Skip to content

Commit 91c8f4e

Browse files
authored
Track maximum runtime of currently running jobs (#17)
Closes #13
1 parent d39772b commit 91c8f4e

File tree

6 files changed

+71
-0
lines changed

6 files changed

+71
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ end
4949
- Number of jobs in dead set (“morgue”): `sidekiq_jobs_dead_count`
5050
- Active workers count: `sidekiq_active_processes`
5151
- Active processes count: `sidekiq_active_workers_count`
52+
- Maximum runtime of currently executing jobs: `sidekiq_running_job_runtime` (useful for detection of hung jobs, segmented by queue and class name)
5253

5354
## Custom tags
5455

lib/yabeda/sidekiq.rb

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ module Sidekiq
3333
gauge :jobs_dead_count, tags: [], comment: "The number of jobs exceeded their retry count."
3434
gauge :active_processes, tags: [], comment: "The number of active Sidekiq worker processes."
3535
gauge :queue_latency, tags: %i[queue], comment: "The queue latency, the difference in seconds since the oldest job in the queue was enqueued"
36+
gauge :running_job_runtime, tags: %i[queue worker], aggregation: :max, unit: :seconds,
37+
comment: "How long currently running jobs are running (useful for detection of hung jobs)"
3638

3739
histogram :job_latency, comment: "The job latency, the difference in seconds between enqueued and running time",
3840
unit: :seconds, per: :job,
@@ -59,6 +61,8 @@ module Sidekiq
5961
sidekiq_queue_latency.set({ queue: queue.name }, queue.latency)
6062
end
6163

64+
Yabeda::Sidekiq.track_max_job_runtime
65+
6266
# That is quite slow if your retry set is large
6367
# I don't want to enable it by default
6468
# retries_by_queues =
@@ -105,6 +109,22 @@ def custom_tags(worker, job)
105109

106110
worker.method(:yabeda_tags).arity.zero? ? worker.yabeda_tags : worker.yabeda_tags(*job["args"])
107111
end
112+
113+
# Hash of hashes containing all currently running jobs' start timestamps
114+
# to calculate maximum durations of currently running not yet completed jobs
115+
# { { queue: "default", worker: "SomeJob" } => { "jid1" => 100500, "jid2" => 424242 } }
116+
attr_accessor :jobs_started_at
117+
118+
def track_max_job_runtime
119+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
120+
::Yabeda::Sidekiq.jobs_started_at.each do |labels, jobs|
121+
oldest_job_started_at = jobs.values.min
122+
oldest_job_duration = oldest_job_started_at ? (now - oldest_job_started_at).round(3) : 0
123+
Yabeda.sidekiq.running_job_runtime.set(labels, oldest_job_duration)
124+
end
125+
end
108126
end
127+
128+
self.jobs_started_at = Concurrent::Hash.new { |hash, key| hash[key] = Concurrent::Hash.new }
109129
end
110130
end

lib/yabeda/sidekiq/server_middleware.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def call(worker, job, queue)
1212
begin
1313
job_instance = ::Sidekiq::Job.new(job)
1414
Yabeda.sidekiq_job_latency.measure(labels, job_instance.latency)
15+
Yabeda::Sidekiq.jobs_started_at[labels][job["jid"]] = start
1516
Yabeda.with_tags(**custom_tags) do
1617
yield
1718
end
@@ -22,6 +23,7 @@ def call(worker, job, queue)
2223
ensure
2324
Yabeda.sidekiq_job_runtime.measure(labels, elapsed(start))
2425
Yabeda.sidekiq_jobs_executed_total.increment(labels)
26+
Yabeda::Sidekiq.jobs_started_at[labels].delete(job["jid"])
2527
end
2628
end
2729
# rubocop: enable Metrics/AbcSize, Metrics/MethodLength:

spec/support/jobs.rb

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@ def perform(*_args)
88
end
99
end
1010

11+
class SampleLongRunningJob
12+
include Sidekiq::Worker
13+
14+
def perform(*_args)
15+
sleep 0.05
16+
"Phew, I'm done!"
17+
end
18+
end
19+
1120
class SampleComplexJob
1221
include Sidekiq::Worker
1322

spec/support/sidekiq_inline_middlewares.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ def push(job)
77
return super unless Sidekiq::Testing.inline?
88

99
job = Sidekiq.load_json(Sidekiq.dump_json(job))
10+
job["jid"] ||= SecureRandom.hex(12)
1011
job_class = Sidekiq::Testing.constantize(job["class"])
1112
job_instance = job_class.new
1213
queue = (job_instance.sidekiq_options_hash || {}).fetch("queue", "default")

spec/yabeda/sidekiq_spec.rb

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,5 +167,43 @@
167167
expect(Yabeda.sidekiq.jobs_dead_count.values).to eq({ {} => 3 })
168168
expect(Yabeda.sidekiq.jobs_scheduled_count.values).to eq({ {} => 2 })
169169
end
170+
171+
it "measures maximum runtime of currently running jobs" do
172+
Yabeda.sidekiq.running_job_runtime.values.clear # This is a hack
173+
described_class.jobs_started_at.clear
174+
175+
Sidekiq::Testing.inline! do
176+
workers = []
177+
workers.push(Thread.new { SampleLongRunningJob.perform_async })
178+
sleep 0.012 # Ruby can sleep less than requested
179+
workers.push(Thread.new { SampleLongRunningJob.perform_async })
180+
181+
Yabeda.collectors.each(&:call)
182+
expect(Yabeda.sidekiq.running_job_runtime.values).to include(
183+
{ queue: "default", worker: "SampleLongRunningJob" } => (be >= 0.010),
184+
)
185+
186+
sleep 0.012 # Ruby can sleep less than requested
187+
begin
188+
FailingActiveJob.perform_later
189+
rescue StandardError
190+
nil
191+
end
192+
Yabeda.collectors.each(&:call)
193+
194+
expect(Yabeda.sidekiq.running_job_runtime.values).to include(
195+
{ queue: "default", worker: "SampleLongRunningJob" } => (be >= 0.020),
196+
{ queue: "default", worker: "FailingActiveJob" } => 0,
197+
)
198+
199+
# When all jobs are completed, metric should respond with zero
200+
workers.map(&:join)
201+
Yabeda.collectors.each(&:call)
202+
expect(Yabeda.sidekiq.running_job_runtime.values).to include(
203+
{ queue: "default", worker: "SampleLongRunningJob" } => 0,
204+
{ queue: "default", worker: "FailingActiveJob" } => 0,
205+
)
206+
end
207+
end
170208
end
171209
end

0 commit comments

Comments
 (0)