Skip to content

Commit ddca4f7

Browse files
committed
Fix Delayed::Job probe crash on deserialization errors
Previously, the `Delayed::Job` probe eagerly deserializes the `payload_object` in order to name the trace. This moves the work ahead of where it would normally happen, and in case of an error during deserialization (e.g. syntax error in the YAML payload, or a deleted database record as in the customer issue), the error will be raised outside of the block where it's normally be caught. This fix defers the naming until a spot where we can guarantee DJ's successful deserialization, preventing the crashes while capturing the deserialization (database load) time within the trace. If an error occurs prior to reaching that step, the trace will have a default "unknown" label. It is possible that this would affect some traces that we previously were able to name successfully – specifically, if an error occurs in a lifecycle hook. If this turns out to be an issue, we can try and opportunistically check for `@payload_object` on the `job` instance during the `:error` and `:failure` hook, but this is reaching too deep into the implementation internals for my taste. And honestly it seems fair enough to categorize anything that never made it to the actual `#perform` as generic "unknown+error" traces. Fixes #491
1 parent b277ad3 commit ddca4f7

File tree

2 files changed

+122
-25
lines changed

2 files changed

+122
-25
lines changed

lib/skylight/probes/delayed_job.rb

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,36 @@ module DelayedJob
88
begin
99
require "delayed/plugin"
1010

11+
UNKNOWN = "<Delayed::Job Unknown>"
12+
1113
class Plugin < ::Delayed::Plugin
1214
callbacks do |lifecycle|
1315
lifecycle.around(:perform) { |worker, job, &block| sk_instrument(worker, job, &block) }
14-
1516
lifecycle.after(:error) { |_worker, _job| Skylight.trace&.segment = "error" }
17+
lifecycle.after(:failure) { |_worker, _job| Skylight.trace&.segment = "error" }
1618
end
1719

1820
class << self
1921
include Skylight::Util::Logging
2022

23+
# This is called quite early in Delayed::Worker
24+
#
25+
# Typically, the `:perform` lifecycle hook is called before the
26+
# `payload_object` has been deserialized, so we can't name the
27+
# trace yet.
28+
#
29+
# If we call `payload_object` here, we would move the work of
30+
# loading the object ahead of where it naturally happens, which
31+
# means the database load time won't be instrumented. On the other
32+
# hand, should the deserialization fail, we would have moved the
33+
# timing of the error as well. Crucially – it would have moved it
34+
# outside of the spot where these errors are normally caught and
35+
# reported by the worker.
36+
#
37+
# See https://github.com/skylightio/skylight-ruby/issues/491
2138
def sk_instrument(_worker, job)
22-
endpoint = Skylight::Probes::DelayedJob.handler_name(job)
23-
2439
Skylight.trace(
25-
endpoint,
40+
UNKNOWN,
2641
"app.delayed_job.worker",
2742
"Delayed::Worker#run",
2843
component: :worker,
@@ -41,15 +56,6 @@ def sk_instrument(_worker, job)
4156
$stderr.puts "[SKYLIGHT] The delayed_job probe was requested, but Delayed::Plugin was not defined."
4257
end
4358

44-
UNKNOWN = "<Delayed::Job Unknown>"
45-
46-
def self.handler_name(job)
47-
payload_object =
48-
job.respond_to?(:payload_object_without_sk) ? job.payload_object_without_sk : job.payload_object
49-
50-
payload_object_name(payload_object)
51-
end
52-
5359
def self.payload_object_name(payload_object)
5460
if payload_object.is_a?(::Delayed::PerformableMethod)
5561
payload_object.display_name
@@ -77,18 +83,27 @@ def self.payload_object_source_meta(payload_object)
7783

7884
class InstrumentationProxy < SimpleDelegator
7985
def perform
80-
source_meta = Skylight::Probes::DelayedJob.payload_object_source_meta(__getobj__)
81-
82-
opts = {
83-
category: "app.delayed_job.job",
84-
title: format_source(*source_meta),
85-
meta: {
86-
source_location_hint: source_meta
87-
},
88-
internal: true
89-
}
90-
91-
Skylight.instrument(opts) { __getobj__.perform }
86+
if (trace = Skylight.instrumenter&.current_trace)
87+
if trace.endpoint == UNKNOWN
88+
# At this point, deserialization was, by definition, successful.
89+
# So it'd be safe to set the endpoint name based on the payload
90+
# object here.
91+
trace.endpoint = Skylight::Probes::DelayedJob.payload_object_name(__getobj__)
92+
end
93+
94+
source_meta = Skylight::Probes::DelayedJob.payload_object_source_meta(__getobj__)
95+
96+
opts = {
97+
category: "app.delayed_job.job",
98+
title: format_source(*source_meta),
99+
meta: {
100+
source_location_hint: source_meta
101+
},
102+
internal: true
103+
}
104+
105+
Skylight.instrument(opts) { __getobj__.perform }
106+
end
92107
end
93108

94109
# Used by Delayed::Backend::Base to determine Job#name

spec/integration/delayed_job_spec.rb

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,88 @@ def enqueue_job(*args)
276276
)
277277
end
278278
end
279+
280+
context "with ActiveRecord model" do
281+
let(:users_migration) do
282+
base = ActiveRecord::Migration
283+
base = base::Current if defined?(base::Current)
284+
285+
Class.new(base) do
286+
def self.up
287+
create_table :users, force: true do |table|
288+
table.string :name, null: false
289+
table.timestamps null: true
290+
end
291+
end
292+
293+
def self.down
294+
drop_table :users
295+
end
296+
end
297+
end
298+
299+
around do |example|
300+
with_sqlite(migration: users_migration) do
301+
example.call
302+
end
303+
end
304+
305+
before do
306+
stub_const("SkDelayedRecord", Class.new(ActiveRecord::Base) do
307+
self.table_name = "users"
308+
309+
def good_method
310+
Skylight.instrument(category: "app.zomg") do
311+
SpecHelper.clock.skip 1
312+
end
313+
end
314+
315+
def self.good_method
316+
new.good_method
317+
end
318+
end)
319+
end
320+
321+
# overrides enqueue_job on the outer context
322+
def enqueue_job(_method_name, *, class_method: false)
323+
if class_method
324+
SkDelayedRecord.delay(queue: "queue-name").good_method
325+
else
326+
SkDelayedRecord.create!(name: "test-record").tap do |record|
327+
record.delay(queue: "queue-name").good_method
328+
end
329+
end
330+
end
331+
332+
specify "instance method" do
333+
enqueue_and_process_job(:good_method)
334+
335+
server.wait resource: "/report"
336+
report = server.reports[0].to_simple_report
337+
expect(report.endpoint.name).to eq("SkDelayedRecord#good_method<sk-segment>queue-name</sk-segment>")
338+
end
339+
340+
specify "class method" do
341+
enqueue_and_process_job(:good_method, class_method: true)
342+
343+
server.wait resource: "/report"
344+
report = server.reports[0].to_simple_report
345+
expect(report.endpoint.name).to eq("SkDelayedRecord.good_method<sk-segment>queue-name</sk-segment>")
346+
end
347+
348+
specify "instance method on a deleted record" do
349+
SkDelayedRecord.create!(name: "test-record").tap do |record|
350+
record.delay(queue: "queue-name").good_method
351+
record.destroy!
352+
end
353+
354+
expect { worker.work_off }.not_to raise_error
355+
356+
server.wait resource: "/report"
357+
report = server.reports[0].to_simple_report
358+
expect(report.endpoint.name).to eq("<Delayed::Job Unknown><sk-segment>error</sk-segment>")
359+
end
360+
end
279361
end
280362

281363
def enqueue_job(method_name, *, class_method: false)

0 commit comments

Comments
 (0)