Skip to content

Commit 4b11d91

Browse files
authored
feat(debugger): add special handling for very large collections/objects (#6912)
Skip enumerating collections with more than 500 items to avoid excessive CDP serialization time. Collections above the threshold are flagged with a notCapturedReason and their size is recorded. For objects with more than 500 own properties, capture once but prevent future snapshot attempts for the same probe version within the current process. An evaluation error is recorded to notify the user that no more snapshots will be collected for that probe version. This mitigates performance issues when encountering extremely large objects or collections during snapshot capture.
1 parent 5abf807 commit 4b11d91

File tree

16 files changed

+380
-122
lines changed

16 files changed

+380
-122
lines changed

integration-tests/debugger/snapshot-time-budget.spec.js

Lines changed: 159 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@
22

33
const assert = require('node:assert')
44
const { setup } = require('./utils')
5+
const {
6+
DEFAULT_MAX_COLLECTION_SIZE,
7+
LARGE_OBJECT_SKIP_THRESHOLD
8+
} = require('../../packages/dd-trace/src/debugger/devtools_client/snapshot/constants')
59

610
describe('Dynamic Instrumentation', function () {
711
describe('input messages', function () {
8-
describe('with snapshot under tight time budget', function () {
12+
describe('with snapshot under time budget', function () {
913
context('1ms time budget', function () {
1014
// Force a very small time budget in ms to trigger partial snapshots
1115
const budget = 1
@@ -17,57 +21,177 @@ describe('Dynamic Instrumentation', function () {
1721
it(
1822
'should include partial snapshot marked with notCapturedReason: timeout',
1923
// A tolerance of 15ms is used to avoid flakiness
20-
test({ t, maxPausedTime: budget + 15, breakpointIndex: 0, maxReferenceDepth: 5 })
24+
test({ t, maxPausedTime: budget + 15, breakpointIndex: 0, maxReferenceDepth: 5 }, (locals) => {
25+
assert.strictEqual(
26+
containsTimeBudget(locals),
27+
true,
28+
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
29+
)
30+
})
2131
)
2232
})
2333

2434
context('default time budget', function () {
25-
const budget = 15 // default time budget in ms
2635
const t = setup({ dependencies: ['fastify'] })
2736

28-
// TODO: Make this pass
29-
// eslint-disable-next-line mocha/no-pending-tests
30-
it.skip(
31-
'should keep budget when state includes an object with 1 million properties',
32-
// A tolerance of 5ms is used to avoid flakiness
33-
test({ t, maxPausedTime: budget + 5, breakpointIndex: 1, maxReferenceDepth: 1 })
37+
it(
38+
'should timeout first, then disable subsequent snapshots and emit error diagnostics',
39+
async function () {
40+
const breakpoint = t.breakpoints[1]
41+
const expectedEvaluationErrors = [{
42+
expr: '',
43+
message: 'An object with 1000000 properties was detected while collecting a snapshot. This exceeds ' +
44+
`the maximum number of allowed properties of ${LARGE_OBJECT_SKIP_THRESHOLD}. Future snapshots for ` +
45+
'existing probes in this location will be skipped until the Node.js process is restarted'
46+
}]
47+
48+
// Listen for the first snapshot payload (should contain notCapturedReason: "timeout")
49+
const firstPayloadReceived = new Promise(/** @type {() => void} */ (resolve) => {
50+
t.agent.once('debugger-input', ({ payload: [{ debugger: { snapshot } }] }) => {
51+
const { locals } = snapshot.captures.lines[breakpoint.line]
52+
assert.strictEqual(
53+
containsTimeBudget(locals),
54+
true,
55+
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
56+
)
57+
assert.deepStrictEqual(snapshot.evaluationErrors, expectedEvaluationErrors)
58+
resolve()
59+
})
60+
})
61+
62+
// Install probe with snapshot capture enabled
63+
t.agent.addRemoteConfig(breakpoint.generateRemoteConfig({
64+
captureSnapshot: true,
65+
capture: { maxReferenceDepth: 1 }
66+
}))
67+
68+
// Trigger once; this run is expected to be slow and mark fields with "timeout"
69+
const result1 = /** @type {import('axios').AxiosResponse<{ paused: number }>} */
70+
(await breakpoint.triggerBreakpoint())
71+
assert.ok(
72+
result1.data.paused >= 1_000,
73+
`expected thread to be paused for at least 1 second, but was paused for ~${result1.data.paused}ms`
74+
)
75+
76+
await firstPayloadReceived
77+
78+
// Prepare to assert that no snapshot is produced on a subsequent trigger
79+
const secondPayloadReceived = new Promise(/** @type {() => void} */ (resolve) => {
80+
t.agent.once('debugger-input', ({ payload: [{ debugger: { snapshot } }] }) => {
81+
assert.ok(!Object.hasOwn(snapshot, 'captures'))
82+
assert.deepStrictEqual(snapshot.evaluationErrors, expectedEvaluationErrors)
83+
resolve()
84+
})
85+
})
86+
87+
// Trigger the same breakpoint again directly
88+
const result2 = await t.axios.get(breakpoint.url)
89+
assert.ok(
90+
result2.data.paused <= 50,
91+
`expected thread to be paused <=50ms, but was paused for ~${result2.data.paused}ms`
92+
)
93+
94+
await secondPayloadReceived
95+
}
3496
)
97+
})
98+
99+
context('large object thresholds', function () {
100+
// The tests in this group, should take a lot longer than 100ms to capture a snapshot if no thresholds are
101+
// applied. But they should take a lot less than the 100ms budget if thresholds are applied. Using 100ms means
102+
// that the tests should not be flaky, but still fail if the thresholds are not applied.
103+
const budget = 100
104+
const t = setup({
105+
dependencies: ['fastify'],
106+
env: { DD_DYNAMIC_INSTRUMENTATION_CAPTURE_TIMEOUT_MS: String(budget) }
107+
})
35108

36-
// TODO: Make this pass
37-
// eslint-disable-next-line mocha/no-pending-tests
38-
it.skip(
39-
'should keep budget when state includes an array of 1 million primitives',
40-
// A tolerance of 5ms is used to avoid flakiness
41-
test({ t, maxPausedTime: budget + 5, breakpointIndex: 2, maxReferenceDepth: 1 })
109+
it(
110+
'should keep budget when state includes collections with 1 million elements',
111+
test({ t, maxPausedTime: budget, breakpointIndex: 2, maxReferenceDepth: 1 }, (locals) => {
112+
const notCapturedReason = `Large collection with too many elements (skip threshold: ${
113+
LARGE_OBJECT_SKIP_THRESHOLD
114+
})`
115+
assert.strictEqual(locals.arrOfPrimitives.notCapturedReason, notCapturedReason)
116+
assert.strictEqual(locals.arrOfPrimitives.size, 1_000_000)
117+
assert.strictEqual(locals.arrOfPrimitives.elements.length, 0)
118+
assert.strictEqual(locals.arrOfObjects.notCapturedReason, notCapturedReason)
119+
assert.strictEqual(locals.arrOfObjects.size, 1_000_000)
120+
assert.strictEqual(locals.arrOfObjects.elements.length, 0)
121+
assert.strictEqual(locals.map.notCapturedReason, notCapturedReason)
122+
assert.strictEqual(locals.map.size, 1_000_000)
123+
assert.strictEqual(locals.map.entries.length, 0)
124+
assert.strictEqual(locals.set.notCapturedReason, notCapturedReason)
125+
assert.strictEqual(locals.set.size, 1_000_000)
126+
assert.strictEqual(locals.set.elements.length, 0)
127+
})
42128
)
43129

44-
// TODO: Make this pass
45-
// eslint-disable-next-line mocha/no-pending-tests
46-
it.skip(
47-
'should keep budget when state includes an array of 1 million objects',
48-
// A tolerance of 5ms is used to avoid flakiness
49-
test({ t, maxPausedTime: budget + 5, breakpointIndex: 3, maxReferenceDepth: 1 })
130+
it(
131+
'should keep budget when state includes collections with less than the size threshold',
132+
test({ t, maxPausedTime: budget, breakpointIndex: 3, maxReferenceDepth: 1 }, (locals) => {
133+
assert.strictEqual(locals.arrOfPrimitives.notCapturedReason, 'collectionSize')
134+
assert.strictEqual(locals.arrOfPrimitives.size, LARGE_OBJECT_SKIP_THRESHOLD - 1)
135+
assert.strictEqual(locals.arrOfPrimitives.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
136+
assert.strictEqual(locals.arrOfObjects.notCapturedReason, 'collectionSize')
137+
assert.strictEqual(locals.arrOfObjects.size, LARGE_OBJECT_SKIP_THRESHOLD - 1)
138+
assert.strictEqual(locals.arrOfObjects.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
139+
assert.strictEqual(locals.map.notCapturedReason, 'collectionSize')
140+
assert.strictEqual(locals.map.size, LARGE_OBJECT_SKIP_THRESHOLD - 1)
141+
assert.strictEqual(locals.map.entries.length, DEFAULT_MAX_COLLECTION_SIZE)
142+
assert.strictEqual(locals.set.notCapturedReason, 'collectionSize')
143+
assert.strictEqual(locals.set.size, LARGE_OBJECT_SKIP_THRESHOLD - 1)
144+
assert.strictEqual(locals.set.elements.length, DEFAULT_MAX_COLLECTION_SIZE)
145+
})
50146
)
51147
})
148+
149+
context('fuzzing', function () {
150+
for (let budget = 0; budget < 20; budget++) {
151+
context(`graceful handling with time budget of ${budget}ms`, function () {
152+
// Anything longer than this, and the debugger worker thread most likely crashed.
153+
// Run test with `DD_TRACE_DEBUG=true` to see more.
154+
this.timeout(2000)
155+
156+
const t = setup({
157+
dependencies: ['fastify'],
158+
env: { DD_DYNAMIC_INSTRUMENTATION_CAPTURE_TIMEOUT_MS: String(budget) }
159+
})
160+
161+
// If this test uncovers any issues, it will show itself as being flaky, as the exact timing of how long it
162+
// takes to collect the snapshot is not deterministic.
163+
it('should send a probe result to the agent', async function () {
164+
t.agent.addRemoteConfig(t.generateRemoteConfig({
165+
captureSnapshot: true,
166+
capture: { maxReferenceDepth: 5 }
167+
}))
168+
t.triggerBreakpoint()
169+
170+
const { captures } = await t.snapshotReceived()
171+
// If the snapshot contains a `captures` field, it means it was captured and processes without any issues
172+
assert.ok(
173+
captures !== null && typeof captures === 'object',
174+
'expected snapshot to contain a `captures` object'
175+
)
176+
// To make this test more future-proof, we also assert that the snapshot contains at least one local
177+
// property, though currently this is not necessary.
178+
assert.ok(
179+
Object.keys(captures.lines[t.breakpoint.line].locals).length > 0,
180+
'expected snapshot to contain at least one local property'
181+
)
182+
})
183+
})
184+
}
185+
})
52186
})
53187
})
54188
})
55189

56-
function test ({ t, maxPausedTime, breakpointIndex, maxReferenceDepth }) {
190+
function test ({ t, maxPausedTime = 0, breakpointIndex, maxReferenceDepth }, assertFn) {
57191
const breakpoint = t.breakpoints[breakpointIndex]
58192

59193
return async function () {
60-
const payloadReceived = new Promise((/** @type {(value?: unknown) => void} */ resolve) => {
61-
t.agent.on('debugger-input', ({ payload: [{ debugger: { snapshot: { captures } } }] }) => {
62-
const { locals } = captures.lines[breakpoint.line]
63-
assert.strictEqual(
64-
containsTimeBudget(locals),
65-
true,
66-
'expected at least one field/element to be marked with notCapturedReason: "timeout"'
67-
)
68-
resolve()
69-
})
70-
})
194+
const snapshotPromise = t.snapshotReceived()
71195

72196
t.agent.addRemoteConfig(breakpoint.generateRemoteConfig({
73197
captureSnapshot: true,
@@ -81,7 +205,8 @@ function test ({ t, maxPausedTime, breakpointIndex, maxReferenceDepth }) {
81205
`expected thread to be paused <=${maxPausedTime}ms, but was paused for ~${data.paused}ms`
82206
)
83207

84-
await payloadReceived
208+
const snapshot = await snapshotPromise
209+
assertFn?.(snapshot.captures.lines[breakpoint.line].locals)
85210
}
86211
}
87212

integration-tests/debugger/target-app/snapshot-time-budget.js

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ require('dd-trace/init')
55

66
// @ts-expect-error This code is running in a sandbox where fastify is available
77
const Fastify = require('fastify')
8+
const {
9+
LARGE_OBJECT_SKIP_THRESHOLD
10+
// @ts-expect-error This code is running in a sandbox where dd-trace is available
11+
} = require('dd-trace/packages/dd-trace/src/debugger/devtools_client/snapshot/constants')
812

913
const fastify = Fastify({ logger: { level: 'error' } })
1014

@@ -18,24 +22,31 @@ fastify.get('/deeply-nested-large-object', function handler () {
1822
return { paused: Number(diff) / 1_000_000 }
1923
})
2024

21-
fastify.get('/large-object-of-primitives', function handler () {
25+
fastify.get('/object-with-many-properties', function handler () {
2226
const obj = generateObject(0, 1_000_000) // eslint-disable-line no-unused-vars
2327
const start = process.hrtime.bigint()
24-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-object-of-primitives
28+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /object-with-many-properties
2529
return { paused: Number(diff) / 1_000_000 }
2630
})
2731

28-
fastify.get('/large-array-of-primitives', function handler () {
29-
const arr = Array.from({ length: 1_000_000 }, (_, i) => i) // eslint-disable-line no-unused-vars
32+
fastify.get('/large-collections', function handler () {
33+
const arrOfPrimitives = Array.from({ length: 1_000_000 }, (_, i) => i) // eslint-disable-line no-unused-vars
34+
const arrOfObjects = Array.from({ length: 1_000_000 }, (_, i) => ({ i }))
35+
const map = new Map(arrOfObjects.map((obj, i) => [i, obj])) // eslint-disable-line no-unused-vars
36+
const set = new Set(arrOfObjects) // eslint-disable-line no-unused-vars
3037
const start = process.hrtime.bigint()
31-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-array-of-primitives
38+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-collections
3239
return { paused: Number(diff) / 1_000_000 }
3340
})
3441

35-
fastify.get('/large-array-of-objects', function handler () {
36-
const arr = Array.from({ length: 1_000_000 }, (_, i) => ({ i })) // eslint-disable-line no-unused-vars
42+
fastify.get('/large-collections-below-size-threshold', function handler () {
43+
// eslint-disable-next-line no-unused-vars
44+
const arrOfPrimitives = Array.from({ length: LARGE_OBJECT_SKIP_THRESHOLD - 1 }, (_, i) => i)
45+
const arrOfObjects = Array.from({ length: LARGE_OBJECT_SKIP_THRESHOLD - 1 }, (_, i) => ({ i }))
46+
const map = new Map(arrOfObjects.map((obj, i) => [i, obj])) // eslint-disable-line no-unused-vars
47+
const set = new Set(arrOfObjects) // eslint-disable-line no-unused-vars
3748
const start = process.hrtime.bigint()
38-
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-array-of-objects
49+
const diff = process.hrtime.bigint() - start // BREAKPOINT: /large-collections-below-size-threshold
3950
return { paused: Number(diff) / 1_000_000 }
4051
})
4152

integration-tests/debugger/utils.js

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ const pollInterval = 0.1
4141
*
4242
* @typedef {BreakpointInfo & {
4343
* rcConfig: object|null,
44-
* triggerBreakpoint: (url: string) => Promise<import('axios').AxiosResponse<unknown>>,
44+
* triggerBreakpoint: () => Promise<import('axios').AxiosResponse<unknown>>,
4545
* generateRemoteConfig: (overrides?: object) => object,
4646
* generateProbeConfig: BoundGenerateProbeConfigFn
4747
* }} EnrichedBreakpoint
@@ -66,6 +66,7 @@ const pollInterval = 0.1
6666
* once installed.
6767
* @property {(overrides?: object) => object} generateRemoteConfig - Generates RC for the primary breakpoint.
6868
* @property {BoundGenerateProbeConfigFn} generateProbeConfig - Generates probe config for the primary breakpoint.
69+
* @property {() => Promise<object>} snapshotReceived - Waits for a snapshot to be received from the test app.
6970
*/
7071

7172
module.exports = {
@@ -129,7 +130,15 @@ function setup ({ env, testApp, testAppSource, dependencies, silent, stdioHandle
129130
rcConfig: null,
130131
triggerBreakpoint: triggerBreakpoint.bind(null, breakpoints[0].url),
131132
generateRemoteConfig: generateRemoteConfig.bind(null, breakpoints[0]),
132-
generateProbeConfig: generateProbeConfig.bind(null, breakpoints[0])
133+
generateProbeConfig: generateProbeConfig.bind(null, breakpoints[0]),
134+
135+
snapshotReceived () {
136+
return new Promise((/** @type {(value: object) => void} */ resolve) => {
137+
t.agent.on('debugger-input', ({ payload: [{ debugger: { snapshot } }] }) => {
138+
resolve(snapshot)
139+
})
140+
})
141+
}
133142
}
134143

135144
/**

packages/dd-trace/src/ci-visibility/dynamic-instrumentation/worker/index.js

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ session.on('Debugger.paused', async ({ params: { hitBreakpoints: [hitBreakpoint]
3636

3737
const stack = getStackFromCallFrames(callFrames)
3838

39-
const getLocalState = await getLocalStateForCallFrame(callFrames[0])
39+
const { processLocalState } = await getLocalStateForCallFrame(callFrames[0])
4040

4141
await session.post('Debugger.resume')
4242

@@ -48,17 +48,13 @@ session.on('Debugger.paused', async ({ params: { hitBreakpoints: [hitBreakpoint]
4848
version: '0',
4949
location: probe.location
5050
},
51+
captures: {
52+
lines: { [probe.location.lines[0]]: { locals: processLocalState() } }
53+
},
5154
stack,
5255
language: 'javascript'
5356
}
5457

55-
const state = getLocalState()
56-
if (state) {
57-
snapshot.captures = {
58-
lines: { [probe.location.lines[0]]: { locals: state } }
59-
}
60-
}
61-
6258
breakpointHitChannel.postMessage({ snapshot })
6359
})
6460

0 commit comments

Comments
 (0)