Skip to content

Commit 91a10e1

Browse files
nsavoireszegedi
authored andcommitted
[profiling] Add experimental CPU profiler (#3895)
* Add CPU profiler * Avoid restarting profiling for last collect Upon process exit, profiler collects profiles a last time and was restarting all profilers before stopping them. Avoid this useless restart by changing the profiler API and make `profile()` methode take a restart argument. * Bump pprof-nodejs version
1 parent bfd9dcc commit 91a10e1

File tree

15 files changed

+166
-68
lines changed

15 files changed

+166
-68
lines changed

integration-tests/profiler.spec.js

+14-8
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,8 @@ describe('profiler', () => {
191191
const threadIdKey = strings.dedup('thread id')
192192
const osThreadIdKey = strings.dedup('os thread id')
193193
const threadNameValue = strings.dedup('Main Event Loop')
194+
const nonJSThreadNameValue = strings.dedup('Non-JS threads')
195+
194196
for (const sample of profile.sample) {
195197
let ts, spanId, rootSpanId, endpoint, threadName, threadId, osThreadId
196198
for (const label of sample.label) {
@@ -205,14 +207,18 @@ describe('profiler', () => {
205207
default: assert.fail(`Unexpected label key ${strings.dedup(label.key)} ${encoded}`)
206208
}
207209
}
208-
// Timestamp must be defined and be between process start and end time
209-
assert.isDefined(ts, encoded)
210-
assert.isNumber(osThreadId, encoded)
211-
assert.equal(threadId, strings.dedup('0'), encoded)
212-
assert.isTrue(ts <= procEnd, encoded)
213-
assert.isTrue(ts >= procStart, encoded)
214-
// Thread name must be defined and exactly equal "Main Event Loop"
215-
assert.equal(threadName, threadNameValue, encoded)
210+
if (threadName !== nonJSThreadNameValue) {
211+
// Timestamp must be defined and be between process start and end time
212+
assert.isDefined(ts, encoded)
213+
assert.isNumber(osThreadId, encoded)
214+
assert.equal(threadId, strings.dedup('0'), encoded)
215+
assert.isTrue(ts <= procEnd, encoded)
216+
assert.isTrue(ts >= procStart, encoded)
217+
// Thread name must be defined and exactly equal "Main Event Loop"
218+
assert.equal(threadName, threadNameValue, encoded)
219+
} else {
220+
assert.equal(threadId, strings.dedup('NA'), encoded)
221+
}
216222
// Either all or none of span-related labels are defined
217223
if (endpoint === undefined) {
218224
// It is possible to catch a sample executing in tracer's startSpan so

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272
"@datadog/native-iast-rewriter": "2.2.2",
7373
"@datadog/native-iast-taint-tracking": "1.6.4",
7474
"@datadog/native-metrics": "^2.0.0",
75-
"@datadog/pprof": "4.1.0",
75+
"@datadog/pprof": "5.0.0",
7676
"@datadog/sketches-js": "^2.1.0",
7777
"@opentelemetry/api": "^1.0.0",
7878
"@opentelemetry/core": "^1.14.0",

packages/dd-trace/src/profiling/config.js

+4
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class Config {
3535
DD_PROFILING_HEAP_ENABLED,
3636
DD_PROFILING_V8_PROFILER_BUG_WORKAROUND,
3737
DD_PROFILING_WALLTIME_ENABLED,
38+
DD_PROFILING_EXPERIMENTAL_CPU_ENABLED,
3839
DD_PROFILING_EXPERIMENTAL_OOM_MONITORING_ENABLED,
3940
DD_PROFILING_EXPERIMENTAL_OOM_HEAP_LIMIT_EXTENSION_SIZE,
4041
DD_PROFILING_EXPERIMENTAL_OOM_MAX_HEAP_EXTENSION_COUNT,
@@ -150,6 +151,9 @@ class Config {
150151
DD_PROFILING_EXPERIMENTAL_CODEHOTSPOTS_ENABLED, false))
151152
logExperimentalVarDeprecation('CODEHOTSPOTS_ENABLED')
152153

154+
this.cpuProfilingEnabled = isTrue(coalesce(options.cpuProfilingEnabled,
155+
DD_PROFILING_EXPERIMENTAL_CPU_ENABLED, false))
156+
153157
this.profilers = ensureProfilers(profilers, this)
154158
}
155159
}

packages/dd-trace/src/profiling/exporters/file.js

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
const fs = require('fs')
44
const { promisify } = require('util')
5+
const { threadId } = require('worker_threads')
56
const writeFile = promisify(fs.writeFile)
67

78
function formatDateTime (t) {
@@ -19,7 +20,7 @@ class FileExporter {
1920
const types = Object.keys(profiles)
2021
const dateStr = formatDateTime(end)
2122
const tasks = types.map(type => {
22-
return writeFile(`${this._pprofPrefix}${type}_${dateStr}.pprof`, profiles[type])
23+
return writeFile(`${this._pprofPrefix}${type}_worker_${threadId}_${dateStr}.pprof`, profiles[type])
2324
})
2425

2526
return Promise.all(tasks)

packages/dd-trace/src/profiling/profiler.js

+18-14
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
const { EventEmitter } = require('events')
44
const { Config } = require('./config')
55
const { snapshotKinds } = require('./constants')
6+
const { threadNamePrefix } = require('./profilers/shared')
67

78
function maybeSourceMap (sourceMap, SourceMapper, debug) {
89
if (!sourceMap) return
@@ -68,7 +69,7 @@ class Profiler extends EventEmitter {
6869
mapper,
6970
nearOOMCallback: this._nearOOMExport.bind(this)
7071
})
71-
this._logger.debug(`Started ${profiler.type} profiler`)
72+
this._logger.debug(`Started ${profiler.type} profiler in ${threadNamePrefix} thread`)
7273
}
7374

7475
this._capture(this._timeoutInterval, start)
@@ -97,7 +98,7 @@ class Profiler extends EventEmitter {
9798

9899
// collect and export current profiles
99100
// once collect returns, profilers can be safely stopped
100-
this._collect(snapshotKinds.ON_SHUTDOWN)
101+
this._collect(snapshotKinds.ON_SHUTDOWN, false)
101102
this._stop()
102103
}
103104

@@ -108,13 +109,11 @@ class Profiler extends EventEmitter {
108109

109110
for (const profiler of this._config.profilers) {
110111
profiler.stop()
111-
this._logger.debug(`Stopped ${profiler.type} profiler`)
112+
this._logger.debug(`Stopped ${profiler.type} profiler in ${threadNamePrefix} thread`)
112113
}
113114

114115
clearTimeout(this._timer)
115116
this._timer = undefined
116-
117-
return this
118117
}
119118

120119
_capture (timeout, start) {
@@ -128,18 +127,21 @@ class Profiler extends EventEmitter {
128127
}
129128
}
130129

131-
async _collect (snapshotKind) {
130+
async _collect (snapshotKind, restart = true) {
132131
if (!this._enabled) return
133132

134-
const start = this._lastStart
135-
const end = new Date()
133+
const startDate = this._lastStart
134+
const endDate = new Date()
136135
const profiles = []
137136
const encodedProfiles = {}
138137

139138
try {
140139
// collect profiles synchronously so that profilers can be safely stopped asynchronously
141140
for (const profiler of this._config.profilers) {
142-
const profile = profiler.profile(start, end)
141+
const profile = profiler.profile(restart, startDate, endDate)
142+
if (!restart) {
143+
this._logger.debug(`Stopped ${profiler.type} profiler in ${threadNamePrefix} thread`)
144+
}
143145
if (!profile) continue
144146
profiles.push({ profiler, profile })
145147
}
@@ -155,8 +157,10 @@ class Profiler extends EventEmitter {
155157
})
156158
}
157159

158-
this._capture(this._timeoutInterval, end)
159-
await this._submit(encodedProfiles, start, end, snapshotKind)
160+
if (restart) {
161+
this._capture(this._timeoutInterval, endDate)
162+
}
163+
await this._submit(encodedProfiles, startDate, endDate, snapshotKind)
160164
this._logger.debug('Submitted profiles')
161165
} catch (err) {
162166
this._logger.error(err)
@@ -196,10 +200,10 @@ class ServerlessProfiler extends Profiler {
196200
this._flushAfterIntervals = this._config.flushInterval / 1000
197201
}
198202

199-
async _collect (snapshotKind) {
200-
if (this._profiledIntervals >= this._flushAfterIntervals) {
203+
async _collect (snapshotKind, restart = true) {
204+
if (this._profiledIntervals >= this._flushAfterIntervals || !restart) {
201205
this._profiledIntervals = 0
202-
await super._collect(snapshotKind)
206+
await super._collect(snapshotKind, restart)
203207
} else {
204208
this._profiledIntervals += 1
205209
this._capture(this._timeoutInterval, new Date())

packages/dd-trace/src/profiling/profilers/events.js

+10-4
Original file line numberDiff line numberDiff line change
@@ -159,22 +159,24 @@ class EventsProfiler {
159159
}
160160

161161
start () {
162+
// if already started, do nothing
163+
if (this._observer) return
164+
162165
function add (items) {
163166
this.entries.push(...items.getEntries())
164167
}
165-
if (!this._observer) {
166-
this._observer = new PerformanceObserver(add.bind(this))
167-
}
168+
this._observer = new PerformanceObserver(add.bind(this))
168169
this._observer.observe({ entryTypes: Object.keys(decoratorTypes) })
169170
}
170171

171172
stop () {
172173
if (this._observer) {
173174
this._observer.disconnect()
175+
this._observer = undefined
174176
}
175177
}
176178

177-
profile (startDate, endDate) {
179+
profile (restart, startDate, endDate) {
178180
if (this.entries.length === 0) {
179181
// No events in the period; don't produce a profile
180182
return null
@@ -243,6 +245,10 @@ class EventsProfiler {
243245
unit: stringTable.dedup(pprofValueUnit)
244246
})
245247

248+
if (!restart) {
249+
this.stop()
250+
}
251+
246252
return new Profile({
247253
sampleType: [timeValueType],
248254
timeNanos: endDate.getTime() * MS_TO_NS,

packages/dd-trace/src/profiling/profilers/shared.js

+6
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,17 @@ function cacheThreadLabels () {
2929
}
3030
}
3131

32+
function getNonJSThreadsLabels () {
33+
return { [THREAD_NAME_LABEL]: 'Non-JS threads', [THREAD_ID_LABEL]: 'NA', [OS_THREAD_ID_LABEL]: 'NA' }
34+
}
35+
3236
module.exports = {
3337
END_TIMESTAMP_LABEL,
3438
THREAD_NAME_LABEL,
3539
THREAD_ID_LABEL,
40+
OS_THREAD_ID_LABEL,
3641
threadNamePrefix,
3742
eventLoopThreadName,
43+
getNonJSThreadsLabels,
3844
getThreadLabels: cacheThreadLabels()
3945
}

packages/dd-trace/src/profiling/profilers/space.js

+17-2
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@ class NativeSpaceProfiler {
1414
this._stackDepth = options.stackDepth || 64
1515
this._pprof = undefined
1616
this._oomMonitoring = options.oomMonitoring || {}
17+
this._started = false
1718
}
1819

1920
start ({ mapper, nearOOMCallback } = {}) {
21+
if (this._started) return
22+
2023
this._mapper = mapper
2124
this._pprof = require('@datadog/pprof')
2225
this._pprof.heap.start(this._samplingInterval, this._stackDepth)
@@ -31,18 +34,30 @@ class NativeSpaceProfiler {
3134
strategiesToCallbackMode(strategies, this._pprof.heap.CallbackMode)
3235
)
3336
}
37+
38+
this._started = true
3439
}
3540

36-
profile () {
37-
return this._pprof.heap.profile(undefined, this._mapper, getThreadLabels)
41+
profile (restart) {
42+
const profile = this._pprof.heap.profile(undefined, this._mapper, getThreadLabels)
43+
if (!restart) {
44+
this.stop()
45+
}
46+
return profile
3847
}
3948

4049
encode (profile) {
4150
return this._pprof.encode(profile)
4251
}
4352

4453
stop () {
54+
if (!this._started) return
4555
this._pprof.heap.stop()
56+
this._started = false
57+
}
58+
59+
isStarted () {
60+
return this._started
4661
}
4762
}
4863

packages/dd-trace/src/profiling/profilers/wall.js

+34-21
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ const { HTTP_METHOD, HTTP_ROUTE, RESOURCE_NAME, SPAN_TYPE } = require('../../../
77
const { WEB } = require('../../../../../ext/types')
88
const runtimeMetrics = require('../../runtime_metrics')
99
const telemetryMetrics = require('../../telemetry/metrics')
10-
const { END_TIMESTAMP_LABEL, getThreadLabels } = require('./shared')
10+
const { END_TIMESTAMP_LABEL, getNonJSThreadsLabels, getThreadLabels } = require('./shared')
1111

1212
const beforeCh = dc.channel('dd-trace:storage:before')
1313
const enterCh = dc.channel('dd-trace:storage:enter')
@@ -78,13 +78,15 @@ class NativeWallProfiler {
7878
this._codeHotspotsEnabled = !!options.codeHotspotsEnabled
7979
this._endpointCollectionEnabled = !!options.endpointCollectionEnabled
8080
this._timelineEnabled = !!options.timelineEnabled
81+
this._cpuProfilingEnabled = !!options.cpuProfilingEnabled
8182
// We need to capture span data into the sample context for either code hotspots
8283
// or endpoint collection.
8384
this._captureSpanData = this._codeHotspotsEnabled || this._endpointCollectionEnabled
8485
// We need to run the pprof wall profiler with sample contexts if we're either
8586
// capturing span data or timeline is enabled (so we need sample timestamps, and for now
86-
// timestamps require the sample contexts feature in the pprof wall profiler.)
87-
this._withContexts = this._captureSpanData || this._timelineEnabled
87+
// timestamps require the sample contexts feature in the pprof wall profiler), or
88+
// cpu profiling is enabled.
89+
this._withContexts = this._captureSpanData || this._timelineEnabled || this._cpuProfilingEnabled
8890
this._v8ProfilerBugWorkaroundEnabled = !!options.v8ProfilerBugWorkaroundEnabled
8991
this._mapper = undefined
9092
this._pprof = undefined
@@ -131,7 +133,8 @@ class NativeWallProfiler {
131133
sourceMapper: this._mapper,
132134
withContexts: this._withContexts,
133135
lineNumbers: false,
134-
workaroundV8Bug: this._v8ProfilerBugWorkaroundEnabled
136+
workaroundV8Bug: this._v8ProfilerBugWorkaroundEnabled,
137+
collectCpuTime: this._cpuProfilingEnabled
135138
})
136139

137140
if (this._withContexts) {
@@ -220,22 +223,42 @@ class NativeWallProfiler {
220223

221224
_stop (restart) {
222225
if (!this._started) return
226+
223227
if (this._captureSpanData) {
224228
// update last sample context if needed
225229
this._enter()
226230
this._lastSampleCount = 0
227231
}
228232
const profile = this._pprof.time.stop(restart, this._generateLabels)
233+
229234
if (restart) {
230235
const v8BugDetected = this._pprof.time.v8ProfilerStuckEventLoopDetected()
231236
if (v8BugDetected !== 0) {
232237
this._reportV8bug(v8BugDetected === 1)
233238
}
239+
} else {
240+
if (this._captureSpanData) {
241+
beforeCh.unsubscribe(this._enter)
242+
enterCh.unsubscribe(this._enter)
243+
spanFinishCh.unsubscribe(this._spanFinished)
244+
this._profilerState = undefined
245+
this._lastSpan = undefined
246+
this._lastStartedSpans = undefined
247+
this._lastWebTags = undefined
248+
}
249+
this._started = false
234250
}
251+
235252
return profile
236253
}
237254

238-
_generateLabels (context) {
255+
_generateLabels ({ node, context }) {
256+
// check for special node that represents CPU time all non-JS threads.
257+
// In that case only return a special thread name label since we cannot associate any timestamp/span/endpoint to it.
258+
if (node.name === this._pprof.time.constants.NON_JS_THREADS_FUNCTION_NAME) {
259+
return getNonJSThreadsLabels()
260+
}
261+
239262
if (context == null) {
240263
// generateLabels is also called for samples without context.
241264
// In that case just return thread labels.
@@ -267,30 +290,20 @@ class NativeWallProfiler {
267290
return labels
268291
}
269292

270-
profile () {
271-
return this._stop(true)
293+
profile (restart) {
294+
return this._stop(restart)
272295
}
273296

274297
encode (profile) {
275298
return this._pprof.encode(profile)
276299
}
277300

278301
stop () {
279-
if (!this._started) return
280-
281-
const profile = this._stop(false)
282-
if (this._captureSpanData) {
283-
beforeCh.unsubscribe(this._enter)
284-
enterCh.unsubscribe(this._enter)
285-
spanFinishCh.unsubscribe(this._spanFinished)
286-
this._profilerState = undefined
287-
this._lastSpan = undefined
288-
this._lastStartedSpans = undefined
289-
this._lastWebTags = undefined
290-
}
302+
this._stop(false)
303+
}
291304

292-
this._started = false
293-
return profile
305+
isStarted () {
306+
return this._started
294307
}
295308
}
296309

0 commit comments

Comments
 (0)