DataDog · nsavoire · Aug 26, 2024
@@ -58,4 +58,6 @@ type Trace struct {
 	TID              util.TID
 	APMTraceID       libpf.APMTraceID
 	APMTransactionID libpf.APMTransactionID
+	AllocSize        uint64
+	AllocAddress     libpf.Address
 }
@@ -403,6 +403,12 @@ func mainWithExitCode() exitCode {
 		return exitFailure
 	}
 
+	if err := trc.AttachAllocationHook(); err != nil {
+		msg := fmt.Sprintf("Failed to attach allocation hook: %v", err)
+		log.Error(msg)
+		return exitFailure
+	}
+
 	// This log line is used in our system tests to verify if that the agent has started. So if you
 	// change this log line update also the system test.
 	log.Printf("Attached sched monitor")

@@ -78,12 +78,19 @@ type DatadogReporter struct {
 // ReportTraceEvent enqueues reported trace events for the Datadog reporter.
 func (r *DatadogReporter) ReportTraceEvent(trace *libpf.Trace, timestamp libpf.UnixTime64,
 	comm, podName, containerID, containerName, apmServiceName string,
-	pid util.PID, tid util.TID) {
+	pid util.PID, tid util.TID, allocSize uint64, allocAddress libpf.Address) {
 	traceEvents := r.traceEvents.WLock()
 	defer r.traceEvents.WUnlock(&traceEvents)
 
+	allocCount := uint64(0)
+	if allocSize != 0 {
+		allocCount = 1
+	}
+
 	if tr, exists := (*traceEvents)[trace.Hash]; exists {
 		tr.timestamps = append(tr.timestamps, uint64(timestamp))
+		tr.allocCount += allocCount
+		tr.allocSize += allocSize
 		(*traceEvents)[trace.Hash] = tr
 		return
 	}
@@ -100,6 +107,8 @@ func (r *DatadogReporter) ReportTraceEvent(trace *libpf.Trace, timestamp libpf.U
 		pid:            pid,
 		tid:            tid,
 		timestamps:     []uint64{uint64(timestamp)},
+		allocSize:      allocSize,
+		allocCount:     allocCount,
 	}
 }
 
@@ -369,8 +378,11 @@ func (r *DatadogReporter) getPprofProfile() (profile *pprofile.Profile,
 	funcMap := make(map[funcInfo]*pprofile.Function)
 
 	profile = &pprofile.Profile{
-		SampleType: []*pprofile.ValueType{{Type: "cpu-samples", Unit: "count"},
-			{Type: "cpu-time", Unit: "nanoseconds"}},
+		SampleType: []*pprofile.ValueType{
+			{Type: "cpu-samples", Unit: "count"},
+			{Type: "cpu-time", Unit: "nanoseconds"},
+			{Type: "alloc-samples", Unit: "count"},
+			{Type: "alloc-space", Unit: "bytes"}},
 		Sample:            make([]*pprofile.Sample, 0, numSamples),
 		PeriodType:        &pprofile.ValueType{Type: "cpu-time", Unit: "nanoseconds"},
 		Period:            int64(r.samplingPeriod),
@@ -515,7 +527,8 @@ func (r *DatadogReporter) getPprofProfile() (profile *pprofile.Profile,
 		addTraceLabels(sample.Label, traceInfo)
 
 		count := int64(len(traceInfo.timestamps))
-		sample.Value = append(sample.Value, count, count*int64(r.samplingPeriod))
+		sample.Value = append(sample.Value, count, count*int64(r.samplingPeriod),
+			int64(traceInfo.allocCount), int64(traceInfo.allocSize))
 		profile.Sample = append(profile.Sample, sample)
 		totalSampleCount += len(traceInfo.timestamps)
 	}

@@ -42,7 +42,7 @@ type TraceReporter interface {
 	// enqueued for reporting, and false if the event was ignored.
 	ReportTraceEvent(trace *libpf.Trace, timestamp libpf.UnixTime64,
 		comm, podName, containerID, containerName, apmServiceName string,
-		pid util.PID, tid util.TID)
+		pid util.PID, tid util.TID, allocSize uint64, allocAddress libpf.Address)
 
 	// SupportsReportTraceEvent returns true if the reporter supports reporting trace events
 	// via ReportTraceEvent().

@@ -69,6 +69,8 @@ type traceFramesCounts struct {
 	pid            util.PID
 	tid            util.TID
 	timestamps     []uint64 // in nanoseconds
+	allocCount     uint64
+	allocSize      uint64
 }
 
 // OTLPReporter receives and transforms information to be OTLP/profiles compliant.
@@ -126,7 +128,8 @@ func (r *OTLPReporter) SupportsReportTraceEvent() bool { return true }
 // ReportTraceEvent enqueues reported trace events for the OTLP reporter.
 func (r *OTLPReporter) ReportTraceEvent(trace *libpf.Trace,
 	timestamp libpf.UnixTime64, comm, podName, containerID,
-	containerName, apmServiceName string, pid util.PID, tid util.TID) {
+	containerName, apmServiceName string, pid util.PID, tid util.TID,
+	allocSize uint64, allocAddress libpf.Address) {
 	traceEvents := r.traceEvents.WLock()
 	defer r.traceEvents.WUnlock(&traceEvents)
 

@@ -288,3 +288,49 @@ int unwind_dotnet(struct pt_regs *ctx) {
   DEBUG_PRINT("dotnet: tail call for next frame unwinder (%d) failed", unwinder);
   return -1;
 }
+
+SEC("uprobe/unwind_dotnet")
+int unwind_dotnet_uprobe(struct pt_regs *ctx) {
+  PerCPURecord *record = get_per_cpu_record();
+  if (!record) {
+    return -1;
+  }
+
+  Trace *trace = &record->trace;
+  u32 pid = trace->pid;
+  DEBUG_PRINT("==== unwind_dotnet %d ====", trace->stack_len);
+
+  int unwinder = PROG_UNWIND_STOP;
+  ErrorCode error = ERR_OK;
+  DotnetProcInfo *vi = bpf_map_lookup_elem(&dotnet_procs, &pid);
+  if (!vi) {
+    DEBUG_PRINT("dotnet: no DotnetProcInfo for this pid");
+    error = ERR_DOTNET_NO_PROC_INFO;
+    increment_metric(metricID_UnwindDotnetErrNoProcInfo);
+    goto exit;
+  }
+
+  record->ratelimitAction = RATELIMIT_ACTION_FAST;
+  increment_metric(metricID_UnwindDotnetAttempts);
+
+#pragma unroll
+  for (int i = 0; i < DOTNET_FRAMES_PER_PROGRAM; i++) {
+    unwinder = PROG_UNWIND_STOP;
+
+    error = unwind_one_dotnet_frame(record, vi, i == 0);
+    if (error) {
+      break;
+    }
+
+    error = get_next_unwinder_after_native_frame(record, &unwinder);
+    if (error || unwinder != PROG_UNWIND_DOTNET) {
+      break;
+    }
+  }
+
+exit:
+  record->state.unwind_error = error;
+  tail_call_uprobe(ctx, unwinder);
+  DEBUG_PRINT("dotnet: tail call for next frame unwinder (%d) failed", unwinder);
+  return -1;
+}
@@ -7,6 +7,7 @@
 
 // References to map definitions in *.ebpf.c.
 extern bpf_map_def progs;
+extern bpf_map_def progs_uprobe;
 extern bpf_map_def per_cpu_records;
 extern bpf_map_def pid_page_to_mapping_info;
 extern bpf_map_def metrics;

@@ -861,3 +861,41 @@ int unwind_hotspot(struct pt_regs *ctx) {
   DEBUG_PRINT("jvm: tail call for next frame unwinder (%d) failed", unwinder);
   return -1;
 }
+
+SEC("uprobe/unwind_hotspot")
+int unwind_hotspot_uprobe(struct pt_regs *ctx) {
+  PerCPURecord *record = get_per_cpu_record();
+  if (!record)
+    return -1;
+
+  Trace *trace = &record->trace;
+  pid_t pid = trace->pid;
+  DEBUG_PRINT("==== jvm: unwind %d ====", trace->stack_len);
+
+  HotspotProcInfo *ji = bpf_map_lookup_elem(&hotspot_procs, &pid);
+  if (!ji) {
+    DEBUG_PRINT("jvm: no HotspotProcInfo for this pid");
+    return 0;
+  }
+
+  int unwinder = PROG_UNWIND_STOP;
+  ErrorCode error = ERR_OK;
+#pragma unroll
+  for (int i = 0; i < HOTSPOT_FRAMES_PER_PROGRAM; i++) {
+    unwinder = PROG_UNWIND_STOP;
+    error = hotspot_unwind_one_frame(record, ji);
+    if (error) {
+      break;
+    }
+
+    error = get_next_unwinder_after_native_frame(record, &unwinder);
+    if (error || unwinder != PROG_UNWIND_HOTSPOT) {
+      break;
+    }
+  }
+
+  record->state.unwind_error = error;
+  tail_call_uprobe(ctx, unwinder);
+  DEBUG_PRINT("jvm: tail call for next frame unwinder (%d) failed", unwinder);
+  return -1;
+}
@@ -33,6 +33,14 @@ bpf_map_def SEC("maps") progs = {
   .max_entries = NUM_TRACER_PROGS,
 };
 
+// progs maps from a program ID to an eBPF program
+bpf_map_def SEC("maps") progs_uprobe = {
+  .type = BPF_MAP_TYPE_PROG_ARRAY,
+  .key_size = sizeof(u32),
+  .value_size = sizeof(u32),
+  .max_entries = NUM_TRACER_PROGS,
+};
+
 // report_events notifies user space about events (GENERIC_PID and TRACES_FOR_SYMBOLIZATION).
 //
 // As a key the CPU number is used and the value represents a perf event file descriptor.
@@ -238,6 +246,72 @@ int unwind_stop(struct pt_regs *ctx) {
   return 0;
 }
 
+SEC("uprobe/unwind_stop")
+int unwind_stop_uprobe(struct pt_regs *ctx) {
+  PerCPURecord *record = get_per_cpu_record();
+  if (!record)
+    return -1;
+  Trace *trace = &record->trace;
+  UnwindState *state = &record->state;
+
+  maybe_add_apm_info(trace);
+
+  // If the stack is otherwise empty, push an error for that: we should
+  // never encounter empty stacks for successful unwinding.
+  if (trace->stack_len == 0 && trace->kernel_stack_id < 0) {
+    DEBUG_PRINT("unwind_stop called but the stack is empty");
+    increment_metric(metricID_ErrEmptyStack);
+    if (!state->unwind_error) {
+      state->unwind_error = ERR_EMPTY_STACK;
+    }
+  }
+
+  // If unwinding was aborted due to a critical error, push an error frame.
+  if (state->unwind_error) {
+    push_error(&record->trace, state->unwind_error);
+  }
+
+  switch (state->error_metric) {
+  case -1:
+    // No Error
+    break;
+  case metricID_UnwindNativeErrWrongTextSection:;
+    if (report_pid(ctx, trace->pid, record->ratelimitAction)) {
+      increment_metric(metricID_NumUnknownPC);
+    }
+    // Fallthrough to report the error
+  default:
+    increment_metric(state->error_metric);
+  }
+
+  // TEMPORARY HACK
+  //
+  // If we ended up with a trace that consists of only a single error frame, drop it.
+  // This is required as long as the process manager provides the option to filter out
+  // error frames, to prevent empty traces from being sent. While it might seem that this
+  // filtering should belong into the HA code that does the filtering, it is actually
+  // surprisingly hard to implement that way: since traces and their counts are reported
+  // through different data structures, we'd have to keep a list of known empty traces to
+  // also prevent the corresponding trace counts to be sent out. OTOH, if we do it here,
+  // this is trivial.
+  if (trace->stack_len == 1 && trace->kernel_stack_id < 0 && state->unwind_error) {
+    u32 syscfg_key = 0;
+    SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &syscfg_key);
+    if (!syscfg) {
+      return -1; // unreachable
+    }
+
+    if (syscfg->drop_error_only_traces) {
+      return 0;
+    }
+  }
+  // TEMPORARY HACK END
+
+  send_trace(ctx, trace);
+
+  return 0;
+}
+
 char _license[] SEC("license") = "GPL";
 // this number will be interpreted by the elf loader
 // to set the current running kernel version