diff --git a/include/common_symbol_errors.hpp b/include/common_symbol_errors.hpp
index 0f5c44c78..cb5b3793d 100644
--- a/include/common_symbol_errors.hpp
+++ b/include/common_symbol_errors.hpp
@@ -14,10 +14,14 @@ using namespace std::string_view_literals;
 
 namespace ddprof {
 
-inline constexpr std::array<std::string_view, 6> k_common_frame_names = {
-    "[truncated]"sv,      "[unknown mapping]"sv,
-    "[unwind failure]"sv, "[incomplete]"sv,
-    "[lost]"sv,           "[maximum pids]"sv};
+inline constexpr std::array<std::string_view, 7> k_common_frame_names = {
+    "[truncated]"sv,
+    "[unknown mapping]"sv,
+    "[unwind failure]"sv,
+    "[incomplete]"sv,
+    "[lost]"sv,
+    "[maximum pids]"sv,
+    "[unsampled mapping]"sv};
 
 enum SymbolErrors : std::uint8_t {
   truncated_stack = 0,
@@ -26,6 +30,7 @@ enum SymbolErrors : std::uint8_t {
   incomplete_stack,
   lost_event,
   max_pids,
+  unsampled_mapping,
 };
 
 } // namespace ddprof
diff --git a/include/live_allocation.hpp b/include/live_allocation.hpp
index c8ba012df..631de1952 100644
--- a/include/live_allocation.hpp
+++ b/include/live_allocation.hpp
@@ -10,8 +10,10 @@
 #include "unwind_output_hash.hpp"
 
 #include <cstddef>
+#include <set>
 #include <sys/types.h>
 #include <unordered_map>
+#include <vector>
 
 namespace ddprof {
 
@@ -26,15 +28,39 @@ T &access_resize(std::vector<T> &v, size_t index,
 
 class LiveAllocation {
 public:
-  // For allocations Value is the size
-  // This is the cumulative value and count for a given stack
+  struct SmapsEntry {
+    ProcessAddress_t start;
+    ProcessAddress_t end;
+    size_t rss_kb;
+    size_t accounted_size{};
+  };
+
   struct ValueAndCount {
     int64_t _value = 0;
     int64_t _count = 0;
   };
 
+  struct StackAndMapping {
+    const UnwindOutput *uw_output_ptr; // Pointer to an UnwindOutput in a set
+    ProcessAddress_t start_mmap;       // Start of associated mapping
+
+    bool operator==(const StackAndMapping &other) const {
+      return uw_output_ptr == other.uw_output_ptr &&
+          start_mmap == other.start_mmap;
+    }
+  };
+
+  struct StackAndMappingHash {
+    std::size_t operator()(const StackAndMapping &s) const {
+      size_t seed = std::hash<const UnwindOutput *>{}(s.uw_output_ptr);
+      hash_combine(seed, std::hash<ProcessAddress_t>{}(s.start_mmap));
+      return seed;
+    }
+  };
+
   using PprofStacks =
-      std::unordered_map<UnwindOutput, ValueAndCount, UnwindOutputHash>;
+      std::unordered_map<StackAndMapping, ValueAndCount, StackAndMappingHash>;
+  using MappingValuesMap = std::unordered_map<ProcessAddress_t, ValueAndCount>;
 
   struct ValuePerAddress {
     int64_t _value = 0;
@@ -42,32 +68,34 @@ class LiveAllocation {
   };
 
   using AddressMap = std::unordered_map<uintptr_t, ValuePerAddress>;
+
   struct PidStacks {
     AddressMap _address_map;
     PprofStacks _unique_stacks;
+    std::set<UnwindOutput>
+        unwind_output_set; // Set to store all unique UnwindOutput objects
+    std::vector<SmapsEntry> entries;
+    MappingValuesMap
+        mapping_values; // New map to track memory usage per mapping
   };
 
   using PidMap = std::unordered_map<pid_t, PidStacks>;
   using WatcherVector = std::vector<PidMap>;
-  // NOLINTNEXTLINE(misc-non-private-member-variables-in-classes)
-  WatcherVector _watcher_vector;
 
-  // Allocation should be aggregated per stack trace
-  // instead of a stack, we would have a total size for this unique stack trace
-  // and a count.
   void register_allocation(const UnwindOutput &uo, uintptr_t addr, size_t size,
                            int watcher_pos, pid_t pid) {
     PidMap &pid_map = access_resize(_watcher_vector, watcher_pos);
     PidStacks &pid_stacks = pid_map[pid];
-    register_allocation(uo, addr, size, pid_stacks._unique_stacks,
-                        pid_stacks._address_map);
+    if (pid_stacks.entries.empty()) {
+      pid_stacks.entries = parse_smaps(pid);
+    }
+    register_allocation_internal(uo, addr, size, pid_stacks);
   }
 
   void register_deallocation(uintptr_t addr, int watcher_pos, pid_t pid) {
     PidMap &pid_map = access_resize(_watcher_vector, watcher_pos);
     PidStacks &pid_stacks = pid_map[pid];
-    if (!register_deallocation(addr, pid_stacks._unique_stacks,
-                               pid_stacks._address_map)) {
+    if (!register_deallocation_internal(addr, pid_stacks)) {
       ++_stats._unmatched_deallocations;
     }
   }
@@ -83,24 +111,30 @@ class LiveAllocation {
     }
   }
 
-  [[nodiscard]] [[nodiscard]] unsigned get_nb_unmatched_deallocations() const {
+  [[nodiscard]] unsigned get_nb_unmatched_deallocations() const {
     return _stats._unmatched_deallocations;
   }
 
+  static std::vector<SmapsEntry> parse_smaps(pid_t pid);
+
+  static int64_t upscale_with_mapping(const PprofStacks::value_type &stack,
+                                      PidStacks &pid_stacks);
+
   void cycle() { _stats = {}; }
 
+  // no lint to avoid warning about member being public (should be refactored)
+  WatcherVector _watcher_vector; // NOLINT
 private:
-  // returns true if the deallocation was registered
-  static bool register_deallocation(uintptr_t address, PprofStacks &stacks,
-                                    AddressMap &address_map);
-
-  // returns true if the allocation was registerd
-  static bool register_allocation(const UnwindOutput &uo, uintptr_t address,
-                                  int64_t value, PprofStacks &stacks,
-                                  AddressMap &address_map);
+  static bool register_deallocation_internal(uintptr_t address,
+                                             PidStacks &pid_stacks);
+
+  static bool register_allocation_internal(const UnwindOutput &uo,
+                                           uintptr_t address, int64_t value,
+                                           PidStacks &pid_stacks);
+
   struct {
     unsigned _unmatched_deallocations = {};
   } _stats;
 };
 
-} // namespace ddprof
\ No newline at end of file
+} // namespace ddprof
diff --git a/include/pprof/ddprof_pprof.hpp b/include/pprof/ddprof_pprof.hpp
index e0dfdda0c..c0796856f 100644
--- a/include/pprof/ddprof_pprof.hpp
+++ b/include/pprof/ddprof_pprof.hpp
@@ -21,6 +21,8 @@ namespace ddprof {
 class Symbolizer;
 struct SymbolHdr;
 
+using NumToStrCache = std::unordered_map<ProcessAddress_t, std::string>;
+
 struct DDProfPProf {
   /* single profile gathering several value types */
   ddog_prof_Profile _profile{};
@@ -28,7 +30,7 @@ struct DDProfPProf {
   Tags _tags;
   bool use_process_adresses{true};
   // avoid re-creating strings for all pid numbers
-  std::unordered_map<pid_t, std::string> _pid_str;
+  NumToStrCache _pid_str;
 };
 
 struct DDProfValuePack {
@@ -39,6 +41,11 @@ struct DDProfValuePack {
 
 DDRes pprof_create_profile(DDProfPProf *pprof, DDProfContext &ctx);
 
+struct AggregationConfig {
+  EventAggregationModePos value_pos{kSumPos};
+  bool show_samples{false};
+  bool adjust_locations{true};
+};
 /**
  * Aggregate to the existing profile the provided unwinding output.
  * @param uw_output
@@ -49,9 +56,8 @@ DDRes pprof_create_profile(DDProfPProf *pprof, DDProfContext &ctx);
 DDRes pprof_aggregate(const UnwindOutput *uw_output,
                       const SymbolHdr &symbol_hdr, const DDProfValuePack &pack,
                       const PerfWatcher *watcher,
-                      const FileInfoVector &file_infos, bool show_samples,
-                      EventAggregationModePos value_pos, Symbolizer *symbolizer,
-                      DDProfPProf *pprof);
+                      const FileInfoVector &file_infos, AggregationConfig conf,
+                      Symbolizer *symbolizer, DDProfPProf *pprof);
 
 DDRes pprof_reset(DDProfPProf *pprof);
 
diff --git a/include/unwind_output.hpp b/include/unwind_output.hpp
index 7316365eb..f82c15bcb 100644
--- a/include/unwind_output.hpp
+++ b/include/unwind_output.hpp
@@ -31,11 +31,13 @@ struct FunLoc {
 struct UnwindOutput {
   void clear() {
     locs.clear();
+    labels.clear();
     container_id = k_container_id_unknown;
     exe_name = {};
     thread_name = {};
   }
   std::vector<FunLoc> locs;
+  std::vector<std::pair<std::string, std::string>> labels;
   std::string_view container_id;
   std::string_view exe_name;
   std::string_view thread_name;
diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc
index 020af43b8..ff6c65128 100644
--- a/src/ddprof_worker.cc
+++ b/src/ddprof_worker.cc
@@ -5,6 +5,7 @@
 
 #include "ddprof_worker.hpp"
 
+#include "common_symbol_errors.hpp"
 #include "ddprof_context.hpp"
 #include "ddprof_perf_event.hpp"
 #include "ddprof_stats.hpp"
@@ -59,6 +60,12 @@ DDRes report_lost_events(DDProfContext &ctx) {
 
     if (nb_lost > 0) {
       PerfWatcher *watcher = &ctx.watchers[watcher_idx];
+      // todo: check for alternative ways to report lost events
+      // this only happens when we are in live mode only
+      if (watcher->pprof_indices[kSumPos].pprof_index == -1) {
+        LG_DBG("Watcher #%d can not report lost events", watcher_idx);
+        continue;
+      }
       UnwindState *us = ctx.worker_ctx.us;
       us->output.clear();
       add_common_frame(us, SymbolErrors::lost_event);
@@ -74,8 +81,8 @@ DDRes report_lost_events(DDProfContext &ctx) {
              nb_lost, value, watcher_idx);
       DDRES_CHECK_FWD(pprof_aggregate(
           &us->output, us->symbol_hdr, {value, nb_lost, 0}, watcher,
-          ctx.worker_ctx.us->dso_hdr.get_file_info_vector(), false, kSumPos,
-          ctx.worker_ctx.symbolizer,
+          ctx.worker_ctx.us->dso_hdr.get_file_info_vector(),
+          AggregationConfig{.value_pos = kSumPos}, ctx.worker_ctx.symbolizer,
           ctx.worker_ctx.pprof[ctx.worker_ctx.i_current_pprof]));
       ctx.worker_ctx.lost_events_per_watcher[watcher_idx] = 0;
     }
@@ -243,60 +250,116 @@ void ddprof_reset_worker_stats() {
 
 DDRes aggregate_livealloc_stack(
     const LiveAllocation::PprofStacks::value_type &alloc_info,
-    DDProfContext &ctx, const PerfWatcher *watcher, DDProfPProf *pprof,
-    const SymbolHdr &symbol_hdr) {
+    int64_t upscalled_value, DDProfContext &ctx, const PerfWatcher *watcher,
+    DDProfPProf *pprof, const SymbolHdr &symbol_hdr) {
+
+  if (upscalled_value) {
+    LG_DBG("Upscaling from %ld to %ld", alloc_info.second._value,
+           upscalled_value);
+  }
+  // default to the sampled value
   const DDProfValuePack pack{
-      alloc_info.second._value,
+      upscalled_value ? upscalled_value : alloc_info.second._value,
       static_cast<uint64_t>(std::max<int64_t>(0, alloc_info.second._count)), 0};
+  AggregationConfig conf{.value_pos = kLiveSumPos,
+                         .show_samples = ctx.params.show_samples};
+  DDRES_CHECK_FWD(
+      pprof_aggregate(alloc_info.first.uw_output_ptr, symbol_hdr, pack, watcher,
+                      ctx.worker_ctx.us->dso_hdr.get_file_info_vector(), conf,
+                      ctx.worker_ctx.symbolizer, pprof));
+  return {};
+}
+
+DDRes aggregate_live_allocations_common(DDProfContext &ctx,
+                                        unsigned watcher_pos, pid_t pid,
+                                        LiveAllocation::PidStacks &pid_stacks) {
+  UnwindState *us = ctx.worker_ctx.us;
+  DDProfPProf *pprof = ctx.worker_ctx.pprof[ctx.worker_ctx.i_current_pprof];
+  const SymbolHdr &symbol_hdr = us->symbol_hdr;
+  const PerfWatcher *watcher = &ctx.watchers[watcher_pos];
+
+  // Parse smaps entries for the current pid
+  auto new_entries = LiveAllocation::parse_smaps(pid);
+  if (!new_entries.empty()) { // Normal operation
+    pid_stacks.entries = new_entries;
+  } else { // Handle case when we get no entries (e.g., during shutdown)
+    for (auto &el : pid_stacks.entries) {
+      el.accounted_size = 0;
+    }
+  }
+
+  // Step 1: Process existing samples
+  for (const auto &alloc_info : pid_stacks._unique_stacks) {
+    const int64_t upscaled_value =
+        ddprof::LiveAllocation::upscale_with_mapping(alloc_info, pid_stacks);
+    DDRES_CHECK_FWD(aggregate_livealloc_stack(alloc_info, upscaled_value, ctx,
+                                              watcher, pprof, symbol_hdr));
+  }
+
+  // Step 2: Add fake unwind frames for mappings without samples
+  // Create a fake unwind output to account for unsampled mappings
+  us->output.clear();
+  us->output.pid = pid;
+  us->output.tid = 0;
+  add_common_frame(us, SymbolErrors::unsampled_mapping);
+  add_virtual_base_frame(us);
+  UnwindOutput &uo = us->output;
+  uo.labels.emplace_back("mapping", "other");
+  for (const auto &el : pid_stacks.entries) {
+    if (el.accounted_size == 0) {
+      // Use the RSS value for this mapping as the "unsampled" value
+      const int64_t unsampled_value = el.rss_kb * 1000; // RSS in bytes
+      // Add the frame using the fake UnwindOutput
+      const DDProfValuePack pack{unsampled_value, 1, 0};
+      AggregationConfig conf{.value_pos = kLiveSumPos,
+                             .show_samples = false,
+                             .adjust_locations = false};
+      DDRES_CHECK_FWD(
+          pprof_aggregate(&uo, us->symbol_hdr, pack, watcher,
+                          ctx.worker_ctx.us->dso_hdr.get_file_info_vector(),
+                          conf, ctx.worker_ctx.symbolizer, pprof));
+    }
+  }
+
+  LG_NTC(
+      "<%u> Number of Live allocations for PID %d = %lu, Unique stacks = %lu",
+      watcher_pos, pid, pid_stacks._address_map.size(),
+      pid_stacks._unique_stacks.size());
 
-  DDRES_CHECK_FWD(pprof_aggregate(
-      &alloc_info.first, symbol_hdr, pack, watcher,
-      ctx.worker_ctx.us->dso_hdr.get_file_info_vector(),
-      ctx.params.show_samples, kLiveSumPos, ctx.worker_ctx.symbolizer, pprof));
   return {};
 }
 
+// Unified function for aggregating live allocations for a specific pid
 DDRes aggregate_live_allocations_for_pid(DDProfContext &ctx, pid_t pid) {
-  struct UnwindState *us = ctx.worker_ctx.us;
-  int const i_export = ctx.worker_ctx.i_current_pprof;
-  DDProfPProf *pprof = ctx.worker_ctx.pprof[i_export];
-  const SymbolHdr &symbol_hdr = us->symbol_hdr;
   LiveAllocation &live_allocations = ctx.worker_ctx.live_allocation;
+
   for (unsigned watcher_pos = 0;
        watcher_pos < live_allocations._watcher_vector.size(); ++watcher_pos) {
     auto &pid_map = live_allocations._watcher_vector[watcher_pos];
-    const PerfWatcher *watcher = &ctx.watchers[watcher_pos];
-    auto &pid_stacks = pid_map[pid];
-    for (const auto &alloc_info : pid_stacks._unique_stacks) {
-      DDRES_CHECK_FWD(aggregate_livealloc_stack(alloc_info, ctx, watcher, pprof,
-                                                symbol_hdr));
-    }
+    auto &pid_stacks = pid_map[pid]; // Get PidStacks for the specific pid
+    // Call the common function to handle the aggregation
+    DDRES_CHECK_FWD(
+        aggregate_live_allocations_common(ctx, watcher_pos, pid, pid_stacks));
   }
+
   return {};
 }
 
+// Unified function for aggregating live allocations across all PIDs
 DDRes aggregate_live_allocations(DDProfContext &ctx) {
-  // this would be more efficient if we could reuse the same stacks in
-  // libdatadog
-  UnwindState *us = ctx.worker_ctx.us;
-  int const i_export = ctx.worker_ctx.i_current_pprof;
-  DDProfPProf *pprof = ctx.worker_ctx.pprof[i_export];
-  const SymbolHdr &symbol_hdr = us->symbol_hdr;
-  const LiveAllocation &live_allocations = ctx.worker_ctx.live_allocation;
+  LiveAllocation &live_allocations = ctx.worker_ctx.live_allocation;
+
   for (unsigned watcher_pos = 0;
        watcher_pos < live_allocations._watcher_vector.size(); ++watcher_pos) {
-    const auto &pid_map = live_allocations._watcher_vector[watcher_pos];
-    const PerfWatcher *watcher = &ctx.watchers[watcher_pos];
-    for (const auto &pid_vt : pid_map) {
-      for (const auto &alloc_info : pid_vt.second._unique_stacks) {
-        DDRES_CHECK_FWD(aggregate_livealloc_stack(alloc_info, ctx, watcher,
-                                                  pprof, symbol_hdr));
-      }
-      LG_NTC("<%u> Number of Live allocations for PID%d=%lu, Unique stacks=%lu",
-             watcher_pos, pid_vt.first, pid_vt.second._address_map.size(),
-             pid_vt.second._unique_stacks.size());
+    auto &pid_map = live_allocations._watcher_vector[watcher_pos];
+    for (auto &pid_vt : pid_map) {
+      auto &pid_stacks = pid_vt.second;
+      // Call the common function to handle the aggregation for each PID
+      DDRES_CHECK_FWD(aggregate_live_allocations_common(
+          ctx, watcher_pos, pid_vt.first, pid_stacks));
     }
   }
+
   return {};
 }
 
@@ -427,6 +490,9 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample,
   // Aggregate if unwinding went well (todo : fatal error propagation)
   if (!IsDDResFatal(res)) {
     struct UnwindState *us = ctx.worker_ctx.us;
+    if (sample->addr) { // todo: is this the correct trigger ?
+      us->output.labels.emplace_back("mapping", "heap");
+    }
     if (Any(EventAggregationMode::kLiveSum & watcher->aggregation_mode) &&
         sample->addr) {
       // null address means we should not account it
@@ -450,11 +516,11 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample,
       }
       const DDProfValuePack pack{static_cast<int64_t>(sample_val), 1,
                                  timestamp};
-
-      DDRES_CHECK_FWD(pprof_aggregate(
-          &us->output, us->symbol_hdr, pack, watcher,
-          us->dso_hdr.get_file_info_vector(), ctx.params.show_samples, kSumPos,
-          ctx.worker_ctx.symbolizer, pprof));
+      AggregationConfig conf{.show_samples = ctx.params.show_samples};
+      DDRES_CHECK_FWD(pprof_aggregate(&us->output, us->symbol_hdr, pack,
+                                      watcher,
+                                      us->dso_hdr.get_file_info_vector(), conf,
+                                      ctx.worker_ctx.symbolizer, pprof));
     }
   }
   // We need to free the PID only after any aggregation operations
diff --git a/src/live_allocation.cc b/src/live_allocation.cc
index 7a27a7f53..d95a0d408 100644
--- a/src/live_allocation.cc
+++ b/src/live_allocation.cc
@@ -7,11 +7,19 @@
 
 #include "logger.hpp"
 
+#include <absl/strings/str_format.h>
+#include <cassert>
+#include <charconv>
+
 namespace ddprof {
+using SmapsEntry = LiveAllocation::SmapsEntry;
+
+bool LiveAllocation::register_deallocation_internal(uintptr_t address,
+                                                    PidStacks &pid_stacks) {
+  auto &stacks = pid_stacks._unique_stacks;
+  auto &address_map = pid_stacks._address_map;
+  auto &mapping_values = pid_stacks.mapping_values;
 
-bool LiveAllocation::register_deallocation(uintptr_t address,
-                                           PprofStacks &stacks,
-                                           AddressMap &address_map) {
   // Find the ValuePerAddress object corresponding to the address
   auto map_iter = address_map.find(address);
   if (map_iter == address_map.end()) {
@@ -21,16 +29,25 @@ bool LiveAllocation::register_deallocation(uintptr_t address,
     LG_DBG("Unmatched de-allocation at %lx", address);
     return false;
   }
+
   ValuePerAddress const &v = map_iter->second;
 
   // Decrement count and value of the corresponding PprofStacks::value_type
   // object
   if (v._unique_stack) {
+    // Adjust the mapping values map for the allocation being deallocated
+    const ProcessAddress_t mapping_start = v._unique_stack->first.start_mmap;
+    mapping_values[mapping_start]._value -= v._value;
+    if (mapping_values[mapping_start]._count > 0) {
+      --mapping_values[mapping_start]._count;
+    }
+
     v._unique_stack->second._value -= v._value;
-    if (v._unique_stack->second._count) {
+    if (v._unique_stack->second._count > 0) {
       --(v._unique_stack->second._count);
     }
-    if (!v._unique_stack->second._count) {
+
+    if (v._unique_stack->second._count == 0) {
       // If count reaches 0, remove the UnwindOutput from stacks
       stacks.erase(v._unique_stack->first);
     }
@@ -41,48 +58,216 @@ bool LiveAllocation::register_deallocation(uintptr_t address,
   return true;
 }
 
-bool LiveAllocation::register_allocation(const UnwindOutput &uo,
-                                         uintptr_t address, int64_t value,
-                                         PprofStacks &stacks,
-                                         AddressMap &address_map) {
+bool LiveAllocation::register_allocation_internal(const UnwindOutput &uo,
+                                                  uintptr_t address,
+                                                  int64_t value,
+                                                  PidStacks &pid_stacks) {
+
   if (uo.locs.empty()) {
-    // avoid sending empty stacks
+    // Avoid sending empty stacks
     LG_DBG("(LIVE_ALLOC) Avoid registering empty stack");
     return false;
   }
-  // Find or create the PprofStacks::value_type object corresponding to the
+
+  // Find or insert the UnwindOutput in the set
+  auto [uo_iter, inserted] = pid_stacks.unwind_output_set.insert(uo);
+  const UnwindOutput *uo_ptr = &(*uo_iter);
+
+  // Find the corresponding smaps entry for the given address
+  auto entry = std::lower_bound(pid_stacks.entries.begin(),
+                                pid_stacks.entries.end(), address,
+                                [](const LiveAllocation::SmapsEntry &l,
+                                   uintptr_t addr) { return l.start < addr; });
+  // If lower_bound points to the end or the start address is greater than the
+  // address, adjust the entry.
+  if (entry != pid_stacks.entries.begin() &&
+      (entry == pid_stacks.entries.end() || address < entry->start)) {
+    --entry; // Move to the previous entry if it's not the beginning
+  }
+  ProcessAddress_t start_addr = 0;
+  if (entry == pid_stacks.entries.end() || address < entry->start) {
+    // Address not within any known mapping
+    LG_DBG("(LIVE_ALLOC) Address not within any known mapping: %lx", address);
+  } else {
+    start_addr = entry->start;
+  }
+
+  // Create or find the PprofStacks::value_type object corresponding to the
   // UnwindOutput
-  auto iter = stacks.find(uo);
+  auto &stacks = pid_stacks._unique_stacks;
+  const StackAndMapping stack_key{uo_ptr, start_addr};
+  auto iter = stacks.find(stack_key);
   if (iter == stacks.end()) {
-    iter = stacks.emplace(uo, ValueAndCount{}).first;
+    iter = stacks.emplace(stack_key, ValueAndCount{}).first;
   }
+
   PprofStacks::value_type &unique_stack = *iter;
 
-  // Add the value to the address map
+  // Update the address map
+  auto &address_map = pid_stacks._address_map;
   ValuePerAddress &v = address_map[address];
-  if (v._value) {
-    // unexpected, we already have an allocation here
-    // This means we missed a previous free
-    LG_DBG("Existing allocation: %lx (cleaning up)", address);
+
+  if (unlikely(v._value)) { // we should not have double counts
+    // Existing allocation: handle cleanup
+    LG_DBG("(LIVE_ALLOC) Existing allocation found: %lx (cleaning up)",
+           address);
     if (v._unique_stack) {
-      // we should decrement count / value
+      // Adjust the mapping values map for the previous allocation
+      pid_stacks.mapping_values[v._unique_stack->first.start_mmap]._value -=
+          v._value;
+      if (pid_stacks.mapping_values[v._unique_stack->first.start_mmap]._count >
+          0) {
+        --pid_stacks.mapping_values[v._unique_stack->first.start_mmap]._count;
+      }
+
       v._unique_stack->second._value -= v._value;
-      if (v._unique_stack->second._count) {
+      if (v._unique_stack->second._count > 0) {
         --(v._unique_stack->second._count);
       }
-      // Should we erase the element here ?
-      // only if we are sure it is not the same as the one we are inserting.
-      if (v._unique_stack != &unique_stack && !v._unique_stack->second._count) {
+
+      // Remove the old stack if its count is zero and not the same as the
+      // current unique stack
+      if (v._unique_stack != &unique_stack &&
+          v._unique_stack->second._count == 0) {
         stacks.erase(v._unique_stack->first);
       }
     }
   }
 
+  // Set the new allocation value and stack association
   v._value = value;
   v._unique_stack = &unique_stack;
   v._unique_stack->second._value += value;
   ++(v._unique_stack->second._count);
+
+  // Update the mapping values map
+  pid_stacks.mapping_values[unique_stack.first.start_mmap]._value += value;
+  ++pid_stacks.mapping_values[unique_stack.first.start_mmap]._count;
+
   return true;
 }
 
+std::vector<SmapsEntry> LiveAllocation::parse_smaps(pid_t pid) {
+  const std::string smaps_file = absl::StrFormat("%s/%d/smaps", "/proc", pid);
+
+  std::vector<SmapsEntry> entries;
+  FILE *file = fopen(smaps_file.c_str(), "r");
+
+  if (!file) {
+    LG_WRN("Unable to access smaps file for %d", pid);
+    return entries;
+  }
+  char buffer[256];
+  SmapsEntry current_entry;
+
+  while (fgets(buffer, sizeof(buffer), file)) {
+    const std::string_view line(buffer);
+
+    if (line.find("Rss:") == 0) {
+      // Extract RSS value (take characters after "Rss:    ")
+      size_t rss = 0;
+      std::string_view rss_str = line.substr(4, line.find("kB") - 4);
+      rss_str.remove_prefix(std::min(rss_str.find_first_not_of(' '),
+                                     rss_str.size())); // trim leading spaces
+      auto res =
+          std::from_chars(rss_str.data(), rss_str.data() + rss_str.size(), rss);
+      if (res.ec != std::errc()) {
+        LG_DBG("Failed to convert RSS value in smaps file for %d", pid);
+        continue;
+      }
+      current_entry.rss_kb = rss;
+      // push back as we are not parsing other values
+      entries.push_back(current_entry);
+      current_entry = SmapsEntry(); // Reset for next entry
+    } else if (line.find('-') != std::string_view::npos) {
+      // Extract address
+      const std::string_view address = line.substr(0, line.find(' '));
+      const size_t dash_position = address.find('-');
+      unsigned long long start;
+      unsigned long long end;
+      // Convert start address
+      auto result = std::from_chars(address.data(),
+                                    address.data() + dash_position, start, 16);
+      if (result.ec != std::errc()) {
+        LG_DBG("Failed to convert start address in smaps file for %d", pid);
+        continue;
+      }
+      // Convert end address
+      result = std::from_chars(address.data() + dash_position + 1,
+                               address.data() + address.size(), end, 16);
+      if (result.ec != std::errc()) {
+        LG_DBG("Failed to convert end address in smaps file for %d", pid);
+        // todo skip next rss
+        continue;
+      }
+      current_entry.end = end;
+      current_entry.start = start;
+    }
+  }
+
+  fclose(file);
+  // should be a no-op
+  std::sort(entries.begin(), entries.end(),
+            [](const SmapsEntry &a, const SmapsEntry &b) {
+              return a.start < b.start;
+            });
+
+  return entries;
+}
+
+int64_t
+LiveAllocation::upscale_with_mapping(const PprofStacks::value_type &stack,
+                                     PidStacks &pid_stacks) {
+  const ValueAndCount &accounted_value =
+      pid_stacks.mapping_values[stack.first.start_mmap];
+  // Find the corresponding smaps entry for the given address
+  auto entry =
+      std::lower_bound(pid_stacks.entries.begin(), pid_stacks.entries.end(),
+                       stack.first.start_mmap,
+                       [](const LiveAllocation::SmapsEntry &l, uintptr_t addr) {
+                         return l.start < addr;
+                       });
+
+  // We should already be matching an existing entry
+  // ie. entry->start == stack.first.start_mmap
+  if (entry != pid_stacks.entries.begin() &&
+      (entry == pid_stacks.entries.end() ||
+       stack.first.start_mmap < entry->start)) {
+    --entry; // Move to the previous entry if it's not the beginning
+  }
+
+  // Check if a valid entry was found
+  if (entry == pid_stacks.entries.end() ||
+      (stack.first.start_mmap != entry->start)) {
+    // todo: think about these cases
+    LG_DBG("Unable to upscale address for mapping at %lx",
+           stack.first.start_mmap);
+    return 0;
+  }
+  if (accounted_value._value == 0) {
+    LG_DBG("No accounted memory for mapping at %lx", stack.first.start_mmap);
+    return 0;
+  }
+  // save how much mem we have for this one
+  entry->accounted_size += stack.second._value;
+  //
+  // Mapping is 10 megs of RSS
+  // foo -> 10 samples / 100 kb; 10% of the memory in this mapping
+  // bar -> 90 samples / 900 kb; 90% of the memory in this mapping
+  //
+  // foo is upscaled to 1 Meg
+  // bar is upscaled to 9 megs
+  //
+  // Cases of interest:
+  // RSS is 0 -> is that OK not to show the allocations ?
+  // RSS does not shrink even if we no longer have allocations
+  // -> this is where malloc stats would help us
+  //
+  // What if we have a different profile type to show this?
+  //
+  constexpr int KILOBYTES_TO_BYTES = 1000;
+  return stack.second._value * entry->rss_kb * KILOBYTES_TO_BYTES /
+      accounted_value._value;
+}
+
 } // namespace ddprof
diff --git a/src/pprof/ddprof_pprof.cc b/src/pprof/ddprof_pprof.cc
index 9114df611..aed30a965 100644
--- a/src/pprof/ddprof_pprof.cc
+++ b/src/pprof/ddprof_pprof.cc
@@ -30,7 +30,7 @@ using namespace std::string_view_literals;
 namespace ddprof {
 
 namespace {
-constexpr size_t k_max_pprof_labels{8};
+constexpr size_t k_max_pprof_labels{10};
 
 constexpr int k_max_value_types =
     DDPROF_PWT_LENGTH * static_cast<int>(kNbEventAggregationModes);
@@ -40,13 +40,12 @@ struct ActiveIdsResult {
   PerfWatcher *default_watcher = nullptr;
 };
 
-std::string_view pid_str(pid_t pid,
-                         std::unordered_map<pid_t, std::string> &pid_strs) {
-  auto it = pid_strs.find(pid);
+std::string_view pid_str(ProcessAddress_t num, NumToStrCache &pid_strs) {
+  auto it = pid_strs.find(num);
   if (it != pid_strs.end()) {
     return it->second;
   }
-  const auto pair = pid_strs.emplace(pid, std::to_string(pid));
+  const auto pair = pid_strs.emplace(num, std::to_string(num));
   return pair.first->second;
 }
 
@@ -235,7 +234,7 @@ ProfValueTypes compute_pprof_values(const ActiveIdsResult &active_ids) {
 }
 
 size_t prepare_labels(const UnwindOutput &uw_output, const PerfWatcher &watcher,
-                      std::unordered_map<pid_t, std::string> &pid_strs,
+                      NumToStrCache &pid_strs,
                       std::span<ddog_prof_Label> labels) {
   constexpr std::string_view k_container_id_label = "container_id"sv;
   constexpr std::string_view k_process_id_label = "process_id"sv;
@@ -246,6 +245,7 @@ size_t prepare_labels(const UnwindOutput &uw_output, const PerfWatcher &watcher,
   constexpr std::string_view k_thread_name_label = "thread_name"sv;
   constexpr std::string_view k_tracepoint_label = "tracepoint_type"sv;
   size_t labels_num = 0;
+
   if (!uw_output.container_id.empty()) {
     labels[labels_num].key = to_CharSlice(k_container_id_label);
     labels[labels_num].str = to_CharSlice(uw_output.container_id);
@@ -260,7 +260,7 @@ size_t prepare_labels(const UnwindOutput &uw_output, const PerfWatcher &watcher,
     labels[labels_num].str = to_CharSlice(pid_str(uw_output.pid, pid_strs));
     ++labels_num;
   }
-  if (!watcher.suppress_tid) {
+  if (!watcher.suppress_tid && uw_output.tid != 0) {
     labels[labels_num].key = to_CharSlice(k_thread_id_label);
     labels[labels_num].str = to_CharSlice(pid_str(uw_output.tid, pid_strs));
     ++labels_num;
@@ -286,6 +286,15 @@ size_t prepare_labels(const UnwindOutput &uw_output, const PerfWatcher &watcher,
     labels[labels_num].str = to_CharSlice(uw_output.thread_name);
     ++labels_num;
   }
+  for (const auto &el : uw_output.labels) {
+    if (labels_num >= labels.size()) {
+      LG_WRN("Labels buffer exceeded at %s (%d)", __FUNCTION__, __LINE__);
+      break;
+    }
+    labels[labels_num].key = to_CharSlice(el.first);
+    labels[labels_num].str = to_CharSlice(el.second);
+    ++labels_num;
+  }
   DDPROF_DCHECK_FATAL(labels_num <= labels.size(),
                       "pprof_aggregate - label buffer exceeded");
   return labels_num;
@@ -488,6 +497,8 @@ DDRes pprof_create_profile(DDProfPProf *pprof, DDProfContext &ctx) {
     // Allow the data to be split by container-id
     pprof->_tags.emplace_back(std::string("ddprof.custom_ctx"),
                               std::string("container_id"));
+    pprof->_tags.emplace_back(std::string("ddprof.custom_ctx"),
+                              std::string("mapping"));
   }
 
   if (ctx.params.remote_symbolization) {
@@ -512,11 +523,10 @@ DDRes pprof_free_profile(DDProfPProf *pprof) {
 DDRes pprof_aggregate(const UnwindOutput *uw_output,
                       const SymbolHdr &symbol_hdr, const DDProfValuePack &pack,
                       const PerfWatcher *watcher,
-                      const FileInfoVector &file_infos, bool show_samples,
-                      EventAggregationModePos value_pos, Symbolizer *symbolizer,
-                      DDProfPProf *pprof) {
+                      const FileInfoVector &file_infos, AggregationConfig conf,
+                      Symbolizer *symbolizer, DDProfPProf *pprof) {
 
-  const PProfIndices &pprof_indices = watcher->pprof_indices[value_pos];
+  const PProfIndices &pprof_indices = watcher->pprof_indices[conf.value_pos];
   ddog_prof_Profile *profile = &pprof->_profile;
   int64_t values[k_max_value_types] = {};
   assert(pprof_indices.pprof_index != -1);
@@ -528,7 +538,9 @@ DDRes pprof_aggregate(const UnwindOutput *uw_output,
 
   std::array<ddog_prof_Location, kMaxStackDepth> locations_buff;
   std::span locs{uw_output->locs};
-  locs = adjust_locations(watcher, locs);
+  if (conf.adjust_locations) {
+    locs = adjust_locations(watcher, locs);
+  }
 
   // Blaze results should remain alive until we aggregate the pprof data
   Symbolizer::BlazeResultsWrapper session_results;
@@ -549,10 +561,10 @@ DDRes pprof_aggregate(const UnwindOutput *uw_output,
       .labels = {.ptr = labels.data(), .len = labels_num},
   };
 
-  if (show_samples) {
+  if (conf.show_samples) {
     ddprof_print_sample(std::span{locations_buff.data(), write_index},
-                        pack.value, uw_output->pid, uw_output->tid, value_pos,
-                        *watcher);
+                        pack.value, uw_output->pid, uw_output->tid,
+                        conf.value_pos, *watcher);
   }
   auto res = ddog_prof_Profile_add(profile, sample, pack.timestamp);
   if (res.tag != DDOG_PROF_PROFILE_RESULT_OK) {
@@ -575,4 +587,5 @@ DDRes pprof_reset(DDProfPProf *pprof) {
   pprof->_pid_str.clear();
   return {};
 }
+
 } // namespace ddprof
diff --git a/test/ddprof_exporter-ut.cc b/test/ddprof_exporter-ut.cc
index 648b3217c..d233892c3 100644
--- a/test/ddprof_exporter-ut.cc
+++ b/test/ddprof_exporter-ut.cc
@@ -163,7 +163,7 @@ TEST(DDProfExporter, simple) {
     res = pprof_create_profile(&pprofs, ctx);
     EXPECT_TRUE(IsDDResOK(res));
     res = pprof_aggregate(&mock_output, symbol_hdr, {1000, 1, 0},
-                          &ctx.watchers[0], file_infos, false, kSumPos,
+                          &ctx.watchers[0], file_infos, AggregationConfig{},
                           ctx.worker_ctx.symbolizer, &pprofs);
     EXPECT_TRUE(IsDDResOK(res));
   }
diff --git a/test/ddprof_pprof-ut.cc b/test/ddprof_pprof-ut.cc
index 517c21ec2..4d4ec9845 100644
--- a/test/ddprof_pprof-ut.cc
+++ b/test/ddprof_pprof-ut.cc
@@ -72,8 +72,9 @@ TEST(DDProfPProf, aggregate) {
   DDRes res = pprof_create_profile(&pprof, ctx);
   EXPECT_TRUE(ctx.watchers[0].pprof_indices[kSumPos].pprof_index != -1);
   EXPECT_TRUE(ctx.watchers[0].pprof_indices[kSumPos].pprof_count_index != -1);
+
   res = pprof_aggregate(&mock_output, symbol_hdr, {1000, 1, 0},
-                        &ctx.watchers[0], file_infos, false, kSumPos,
+                        &ctx.watchers[0], file_infos, AggregationConfig{},
                         ctx.worker_ctx.symbolizer, &pprof);
 
   EXPECT_TRUE(IsDDResOK(res));
@@ -114,9 +115,10 @@ TEST(DDProfPProf, just_live) {
   EXPECT_TRUE(ctx.watchers[1].pprof_indices[kLiveSumPos].pprof_count_index !=
               -1);
   FileInfoVector file_infos;
-  res = pprof_aggregate(&mock_output, symbol_hdr, {1000, 1, 0},
-                        &ctx.watchers[1], file_infos, false, kLiveSumPos,
-                        ctx.worker_ctx.symbolizer, &pprof);
+  res =
+      pprof_aggregate(&mock_output, symbol_hdr, {1000, 1, 0}, &ctx.watchers[1],
+                      file_infos, AggregationConfig{.value_pos = kLiveSumPos},
+                      ctx.worker_ctx.symbolizer, &pprof);
   EXPECT_TRUE(IsDDResOK(res));
   test_pprof(&pprof);
   res = pprof_free_profile(&pprof);
diff --git a/test/live_allocation-ut.cc b/test/live_allocation-ut.cc
index c21e94a12..1f6a6d5b8 100644
--- a/test/live_allocation-ut.cc
+++ b/test/live_allocation-ut.cc
@@ -39,8 +39,9 @@ TEST(LiveAllocationTest, simple) {
   EXPECT_EQ(pid_stacks._address_map.size(), nb_registered_allocs);
   // though the stack is the same
   ASSERT_EQ(pid_stacks._unique_stacks.size(), 1);
-  const auto &el = pid_stacks._unique_stacks[uo];
-  EXPECT_EQ(el._value, 100);
+  for (const auto &el : pid_stacks._unique_stacks) {
+    EXPECT_EQ(el.second._value, 100);
+  }
 
   { // allocate 10
     uintptr_t addr = 0x10;
@@ -110,9 +111,10 @@ TEST(LiveAllocationTest, overlap_registrations) {
   EXPECT_EQ(pid_stacks._unique_stacks.size(), 1);
 
   // Check that the value and count have the latest value
-  auto &el = pid_stacks._unique_stacks[uo];
-  EXPECT_EQ(el._value, value * 2);
-  EXPECT_EQ(el._count, 1);
+  for (const auto &el : pid_stacks._unique_stacks) {
+    EXPECT_EQ(el.second._value, value * 2);
+    EXPECT_EQ(el.second._count, 1);
+  }
 
   // Deallocate the first allocation
   live_alloc.register_deallocation(addr, watcher_pos, pid);
@@ -135,4 +137,13 @@ TEST(LiveAllocationTest, stats) {
   EXPECT_EQ(live_alloc.get_nb_unmatched_deallocations(), 0);
 }
 
+TEST(LiveAllocationTest, smaps) {
+  LogHandle handle;
+  pid_t pid = getpid();
+  auto entries = LiveAllocation::parse_smaps(pid);
+  for (auto entry : entries) {
+    LG_DBG("address: %ld, rss_kb: %zu", entry.start, entry.rss_kb);
+  }
+}
+
 } // namespace ddprof