Tutorial for DebugMode

yushangdi · yushangdi · commit 6e85e8a3f118 · 2025-12-11T16:03:18.000-08:00
diff --git a/recipes_source/debug_mode_tutorial.py b/recipes_source/debug_mode_tutorial.py
@@ -0,0 +1,234 @@
+# -*- coding: utf-8 -*-
+
+"""
+DebugMode: Recording Dispatched Operations and Numerical Debugging
+=================================================================
+
+**Authors:** Pian Pawakapan, Shangdi Yu
+"""
+
+######################################################################
+# Overview
+# --------
+#
+# ``DebugMode`` (:class:`torch.utils._debug_mode.DebugMode`) is a
+# ``TorchDispatchMode`` that intercepts PyTorch runtime calls and emits a
+# hierarchical log of operations. It is particularly useful when you need to
+# understand *what* actually ran, both in eager mode and under ``torch.compile``
+# or when you need to pinpoint numerical divergence between two runs.
+#
+# Key capabilities:
+#
+# * **Runtime logging** – Records dispatched operations and TorchInductor compiled
+#   Triton kernels.
+# * **Tensor hashing** – Attaches deterministic hashes to inputs/outputs so you
+#   can diff runs and locate numerical divergences.
+# * **Dispatch hooks** – Lets you register custom hooks to annotate each call
+
+######################################################################
+# Quick start
+# -----------
+#
+# The snippet below captures a small eager workload and prints the debug string:
+
+import torch
+from torch.utils._debug_mode import DebugMode
+
+def run_once():
+    x = torch.randn(8, 8)
+    y = torch.randn(8, 8)
+    return torch.mm(torch.relu(x), y)
+
+with DebugMode() as debug_mode:
+    out = run_once()
+
+print("DebugMode output:")
+print(debug_mode.debug_string())
+
+
+######################################################################
+# Getting more metadata
+# -----------
+#
+# For most investigations, you'll want to enable stack traces, tensor IDs, and tensor hashing.
+# These features provide metadata to correlate operations back to model code.
+#
+# ``DebugMode.log_tensor_hashes`` decorates the log with hashes for every call.
+# The ``hash_tensor`` hash function uses ``torch.hash_tensor``, which returns 0 for tensors whose
+# elements are all the same. The ``norm`` hash function uses ``norm`` with ``p=1``.
+
+with (
+    DebugMode(
+        record_output=True,
+        record_stack_trace=True,
+        record_ids=True,
+    ) as debug_mode,
+    DebugMode.log_tensor_hashes(
+        hash_fn=["norm", "hash_tensor"],
+        hash_inputs=True,
+    ),
+):
+    result = run_once()
+
+print("DebugMode output with more metadata:")
+print(
+    debug_mode.debug_string(show_stack_trace=True)
+)
+
+######################################################################
+# Interpreting the log
+# --------------------
+#
+# Each line follows ``op(args) -> outputs``. When ``record_ids`` is enabled,
+# tensors are suffixed with ``$<id>`` and DTensors are labeled ``dt``.
+#
+# Indentation generally reflects the dynamic call stack, but it's not guaranteed to be the same
+# as the call stack at runtime, especially for DTensor calls.
+#
+# For the tuple in hash, e.g., ``'hash': (25.47251951135695, 9216239975761182720)``, each number corresponds to the hash result
+# using the hash function specified in the ``hash_fn`` list.
+
+
+######################################################################
+# Log Triton kernels
+# ------------------
+#
+# Though Triton kernels are not dispatched, DebugMode has custom logic that logs their inputs and outputs.
+#
+# Inductor-generated Triton kernels show up with a ``[triton]`` prefix.
+# Pre/post hash annotations report buffer hashes around each kernel call, which
+# is helpful when isolating incorrect kernels.
+def f(x):
+    return torch.mm(torch.relu(x), x.T)
+
+x = torch.randn(3, 3, device="cuda")
+
+with (
+    DebugMode(record_output=True) as debug_mode,
+    DebugMode.log_tensor_hashes(
+        hash_fn=["norm"],
+        hash_inputs=True,
+    )
+):
+    a = torch.compile(f)(x)
+
+print("Triton in DebugMode logs:")
+print(debug_mode.debug_string())
+
+######################################################################
+# Numerical debugging with tensor hashes
+# --------------------------------------
+#
+# If you have some numerical divergence between modes, you can use DebugMode to find where the
+# numerical divergence originates.
+# In the example below, you can see that all tensor hashes are the same for eager mode and compiled mode.
+# If any hash looks different, then that's where the numerical divergence is coming from.
+
+def run_model(model, data, *, compile_with=None):
+    if compile_with is not None:
+        model = torch.compile(model, backend=compile_with)
+    with DebugMode(record_output=True) as dm, DebugMode.log_tensor_hashes(
+        hash_fn=["norm"],
+        hash_inputs=True,
+    ):
+        dm_out = model(*data)
+    return dm, dm_out
+
+class Toy(torch.nn.Module):
+    def forward(self, x):
+        return torch.relu(x).mm(x.T)
+
+inputs = (torch.randn(4, 4),)
+dm_eager, _ = run_model(Toy(), inputs)
+dm_compiled, _ = run_model(Toy(), inputs, compile_with="aot_eager")
+
+print("Eager mode:")
+print(dm_eager.debug_string())
+print("Compiled aot_eager mode:")
+print(dm_compiled.debug_string())
+
+
+
+
+######################################################################
+# Custom dispatch hooks
+# ---------------------
+#
+# Hooks allow you to annotate each call with custom metadata such as GPU memory usage. ``log_hook`` returns a mapping
+# that is rendered inline with the debug string.
+
+MB = 1024 * 1024.0
+
+def memory_hook(func, types, args, kwargs, result):
+    mem = torch.cuda.memory_allocated() / MB if torch.cuda.is_available() else 0.0
+    peak = torch.cuda.max_memory_allocated() / MB if torch.cuda.is_available() else 0.0
+    torch.cuda.reset_peak_memory_stats() if torch.cuda.is_available() else None
+    return {"mem": f"{mem:.3f} MB", "peak": f"{peak:.3f} MB"}
+
+with (
+    DebugMode() as dm,
+    DebugMode.dispatch_hooks(log_hook=memory_hook),
+):
+    run_once()
+
+print("DebugMode output with memory usage:")
+print(dm.debug_string())
+
+######################################################################
+# Module boundaries
+# ----------------------------------
+#
+# ``record_nn_module=True`` inserts ``[nn.Mod]`` markers that show which
+# module executed each set of operations. As of PyTorch 2.10 it only works in eager mode,
+# but support for compiled modes is under development.
+
+class Foo(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.l1 = torch.nn.Linear(4, 4)
+            self.l2 = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            return self.l2(self.l1(x))
+
+class Bar(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.abc = Foo()
+        self.xyz = torch.nn.Linear(4, 4)
+
+    def forward(self, x):
+        return self.xyz(self.abc(x))
+
+mod = Bar()
+inp = torch.randn(4, 4)
+with DebugMode(record_nn_module=True, record_output=False) as debug_mode:
+    _ = mod(inp)
+
+print("DebugMode output with stack traces and module boundaries:")
+print(debug_mode.debug_string(show_stack_trace=True))
+
+######################################################################
+# Annotation
+# ----------------------------------
+#
+# You can insert annotations in DebugMode logs by calling ``DebugMode._annotate``
+
+x = torch.randn(8, 8)
+
+class Foo(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = torch.nn.Linear(8, 8)
+
+    def forward(self, x):
+        DebugMode._annotate("Foo")
+        return self.l1(x)
+
+mod = Foo()
+with DebugMode(record_nn_module=True) as debug_mode:
+    DebugMode._annotate("forward")
+    mod(x)
+
+print("DebugMode output with annotation:")
+print(debug_mode.debug_string())