[Tracing] Better runtime error messages (#1307)

kylesayrs · web-flow · commit 90c407516ea4 · 2025-04-23T15:45:43.000Z
## Purpose ##
* Add better exception messages when encountering tracing errors

## Example ##
* Below is an example of a potential tracing runtime error (this
particular error was forced for demonstration purposes)
````
Traceback (most recent call last):  
  File "/home/kyle/llm-compressor/src/llmcompressor/pipelines/sequential/helpers.py", line 45, in forward
    outputs = forward_fn(*args, **kwargs)
  File "&lt;string&gt;", line 12, in forward
TypeError: iter(v, w): v must be callable

The above exception was the direct cause of the following exception:

Traceback (most recent call last):  
  File "/home/kyle/llm-compressor/src/llmcompressor/modifiers/quantization/gptq/base.py", line 234, in on_initialize
    run_sequential(
  File "/home/kyle/llm-compressor/src/llmcompressor/pipelines/sequential/pipeline.py", line 67, in run_pipeline
    subgraph.forward(model, **inputs)
  File "/home/kyle/llm-compressor/src/llmcompressor/pipelines/sequential/helpers.py", line 47, in forward
    raise RuntimeError(
RuntimeError: Raised an exception during execution of the following code:
```
1
2
3
4 def forward(self, input_ids : torch.Tensor, attention_mask :
torch.Tensor):
5     model_rotary_emb_inv_freq = self.model.rotary_emb.inv_freq
6 getitem_10 = model_rotary_emb_inv_freq[(None, slice(None, None, None),
None)]; model_rotary_emb_inv_freq = None
7 model_embed_tokens = self.model.embed_tokens(input_ids); input_ids =
None
8     size_3 = attention_mask.size();  size_3 = None
9     dim = attention_mask.dim()
10     size_6 = attention_mask.size()
11 getitem_8 = attention_mask[(slice(None, None, None), None, None,
slice(None, None, None))]
12     iter_6 = iter(attention_mask, 'device');  attention_mask = None
13     float_1 = getitem_10.float();  getitem_10 = None
14     size = model_embed_tokens.size()
15     iter_1 = iter(model_embed_tokens, 'device')

```
````

## Changes ##
* Move forward call to inside Subgraph class and wrap forward call in
order to catch and propagate exceptions

---------

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -1,11 +1,12 @@
 import inspect
 from collections import deque
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Set, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 from compressed_tensors import has_offloaded_params
 from compressed_tensors.quantization import find_name_or_class_matches
 from torch.fx import Graph, GraphModule, Node
+from torch.fx.graph import PythonCode
 from torch.fx.proxy import Argument
 from torch.nn import Module
 from transformers import PreTrainedModel
@@ -32,16 +33,33 @@ class Subgraph:
     graph: Graph
     input_names: Set[str]
     consumed_names: Set[str]
+    _code: Optional[PythonCode] = None
 
-    def compile_forward(self) -> Callable[[Any], Any]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         """
-        Generate and compile code for executing this subgraph
+        Execute the operations within the subgraph
 
-        :return: function which, when called, executes this subgraph
+        :param \\*args: argument inputs to subgraph forward function
+        :param \\**kwargs: keyword inputs to subgraph forward function
+        :return keyword outputs of subgraph forward function (non-consumed variables):
         """
-        code = self.graph.python_code("self")
-        exec(code.src, code.globals)
-        return code.globals.get("forward")
+        if self._code is None:
+            self._code = self.graph.python_code("self")
+            exec(self._code.src, self._code.globals)
+
+        forward_fn = self._code.globals.get("forward")
+
+        try:
+            outputs = forward_fn(*args, **kwargs)
+        except Exception as exception:
+            raise RuntimeError(
+                "Raised an exception during execution of the following code:\n"
+                f"```\n{add_line_numbers(self._code.src)}\n```\n"
+                "This is likely due to a violation of shape assumptions made when "
+                "tracing"
+            ) from exception
+
+        return outputs
 
 
 def trace_subgraphs(
@@ -376,3 +394,9 @@ def match_modules(model: Module, target_names: List[str]) -> Set[Module]:
         for name, module in model.named_modules()
         if find_name_or_class_matches(name, module, target_names)
     )
+
+
+def add_line_numbers(text: str) -> str:
+    lines = text.splitlines()
+    numbered_lines = [f"{i + 1} {line}" for i, line in enumerate(lines)]
+    return "\n".join(numbered_lines)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -61,13 +61,10 @@ def run_pipeline(
             calib_desc = f"({subgraph_index + 1}/{num_subgraphs}): Calibrating"
             prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
 
-            # compile subgraph forward function
-            forward_function = subgraph.compile_forward()
-
             # do an preliminary pass to trigger modifier hooks
             for batch_index in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
                 inputs = intermediates.fetch(batch_index, subgraph.input_names)
-                forward_function(model, **inputs)
+                subgraph.forward(model, **inputs)
 
             # TODO: replace with a lifecycle event
             if callback_modifier:
@@ -78,7 +75,7 @@ def run_pipeline(
             with HooksMixin.disable_hooks():
                 for batch_index in tqdm.tqdm(range(len(dataloader)), desc=prop_desc):
                     inputs = intermediates.fetch(batch_index, subgraph.input_names)
-                    output = forward_function(model, **inputs)
+                    output = subgraph.forward(model, **inputs)
 
                     if subgraph_index < num_subgraphs - 1:
                         intermediates.update(batch_index, output)