Binary search to locate the faulty subgraph

TelGome · TelGome · commit 8019407ce9e2 · 2025-11-21T10:51:37.000+08:00
diff --git a/graph_net/torch/decompose_util.py b/graph_net/torch/decompose_util.py
@@ -215,13 +215,22 @@ def _get_submodule_inputs_and_outputs(
     )
     node_list = list(gm.graph.nodes)
 
+    def _hashable(obj):
+        if isinstance(obj, slice):
+            return ("__slice__", obj.start, obj.stop, obj.step)
+        elif isinstance(obj, (list, tuple)):
+            return tuple(_hashable(x) for x in obj)
+        else:
+            return obj
+
     def get_related_node(node):
         for arg in node.args:
             if isinstance(arg, tuple):
-                yield from arg
+                for x in arg:
+                    yield _hashable(x)
             else:
-                yield arg
-        yield node
+                yield _hashable(arg)
+        yield _hashable(node)
 
     for node in node_list[0:start_node_idx]:
         for related_node in get_related_node(node):
diff --git a/graph_net/torch/native_decomposer_extractor.py b/graph_net/torch/native_decomposer_extractor.py
@@ -0,0 +1,287 @@
+import os
+import sys
+import re
+import json
+import base64
+import shutil
+import argparse
+import subprocess
+from pathlib import Path
+import torch
+from graph_net.torch import utils as gn_utils
+from graph_net.torch.decompose_util import convert_to_submodules_graph
+from graph_net.torch.extractor import GraphExtractor as BuiltinGraphExtractor
+import graph_net.imp_util as imp_util
+
+# ----------------------------
+# Helpers
+# ----------------------------
+def contains_nan_or_inf_in_file(path: str) -> bool:
+    """Check if the file contains NaN or INF."""
+    if not os.path.exists(path):
+        return False
+    with open(path, "r", encoding="utf-8", errors="ignore") as f:
+        text = f.read().lower()
+    return ("nan" in text) or ("inf" in text)
+
+
+def parse_correctness_line_for_nan(log_path: str, log_prompt: str = "graph-net-test-compiler-log"):
+    """Parse log for correctness and check if it contains NaN."""
+    if not os.path.exists(log_path):
+        return False
+    pattern = re.compile(re.escape(log_prompt) + r".*\[Correctness\]\[max_diff\].*", re.IGNORECASE)
+    with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+        for line in f:
+            if pattern.search(line):
+                return "nan" in line.lower() or "inf" in line.lower()
+    return contains_nan_or_inf_in_file(log_path)
+
+
+def get_graph_net_root() -> str:
+    """Return graph_net package root directory."""
+    result = subprocess.run(
+        [sys.executable, "-c", "import graph_net, os; print(os.path.dirname(graph_net.__file__))"],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        raise RuntimeError("Cannot locate graph_net package root: " + result.stderr)
+    return result.stdout.strip()
+
+
+# ----------------------------
+# Extraction (reuse run_model decorator approach)
+# ----------------------------
+def run_naive_extractor(model_path: str, output_dir: str, split_positions: list, group_head_and_tail: bool = True, filter_path: str = None):
+    """Run naive graph decomposition to extract subgraphs."""
+    GRAPH_NET_ROOT = get_graph_net_root()
+
+    decorator_config = {
+        "decorator_path": f"{GRAPH_NET_ROOT}/torch/extractor.py",
+        "decorator_config": {
+            "name": os.path.basename(model_path.rstrip("/")),
+            "custom_extractor_path": f"{GRAPH_NET_ROOT}/torch/naive_graph_decomposer.py",
+            "custom_extractor_config": {
+                "output_dir": output_dir,
+                "split_positions": split_positions,
+                "group_head_and_tail": group_head_and_tail,
+                "filter_path": filter_path if filter_path else f"{GRAPH_NET_ROOT}/torch/naive_subgraph_filter.py",
+                "filter_config": {}
+            }
+        }
+    }
+
+    deco_json = json.dumps(decorator_config)
+    deco_b64 = base64.b64encode(deco_json.encode()).decode()
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "graph_net.torch.run_model",
+        "--model-path",
+        model_path,
+        "--decorator-config",
+        deco_b64,
+    ]
+
+    print("[RUN] extracting subgraphs with naive_graph_decomposer:")
+    print(" ".join(cmd))
+    proc = subprocess.run(cmd)
+    if proc.returncode != 0:
+        raise RuntimeError(f"Extractor failed (rc={proc.returncode})")
+
+
+# ----------------------------
+# Find subgraph directories under output_dir
+# ----------------------------
+def find_subgraphs(output_dir: str):
+    """Find all subgraph directories in the output dir."""
+    out = []
+    for root, dirs, files in os.walk(output_dir):
+        if any(fname in files for fname in ("graph_code.json", "graph_net.json", "model.py")):
+            out.append(os.path.abspath(root))
+    out = sorted(set(out))
+    return out
+
+
+# ----------------------------
+# Run test_compiler on a subgraph dir and write log
+# ----------------------------
+def run_test_compiler_on_subgraph(subgraph_dir: str, log_path: str, compiler: str, device: str, warmup: int = 1, trials: int = 1, log_prompt: str = "graph-net-test-compiler-log"):
+    """Run test_compiler on subgraph and log result."""
+    cmd = [
+        sys.executable,
+        "-m",
+        "graph_net.torch.test_compiler",
+        "--model-path",
+        subgraph_dir,
+        "--compiler",
+        compiler,
+        "--device",
+        device,
+        "--warmup",
+        str(warmup),
+        "--trials",
+        str(trials),
+        "--log-prompt",
+        log_prompt,
+    ]
+
+    print(f"[RUN] test_compiler on {subgraph_dir}")
+    with open(log_path, "wb") as logf:
+        proc = subprocess.run(cmd, stdout=logf, stderr=subprocess.STDOUT)
+
+    has_nan = parse_correctness_line_for_nan(log_path, log_prompt=log_prompt)
+    print(f"[LOG] {subgraph_dir} -> nan={has_nan} (rc={proc.returncode})")
+    return has_nan
+
+
+# ----------------------------
+# Count nodes in the FX graph
+# ----------------------------
+def count_graph_nodes(gm: torch.fx.GraphModule):
+    """Count the number of nodes in the FX graph."""
+    return len(list(gm.graph.nodes))
+
+
+# ----------------------------
+# Recursive binary classification
+# ----------------------------
+def binary_classify_subgraphs(subgraphs: list, tmp_log_dir: str, compiler: str, device: str, warmup: int, trials: int, log_prompt: str):
+    """
+    Recursive binary classification of subgraphs based on node count.
+    """
+    good = []
+    bad = []
+
+    def solve(lst):
+        if not lst:
+            return
+        if len(lst) == 1:
+            g = lst[0]
+            log_path = os.path.join(g, "compiler_test.log")
+            os.makedirs(os.path.dirname(log_path), exist_ok=True)
+            has_nan = run_test_compiler_on_subgraph(g, log_path, compiler, device, warmup, trials, log_prompt)
+            if has_nan:
+                bad.append(g)
+            else:
+                good.append(g)
+            return
+
+        mid = len(lst) // 2
+        left = lst[:mid]
+        right = lst[mid:]
+
+        # Test left side
+        left_has_nan = False
+        for g in left:
+            tmp_log = os.path.join(tmp_log_dir, "batch_left.log")
+            os.makedirs(os.path.dirname(tmp_log), exist_ok=True)
+            if run_test_compiler_on_subgraph(g, tmp_log, compiler, device, warmup, trials, log_prompt):
+                left_has_nan = True
+                break
+
+        if left_has_nan:
+            solve(left)
+        else:
+            good.extend(left)
+
+        # Test right side
+        right_has_nan = False
+        for g in right:
+            tmp_log = os.path.join(tmp_log_dir, "batch_right.log")
+            os.makedirs(os.path.dirname(tmp_log), exist_ok=True)
+            if run_test_compiler_on_subgraph(g, tmp_log, compiler, device, warmup, trials, log_prompt):
+                right_has_nan = True
+                break
+
+        if right_has_nan:
+            solve(right)
+        else:
+            good.extend(right)
+
+    solve(subgraphs)
+    return good, bad
+
+
+# ----------------------------
+# Main
+# ----------------------------
+def main():
+    parser = argparse.ArgumentParser(description="GraphNet: check log -> if nan -> extract subgraphs -> binary classify via test_compiler")
+    parser.add_argument("--log-file", type=str, required=True, help="Path to main run log to check for nan")
+    parser.add_argument("--model-path", type=str, required=True, help="GraphNet model dir (contains model.py, graph_net.json, inputs...)")
+    parser.add_argument("--output-dir", type=str, default="/tmp/naive_decompose_workspace", help="workspace to dump extracted subgraphs")
+    parser.add_argument("--split-positions", type=int, nargs="*", default=[], help="split positions to pass to extractor")
+    parser.add_argument("--compiler", type=str, default="inductor", help="compiler backend to use when running test_compiler")
+    parser.add_argument("--device", type=str, default="cuda", help="device for test_compiler")
+    parser.add_argument("--warmup", type=int, default=1, help="warmup for test_compiler runs")
+    parser.add_argument("--trials", type=int, default=1, help="trials for test_compiler runs")
+    parser.add_argument("--log-prompt", type=str, default="graph-net-test-compiler-log", help="log prompt used by test_compiler")
+    parser.add_argument("--force-extract", action="store_true", help="always run extractor even if no nan in main log")
+
+    args = parser.parse_args()
+
+    # 1) check main log
+    print(f"[INFO] Checking main log: {args.log_file}")
+    if not os.path.exists(args.log_file):
+        print(f"[WARN] main log not found: {args.log_file}")
+        # we allow forcing extractor or abort
+        if not args.force_extract:
+            print("[ERROR] main log missing and not forcing extraction. Exiting.")
+            sys.exit(2)
+
+    main_log_has_nan = False
+    if os.path.exists(args.log_file):
+        main_log_has_nan = contains_nan_or_inf_in_file(args.log_file)
+        print(f"[INFO] main log contains nan/inf? {main_log_has_nan}")
+
+    if not main_log_has_nan and not args.force_extract:
+        print("[INFO] No NaN found in main log. Exiting without extraction.")
+        sys.exit(0)
+
+    # 2) run extractor to produce subgraphs
+    print("[STEP] Running naive_graph_decomposer to extract subgraphs...")
+    # ensure fresh output dir
+    if os.path.exists(args.output_dir):
+        print(f"[INFO] clearing existing output dir: {args.output_dir}")
+        shutil.rmtree(args.output_dir)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    run_naive_extractor(args.model_path, args.output_dir, args.split_positions)
+
+    # 3) find subgraphs
+    print("[STEP] Searching for subgraphs in output dir...")
+    subgraphs = find_subgraphs(args.output_dir)
+    print(f"[INFO] Found {len(subgraphs)} candidate subgraph dirs")
+
+    if not subgraphs:
+        print("[ERROR] No subgraphs found; make sure extractor produced files (graph_code.json or graph_net.json or model.py)")
+        sys.exit(3)
+
+    # 4) binary classify using test_compiler
+    print("[STEP] Running binary classification on extracted subgraphs...")
+    tmp_log_dir = os.path.join(args.output_dir, "_tmp_logs")
+    os.makedirs(tmp_log_dir, exist_ok=True)
+
+    good, bad = binary_classify_subgraphs(subgraphs, tmp_log_dir, args.compiler, args.device, args.warmup, args.trials, args.log_prompt)
+
+    # 5) output result
+    print("\n===== RESULT =====")
+    print(f"Good subgraphs ({len(good)}):")
+    for g in good:
+        print("  [GOOD]", g)
+    print(f"\nBad subgraphs ({len(bad)}):")
+    for g in bad:
+        print("  [BAD]", g)
+
+    if bad:
+        print("\nDetected bad subgraphs -> exit code 4")
+        sys.exit(4)
+    else:
+        print("\nAll subgraphs OK -> exit code 0")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()