Merge branch 'main' into RC-TEST-2.9

svekars · web-flow · commit 2aef2a9ad13c · 2025-09-15T10:36:37.000-07:00
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -16,7 +16,7 @@ pandocfilters==1.5.1
 markdown==3.8.2
 
 # PyTorch Theme
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git@e0cbfaf789a51899859f2c7626e0ad3a78ad4c2e#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@3066b6f62b0da01c52570c928281bca68287933d#egg=pytorch_sphinx_theme2
 
 # Tutorial dependencies
 tqdm==4.66.1
diff --git a/beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py b/beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py
@@ -96,19 +96,6 @@ def forward(self, x):
 except Exception as e:
     print(e)
 
-###############################################################################
-# Using :func:`torch.onnx.export` with JIT Tracing
-# ----------------------------------------
-#
-# When exporting the model using :func:`torch.onnx.export` with the dynamo=True
-# argument, the exporter defaults to using JIT tracing. This fallback allows
-# the model to export, but the resulting ONNX graph may not faithfully represent
-# the original model logic due to the limitations of tracing.
-
-
-onnx_program = torch.onnx.export(model, (x,), dynamo=True) 
-print(onnx_program.model)
-
 
 ###############################################################################
 # Suggested Patch: Refactoring with :func:`torch.cond`
@@ -182,4 +169,4 @@ def neg(x):
 #
 # .. toctree::
 #    :hidden:
-#
+#
diff --git a/recipes_index.rst b/recipes_index.rst
@@ -333,6 +333,13 @@ from our full-length tutorials.
    :link: recipes/distributed_comm_debug_mode.html
    :tags: Distributed-Training
 
+.. customcarditem::
+   :header: Reducing AoT cold start compilation time with regional compilation
+   :card_description: Learn how to use regional compilation to control AoT cold start compile time
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/regional_aot.html
+   :tags: Model-Optimization
+
 .. End of tutorial card section
 
 .. -----------------------------------------
@@ -378,6 +385,7 @@ from our full-length tutorials.
    recipes/torch_compile_caching_tutorial
    recipes/torch_compile_caching_configuration_tutorial
    recipes/regional_compilation
+   recipes/regional_aot
    recipes/intel_extension_for_pytorch.html
    recipes/intel_neural_compressor_for_pytorch
    recipes/distributed_device_mesh
diff --git a/recipes_source/regional_aot.py b/recipes_source/regional_aot.py
@@ -0,0 +1,241 @@
+
+"""
+Reducing AoT cold start compilation time with regional compilation
+============================================================================
+
+**Author:** `Sayak Paul <https://huggingface.co/sayakpaul>`_, `Charles Bensimon <https://huggingface.co/cbensimon>`_, `Angela Yi <https://github.com/angelayi>`_
+
+In the `regional compilation recipe <https://docs.pytorch.org/tutorials/recipes/regional_compilation.html>`__, we showed
+how to reduce cold start compilation times while retaining (almost) full compilation benefits. This was demonstrated for
+just-in-time (JIT) compilation.
+
+This recipe shows how to apply similar principles when compiling a model ahead-of-time (AoT). If you
+are not familiar with AOTInductor and ``torch.export``, we recommend you to check out `this tutorial <https://docs.pytorch.org/tutorials/recipes/torch_export_aoti_python.html>`__.
+
+Prerequisites
+----------------
+
+* Pytorch 2.6 or later
+* Familiarity with regional compilation
+* Familiarity with AOTInductor and ``torch.export``
+
+Setup
+-----
+Before we begin, we need to install ``torch`` if it is not already
+available.
+
+.. code-block:: sh
+
+   pip install torch
+"""
+
+######################################################################
+# Steps
+# -----
+#
+# In this recipe, we will follow the same steps as the regional compilation recipe mentioned above:
+#
+# 1. Import all necessary libraries.
+# 2. Define and initialize a neural network with repeated regions.
+# 3. Measure the compilation time of the full model and the regional compilation with AoT.
+#
+# First, let's import the necessary libraries for loading our data:
+#
+
+import torch
+torch.set_grad_enabled(False)
+
+from time import perf_counter
+
+###################################################################################
+# Defining the Neural Network
+# ---------------------------
+#
+# We will use the same neural network structure as the regional compilation recipe.
+#
+# We will use a network, composed of repeated layers. This mimics a
+# large language model, that typically is composed of many Transformer blocks. In this recipe,
+# we will create a ``Layer`` using the ``nn.Module`` class as a proxy for a repeated region.
+# We will then create a ``Model`` which is composed of 64 instances of this
+# ``Layer`` class.
+#
+class Layer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(10, 10)
+        self.relu1 = torch.nn.ReLU()
+        self.linear2 = torch.nn.Linear(10, 10)
+        self.relu2 = torch.nn.ReLU()
+
+    def forward(self, x):
+        a = self.linear1(x)
+        a = self.relu1(a)
+        a = torch.sigmoid(a)
+        b = self.linear2(a)
+        b = self.relu2(b)
+        return b
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(10, 10)
+        self.layers = torch.nn.ModuleList([Layer() for _ in range(64)])
+
+    def forward(self, x):
+        # In regional compilation, the self.linear is outside of the scope of ``torch.compile``.
+        x = self.linear(x)
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+##################################################################################
+# Compiling the model ahead-of-time
+# ---------------------------------
+#
+# Since we're compiling the model ahead-of-time, we need to prepare representative
+# input examples, that we expect the model to see during actual deployments.
+#
+# Let's create an instance of ``Model`` and pass it some sample input data.
+#
+
+model = Model().cuda()
+input = torch.randn(10, 10, device="cuda")
+output = model(input)
+print(f"{output.shape=}")
+
+###############################################################################################
+# Now, let's compile our model ahead-of-time. We will use ``input`` created above to pass
+# to ``torch.export``. This will yield a ``torch.export.ExportedProgram`` which we can compile.
+
+path = torch._inductor.aoti_compile_and_package(
+    torch.export.export(model, args=(input,))
+)
+
+#################################################################
+# We can load from this ``path`` and use it to perform inference.
+
+compiled_binary = torch._inductor.aoti_load_package(path)
+output_compiled = compiled_binary(input)
+print(f"{output_compiled.shape=}")
+
+######################################################################################
+# Compiling _regions_ of the model ahead-of-time
+# ----------------------------------------------
+#
+# Compiling model regions ahead-of-time, on the other hand, requires a few key changes.
+#
+# Since the compute pattern is shared by all the blocks that
+# are repeated in a model (``Layer`` instances in this cases), we can just
+# compile a single block and let the inductor reuse it.
+
+model = Model().cuda()
+path = torch._inductor.aoti_compile_and_package(
+    torch.export.export(model.layers[0], args=(input,)),
+    inductor_configs={
+        # compile artifact w/o saving params in the artifact
+        "aot_inductor.package_constants_in_so": False,
+    }
+)
+
+###################################################
+# An exported program (``torch.export.ExportedProgram``) contains the Tensor computation,
+# a ``state_dict`` containing tensor values of all lifted parameters and buffer alongside
+# other metadata. We specify the ``aot_inductor.package_constants_in_so`` to be ``False`` to
+# not serialize the model parameters in the generated artifact.
+#
+# Now, when loading the compiled binary, we can reuse the existing parameters of
+# each block. This lets us take advantage of the compiled binary obtained above.
+#
+
+for layer in model.layers:
+    compiled_layer = torch._inductor.aoti_load_package(path)
+    compiled_layer.load_constants(
+        layer.state_dict(), check_full_update=True, user_managed=True
+    )
+    layer.forward = compiled_layer
+
+output_regional_compiled = model(input)
+print(f"{output_regional_compiled.shape=}")
+
+#####################################################
+# Just like JIT regional compilation, compiling regions within a model ahead-of-time
+# leads to significantly reduced cold start times. The actual number will vary from
+# model to model.
+#
+# Even though full model compilation offers the fullest scope of optimizations,
+# for practical purposes and depending on the type of model, we have seen regional
+# compilation (both JiT and AoT) providing similar speed benefits, while drastically
+# reducing the cold start times.
+
+###################################################
+# Measuring compilation time
+# --------------------------
+# Next, let's measure the compilation time of the full model and the regional compilation.
+#
+
+def measure_compile_time(input, regional=False):
+    start = perf_counter()
+    model = aot_compile_load_model(regional=regional)
+    torch.cuda.synchronize()
+    end = perf_counter()
+    # make sure the model works.
+    _ = model(input)
+    return end - start
+
+def aot_compile_load_model(regional=False) -> torch.nn.Module:
+    input = torch.randn(10, 10, device="cuda")
+    model = Model().cuda()
+
+    inductor_configs = {}
+    if regional:
+        inductor_configs = {"aot_inductor.package_constants_in_so": False}
+
+    # Reset the compiler caches to ensure no reuse between different runs
+    torch.compiler.reset()
+    with torch._inductor.utils.fresh_inductor_cache():
+        path = torch._inductor.aoti_compile_and_package(
+            torch.export.export(
+                model.layers[0] if regional else model,
+                args=(input,)
+            ),
+            inductor_configs=inductor_configs,
+        )
+
+        if regional:
+            for layer in model.layers:
+                compiled_layer = torch._inductor.aoti_load_package(path)
+                compiled_layer.load_constants(
+                    layer.state_dict(), check_full_update=True, user_managed=True
+                )
+                layer.forward = compiled_layer
+        else:
+            model = torch._inductor.aoti_load_package(path)
+    return model
+
+input = torch.randn(10, 10, device="cuda")
+full_model_compilation_latency = measure_compile_time(input, regional=False)
+print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds")
+
+regional_compilation_latency = measure_compile_time(input, regional=True)
+print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds")
+
+assert regional_compilation_latency < full_model_compilation_latency
+
+############################################################################
+# There may also be layers in a model incompatible with compilation. So,
+# full compilation will result in a fragmented computation graph resulting
+# in potential latency degradation. In these case, regional compilation
+# can be beneficial.
+#
+
+############################################################################
+# Conclusion
+# -----------
+#
+# This recipe shows how to control the cold start time when compiling your
+# model ahead-of-time. This becomes effective when your model has repeated
+# blocks, which is typically seen in large generative models. We used this
+# recipe on various models to speed up real-time performance. Learn more
+# `here <https://huggingface.co/blog/zerogpu-aoti>`__.
diff --git a/redirects.py b/redirects.py
@@ -37,4 +37,5 @@
     "recipes/bundled_inputs.html": "../index.html",
     "recipes/recipes_index.html": "../recipes_index.html",
     "recipes/torchserve_vertexai_tutorial.html": "../index.html",
+    "unstable_source/vulkan_workflow.rst": "../index.html",
 }
diff --git a/unstable_index.rst b/unstable_index.rst
@@ -171,7 +171,6 @@ decide if we want to upgrade the level of commitment or to fail fast.
    unstable/inductor_cpp_wrapper_tutorial
    unstable/inductor_windows
    unstable/vmap_recipe
-   unstable/vulkan_workflow
    unstable/nestedtensor
    unstable/maskedtensor_overview
    unstable/maskedtensor_sparsity
diff --git a/unstable_source/README.txt b/unstable_source/README.txt
@@ -2,16 +2,12 @@ Prototype Tutorials
 ------------------
 1. distributed_rpc_profiling.rst
            Profiling PyTorch RPC-Based Workloads
-           https://github.com/pytorch/tutorials/blob/main/prototype_source/distributed_rpc_profiling.rst
+           https://github.com/pytorch/tutorials/blob/main/unstable_source/distributed_rpc_profiling.rst
 
-2. vulkan_workflow.rst
-           Vulkan Backend User Workflow
-           https://pytorch.org/tutorials/prototype/vulkan_workflow.html
-           
-3. flight_recorder_tutorial.rst
+2. flight_recorder_tutorial.rst
 	   Flight Recorder User Guide
 	   https://pytorch.org/tutorials/prototype/flight_recorder_tutorial.html
 
-4. python_extension_autoload.rst
+3. python_extension_autoload.rst
 	   Autoloading Out-of-Tree Extension
-	   https://pytorch.org/tutorials/prototype/python_extension_autoload.html
+	   https://github.com/pytorch/tutorials/blob/main/unstable_source/python_extension_autoload.rst
diff --git a/unstable_source/vulkan_workflow.rst b/unstable_source/vulkan_workflow.rst

Original file line number	Diff line number	Diff line change
`@@ -37,4 +37,5 @@`
`37`	`37`	`"recipes/bundled_inputs.html": "../index.html",`
`38`	`38`	`"recipes/recipes_index.html": "../recipes_index.html",`
`39`	`39`	`"recipes/torchserve_vertexai_tutorial.html": "../index.html",`
	`40`	`+ "unstable_source/vulkan_workflow.rst": "../index.html",`
`40`	`41`	`}`