fix recompile in distrubuted mode with given path

refraction-ray · refraction-ray · commit 03d813285006 · 2025-11-26T17:26:57.000+08:00
diff --git a/examples/multi_host/multicontroller_vqe_with_path.py b/examples/multi_host/multicontroller_vqe_with_path.py
@@ -98,16 +98,17 @@ def run_vqe_main(coordinator_address: str, num_processes: int, process_id: int):
     # The contractor will use this concrete array to run its (now internal)
     # "find path on 0 and broadcast" logic.},
 
+    # Shard the parameters onto devices for the actual GPU/TPU computation.
+    params_sharding = NamedSharding(global_mesh, P(*([None] * len(params_shape))))
+    params = jax.device_put(params_cpu, params_sharding)
+
     DC = DistributedContractor.from_path(
         filepath="tree.pkl",
         nodes_fn=nodes_fn,
+        params=params,
         mesh=global_mesh,
     )
 
-    # Shard the parameters onto devices for the actual GPU/TPU computation.
-    params_sharding = NamedSharding(global_mesh, P(*([None] * len(params_shape))))
-    params = jax.device_put(params_cpu, params_sharding)
-
     # Initialize the optimizer and its state.
     optimizer = optax.adam(2e-2)
     opt_state = optimizer.init(params)  # Can init directly with sharded params
diff --git a/examples/multi_host/slurm_vqe_with_path.py b/examples/multi_host/slurm_vqe_with_path.py
@@ -83,17 +83,17 @@ def run_vqe_main():
     # Broadcast the CPU array. Now all processes have a concrete `params_cpu`.
     # This is CRITICAL to prevent the NoneType error upon contractor initialization.
     params_cpu = broadcast_py_object(params_cpu)
+    # Shard the parameters onto devices for the actual GPU/TPU computation.
+    params_sharding = NamedSharding(global_mesh, P(*([None] * len(params_shape))))
+    params = jax.device_put(params_cpu, params_sharding)
 
     DC = DistributedContractor.from_path(
         filepath="tree.pkl",
         nodes_fn=nodes_fn,
         mesh=global_mesh,
+        params=params,
     )
 
-    # Shard the parameters onto devices for the actual GPU/TPU computation.
-    params_sharding = NamedSharding(global_mesh, P(*([None] * len(params_shape))))
-    params = jax.device_put(params_cpu, params_sharding)
-
     # Initialize the optimizer and its state.
     optimizer = optax.adam(2e-2)
     opt_state = optimizer.init(params)  # Can init directly with sharded params
diff --git a/llm.md b/llm.md
@@ -142,6 +142,6 @@ pip install -r requirements/requirements-types.txt
 
 ### Branch Strategy
 
-- main/master branch for stable releases
+- master branch for stable releases
 - beta branch for nightly builds (as seen in nightly_release.yml)
 - pull requests for feature development
diff --git a/tensorcircuit/experimental.py b/tensorcircuit/experimental.py
@@ -780,6 +780,10 @@ def __init__(
             logger.info("DistributedContractor is running on a single device.")
 
         self._params_template = params
+        self.params_sharding = jaxlib.tree_util.tree_map(
+            lambda x: NamedSharding(self.mesh, P(*((None,) * x.ndim))),
+            self._params_template,
+        )
         self._backend = "jax"
         self._compiled_v_fns: Dict[
             Tuple[Callable[[Tensor], Tensor], str],
@@ -932,6 +936,7 @@ def from_path(
         nodes_fn: Callable[[Tensor], List[Gate]],
         devices: Optional[List[Any]] = None,  # backward compatibility
         mesh: Optional[Any] = None,
+        params: Any = None,
     ) -> "DistributedContractor":
         with open(filepath, "rb") as f:
             tree_data = pickle.load(f)
@@ -940,7 +945,7 @@ def from_path(
         # We pass the loaded `tree_data` directly to __init__ to trigger the second workflow.
         return cls(
             nodes_fn=nodes_fn,
-            params=None,
+            params=params,
             mesh=mesh,
             devices=devices,
             tree_data=tree_data,
@@ -1107,19 +1112,15 @@ def global_aggregated_fn(
 
             #  Compile the global function with jax.jit and specify shardings.
             # `params` are replicated (available everywhere).
-            params_sharding = jaxlib.tree_util.tree_map(
-                lambda x: NamedSharding(self.mesh, P(*((None,) * x.ndim))),
-                self._params_template,
-            )
 
-            in_shardings = (params_sharding, self.sharding)
+            in_shardings = (self.params_sharding, self.sharding)
 
             if is_grad_fn:
                 # Returns (value, grad), so out_sharding must be a 2-tuple.
                 # `value` is a replicated scalar -> P()
                 sharding_for_value = NamedSharding(self.mesh, P())
                 # `grad` is a replicated PyTree with the same structure as params.
-                sharding_for_grad = params_sharding
+                sharding_for_grad = self.params_sharding
                 out_shardings = (sharding_for_value, sharding_for_grad)
             else:
                 # Returns a single scalar value -> P()