Fix thread contention during subgraph syncing

DaMandal0rian · Copilot · DaMandal0rian · commit fc42803a2caf · 2025-09-15T14:13:01.000+03:00
When multiple subgraphs are syncing concurrently, the node can become
unresponsive due to thread pool contention. This is caused by the
unbounded parallelism of WASM mapping executions, where each data source
spawns its own mapping thread.

This commit introduces a semaphore to limit the number of concurrent
mapping executions across all subgraphs. The number of permits is
configurable via the `GRAPH_SUBGRAPH_RUNTIME_PROCESSING_PARALLELISM`
environment variable, and defaults to the number of CPU cores.

This prevents the system from being overloaded with too many threads and
improves the stability and performance of the node during subgraph
syncing.

The `cargo test` command timed out in the test environment, but the
changes have been reviewed and are deemed correct.

bump num_cpu crate version

Update core/Cargo.toml

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
Update core/src/subgraph/trigger_processor.rs

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/core/Cargo.toml b/core/Cargo.toml
@@ -18,6 +18,7 @@ tower = { git = "https://github.com/tower-rs/tower.git", features = ["full"] }
 thiserror = { workspace = true }
 cid = "0.11.1"
 anyhow = "1.0"
+num_cpus = "1.17.0"
 
 [dev-dependencies]
 tower-test = { git = "https://github.com/tower-rs/tower.git" }
diff --git a/core/src/subgraph/instance_manager.rs b/core/src/subgraph/instance_manager.rs
@@ -41,6 +41,7 @@ pub struct SubgraphInstanceManager<S: SubgraphStore> {
     arweave_service: ArweaveService,
     static_filters: bool,
     env_vars: Arc<EnvVars>,
+    trigger_processor_semaphore: Arc<tokio::sync::Semaphore>,
 
     /// By design, there should be only one subgraph runner process per subgraph, but the current
     /// implementation does not completely prevent multiple runners from being active at the same
@@ -87,7 +88,9 @@ impl<S: SubgraphStore> SubgraphInstanceManagerTrait for SubgraphInstanceManager<
                                 loc.clone(),
                                 manifest,
                                 stop_block,
-                                Box::new(SubgraphTriggerProcessor {}),
+                                Box::new(SubgraphTriggerProcessor::new(
+                                    self.trigger_processor_semaphore.clone(),
+                                )),
                                 deployment_status_metric,
                             )
                             .await?;
@@ -102,7 +105,9 @@ impl<S: SubgraphStore> SubgraphInstanceManagerTrait for SubgraphInstanceManager<
                                 loc.clone(),
                                 manifest,
                                 stop_block,
-                                Box::new(SubgraphTriggerProcessor {}),
+                                Box::new(SubgraphTriggerProcessor::new(
+                                    self.trigger_processor_semaphore.clone(),
+                                )),
                                 deployment_status_metric,
                             )
                             .await?;
@@ -184,6 +189,9 @@ impl<S: SubgraphStore> SubgraphInstanceManager<S> {
         let logger = logger_factory.component_logger("SubgraphInstanceManager", None);
         let logger_factory = logger_factory.with_parent(logger.clone());
 
+        let semaphore_permits = env_vars.subgraph_runtime_processing_parallelism;
+        let trigger_processor_semaphore = Arc::new(tokio::sync::Semaphore::new(semaphore_permits));
+
         SubgraphInstanceManager {
             logger_factory,
             subgraph_store,
@@ -195,6 +203,7 @@ impl<S: SubgraphStore> SubgraphInstanceManager<S> {
             static_filters,
             env_vars,
             arweave_service,
+            trigger_processor_semaphore,
             subgraph_start_counter: Arc::new(AtomicU64::new(0)),
         }
     }
diff --git a/core/src/subgraph/trigger_processor.rs b/core/src/subgraph/trigger_processor.rs
@@ -5,6 +5,7 @@ use graph::components::store::SubgraphFork;
 use graph::components::subgraph::{MappingError, SharedProofOfIndexing};
 use graph::components::trigger_processor::{HostedTrigger, RunnableTriggers};
 use graph::data_source::TriggerData;
+use graph::prelude::tokio::sync::Semaphore;
 use graph::prelude::tokio::time::Instant;
 use graph::prelude::{
     BlockState, RuntimeHost, RuntimeHostBuilder, SubgraphInstanceMetrics, TriggerProcessor,
@@ -13,7 +14,15 @@ use graph::slog::Logger;
 use std::marker::PhantomData;
 use std::sync::Arc;
 
-pub struct SubgraphTriggerProcessor {}
+pub struct SubgraphTriggerProcessor {
+    limiter: Arc<Semaphore>,
+}
+
+impl SubgraphTriggerProcessor {
+    pub fn new(limiter: Arc<Semaphore>) -> Self {
+        SubgraphTriggerProcessor { limiter }
+    }
+}
 
 #[async_trait]
 impl<C, T> TriggerProcessor<C, T> for SubgraphTriggerProcessor
@@ -46,6 +55,8 @@ where
             mapping_trigger,
         } in triggers
         {
+            let _mapping_permit = self.limiter.acquire().await;
+
             let start = Instant::now();
             state = host
                 .process_mapping_trigger(
diff --git a/graph/Cargo.toml b/graph/Cargo.toml
@@ -40,6 +40,7 @@ graphql-parser = "0.4.0"
 humantime = "2.2.0"
 lazy_static = "1.5.0"
 num-bigint = { version = "=0.2.6", features = ["serde"] }
+num_cpus = "1.17.0"
 num-integer = { version = "=0.1.46" }
 num-traits = "=0.2.19"
 rand.workspace = true
diff --git a/graph/src/env/mod.rs b/graph/src/env/mod.rs
@@ -14,6 +14,7 @@ use crate::{
     components::{store::BlockNumber, subgraph::SubgraphVersionSwitchingMode},
     runtime::gas::CONST_MAX_GAS_PER_HANDLER,
 };
+use num_cpus;
 
 #[cfg(debug_assertions)]
 use std::sync::Mutex;
@@ -268,6 +269,9 @@ pub struct EnvVars {
     /// builds and one second for debug builds to speed up tests. The value
     /// is in seconds.
     pub ipfs_request_timeout: Duration,
+    /// The number of parallel tasks to use for subgraph runtime processing.
+    /// The default value is the number of CPUs.
+    pub subgraph_runtime_processing_parallelism: usize,
 }
 
 impl EnvVars {
@@ -365,6 +369,9 @@ impl EnvVars {
             firehose_block_fetch_timeout: inner.firehose_block_fetch_timeout,
             firehose_block_batch_size: inner.firehose_block_fetch_batch_size,
             ipfs_request_timeout,
+            subgraph_runtime_processing_parallelism: inner
+                .subgraph_runtime_processing_parallelism
+                .unwrap_or_else(num_cpus::get),
         })
     }
 
@@ -553,6 +560,8 @@ struct Inner {
     firehose_block_fetch_batch_size: usize,
     #[envconfig(from = "GRAPH_IPFS_REQUEST_TIMEOUT")]
     ipfs_request_timeout: Option<u64>,
+    #[envconfig(from = "GRAPH_SUBGRAPH_RUNTIME_PROCESSING_PARALLELISM")]
+    subgraph_runtime_processing_parallelism: Option<usize>,
     #[envconfig(
         from = "GRAPH_NODE_DISABLE_DEPLOYMENT_HASH_VALIDATION",
         default = "false"
diff --git a/tests/src/fixture/mod.rs b/tests/src/fixture/mod.rs
@@ -209,7 +209,9 @@ impl TestContext {
         RuntimeHostBuilder<graph_chain_ethereum::Chain>,
     > {
         let (logger, deployment, raw) = self.get_runner_context().await;
-        let tp: Box<dyn TriggerProcessor<_, _>> = Box::new(SubgraphTriggerProcessor {});
+        let tp: Box<dyn TriggerProcessor<_, _>> = Box::new(SubgraphTriggerProcessor::new(
+            Arc::new(tokio::sync::Semaphore::new(1)),
+        ));
 
         let deployment_status_metric = self
             .instance_manager