ServiceNow · AlexPiche · Jun 16, 2025 · Jun 17, 2025 · Jun 17, 2025 · Jun 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -120,6 +120,7 @@ celerybeat.pid
 
 # SageMath parsed files
 *.sage.py
+node_modules/
 
 # Environments
 .env
@@ -185,4 +186,4 @@ results
 results/
 data/
 cache/
-dump.rdb
+dump.rdb
diff --git a/conf/base.yaml b/conf/base.yaml
@@ -5,6 +5,7 @@ defaults:
   - _self_
 
 seed: 42
+use_ray: false
 
 finetune:
   seed: ${..seed}
@@ -23,9 +24,9 @@ preprocess:
   input: actor
   output: training_data
   n_workers: 8
-  chunk_n_groups: 2
+  chunk_n_groups: 8
   # queue for loaded raw groups
-  raw_queue_size: 8
+  raw_queue_size: 128
   # queue for processed chunks of multiple groups  
   input_queue_size: 32
   # queue for ready chunks for multiple groups
@@ -47,7 +48,7 @@ llm:
     temperature: 1.0
 test_llm:
   parameters: 
-    max_tokens: 16000
+    max_tokens: 8192
     temperature: 1.0
     top_p: 0.95
     top_k: 50
@@ -67,6 +68,7 @@ vllm_config:
     tensor-parallel-size: 1
     pipeline-parallel-size: 1
     generation-config: vllm
+    max_model_len: 16000
 
 world:
   replicas: 1
@@ -75,10 +77,13 @@ world:
   preprocessor_fraction: 0
   finetune_fraction: 4
 
-  env_replicas: 2
+  # Number of environment servers per actor VLLM server
+  env_replicas_per_actor: 1
 
   actor_group_port: 9000
   environment_start_port: 7777
+# Remote vs embedded environment execution strategy
+  environment_mode: remote
 # this will be autocreated based on the config
 jobs: []
 

diff --git a/conf/finetune/base.yaml b/conf/finetune/base.yaml
@@ -36,7 +36,7 @@ learning_rate: 1e-6
 # How much to clip the gradient (no clipping if null)
 gradient_clipping_threshold: 0.3
 # Learning rate scheduler type (indexed by completed_steps).
-lr_scheduler_type: cosine # could be cosine, constant_with_warmup
+lr_scheduler_type: constant # could be cosine, constant_with_warmup
 # Number of warmup (completed) steps in the learning rate schedule.
 num_warmup_steps: 50
 # Number of gradient accumulation steps.

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
@@ -0,0 +1,155 @@
+defaults:
+    - base
+    - override finetune: grpo
+    - _self_
+
+use_ray: true
+
+llm:
+  use_cache: false
+  parameters:
+    max_tokens: 8192
+
+test_llm:
+  parameters:
+    max_tokens: 8192
+
+rewards:
+  correct_answer_not_finished: 0.0
+  buffer_tokens: 2000
+
+actor:
+  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout_with_local_env
+  system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
+  rollout_workers: 64
+  llm_max_rollouts: 256
+  problem_queue_size: 256
+  task_template: |-
+    {task}
+  shared_memory_entry_size: 200000000
+
+preprocess:
+  shared_memory_entry_size: 2000000000
+
+finetune:
+  seq_length: 32000
+  seq_parallel: 8
+
+dataset_loader: pipelinerl.domains.math.load_datasets
+train_dataset_names:
+- open_reasoner_zero_57k
+- open_reasoner_zero_extended_72k 
+test_dataset_names:
+  - aime_2025
+
+vllm_config:
+  use_v1: true
+  vllm_kwargs:
+    enable-auto-tool-choice: ""
+    tool-call-parser: rl_tool
+    tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
+    max-num-seqs: 256
+    max-num-batched-tokens: 32000
+    max_model_len: 32000
+    gpu-memory-utilization: 0.9
+
+environment:
+  _target_: tapeagents.mcp.MCPEnvironment
+  config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
+  tools_whitelist:
+    - run_python_code
+  read_timeout_seconds: 600
+  use_cache: false
+
+
+world:
+  env_replicas_per_actor: 8
+  environment_mode: embedded
+
+agent_max_loops: 3
+agent:
+  _target_: tapeagents.agent.Agent
+  name : mcp_agent
+  max_iterations: 3
+  store_llm_calls: true
+  templates:
+    system_prompt: |
+      You are a math-focused AI Agent. Solve problems by combining clear symbolic reasoning
+      with short, deterministic Python code.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Always present the final answer in LaTeX \boxed{{}}.
+      Do not express emotions or opinions about user questions.
+
+      Workflow:
+      1. Draft a brief plan in plain text.
+      2. Execute one run_python_code call to compute or verify the result.
+      3. Finalize by calling MathAnswer with the LaTeX-formatted answer.
+
+      Python execution policy (run_python_code):
+      - Use Python strictly for pure computation to verify and validate the final answer.
+      - No network, file system, OS or environment access.
+      - Keep snippets minimal and self-contained; avoid large outputs and long-running loops; print only the final result.
+
+      Validation:
+      - Cross-check results (alternative derivation, invariants, higher precision) before finalizing.
+      - If execution fails, propose the minimal fix and retry.
+      Keep replies direct and avoid unnecessary text.
+    allowed_tools: |
+      You can call the following tools:
+      {tools_description}
+      - run_python_code: deterministic math code; print only the final value.
+      - MathAnswer: return the LaTeX \boxed{{}} answer when the solution is verified.
+      Always verify with run_python_code before invoking MathAnswer.
+    thought_format: |
+      Important! Respond with the plain text, do not include any JSON or code.
+      Do not output anything besides what I asked in this message.
+    allowed_steps: |
+      Workflow summary:
+      - Plan briefly in plain text.
+      - Call run_python_code exactly once per loop to compute/verify.
+      - Finish with a single MathAnswer tool call carrying the \boxed{{}} result.
+    format: |
+      For finalization, reply with a single short sentence that ends in the \boxed{{}} answer,
+      immediately followed by the MathAnswer function call containing the same \boxed{{}} value.
+      Never emit unrelated JSON wrappers or duplicate the final thought.
+
+
+  nodes:
+    - _target_: tapeagents.nodes.StandardNode
+      name: plan
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Produce a concise math plan (formulas/checks). You will ALWAYS verify by executing Python code.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+      trim_obs_except_last_n: 2
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: code
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        ALWAYS call run_python_code once to compute/verify the result.
+        Use exact, deterministic code; print only the final scalar or tuple.
+        If code fails, fix minimally and call run_python_code again after reviewing the error.
+      use_known_actions: true
+      use_function_calls: true
+      trim_obs_except_last_n: 2
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: finalize
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Read the last Python stdout value. First, state the answer in one short sentence that ends with LaTeX \boxed{{}}.
+        Immediately after that sentence, call the MathAnswer tool exactly once with:
+          name: MathAnswer
+          arguments: {"answer": "<final answer in LaTeX \\boxed{}>"}
+        Do not add any extra text around the tool call. Once the sentence is emitted, return only the MathAnswer function call.
+      steps:
+        - pipelinerl.domains.mcp.steps.MathAnswer
+      use_known_actions: true
+      use_function_calls: true
+      trim_obs_except_last_n: 2
+      next_node: code
+
+model_path: Qwen/Qwen3-8B
+# model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
@@ -0,0 +1,11 @@
+{
+    "mcpServers": {
+        "python_exec": {
+            "command": "bash",
+            "args": [
+                "-c",
+                "deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
+                ]
+        }
+    }
+}