fix

dc3671 · dc3671 · commit de9cfbad57aa · 2025-07-08T13:50:11.000+08:00
Signed-off-by: Zhenhuan Chen &lt;chenzhh3671@gmail.com&gt;
diff --git a/examples/scaffolding/contrib/Dynasor/scaffolding_dynasor_run.py b/examples/scaffolding/contrib/Dynasor/scaffolding_dynasor_run.py
@@ -58,13 +58,13 @@ async def task(prompt: str):
             async for result in llm.generate_async(prompt):
                 i += 1
                 print(">>>", i, result)
-                async for output in result.output:
+                async for output in result.cur_output:
                     print(">>>", i, len(output.outputs[0].token_ids), "\n",
                           output.outputs[0].text)
-            print(
-                f">>> final output {len(result.output.outputs[0].token_ids)}\n",
-                result.output.outputs[0].text)
+            print(f">>> final output {len(result.outputs[0].token_ids)}\n",
+                  result.outputs[0].text)
 
+        # Need to provide LLM's event loop to get results in the middle of the whole process.
         asyncio.run_coroutine_threadsafe(task(prompts[0]), llm.loop).result()
     else:
         results = llm.generate(prompts)
@@ -83,8 +83,8 @@ def main():
 
     prompts = [
         "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\r\n\r\n",
-        # "There exist real numbers $x$ and $y$, both greater than 1, such that $\\log_x\\left(y^x\\right)=\\log_y\\left(x^{4y}\\right)=10$. Find $xy$.",
-        # "Find the largest possible real part of \\[(75+117i)z+\\frac{96+144i}{z}\\]where $z$ is a complex number with $|z|=4$.",
+        "There exist real numbers $x$ and $y$, both greater than 1, such that $\\log_x\\left(y^x\\right)=\\log_y\\left(x^{4y}\\right)=10$. Find $xy$.",
+        "Find the largest possible real part of \\[(75+117i)z+\\frac{96+144i}{z}\\]where $z$ is a complex number with $|z|=4$.",
     ]
 
     llm_worker = TRTLLMWorker.init_with_new_llm(
diff --git a/examples/scaffolding/run_basic_generation.py b/examples/scaffolding/run_basic_generation.py
@@ -19,16 +19,19 @@ def parse_arguments():
 
 
 def test_sync(prompts, proposer_worker):
-    prototype_controller = NativeGenerationController(
-        sampling_params={"temperature": 0.9})
+    prototype_controller = NativeGenerationController(sampling_params={
+        "temperature": 0.9,
+        "max_tokens": 1024,
+    })
 
     llm = ScaffoldingLlm(
         prototype_controller,
         {NativeGenerationController.WorkerTag.GENERATION: proposer_worker},
     )
     results = llm.generate(prompts)
     for result in results:
-        print(result.output.outputs[0].text)
+        print(len(result.outputs[0].token_ids))
+        print(result.outputs[0].text)
     print(f'main shutting down...')
     llm.shutdown()
     print(f'worker shutting down...')
@@ -42,7 +45,7 @@ async def test_async_func(prompt, proposer_worker):
         prototype_controller = NativeGenerationController(
             sampling_params={
                 "temperature": 0.9,
-                "max_tokens": 64
+                "max_tokens": 1024,
             },
             streaming=True,
         )
@@ -55,11 +58,11 @@ async def test_async_func(prompt, proposer_worker):
         async for result in llm.generate_async(prompt):
             i += 1
             print(">>>", i, result)
-            async for output in result.output:
+            async for output in result.cur_output:
                 print(">>>", i, len(output.outputs[0].token_ids), "\n",
                       output.outputs[0].text)
-        print(f">>> final output {len(output.outputs[0].token_ids)}\n",
-              output.outputs[0].text)
+        print(f">>> final output {len(result.outputs[0].token_ids)}\n",
+              result.outputs[0].text)
 
         print(f'main shutting down...')
         llm.shutdown()
diff --git a/tensorrt_llm/scaffolding/contrib/Dynasor/dynasor_controller.py b/tensorrt_llm/scaffolding/contrib/Dynasor/dynasor_controller.py
@@ -102,8 +102,6 @@ def process(self, tasks: List[GenerationTask], **kwargs):
             probe_task.input_str = current_prompt + self.probe_suffix
 
             # For the probe task, append the suffix to force a chain-of-thought leading to an answer.
-            print("[DynasorGenerationController] probe_task")
-            # yield [probe_task, proposer_task]
             yield [proposer_task, probe_task]
 
             # Retrieve the output from the probe task.
@@ -141,10 +139,7 @@ def process(self, tasks: List[GenerationTask], **kwargs):
                                            probe_answers[-1] + "}\n\\]")
                     return
 
-            # if not confident, do another round of generation
-            # print("[DynasorGenerationController] proposer_task")
-            # yield [proposer_task]
-
+            # If not confident, do another round of generation
             # Append the newly generated text from the proposer to the current prompt for the next iteration.
             current_prompt += proposer_task.output_str
 
diff --git a/tensorrt_llm/scaffolding/controller.py b/tensorrt_llm/scaffolding/controller.py
@@ -11,8 +11,6 @@
 from tensorrt_llm.scaffolding.math_utils import get_digit_majority_vote_result
 from tensorrt_llm.scaffolding.task import GenerationTask, Task
 
-# from .result import ScaffoldingOutput
-
 
 class Controller(ABC):
 
@@ -27,7 +25,6 @@ def generate(self, prompt: str, **kwargs) -> GenerationResult:
 
         yield from self.process([task], **kwargs)
 
-        # print("[Controller.generate] task.output in generate", task.result)
         return task.create_scaffolding_output()
 
     def process(self, tasks: List[Task], **kwargs):
diff --git a/tensorrt_llm/scaffolding/result.py b/tensorrt_llm/scaffolding/result.py
@@ -17,35 +17,33 @@ class ScaffoldingResult:
     def __init__(self, streaming_event: Optional[asyncio.Event] = None):
         super().__init__()
         self.aqueue = asyncio.Queue()
-        self.output = None
+        self.cur_output = None
         self._done = False
         self.task_collections = None
         self.streaming_event = streaming_event
 
     def set_output(self, output: GenerationResult):
-        print("[set_output] called")
         self.aqueue.put_nowait(output)
         self._done = True
-        print("[set_output] put")
 
     async def set_output_async(self, output: GenerationResult):
-        print("[set_output_async] called")
         await self.aqueue.put(output)
-        print("[set_output_async] put")
 
     def set_task_collections(self, task_collections: Mapping[str,
                                                              "TaskCollection"]):
         self.task_collections = task_collections
 
+    @property
+    def outputs(self):
+        return self.cur_output.outputs if self.cur_output else None
+
     @property
     def finished(self) -> bool:
-        return self.output is not None and self.output.finished
+        return self.cur_output is not None and self.cur_output.finished
 
     async def _aresult_step(self):
-        print("[_aresult_step] waiting for response")
         # TODO: error handling or raise exception?
         response = await self.aqueue.get()
-        print("[_aresult_step] response received")
         if response is None:
             raise Exception("ScaffoldingLlm execution failed")
         self._handle_response(response)
@@ -79,7 +77,6 @@ def __aiter__(self):
 
     async def __anext__(self):
         if self.finished:
-            print("[_aresult_step] streaming_event set")
             self.streaming_event.set() if self.streaming_event else None
         if self._done and self.finished:
             raise StopAsyncIteration
@@ -88,4 +85,4 @@ async def __anext__(self):
         return self
 
     def _handle_response(self, response: GenerationResult):
-        self.output = response  # .outputs[0].text
+        self.cur_output = response
diff --git a/tensorrt_llm/scaffolding/scaffolding_llm.py b/tensorrt_llm/scaffolding/scaffolding_llm.py
@@ -30,7 +30,6 @@ def __init__(
         self.workers = workers
 
         self.loop = self._get_loop()
-        print("own_loop:", self.own_loop)
         asyncio.set_event_loop(self.loop)
         self.task_queue = asyncio.Queue()
         self.main_loop_stop_event = asyncio.Event()
@@ -85,7 +84,6 @@ async def _handle_task_list(self,
         for task in tasks:
             if task.streaming:
                 await request.result.set_output_async(task.result)
-                print("[_handle_task_list] streaming_event wait")
                 self.streaming_event.clear()
                 await self.streaming_event.wait()
 
@@ -113,8 +111,6 @@ async def _handle_single_request(self, request: ScaffoldingRequest):
         finally:
             self.running_req_count -= 1
             self._maybe_schedule()
-            print(f"[Request finished] running_req_count: "
-                  f"{self.running_req_count}")
 
     def _create_controller_generator(self, request: ScaffoldingRequest):
         """Create a generator wrapper for the controller."""
@@ -141,7 +137,6 @@ def _maybe_schedule(self, request: ScaffoldingRequest = None):
 
         while (self.running_req_count < self.max_parallel_requests
                and self.pending_queue):
-            print(f"[Scheduling] running_req_count: {self.running_req_count}")
             next_request = self.pending_queue.popleft()
             self._schedule_request(next_request)
 
diff --git a/tensorrt_llm/scaffolding/task.py b/tensorrt_llm/scaffolding/task.py
@@ -43,7 +43,7 @@ class GenerationTask(Task):
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[Dict[str, float]] = None
     num_logprobs: Optional[int] = None
-    max_tokens: Optional[int] = 2048
+    max_tokens: Optional[int] = None
     n: int = 1
     presence_penalty: Optional[float] = 0.0
     seed: Optional[int] = None
diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py
@@ -187,18 +187,16 @@ def convert_task_params(self, task: GenerationTask):
     async def generation_handler(self, task: GenerationTask) -> TaskStatus:
         sampling_params = self.convert_task_params(task)
 
-        print("[generation_handler] task.streaming:", task.streaming)
+        # If the task is streaming, we will return result directly for
+        # async iteration outside. Otherwise, we will wait.
         if task.streaming:
-            # If the task is streaming, we need to use the async generate method
-            # and handle the streaming output.
             result = self.llm.generate_async(task.input_str,
                                              sampling_params=sampling_params,
                                              streaming=True)
         else:
             result = await self.llm.generate_async(
                 task.input_str, sampling_params=sampling_params)
         task.result = result
-        # print("[generation_handler] task.result:", task.result)
 
         # TODO: error handle
         return TaskStatus.SUCCESS