Cleaned up PR significantly.

finbarrtimbers · finbarrtimbers · commit 956076b54435 · 2025-09-23T06:52:43.000-06:00
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
@@ -82,7 +82,7 @@
 from transformers import AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizer, get_scheduler
 from transformers.integrations import HfDeepSpeedConfig
 
-from open_instruct import logger_utils, rl_utils2, vllm_utils3
+from open_instruct import logger_utils, vllm_utils3
 from open_instruct.actor_manager import ActorManager
 from open_instruct.dataset_transformation import (
     GROUND_TRUTHS_KEY,
@@ -111,6 +111,7 @@
     push_folder_to_hub,
 )
 from open_instruct.queue_types import GenerationResult, PromptRequest, RequestInfo, TokenStatistics
+from open_instruct.rl_utils2 import PackedSequences, Timer, pack_sequences
 from open_instruct.utils import (
     ArgumentParserPlus,
     BeakerRuntimeConfig,
@@ -149,7 +150,7 @@ class ShutdownSentinel:
 class PackedData:
     """Container for packed sequences and associated metadata."""
 
-    packed_sequences: rl_utils2.PackedSequences
+    packed_sequences: PackedSequences
     collated_data: list  # Collated training data for each device
     metrics: dict  # Training metrics
     responses_count: int  # Number of responses
@@ -931,7 +932,7 @@ def train(
 
         # Calculate the logprob of the reference policy
         collated_ref_logprobs = []
-        with rl_utils2.Timer("Inference Calculation", noop=self.rank != 0):
+        with Timer("Inference Calculation", noop=self.rank != 0):
             with torch.no_grad():
                 for i in range(len(collated_query_responses)):
                     query_response = collated_query_responses[i]
@@ -961,7 +962,7 @@ def train(
         # from the generator (note that async mode means these are a bit diff!)
         old_logprobs = [None for _ in range(len(collated_query_responses))]
         if num_mini_batches > 1:
-            with rl_utils2.Timer("Old logprobs Calculation", noop=self.rank != 0):
+            with Timer("Old logprobs Calculation", noop=self.rank != 0):
                 with torch.no_grad():
                     for i in range(len(collated_query_responses)):
                         query_response = collated_query_responses[i]
@@ -988,7 +989,7 @@ def train(
 
         local_step = 0
         # Do multiple epochs of training on on-policy data (PPO-style), with a fresh random shuffle in each epoch
-        with rl_utils2.Timer("[Training Processes] Loss calculation", noop=self.rank != 0):
+        with Timer("[Training Processes] Loss calculation", noop=self.rank != 0):
             kl1_stats = torch.zeros(len(collated_query_responses))
             kl2_stats = torch.zeros(len(collated_query_responses))
             kl3_stats = torch.zeros(len(collated_query_responses))
@@ -1411,7 +1412,6 @@ def accumulate_inference_batches(
     all_ground_truths = []
     all_datasets = []
     all_raw_queries = []
-    all_indices = []
     for i in tqdm(
         range(num_prompts),
         total=num_prompts,
@@ -1438,7 +1438,6 @@ def accumulate_inference_batches(
         all_ground_truths.append(ground_truth)
         all_datasets.append(dataset)
         all_raw_queries.append(raw_query)
-        all_indices.append(result.dataset_index)
 
     # Combine all results into a single GenerationResult
     combined_responses = []
@@ -1495,13 +1494,13 @@ def accumulate_inference_batches(
     if actor_manager is not None:
         ray.get(actor_manager.report_token_statistics.remote(accumulated_stats))
 
-    # Create batch with preserved dataset indices
+    # Note: We don't have dataset_indices here, but they're not needed for the returned batch
     batch = Batch(
         queries=all_queries,
         ground_truths=all_ground_truths,
         datasets=all_datasets,
         raw_queries=all_raw_queries,
-        indices=all_indices,  # Preserve the dataset indices for MFU/MBU calculations
+        indices=None,  # Not meaningful for combined results
     )
     return combined_result, batch
 
@@ -1520,7 +1519,7 @@ def data_preparation_thread(
 ):
     for training_step in range(resume_training_step, num_training_steps + 1):
         # Streaming accumulation: collect results as they arrive
-        with rl_utils2.Timer("🚀 [Data Preparation Thread] Getting response ids") as timer:
+        with Timer("🚀 [Data Preparation Thread] Getting response ids") as timer:
             result, batch = accumulate_inference_batches(
                 inference_results_Q,
                 pending_queries_map,
@@ -1562,14 +1561,14 @@ def data_preparation_thread(
             ):
                 result.responses[i].append(tokenizer.eos_token_id)
                 result.masks[i].append(1)  # never mask the eos token for
-        with rl_utils2.Timer("🔥 [Data Preparation Thread] Decoding responses", noop=True):
+        with Timer("🔥 [Data Preparation Thread] Decoding responses", noop=True):
             decoded_responses = tokenizer.batch_decode(result.responses, skip_special_tokens=True)
             decoded_queries = batch.raw_queries
             stop_rate = sum(int(finish_reason == "stop") for finish_reason in result.finish_reasons) / len(
                 result.finish_reasons
             )
 
-        with rl_utils2.Timer("💰 [Data Preparation Thread] Calculating rewards and advantages"):
+        with Timer("💰 [Data Preparation Thread] Calculating rewards and advantages"):
             scores, reward_metrics = asyncio.run(
                 reward_fn(
                     result.responses,
@@ -1593,7 +1592,7 @@ def data_preparation_thread(
             else:
                 raise ValueError(f"Invalid advantage normalization type: {args.advantage_normalization_type}")
 
-        with rl_utils2.Timer("📦 [Data Preparation Thread] Filtering sequences"):
+        with Timer("📦 [Data Preparation Thread] Filtering sequences"):
             # Here we get the max possible score for each prompt, and see how many prompts are unsolved
             max_possible_score = 0
             if args.apply_verifiable_reward:
@@ -1640,7 +1639,7 @@ def data_preparation_thread(
                 finish_reasons = [finish_reasons[i] for i in stop_idxes]
 
             if args.fill_completions:
-                with rl_utils2.Timer("⏱ [Data Preparation Thread] Refill completions"):
+                with Timer("⏱ [Data Preparation Thread] Refill completions"):
                     current_batch_size = len(scores)
                     original_prompt_cnt = original_batch_size // args.num_samples_per_prompt_rollout
                     current_prompt_cnt = current_batch_size // args.num_samples_per_prompt_rollout
@@ -1694,8 +1693,8 @@ def data_preparation_thread(
                 f"({all_zero_groups / total_groups:.1%})"
             )
 
-        with rl_utils2.Timer("📦 [Data Preparation Thread] Packing sequences"):
-            packed_sequences = rl_utils2.pack_sequences(
+        with Timer("📦 [Data Preparation Thread] Packing sequences"):
+            packed_sequences = pack_sequences(
                 queries=batch.queries,
                 responses=responses,
                 masks=masks,
@@ -1716,7 +1715,7 @@ def data_preparation_thread(
         # if we have less batches than world size, we need to pad out so each world is fine
         # ideally, you should avoid this since its wasting computation.
         if args.allow_world_padding:
-            with rl_utils2.Timer("🤺 [Data Preparation Thread] Padding sequences for world size"):
+            with Timer("🤺 [Data Preparation Thread] Padding sequences for world size"):
                 shortfall = args.world_size - len(packed_sequences.query_responses)
                 if shortfall > 0:
                     logger.warning(
@@ -1738,7 +1737,7 @@ def data_preparation_thread(
                         packed_sequences.response_masks.append(dummy_response_mask)
                         packed_sequences.advantages.append(dummy_advantage)
 
-        with rl_utils2.Timer("🔄 [Data Preparation Thread] Prepare collated data for each worker"):
+        with Timer("🔄 [Data Preparation Thread] Prepare collated data for each worker"):
             B = (
                 len(packed_sequences.query_responses) // args.world_size
             )  # essentially doing `drop_last=True`, which is fine.
@@ -1861,15 +1860,9 @@ def data_preparation_thread(
             logger.warning(f"No responses in batch {training_step}.")
 
         # Put the packed sequences and metrics into the output queue
-        # For MFU/MBU calculations, we need unique prompt lengths, not repeated ones
-        # Use indices to identify unique prompts
-        seen_indices = set()
-        unique_queries = []
-        for idx, query in zip(batch.indices, batch.queries):
-            if idx not in seen_indices:
-                seen_indices.add(idx)
-                unique_queries.append(query)
-        unique_prompt_lengths = [len(q) for q in unique_queries]
+        # For training MFU, we need all prompt lengths (including repeated ones)
+        # since we're calculating total tokens processed during training
+        prompt_lengths = [len(q) for q in batch.queries]
 
         packed_sequences_Q.put(
             PackedData(
@@ -1879,7 +1872,7 @@ def data_preparation_thread(
                 responses_count=len(responses),
                 num_new_tokens=num_new_tokens,
                 batch_size=B,
-                prompt_lengths=unique_prompt_lengths,
+                prompt_lengths=prompt_lengths,
                 response_lengths=[len(r) for r in responses],
             )
         )
@@ -2143,7 +2136,7 @@ def load_data_from_packing_thread(
     Returns:
         Tuple of (collated_data, data_thread_metrics, num_total_tokens, packed_data)
     """
-    with rl_utils2.Timer("[Main Thread] 📦 Getting packed sequences from thread") as timer:
+    with Timer("[Main Thread] 📦 Getting packed sequences from thread") as timer:
         while True:
             if stop_event.is_set():
                 logger.warning("[Main Thread] Stop event detected while waiting for packed sequences")
@@ -2189,7 +2182,7 @@ def weight_sync_thread(
         # Clear the event for next iteration
         weight_sync_trigger_event.clear()
 
-        with rl_utils2.Timer("[Weight Sync]") as timer:
+        with Timer("[Weight Sync]") as timer:
             logger.debug("[Weight Sync Thread] Starting weight sync")
 
             # Set actors to stop
@@ -2223,7 +2216,7 @@ def generate_thread(args, vllm_engines, resume_training_step, stop_event, genera
     """Thread function that repeatedly calls process_from_queue on vllm engines."""
     logger.info("[Generate Thread] 🚀 Starting generation thread")
     while not stop_event.is_set():
-        with rl_utils2.Timer("🔥 Generation time") as timer:
+        with Timer("🔥 Generation time") as timer:
             processed_results = ray_get_with_progress(
                 [engine.process_from_queue.remote(timeout=20) for engine in vllm_engines],
                 desc="[Generate Thread] Waiting for vLLM engines to process",
@@ -2267,14 +2260,10 @@ def calculate_utilization_metrics(
 
     # For training, we need to calculate total sequence lengths (prompt + response)
     # This represents the full sequence that the model is trained on
+    # Since we now have all prompts (including repeated ones), we can directly zip
     total_sequence_lengths = []
-    response_idx = 0
-    for prompt_len in packed_data.prompt_lengths:
-        # For each unique prompt, get all its response lengths
-        for _ in range(args.num_samples_per_prompt_rollout):
-            response_len = packed_data.response_lengths[response_idx]
-            total_sequence_lengths.append(prompt_len + response_len)
-            response_idx += 1
+    for prompt_len, response_len in zip(packed_data.prompt_lengths, packed_data.response_lengths):
+        total_sequence_lengths.append(prompt_len + response_len)
 
     # Create a new ModelDims instance with is_training=True
     training_model_dims = utils.ModelDims(
@@ -2299,7 +2288,8 @@ def calculate_utilization_metrics(
     flops_per_second = total_flops / train_duration
     mfu = 100 * flops_per_second / device_flops
 
-    # MBU is not reported during training as requested
+    # We currently only report a single metric. This will expand to include actor MFU/MBU.
+    # We don't include MBU as it's currently broken for training.
     return {"mfu": mfu}
 
 
@@ -2324,7 +2314,7 @@ def one_training_step(
 ) -> None:
     """Train the model for one step."""
     update_ref_policy_future = []
-    with rl_utils2.Timer("[Main Thread] 🗡️ Training") as train_timer:
+    with Timer("[Main Thread] 🗡️ Training") as train_timer:
         metrics_list: List[dict[str, float]] = ray_get_with_progress(
             [
                 policy_group.models[i].train.remote(
@@ -2345,7 +2335,7 @@ def one_training_step(
 
     save_time = 0
     if args.save_freq > 0 and training_step % args.save_freq == 0 and (args.eval_on_step_0 or training_step > 1):
-        with rl_utils2.Timer("[Main Thread] 🗡️ Saving model") as timer:
+        with Timer("[Main Thread] 🗡️ Saving model") as timer:
             checkpoint_dir = f"{args.output_dir}_checkpoints"
             step_dir = os.path.join(checkpoint_dir, f"step_{training_step}")
             logger.info(f"Saving model at step {training_step} to {step_dir}")
@@ -2365,7 +2355,7 @@ def one_training_step(
         save_time += timer.duration
 
     if len(update_ref_policy_future) > 0:
-        with rl_utils2.Timer("[Main Thread] 🔃 Updating reference policy"):
+        with Timer("[Main Thread] 🔃 Updating reference policy"):
             ray_get_with_progress(update_ref_policy_future, desc="Updating reference policy")
 
     ray.get(actor_manager.report_training_step_time.remote(train_timer.duration))
@@ -2499,7 +2489,7 @@ def save_final_model(
 ):
     """Save the final model and launch evaluation jobs if configured."""
     logger.info(f"Saving final model at step {training_step} to {args.output_dir}")
-    with rl_utils2.Timer("[Main Thread] 🗡️ Saving model"):
+    with Timer("[Main Thread] 🗡️ Saving model"):
         ray_get_with_progress(
             [
                 policy_group.models[i].save_model.remote(args.output_dir, chat_template_name, tokenizer)
@@ -2558,7 +2548,7 @@ async def reward_fn(
         metrics = {}
 
         if args.apply_r1_style_format_reward:
-            with rl_utils2.Timer("[Data Preparation Thread] Calculating rewards -- 🧮 Calculating format reward"):
+            with Timer("[Data Preparation Thread] Calculating rewards -- 🧮 Calculating format reward"):
                 format_scores = soft_format_reward_func(decoded_responses, args.r1_style_format_reward)
                 if len(format_scores) != len(scores):
                     raise ValueError(f"{len(format_scores)=} != {len(scores)=}")
@@ -2567,7 +2557,7 @@ async def reward_fn(
                 metrics["val/format_scores"] = np.array(format_scores).mean()
 
         if args.apply_verifiable_reward:
-            with rl_utils2.Timer("[Data Preparation Thread] Calculating rewards -- 🏆 Applying verifiable reward"):
+            with Timer("[Data Preparation Thread] Calculating rewards -- 🏆 Applying verifiable reward"):
                 verifiable_rewards, per_func_rewards = await apply_verifiable_reward(
                     reward_fn_mapping,
                     responses,
@@ -2603,7 +2593,7 @@ async def reward_fn(
 
         # this gets applied at the very end since it replaces (rather than adds to) the existing reward.
         if args.non_stop_penalty:
-            with rl_utils2.Timer("[Data Preparation Thread] Calculating rewards -- 🦖 Applying non stop penalty"):
+            with Timer("[Data Preparation Thread] Calculating rewards -- 🦖 Applying non stop penalty"):
                 assert len(finish_reasons) == len(scores)
                 for i in range(len(finish_reasons)):
                     if finish_reasons[i] != "stop":
@@ -2845,7 +2835,7 @@ def health_check_fn():
             and training_step % args.checkpoint_state_freq == 0
             and args.checkpoint_state_dir is not None
         ):
-            with rl_utils2.Timer("[Main Thread] 🗡️ Saving checkpoint state"):
+            with Timer("[Main Thread] 🗡️ Saving checkpoint state"):
                 # Save comprehensive client state including ShufflingIterator state
                 client_state = {
                     "training_step": training_step,