Fixed MBU/MFU calculations for the learner.

finbarrtimbers · finbarrtimbers · commit 52bfcc9e43ad · 2025-09-23T06:27:36.000-06:00
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
@@ -2244,7 +2244,7 @@ def generate_thread(args, vllm_engines, resume_training_step, stop_event, genera
 def calculate_mfu_mbu(
     model_dims: utils.ModelDims, packed_data: PackedData, train_duration: float, args: Args
 ) -> dict[str, float]:
-    """Calculate Model FLOPs Utilization (MFU) and Model Bandwidth Utilization (MBU).
+    """Calculate Model FLOPs Utilization (MFU) for training.
 
     Args:
         model_dims: Model dimensions for FLOPs/memory calculations
@@ -2253,7 +2253,7 @@ def calculate_mfu_mbu(
         args: Training arguments
 
     Returns:
-        Dictionary with 'mfu' and 'mbu' keys as percentages
+        Dictionary with 'mfu' key as percentage (MBU not reported for training)
     """
     assert model_dims is not None, "model_dims must not be None"
     assert packed_data is not None, "packed_data must not be None"
@@ -2264,42 +2264,43 @@ def calculate_mfu_mbu(
     # Get GPU specifications
     device_name = utils.get_device_name(torch.cuda.get_device_name(0))
     device_flops = utils.GPU_SPECS[device_name]["flops"]
-    device_memory_bandwidth = utils.GPU_SPECS[device_name]["memory_bandwidth"]
-
-    # For GRPO, we have multiple samples per prompt
-    # prompt_lengths contains lengths for unique prompts
-    # response_lengths contains lengths for all samples (num_prompts * samples_per_prompt)
-    assert (
-        len(packed_data.response_lengths) == len(packed_data.prompt_lengths) * args.num_samples_per_prompt_rollout
-    ), (
-        f"Expected {len(packed_data.prompt_lengths) * args.num_samples_per_prompt_rollout} response lengths, "
-        f"got {len(packed_data.response_lengths)}"
+
+    # For training, we need to calculate total sequence lengths (prompt + response)
+    # This represents the full sequence that the model is trained on
+    total_sequence_lengths = []
+    response_idx = 0
+    for prompt_len in packed_data.prompt_lengths:
+        # For each unique prompt, get all its response lengths
+        for _ in range(args.num_samples_per_prompt_rollout):
+            response_len = packed_data.response_lengths[response_idx]
+            total_sequence_lengths.append(prompt_len + response_len)
+            response_idx += 1
+
+    # Create a new ModelDims instance with is_training=True
+    training_model_dims = utils.ModelDims(
+        num_layers=model_dims.num_layers,
+        hidden_size=model_dims.hidden_size,
+        intermediate_size=model_dims.intermediate_size,
+        vocab_size=model_dims.vocab_size,
+        num_attn_heads=model_dims.num_attn_heads,
+        num_kv_heads=model_dims.num_kv_heads,
+        is_training=True,
     )
 
-    # Calculate FLOPs with proper handling of samples_per_prompt
-    # Note: prompt prefill is only done once per unique prompt, not per sample
-    total_flops = model_dims.flops(
-        prompt_lengths=packed_data.prompt_lengths,
-        response_lengths=packed_data.response_lengths,
-        samples_per_prompt=args.num_samples_per_prompt_rollout,
+    # Calculate FLOPs for training (forward + backward + gradient)
+    # Pass the total sequence lengths as prompt_lengths, with response_lengths=None
+    total_flops = training_model_dims.flops(
+        prompt_lengths=total_sequence_lengths,
+        response_lengths=None,  # None for training mode
+        samples_per_prompt=1,  # Each sequence is treated independently for training
     )
 
     # MFU = (FLOPs / time) / peak_FLOPS * 100
     flops_per_second = total_flops / train_duration
     mfu = 100 * flops_per_second / device_flops
 
-    # Calculate memory bandwidth utilization
-    total_memory_bytes = model_dims.memory_bytes(
-        prompt_lengths=packed_data.prompt_lengths,
-        response_lengths=packed_data.response_lengths,
-        samples_per_prompt=args.num_samples_per_prompt_rollout,
-    )
-
-    # MBU = (Memory bytes / time) / peak_bandwidth * 100
-    bytes_per_second = total_memory_bytes / train_duration
-    mbu = 100 * bytes_per_second / device_memory_bandwidth
-
-    return {"mfu": mfu, "mbu": mbu}
+    # MBU is not reported during training as requested
+    return {"mfu": mfu}
 
 
 def one_training_step(
@@ -2383,7 +2384,6 @@ def one_training_step(
         "epoch": episode / args.num_samples_per_prompt_rollout / len(train_dataset),
         "learner_tokens_per_second": num_total_tokens / total_time,
         "learner_mfu": utilization_metrics["mfu"],
-        "learner_mbu": utilization_metrics["mbu"],
         "time/total": total_time,
         "time/training": train_timer.duration,
         "time/saving": save_time,
diff --git a/open_instruct/utils.py b/open_instruct/utils.py
@@ -1661,6 +1661,8 @@ def check_oe_eval_internal():
 # Approximate softmax cost per attention score:
 # ~4 scalar ops/score: exp + subtract max (stabilization) + sum + divide.
 SOFTMAX_FLOPS_PER_SCORE = 4
+# Training multiplier: forward + backward + gradient computation
+TRAINING_FLOP_MULT = 3
 
 
 @dataclasses.dataclass
@@ -1671,6 +1673,7 @@ class ModelDims:
     vocab_size: int
     num_attn_heads: int
     num_kv_heads: Optional[int] = None
+    is_training: bool = False
 
     def __post_init__(self):
         if self.num_kv_heads is None:
@@ -1740,6 +1743,7 @@ def decode_flops(
 
         Embedding lookups are ignored by design.
         """
+        assert not self.is_training, "decode_flops should not be called when is_training=True"
         assert len(response_lengths) == len(prompt_lengths) * samples_per_prompt, (
             f"Expected {len(prompt_lengths) * samples_per_prompt} response lengths, got {len(response_lengths)}"
         )
@@ -1771,9 +1775,16 @@ def flops(
             response_lengths: List of response lengths (samples_per_prompt * len(prompt_lengths) total)
             samples_per_prompt: Number of samples generated per prompt
         """
+        if self.is_training:
+            assert response_lengths is None, "response_lengths should be None when is_training=True"
+
         total = self.prefill_flops(prompt_lengths)
         if response_lengths is not None:
             total += self.decode_flops(prompt_lengths, response_lengths, samples_per_prompt)
+
+        if self.is_training:
+            total *= TRAINING_FLOP_MULT
+
         return total
 
     def weight_memory_bytes(self, num_tokens: int, dtype_bytes: int = 2) -> int: