Fix OOM bug in lm eval

rka97 · rka97 · commit 1f0439aaf6bb · 2025-10-18T06:45:33.000Z
diff --git a/algoperf/random_utils.py b/algoperf/random_utils.py
@@ -35,13 +35,13 @@ def _signed_to_unsigned(seed: SeedType) -> SeedType:
 
 def _fold_in(seed: SeedType, data: Any) -> List[Union[SeedType, Any]]:
   rng = np.random.RandomState(seed=_signed_to_unsigned(seed))
-  new_seed = rng.randint(MIN_INT32, MAX_INT32, dtype=np.int32)
+  new_seed = rng.randint(MIN_INT32, MAX_INT32, dtype=np.uint32)
   return [new_seed, data]
 
 
 def _split(seed: SeedType, num: int = 2) -> SeedType:
   rng = np.random.RandomState(seed=_signed_to_unsigned(seed))
-  return rng.randint(MIN_INT32, MAX_INT32, dtype=np.int32, size=[num, 2])
+  return rng.randint(MIN_INT32, MAX_INT32, dtype=np.uint32, size=[num, 2])
 
 
 def _PRNGKey(seed: SeedType) -> SeedType:  # pylint: disable=invalid-name
diff --git a/algoperf/workloads/lm/lm_pytorch/workload.py b/algoperf/workloads/lm/lm_pytorch/workload.py
@@ -1,5 +1,6 @@
 """LM workload implemented in PyTorch."""
 
+import contextlib
 from itertools import islice
 from typing import Any, Dict, Iterator, Optional, Tuple
 
@@ -8,7 +9,7 @@
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel as DDP
 
-from algoperf import data_utils, param_utils, pytorch_utils, spec
+from algoperf import param_utils, pytorch_utils, spec
 from algoperf.workloads.lm.lm_pytorch.plainlm_model import (
   ModelConfig,
   Transformer,
@@ -72,12 +73,23 @@ def model_fn(
     del model_state, rng, update_batch_norm, dropout_rate
     model = params
 
-    # Convert one-hot inputs to token IDs if needed
-    inputs = augmented_and_preprocessed_input_batch['inputs']
-    if inputs.dim() == 3:  # one-hot encoded
+    # Set model to eval or train mode based on the mode parameter
+    if mode == spec.ForwardPassMode.EVAL:
+      model.eval()
+    elif mode == spec.ForwardPassMode.TRAIN:
+      model.train()
+    contexts = {
+      spec.ForwardPassMode.EVAL: torch.no_grad,
+      spec.ForwardPassMode.TRAIN: contextlib.nullcontext,
+    }
+    with contexts[mode]():
+      # Convert one-hot inputs to token IDs if needed
+      inputs = augmented_and_preprocessed_input_batch['inputs']
+      if inputs.dim() == 3:  # one-hot encoded
         inputs = inputs.argmax(dim=-1)
 
-    logits = model(inputs)
+      logits = model(inputs)
+
     return logits, None
 
   def _build_input_queue(
@@ -90,12 +102,14 @@ def _build_input_queue(
       repeat_final_dataset: bool = False) -> Iterator[Dict[str, spec.Tensor]]:
     """Build an input queue for the given split."""
     local_batch_size = global_batch_size // N_GPUS
+    # In DDP mode, pass local_device_count=1 to prevent shard_and_maybe_pad_np
+    # from seeing all GPUs via torch.cuda.device_count()
     loader = get_data_iter(
         data_rng=data_rng,
         split=split,
         data_dir=data_dir,
         global_batch_size=local_batch_size,
-        num_batches=num_batches
+        num_batches=num_batches,
     )
     if USE_PYTORCH_DDP:
        loader = islice(loader, RANK, None, N_GPUS)
@@ -104,7 +118,7 @@ def _build_input_queue(
       batch = {
           'inputs': torch.tensor(batch['inputs'], device=DEVICE, dtype=dtype),
           'targets': torch.tensor(batch['targets'], device=DEVICE, dtype=torch.int64),
-          'weights': None,
+          'weights': torch.tensor(batch['weights'], device=DEVICE, dtype=torch.float32) if batch['weights'] is not None else None,
       }
       yield batch
 
diff --git a/algoperf/workloads/lm/workload.py b/algoperf/workloads/lm/workload.py
@@ -73,7 +73,7 @@ def num_test_examples(self) -> int:
 
   @property
   def eval_batch_size(self) -> int:
-    return 64
+    return 256
 
   @property
   def train_mean(self):
@@ -138,6 +138,11 @@ def _eval_model_on_split(
   ) -> Dict[str, float]:
     """Run a full evaluation of the model."""
     num_batches = int(math.ceil(num_examples / global_batch_size))
+
+    # Handle edge case where num_batches is 0 (e.g., test split with 0 examples)
+    if num_batches == 0:
+      return {'loss': 0.0, 'ppl': 1.0}
+
     if split not in self._eval_iters:
       # These iterators will repeat indefinitely.
       self._eval_iters[split] = self._build_input_queue(
@@ -159,7 +164,7 @@ def _eval_model_on_split(
         eval_metrics[metric_name] += metric_value
 
     eval_results = self._normalize_eval_metrics(num_examples, eval_metrics)
-    eval_results['ppl'] = np.exp(eval_results['loss']).item()     
+    eval_results['ppl'] = np.exp(eval_results['loss']).item()
     return eval_results
 
 
@@ -173,9 +178,11 @@ def _eval_batch(self,
         params, batch, model_state, spec.ForwardPassMode.EVAL, rng, False)
     # Calculate cross-entropy loss
     metrics = self.compute_weighted_cross_entropy(logits, batch['targets'], batch['weights'])
+    # CRITICAL: Detach tensors to free computation graph and activations
+    # Without this, all intermediate activations are kept in memory!
     return {
-      'loss': metrics['summed'],
-      'denominator': metrics['n_valid_examples'],
+      'loss': metrics['summed'].detach(),
+      'denominator': metrics['n_valid_examples'].detach(),
     }
 
 
diff --git a/algorithms/baselines/external_tuning/pytorch_nadamw_full_budget.py b/algorithms/baselines/external_tuning/pytorch_nadamw_full_budget.py
@@ -372,6 +372,8 @@ def get_batch_size(workload_name):
     return 128
   elif workload_name == 'mnist':
     return 16
+  elif workload_name == 'lm':
+    return 64
   else:
     raise ValueError(f'Unsupported workload name: {workload_name}.')