Run eval tasks even when no validaiton set is provided

Kyle1668 · Quentin-Anthony · commit 6133de9f5150 · 2025-05-09T11:09:50.000-04:00
diff --git a/megatron/training.py b/megatron/training.py
@@ -1626,11 +1626,14 @@ def train(
                 lr_scheduler=lr_scheduler,
             )
         # Evaluation
-        if (
-            neox_args.eval_interval
-            and iteration % neox_args.eval_interval == 0
-            and neox_args.do_valid
-        ):
+        is_eval_internal = neox_args.eval_interval and iteration % neox_args.eval_interval == 0
+        is_validation_configured = bool(neox_args.do_valid) or (isinstance(neox_args.eval_tasks, list) and len(neox_args.eval_tasks) > 0)
+        # if (
+        #     neox_args.eval_interval
+        #     and iteration % neox_args.eval_interval == 0
+        #     # and neox_args.do_valid
+        # ):
+        if is_eval_internal and is_validation_configured:
             prefix = "iteration {}".format(iteration)
             evaluate_and_print_results(
                 neox_args=neox_args,
@@ -1683,46 +1686,49 @@ def evaluate(
     if neox_args.char_level_ppl:
         data_iterator = CharCounter(data_iterator, neox_args.tokenizer)
 
-    with torch.no_grad():
-        iteration = 0
-        while iteration < neox_args.eval_iters:
-            iteration += 1
-            if verbose and iteration % neox_args.log_interval == 0:
-                print_rank_0(
-                    "Evaluating iter {}/{}".format(iteration, neox_args.eval_iters)
-                )
+    eval_results = {}
+    if data_iterator is not None:
+        with torch.no_grad():
+            iteration = 0
+            while iteration < neox_args.eval_iters:
+                iteration += 1
+                if verbose and iteration % neox_args.log_interval == 0:
+                    print_rank_0(
+                        "Evaluating iter {}/{}".format(iteration, neox_args.eval_iters)
+                    )
 
-            # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s
-            # to be consistent with deepspeed's pipe parallel engine
-            # since pipe parallel already takes gradient_accumulation_steps into account - default to 1 here if pipe parallel is true
-            for _ in range(
-                1
-                if neox_args.is_pipe_parallel
-                else neox_args.gradient_accumulation_steps
-            ):
-                # Forward evaluation
-                loss, metric_dict = forward_step_fn(
-                    model=model,
-                    data_iterator=data_iterator,
-                    neox_args=neox_args,
-                    timers=timers,
-                    reference_model=reference_model,
-                )
-                losses.append(loss)
-                for key in metric_dict.keys():
-                    metric_dicts[key].append(metric_dict[key])
-            # When contiguous memory optimizations are enabled, the buffers
-            # allocated by the optimizations are deallocated during backward pass
-            # in the absence of backward pass the buffers should be reset after each
-            # forward pass
-            if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing:
-                deepspeed.checkpointing.reset()
-
-    # reduces losses across processes for logging & run eval harness tasks
-    eval_results = {"lm_loss": reduce_losses(losses).mean().item()}
-    for key in metric_dicts.keys():
-        eval_results[key] = reduce_losses(metric_dicts[key]).mean().item()
-    eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"])
+                # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s
+                # to be consistent with deepspeed's pipe parallel engine
+                # since pipe parallel already takes gradient_accumulation_steps into account - default to 1 here if pipe parallel is true
+                for _ in range(
+                    1
+                    if neox_args.is_pipe_parallel
+                    else neox_args.gradient_accumulation_steps
+                ):
+                    # Forward evaluation
+                    loss, metric_dict = forward_step_fn(
+                        model=model,
+                        data_iterator=data_iterator,
+                        neox_args=neox_args,
+                        timers=timers,
+                        reference_model=reference_model,
+                    )
+                    losses.append(loss)
+                    for key in metric_dict.keys():
+                        metric_dicts[key].append(metric_dict[key])
+                # When contiguous memory optimizations are enabled, the buffers
+                # allocated by the optimizations are deallocated during backward pass
+                # in the absence of backward pass the buffers should be reset after each
+                # forward pass
+                if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing:
+                    deepspeed.checkpointing.reset()
+
+        # reduces losses across processes for logging & run eval harness tasks
+        eval_results = {"lm_loss": reduce_losses(losses).mean().item()}
+        for key in metric_dicts.keys():
+            eval_results[key] = reduce_losses(metric_dicts[key]).mean().item()
+
+        eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"])
 
     if neox_args.char_level_ppl:
         # calculate character level perplexity, if specified