adding keep_module_scores flag

Lucas-Fernandes-Martins · Lucas-Fernandes-Martins · commit 4889c0223167 · 2025-08-29T19:53:51.000-03:00
diff --git a/dspy/teleprompt/gepa/gepa.py b/dspy/teleprompt/gepa/gepa.py
@@ -197,60 +197,38 @@ def metric(
     # pareto_frontier is a list of scores, one for each task in the batch.
     ```
 
-    Args:
-        metric: The metric function to use for feedback and evaluation.
-        auto: The auto budget to use for the run. Options: "light", "medium", "heavy".
-        max_full_evals: The maximum number of full evaluations to perform.
-        max_metric_calls: The maximum number of metric calls to perform.
-        reflection_minibatch_size: The number of examples to use for reflection in a single GEPA step. Default is 3.
-        candidate_selection_strategy: The strategy to use for candidate selection. Default is "pareto", 
-            which stochastically selects candidates from the Pareto frontier of all validation scores. 
-            Options: "pareto", "current_best".
-        reflection_lm: The language model to use for reflection. Required parameter. GEPA benefits from 
-            a strong reflection model. Consider using `dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000)` 
-            for optimal performance.
-        skip_perfect_score: Whether to skip examples with perfect scores during reflection. Default is True.
-        add_format_failure_as_feedback: Whether to add format failures as feedback. Default is False.
-        use_merge: Whether to use merge-based optimization. Default is True.
-        max_merge_invocations: The maximum number of merge invocations to perform. Default is 5.
-        num_threads: The number of threads to use for evaluation with `Evaluate`. Optional.
-        failure_score: The score to assign to failed examples. Default is 0.0.
-        perfect_score: The maximum score achievable by the metric. Default is 1.0. Used by GEPA 
-            to determine if all examples in a minibatch are perfect.
-        log_dir: The directory to save the logs. GEPA saves elaborate logs, along with all candidate 
-            programs, in this directory. Running GEPA with the same `log_dir` will resume the run 
-            from the last checkpoint.
-        track_stats: Whether to return detailed results and all proposed programs in the `detailed_results` 
-            attribute of the optimized program. Default is False.
-        use_wandb: Whether to use wandb for logging. Default is False.
-        wandb_api_key: The API key to use for wandb. If not provided, wandb will use the API key 
-            from the environment variable `WANDB_API_KEY`.
-        wandb_init_kwargs: Additional keyword arguments to pass to `wandb.init`.
-        track_best_outputs: Whether to track the best outputs on the validation set. track_stats must 
-            be True if track_best_outputs is True. The optimized program's `detailed_results.best_outputs_valset` 
-            will contain the best outputs for each task in the validation set.
-        seed: The random seed to use for reproducibility. Default is 0.
-        
-    Note:
-        Budget Configuration: Exactly one of `auto`, `max_full_evals`, or `max_metric_calls` must be provided.
-        The `auto` parameter provides preset configurations: "light" for quick experimentation, "medium" for
-        balanced optimization, and "heavy" for thorough optimization.
-        
-        Reflection Configuration: The `reflection_lm` parameter is required and should be a strong language model.
-        GEPA performs best with models like `dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000)`.
-        The reflection process analyzes failed examples to generate feedback for program improvement.
-        
-        Merge Configuration: GEPA can merge successful program variants using `use_merge=True`.
-        The `max_merge_invocations` parameter controls how many merge attempts are made during optimization.
-        
-        Evaluation Configuration: Use `num_threads` to parallelize evaluation. The `failure_score` and 
-        `perfect_score` parameters help GEPA understand your metric's range and optimize accordingly.
-        
-        Logging Configuration: Set `log_dir` to save detailed logs and enable checkpoint resuming.
-        Use `track_stats=True` to access detailed optimization results via the `detailed_results` attribute.
-        Enable `use_wandb=True` for experiment tracking and visualization.
-        
-        Reproducibility: Set `seed` to ensure consistent results across runs with the same configuration.
+    Parameters:
+        - metric: The metric function to use for feedback and evaluation.
+
+        Budget configuration (exactly one of the following must be provided):
+        - auto: The auto budget to use for the run.
+        - max_full_evals: The maximum number of full evaluations to perform.
+        - max_metric_calls: The maximum number of metric calls to perform.
+
+        Reflection based configuration:
+        - reflection_minibatch_size: The number of examples to use for reflection in a single GEPA step.
+        - candidate_selection_strategy: The strategy to use for candidate selection. Default is "pareto", which stochastically selects candidates from the Pareto frontier of all validation scores.
+        - reflection_lm: [Required] The language model to use for reflection. GEPA benefits from a strong reflection model, and you can use `dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000)` to get a good reflection model.
+
+        Merge-based configuration:
+        - use_merge: Whether to use merge-based optimization. Default is True.
+        - max_merge_invocations: The maximum number of merge invocations to perform. Default is 5.
+
+        Evaluation configuration:
+        - num_threads: The number of threads to use for evaluation with `Evaluate`
+        - failure_score: The score to assign to failed examples. Default is 0.0.
+        - perfect_score: The maximum score achievable by the metric. Default is 1.0. Used by GEPA to determine if all examples in a minibatch are perfect.
+
+        Logging configuration:
+        - log_dir: The directory to save the logs. GEPA saves elaborate logs, along with all the candidate programs, in this directory. Running GEPA with the same `log_dir` will resume the run from the last checkpoint.
+        - track_stats: Whether to return detailed results and all proposed programs in the `detailed_results` attribute of the optimized program. Default is False.
+        - use_wandb: Whether to use wandb for logging. Default is False.
+        - wandb_api_key: The API key to use for wandb. If not provided, wandb will use the API key from the environment variable `WANDB_API_KEY`.
+        - wandb_init_kwargs: Additional keyword arguments to pass to `wandb.init`.
+        - track_best_outputs: Whether to track the best outputs on the validation set. track_stats must be True if track_best_outputs is True. `optimized_program.detailed_results.best_outputs_valset` will contain the best outputs for each task in the validation set.
+
+        Reproducibility:
+        - seed: The random seed to use for reproducibility. Default is 0.
     """
     def __init__(
         self,
@@ -282,6 +260,7 @@ def __init__(
         track_best_outputs: bool = False,
         # Reproducibility
         seed: int | None = 0,
+        keep_module_scores: bool = False,
     ):
         try:
             inspect.signature(metric).bind(None, None, None, None, None)
@@ -334,6 +313,8 @@ def __init__(
         self.wandb_api_key = wandb_api_key
         self.wandb_init_kwargs = wandb_init_kwargs
 
+        self.keep_module_scores = keep_module_scores
+
         if track_best_outputs:
             assert track_stats, "track_stats must be True if track_best_outputs is True."
         self.track_best_outputs = track_best_outputs
@@ -452,6 +433,7 @@ def feedback_fn(
             num_threads=self.num_threads,
             add_format_failure_as_feedback=self.add_format_failure_as_feedback,
             rng=rng,
+            keep_module_scores=self.keep_module_scores,
         )
 
         reflection_lm = self.reflection_lm
@@ -486,8 +468,6 @@ def feedback_fn(
             wandb_api_key=self.wandb_api_key,
             wandb_init_kwargs=self.wandb_init_kwargs,
             track_best_outputs=self.track_best_outputs,
-            display_progress_bar=True,
-            raise_on_exception=True,
 
             # Reproducibility
             seed=self.seed,