adding flag to switch between feedback score and module score

Lucas-Fernandes-Martins · Lucas-Fernandes-Martins · commit 83556113d7b6 · 2025-08-30T11:49:16.000-03:00
diff --git a/dspy/teleprompt/gepa/gepa.py b/dspy/teleprompt/gepa/gepa.py
@@ -197,38 +197,60 @@ def metric(
     # pareto_frontier is a list of scores, one for each task in the batch.
     ```
 
-    Parameters:
-        - metric: The metric function to use for feedback and evaluation.
-
-        Budget configuration (exactly one of the following must be provided):
-        - auto: The auto budget to use for the run.
-        - max_full_evals: The maximum number of full evaluations to perform.
-        - max_metric_calls: The maximum number of metric calls to perform.
-
-        Reflection based configuration:
-        - reflection_minibatch_size: The number of examples to use for reflection in a single GEPA step.
-        - candidate_selection_strategy: The strategy to use for candidate selection. Default is "pareto", which stochastically selects candidates from the Pareto frontier of all validation scores.
-        - reflection_lm: [Required] The language model to use for reflection. GEPA benefits from a strong reflection model, and you can use `dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000)` to get a good reflection model.
-
-        Merge-based configuration:
-        - use_merge: Whether to use merge-based optimization. Default is True.
-        - max_merge_invocations: The maximum number of merge invocations to perform. Default is 5.
-
-        Evaluation configuration:
-        - num_threads: The number of threads to use for evaluation with `Evaluate`
-        - failure_score: The score to assign to failed examples. Default is 0.0.
-        - perfect_score: The maximum score achievable by the metric. Default is 1.0. Used by GEPA to determine if all examples in a minibatch are perfect.
-
-        Logging configuration:
-        - log_dir: The directory to save the logs. GEPA saves elaborate logs, along with all the candidate programs, in this directory. Running GEPA with the same `log_dir` will resume the run from the last checkpoint.
-        - track_stats: Whether to return detailed results and all proposed programs in the `detailed_results` attribute of the optimized program. Default is False.
-        - use_wandb: Whether to use wandb for logging. Default is False.
-        - wandb_api_key: The API key to use for wandb. If not provided, wandb will use the API key from the environment variable `WANDB_API_KEY`.
-        - wandb_init_kwargs: Additional keyword arguments to pass to `wandb.init`.
-        - track_best_outputs: Whether to track the best outputs on the validation set. track_stats must be True if track_best_outputs is True. `optimized_program.detailed_results.best_outputs_valset` will contain the best outputs for each task in the validation set.
-
-        Reproducibility:
-        - seed: The random seed to use for reproducibility. Default is 0.
+    Args:
+        metric: The metric function to use for feedback and evaluation.
+        auto: The auto budget to use for the run. Options: "light", "medium", "heavy".
+        max_full_evals: The maximum number of full evaluations to perform.
+        max_metric_calls: The maximum number of metric calls to perform.
+        reflection_minibatch_size: The number of examples to use for reflection in a single GEPA step. Default is 3.
+        candidate_selection_strategy: The strategy to use for candidate selection. Default is "pareto", 
+            which stochastically selects candidates from the Pareto frontier of all validation scores. 
+            Options: "pareto", "current_best".
+        reflection_lm: The language model to use for reflection. Required parameter. GEPA benefits from 
+            a strong reflection model. Consider using `dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000)` 
+            for optimal performance.
+        skip_perfect_score: Whether to skip examples with perfect scores during reflection. Default is True.
+        add_format_failure_as_feedback: Whether to add format failures as feedback. Default is False.
+        use_merge: Whether to use merge-based optimization. Default is True.
+        max_merge_invocations: The maximum number of merge invocations to perform. Default is 5.
+        num_threads: The number of threads to use for evaluation with `Evaluate`. Optional.
+        failure_score: The score to assign to failed examples. Default is 0.0.
+        perfect_score: The maximum score achievable by the metric. Default is 1.0. Used by GEPA 
+            to determine if all examples in a minibatch are perfect.
+        log_dir: The directory to save the logs. GEPA saves elaborate logs, along with all candidate 
+            programs, in this directory. Running GEPA with the same `log_dir` will resume the run 
+            from the last checkpoint.
+        track_stats: Whether to return detailed results and all proposed programs in the `detailed_results` 
+            attribute of the optimized program. Default is False.
+        use_wandb: Whether to use wandb for logging. Default is False.
+        wandb_api_key: The API key to use for wandb. If not provided, wandb will use the API key 
+            from the environment variable `WANDB_API_KEY`.
+        wandb_init_kwargs: Additional keyword arguments to pass to `wandb.init`.
+        track_best_outputs: Whether to track the best outputs on the validation set. track_stats must 
+            be True if track_best_outputs is True. The optimized program's `detailed_results.best_outputs_valset` 
+            will contain the best outputs for each task in the validation set.
+        seed: The random seed to use for reproducibility. Default is 0.
+        
+    Note:
+        Budget Configuration: Exactly one of `auto`, `max_full_evals`, or `max_metric_calls` must be provided.
+        The `auto` parameter provides preset configurations: "light" for quick experimentation, "medium" for
+        balanced optimization, and "heavy" for thorough optimization.
+        
+        Reflection Configuration: The `reflection_lm` parameter is required and should be a strong language model.
+        GEPA performs best with models like `dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000)`.
+        The reflection process analyzes failed examples to generate feedback for program improvement.
+        
+        Merge Configuration: GEPA can merge successful program variants using `use_merge=True`.
+        The `max_merge_invocations` parameter controls how many merge attempts are made during optimization.
+        
+        Evaluation Configuration: Use `num_threads` to parallelize evaluation. The `failure_score` and 
+        `perfect_score` parameters help GEPA understand your metric's range and optimize accordingly.
+        
+        Logging Configuration: Set `log_dir` to save detailed logs and enable checkpoint resuming.
+        Use `track_stats=True` to access detailed optimization results via the `detailed_results` attribute.
+        Enable `use_wandb=True` for experiment tracking and visualization.
+        
+        Reproducibility: Set `seed` to ensure consistent results across runs with the same configuration.
     """
     def __init__(
         self,
@@ -260,7 +282,7 @@ def __init__(
         track_best_outputs: bool = False,
         # Reproducibility
         seed: int | None = 0,
-        keep_module_scores: bool = False,
+        keep_module_scores: bool = False
     ):
         try:
             inspect.signature(metric).bind(None, None, None, None, None)
@@ -468,6 +490,8 @@ def feedback_fn(
             wandb_api_key=self.wandb_api_key,
             wandb_init_kwargs=self.wandb_init_kwargs,
             track_best_outputs=self.track_best_outputs,
+            display_progress_bar=True,
+            raise_on_exception=True,
 
             # Reproducibility
             seed=self.seed,