stanfordnlp · okhat · Jul 19, 2025 · Jul 18, 2025
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -51,6 +51,7 @@ class EvaluationResult(Prediction):
     - score: An float value (e.g., 67.30) representing the overall performance
     - results: a list of (example, prediction, score) tuples for each example in devset
     """
+
     def __init__(self, score: float, results: list[tuple["dspy.Example", "dspy.Example", Any]]):
         super().__init__(score=score, results=results)
 
@@ -126,9 +127,9 @@ def __call__(
 
         Returns:
             The evaluation results are returned as a dspy.EvaluationResult object containing the following attributes:
-            
+
             - score: A float percentage score (e.g., 67.30) representing overall performance
-            
+
             - results: a list of (example, prediction, score) tuples for each example in devset
         """
         metric = metric if metric is not None else self.metric
@@ -145,25 +146,14 @@ def __call__(
         executor = ParallelExecutor(
             num_threads=num_threads,
             disable_progress_bar=not display_progress,
-            max_errors=(
-                self.max_errors
-                if self.max_errors is not None
-                else dspy.settings.max_errors
-            ),
+            max_errors=(self.max_errors if self.max_errors is not None else dspy.settings.max_errors),
             provide_traceback=self.provide_traceback,
             compare_results=True,
         )
 
         def process_item(example):
             prediction = program(**example.inputs())
             score = metric(example, prediction)
-
-            # Increment assert and suggest failures to program's attributes
-            if hasattr(program, "_assert_failures"):
-                program._assert_failures += dspy.settings.get("assert_failures")
-            if hasattr(program, "_suggest_failures"):
-                program._suggest_failures += dspy.settings.get("suggest_failures")
-
             return prediction, score
 
         results = executor.execute(process_item, devset)
@@ -191,7 +181,6 @@ def process_item(example):
             results=results,
         )
 
-
     def _construct_result_table(
         self, results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
     ) -> "pd.DataFrame":

diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py
@@ -85,10 +85,6 @@ def compile(self, student, *, teacher=None, trainset):
         self.student = self._train()
         self.student._compiled = True
 
-        # set assert_failures and suggest_failures as attributes of student w/ value 0
-        self.student._assert_failures = 0
-        self.student._suggest_failures = 0
-
         return self.student
 
     def _prepare_student_and_teacher(self, student, teacher):
@@ -111,7 +107,9 @@ def _prepare_predictor_mappings(self):
             teacher.predictors(),
         ), "Student and teacher must have the same number of predictors."
 
-        for (name1, predictor1), (name2, predictor2) in zip(student.named_predictors(), teacher.named_predictors(), strict=False):
+        for (name1, predictor1), (name2, predictor2) in zip(
+            student.named_predictors(), teacher.named_predictors(), strict=False
+        ):
             assert name1 == name2, "Student and teacher must have the same program structure."
             if hasattr(predictor1.signature, "equals"):
                 assert predictor1.signature.equals(
@@ -210,11 +208,7 @@ def _bootstrap_one_example(self, example, round_idx=0):
             with self.error_lock:
                 self.error_count += 1
                 current_error_count = self.error_count
-            effective_max_errors = (
-                self.max_errors
-                if self.max_errors is not None
-                else dspy.settings.max_errors
-            )
+            effective_max_errors = self.max_errors if self.max_errors is not None else dspy.settings.max_errors
             if current_error_count >= effective_max_errors:
                 raise e
             logger.error(f"Failed to run or to evaluate example {example} with {self.metric} due to {e}.")
@@ -244,7 +238,6 @@ def _bootstrap_one_example(self, example, round_idx=0):
 
             # Update the traces
             for name, demos in name2traces.items():
-
                 # If there are multiple traces for the same predictor in the sample example,
                 # sample 50/50 from the first N-1 traces or the last trace.
                 if len(demos) > 1:

diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py
@@ -58,11 +58,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
         self.trainset = trainset
         self.valset = valset or trainset  # TODO: FIXME: Note this choice.
 
-        effective_max_errors = (
-            self.max_errors
-            if self.max_errors is not None
-            else dspy.settings.max_errors
-        )
+        effective_max_errors = self.max_errors if self.max_errors is not None else dspy.settings.max_errors
 
         scores = []
         all_subscores = []
@@ -129,13 +125,6 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
 
             all_subscores.append(subscores)
 
-            ############ Assertion-aware Optimization ############
-            if hasattr(program, "_suggest_failures"):
-                score = score - program._suggest_failures * 0.2
-            if hasattr(program, "_assert_failures"):
-                score = 0 if program._assert_failures > 0 else score
-            ######################################################
-
             if len(scores) == 0 or score > max(scores):
                 print("New best score:", score, "for seed", seed)
                 best_program = program