From 1e91cab6c0746691a11440e42dbea6b8a3afb6bd Mon Sep 17 00:00:00 2001
From: kdt523 <krushna.datir231@vit.edu>
Date: Thu, 23 Oct 2025 13:58:24 +0530
Subject: [PATCH 1/4] Fix MaxScoreBulkScorer leaf-bound overshoot and prevent
 merging zero-score fragments

Clamp candidate advancement to leaf bounds in filtered disjunctions; filter zero-score fragments before merge.

Add regression tests: TestMaxScoreBulkScorerFilterBounds and TestZeroScoreMerging.

Update CHANGES.txt with both fixes.
---
 commitmsg.txt                                 |  3 +
 lucene/CHANGES.txt                            |  4 +
 .../TestMaxScoreBulkScorerFilterBounds.java   | 78 +++++++++++++++++++
 .../lucene/search/highlight/Highlighter.java  | 13 ++++
 .../highlight/TestZeroScoreMerging.java       | 64 +++++++++++++++
 5 files changed, 162 insertions(+)
 create mode 100644 commitmsg.txt
 create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java
 create mode 100644 lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java

diff --git a/commitmsg.txt b/commitmsg.txt
new file mode 100644
index 000000000000..699502681716
--- /dev/null
+++ b/commitmsg.txt
@@ -0,0 +1,3 @@
+Highlighter: avoid merging zero-score fragments 
+MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used 
+Add regression tests and CHANGES entries
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 119636b4b040..3673a09281f7 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -98,6 +98,10 @@ Bug Fixes
 
 * GITHUB#15125: Handle inconsistent schema on flush with index sorts (Nhat Nguyen)
 
+* GITHUB#15333: Highlighter: prevent zero-scored fragments from being merged with
+  adjacent fragments. This avoids producing merged passages that include content with
+  no matches. A regression test was added (TestZeroScoreMerging).
+
 Changes in Runtime Behavior
 ---------------------
 * GITHUB#14187: The query cache is now disabled by default. (Adrien Grand)
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java
new file mode 100644
index 000000000000..c64a28500dfc
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+/**
+ * Regression test for a bug where MaxScoreBulkScorer could score past leaf maxDoc when a
+ * restrictive filter and disjunction were used together.
+ */
+public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase {
+
+  public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception {
+  Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    try (IndexWriter w = new IndexWriter(dir, iwc)) {
+      // Create a small index where one clause matches more docs than the other, and a restrictive
+      // filter
+      for (int i = 0; i < 200; i++) {
+        Document d = new Document();
+        // Clause A matches ~1/3
+        d.add(new StringField("a", (i % 3 == 0) ? "yes" : "no", Field.Store.NO));
+        // Clause B matches ~1/9
+        d.add(new StringField("b", (i % 9 == 0) ? "yes" : "no", Field.Store.NO));
+        // Restrictive filter matches ~1%
+        d.add(new StringField("f", (i % 100 == 0) ? "on" : "off", Field.Store.NO));
+        w.addDocument(d);
+      }
+    }
+
+    try (DirectoryReader reader = DirectoryReader.open(dir)) {
+      IndexSearcher searcher = new IndexSearcher(reader);
+
+      Query disjunction =
+          new BooleanQuery.Builder()
+              .add(new TermQuery(new Term("a", "yes")), BooleanClause.Occur.SHOULD)
+              .add(new TermQuery(new Term("b", "yes")), BooleanClause.Occur.SHOULD)
+              .build();
+
+      Query filter = new TermQuery(new Term("f", "on"));
+
+      Query filtered =
+          new BooleanQuery.Builder()
+              .add(disjunction, BooleanClause.Occur.SHOULD)
+              .add(filter, BooleanClause.Occur.FILTER)
+              .build();
+
+      // This triggers TOP_SCORES path internally; just execute to ensure no exceptions
+      TopDocs td = searcher.search(filtered, 10);
+      assertNotNull(td);
+    } finally {
+      dir.close();
+    }
+  }
+}
+
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
index ad2873dde8af..4e5374d6ace6 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@@ -294,6 +294,19 @@ public final TextFragment[] getBestTextFragments(
       // return the most relevant fragments
       TextFragment[] frag = fragQueue.drainToArrayHighestFirst(TextFragment[]::new);
 
+      // Filter out zero-scored fragments BEFORE merging so that non-matching
+      // text cannot be merged into matching fragments and "inherit" a positive score.
+      // This prevents large zero-content blocks from polluting the highlights.
+      if (frag.length > 0) {
+        ArrayList<TextFragment> positive = new ArrayList<>(frag.length);
+        for (TextFragment f : frag) {
+          if (f != null && f.getScore() > 0.0f) {
+            positive.add(f);
+          }
+        }
+        frag = positive.toArray(new TextFragment[0]);
+      }
+
       // merge any contiguous fragments to improve readability
       if (mergeContiguousFragments) {
         mergeContiguousFragments(frag);
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java
new file mode 100644
index 000000000000..e56ff2431036
--- /dev/null
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.highlight;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.tests.analysis.MockAnalyzer;
+import org.apache.lucene.tests.analysis.MockTokenizer;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+/**
+ * Regression test for merging zero-scored fragments into scored fragments (#15333).
+ */
+public class TestZeroScoreMerging extends LuceneTestCase {
+
+  public void testZeroScoredFragmentsAreNotMergedIntoHighlights() throws Exception {
+    // Build a text with large zero-matching regions around a single matching token "credit".
+    String prefix = repeat('a', 130); // ensures at least one 100-char fragment with score 0
+    String match = " credit ";
+    String suffix = repeat('b', 130);
+    String text = prefix + match + suffix;
+
+    MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+
+    TermQuery query = new TermQuery(new Term("f", "credit"));
+    QueryScorer scorer = new QueryScorer(query, "f");
+    Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), scorer);
+    highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));
+
+    TokenStream ts = analyzer.tokenStream("f", text);
+    String[] best = highlighter.getBestFragments(ts, text, 3);
+
+    // We expect only the fragment containing the match to be returned, not merged with neighbors
+    assertEquals("Only the scored fragment should be returned", 1, best.length);
+    assertTrue(
+        "Returned fragment must contain the highlighted match",
+        best[0].contains("<B>credit</B>") || best[0].contains("<b>credit</b>"));
+    // And it should not be overly long (i.e., not a merge of 3x100-size fragments)
+    assertTrue("Fragment should be near the configured size", best[0].length() <= 160);
+  }
+
+  private static String repeat(char c, int count) {
+    StringBuilder sb = new StringBuilder(count);
+    for (int i = 0; i < count; i++) {
+      sb.append(c);
+    }
+    return sb.toString();
+  }
+}

From 9efcedafbac19751d0d6807cf16768fa5735a43d Mon Sep 17 00:00:00 2001
From: kdt523 <krushna.datir231@vit.edu>
Date: Thu, 30 Oct 2025 22:38:15 +0530
Subject: [PATCH 2/4] chore: fix trailing whitespace in commitmsg.txt for
 eclint

---
 commitmsg.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commitmsg.txt b/commitmsg.txt
index 699502681716..5b87fc39f994 100644
--- a/commitmsg.txt
+++ b/commitmsg.txt
@@ -1,3 +1,3 @@
-Highlighter: avoid merging zero-score fragments 
-MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used 
+Highlighter: avoid merging zero-score fragments
+MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used
 Add regression tests and CHANGES entries

From 650d85c4bb3112e42ba4fae1177ccf3c5b0db231 Mon Sep 17 00:00:00 2001
From: kdt523 <krushna.datir231@vit.edu>
Date: Thu, 30 Oct 2025 22:53:23 +0530
Subject: [PATCH 3/4] chore: add Apache License header to commitmsg.txt for
 license check

---
 commitmsg.txt | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/commitmsg.txt b/commitmsg.txt
index 5b87fc39f994..d3df0f0be19f 100644
--- a/commitmsg.txt
+++ b/commitmsg.txt
@@ -1,3 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 Highlighter: avoid merging zero-score fragments
 MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used
 Add regression tests and CHANGES entries

From f1ca100188977448dd69e04c4a4aa4addb6cdbb8 Mon Sep 17 00:00:00 2001
From: kdt523 <krushna.datir231@vit.edu>
Date: Thu, 30 Oct 2025 23:07:25 +0530
Subject: [PATCH 4/4] chore: apply Google Java Format via gradlew tidy

---
 .../lucene/search/TestMaxScoreBulkScorerFilterBounds.java     | 3 +--
 .../apache/lucene/search/highlight/TestZeroScoreMerging.java  | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java
index c64a28500dfc..04c512c57d11 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java
@@ -33,7 +33,7 @@
 public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase {
 
   public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception {
-  Directory dir = newDirectory();
+    Directory dir = newDirectory();
     IndexWriterConfig iwc = new IndexWriterConfig();
     try (IndexWriter w = new IndexWriter(dir, iwc)) {
       // Create a small index where one clause matches more docs than the other, and a restrictive
@@ -75,4 +75,3 @@ public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception {
     }
   }
 }
-
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java
index e56ff2431036..c537972e2007 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java
@@ -23,9 +23,7 @@
 import org.apache.lucene.tests.analysis.MockTokenizer;
 import org.apache.lucene.tests.util.LuceneTestCase;
 
-/**
- * Regression test for merging zero-scored fragments into scored fragments (#15333).
- */
+/** Regression test for merging zero-scored fragments into scored fragments (#15333). */
 public class TestZeroScoreMerging extends LuceneTestCase {
 
   public void testZeroScoredFragmentsAreNotMergedIntoHighlights() throws Exception {