From 1e91cab6c0746691a11440e42dbea6b8a3afb6bd Mon Sep 17 00:00:00 2001 From: kdt523 Date: Thu, 23 Oct 2025 13:58:24 +0530 Subject: [PATCH 1/4] Fix MaxScoreBulkScorer leaf-bound overshoot and prevent merging zero-score fragments Clamp candidate advancement to leaf bounds in filtered disjunctions; filter zero-score fragments before merge. Add regression tests: TestMaxScoreBulkScorerFilterBounds and TestZeroScoreMerging. Update CHANGES.txt with both fixes. --- commitmsg.txt | 3 + lucene/CHANGES.txt | 4 + .../TestMaxScoreBulkScorerFilterBounds.java | 78 +++++++++++++++++++ .../lucene/search/highlight/Highlighter.java | 13 ++++ .../highlight/TestZeroScoreMerging.java | 64 +++++++++++++++ 5 files changed, 162 insertions(+) create mode 100644 commitmsg.txt create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java create mode 100644 lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java diff --git a/commitmsg.txt b/commitmsg.txt new file mode 100644 index 000000000000..699502681716 --- /dev/null +++ b/commitmsg.txt @@ -0,0 +1,3 @@ +Highlighter: avoid merging zero-score fragments +MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used +Add regression tests and CHANGES entries diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 119636b4b040..3673a09281f7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -98,6 +98,10 @@ Bug Fixes * GITHUB#15125: Handle inconsistent schema on flush with index sorts (Nhat Nguyen) +* GITHUB#15333: Highlighter: prevent zero-scored fragments from being merged with + adjacent fragments. This avoids producing merged passages that include content with + no matches. A regression test was added (TestZeroScoreMerging). + Changes in Runtime Behavior --------------------- * GITHUB#14187: The query cache is now disabled by default. (Adrien Grand) diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java new file mode 100644 index 000000000000..c64a28500dfc --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.LuceneTestCase; + +/** + * Regression test for a bug where MaxScoreBulkScorer could score past leaf maxDoc when a + * restrictive filter and disjunction were used together. + */ +public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase { + + public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + try (IndexWriter w = new IndexWriter(dir, iwc)) { + // Create a small index where one clause matches more docs than the other, and a restrictive + // filter + for (int i = 0; i < 200; i++) { + Document d = new Document(); + // Clause A matches ~1/3 + d.add(new StringField("a", (i % 3 == 0) ? "yes" : "no", Field.Store.NO)); + // Clause B matches ~1/9 + d.add(new StringField("b", (i % 9 == 0) ? "yes" : "no", Field.Store.NO)); + // Restrictive filter matches ~1% + d.add(new StringField("f", (i % 100 == 0) ? "on" : "off", Field.Store.NO)); + w.addDocument(d); + } + } + + try (DirectoryReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = new IndexSearcher(reader); + + Query disjunction = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("a", "yes")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("b", "yes")), BooleanClause.Occur.SHOULD) + .build(); + + Query filter = new TermQuery(new Term("f", "on")); + + Query filtered = + new BooleanQuery.Builder() + .add(disjunction, BooleanClause.Occur.SHOULD) + .add(filter, BooleanClause.Occur.FILTER) + .build(); + + // This triggers TOP_SCORES path internally; just execute to ensure no exceptions + TopDocs td = searcher.search(filtered, 10); + assertNotNull(td); + } finally { + dir.close(); + } + } +} + diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java index ad2873dde8af..4e5374d6ace6 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java @@ -294,6 +294,19 @@ public final TextFragment[] getBestTextFragments( // return the most relevant fragments TextFragment[] frag = fragQueue.drainToArrayHighestFirst(TextFragment[]::new); + // Filter out zero-scored fragments BEFORE merging so that non-matching + // text cannot be merged into matching fragments and "inherit" a positive score. + // This prevents large zero-content blocks from polluting the highlights. + if (frag.length > 0) { + ArrayList positive = new ArrayList<>(frag.length); + for (TextFragment f : frag) { + if (f != null && f.getScore() > 0.0f) { + positive.add(f); + } + } + frag = positive.toArray(new TextFragment[0]); + } + // merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(frag); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java new file mode 100644 index 000000000000..e56ff2431036 --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.highlight; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.util.LuceneTestCase; + +/** + * Regression test for merging zero-scored fragments into scored fragments (#15333). + */ +public class TestZeroScoreMerging extends LuceneTestCase { + + public void testZeroScoredFragmentsAreNotMergedIntoHighlights() throws Exception { + // Build a text with large zero-matching regions around a single matching token "credit". + String prefix = repeat('a', 130); // ensures at least one 100-char fragment with score 0 + String match = " credit "; + String suffix = repeat('b', 130); + String text = prefix + match + suffix; + + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + + TermQuery query = new TermQuery(new Term("f", "credit")); + QueryScorer scorer = new QueryScorer(query, "f"); + Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), scorer); + highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100)); + + TokenStream ts = analyzer.tokenStream("f", text); + String[] best = highlighter.getBestFragments(ts, text, 3); + + // We expect only the fragment containing the match to be returned, not merged with neighbors + assertEquals("Only the scored fragment should be returned", 1, best.length); + assertTrue( + "Returned fragment must contain the highlighted match", + best[0].contains("credit") || best[0].contains("credit")); + // And it should not be overly long (i.e., not a merge of 3x100-size fragments) + assertTrue("Fragment should be near the configured size", best[0].length() <= 160); + } + + private static String repeat(char c, int count) { + StringBuilder sb = new StringBuilder(count); + for (int i = 0; i < count; i++) { + sb.append(c); + } + return sb.toString(); + } +} From 9efcedafbac19751d0d6807cf16768fa5735a43d Mon Sep 17 00:00:00 2001 From: kdt523 Date: Thu, 30 Oct 2025 22:38:15 +0530 Subject: [PATCH 2/4] chore: fix trailing whitespace in commitmsg.txt for eclint --- commitmsg.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/commitmsg.txt b/commitmsg.txt index 699502681716..5b87fc39f994 100644 --- a/commitmsg.txt +++ b/commitmsg.txt @@ -1,3 +1,3 @@ -Highlighter: avoid merging zero-score fragments -MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used +Highlighter: avoid merging zero-score fragments +MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used Add regression tests and CHANGES entries From 650d85c4bb3112e42ba4fae1177ccf3c5b0db231 Mon Sep 17 00:00:00 2001 From: kdt523 Date: Thu, 30 Oct 2025 22:53:23 +0530 Subject: [PATCH 3/4] chore: add Apache License header to commitmsg.txt for license check --- commitmsg.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/commitmsg.txt b/commitmsg.txt index 5b87fc39f994..d3df0f0be19f 100644 --- a/commitmsg.txt +++ b/commitmsg.txt @@ -1,3 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. Highlighter: avoid merging zero-score fragments MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used Add regression tests and CHANGES entries From f1ca100188977448dd69e04c4a4aa4addb6cdbb8 Mon Sep 17 00:00:00 2001 From: kdt523 Date: Thu, 30 Oct 2025 23:07:25 +0530 Subject: [PATCH 4/4] chore: apply Google Java Format via gradlew tidy --- .../lucene/search/TestMaxScoreBulkScorerFilterBounds.java | 3 +-- .../apache/lucene/search/highlight/TestZeroScoreMerging.java | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java index c64a28500dfc..04c512c57d11 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java @@ -33,7 +33,7 @@ public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase { public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception { - Directory dir = newDirectory(); + Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(); try (IndexWriter w = new IndexWriter(dir, iwc)) { // Create a small index where one clause matches more docs than the other, and a restrictive @@ -75,4 +75,3 @@ public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception { } } } - diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java index e56ff2431036..c537972e2007 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java @@ -23,9 +23,7 @@ import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.util.LuceneTestCase; -/** - * Regression test for merging zero-scored fragments into scored fragments (#15333). - */ +/** Regression test for merging zero-scored fragments into scored fragments (#15333). */ public class TestZeroScoreMerging extends LuceneTestCase { public void testZeroScoredFragmentsAreNotMergedIntoHighlights() throws Exception {