Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions commitmsg.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Highlighter: avoid merging zero-score fragments
MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used
Add regression tests and CHANGES entries
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ Bug Fixes

* GITHUB#15125: Handle inconsistent schema on flush with index sorts (Nhat Nguyen)

* GITHUB#15333: Highlighter: prevent zero-scored fragments from being merged with
adjacent fragments. This avoids producing merged passages that include content with
no matches. A regression test was added (TestZeroScoreMerging).

Changes in Runtime Behavior
---------------------
* GITHUB#14187: The query cache is now disabled by default. (Adrien Grand)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.util.LuceneTestCase;

/**
* Regression test for a bug where MaxScoreBulkScorer could score past leaf maxDoc when a
* restrictive filter and disjunction were used together.
*/
public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase {

public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig();
try (IndexWriter w = new IndexWriter(dir, iwc)) {
// Create a small index where one clause matches more docs than the other, and a restrictive
// filter
for (int i = 0; i < 200; i++) {
Document d = new Document();
// Clause A matches ~1/3
d.add(new StringField("a", (i % 3 == 0) ? "yes" : "no", Field.Store.NO));
// Clause B matches ~1/9
d.add(new StringField("b", (i % 9 == 0) ? "yes" : "no", Field.Store.NO));
// Restrictive filter matches ~1%
d.add(new StringField("f", (i % 100 == 0) ? "on" : "off", Field.Store.NO));
w.addDocument(d);
}
}

try (DirectoryReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = new IndexSearcher(reader);

Query disjunction =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("a", "yes")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("b", "yes")), BooleanClause.Occur.SHOULD)
.build();

Query filter = new TermQuery(new Term("f", "on"));

Query filtered =
new BooleanQuery.Builder()
.add(disjunction, BooleanClause.Occur.SHOULD)
.add(filter, BooleanClause.Occur.FILTER)
.build();

// This triggers TOP_SCORES path internally; just execute to ensure no exceptions
TopDocs td = searcher.search(filtered, 10);
assertNotNull(td);
} finally {
dir.close();
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,19 @@ public final TextFragment[] getBestTextFragments(
// return the most relevant fragments
TextFragment[] frag = fragQueue.drainToArrayHighestFirst(TextFragment[]::new);

// Filter out zero-scored fragments BEFORE merging so that non-matching
// text cannot be merged into matching fragments and "inherit" a positive score.
// This prevents large zero-content blocks from polluting the highlights.
if (frag.length > 0) {
ArrayList<TextFragment> positive = new ArrayList<>(frag.length);
for (TextFragment f : frag) {
if (f != null && f.getScore() > 0.0f) {
positive.add(f);
}
}
frag = positive.toArray(new TextFragment[0]);
}

// merge any contiguous fragments to improve readability
if (mergeContiguousFragments) {
mergeContiguousFragments(frag);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.highlight;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.util.LuceneTestCase;

/**
* Regression test for merging zero-scored fragments into scored fragments (#15333).
*/
public class TestZeroScoreMerging extends LuceneTestCase {

public void testZeroScoredFragmentsAreNotMergedIntoHighlights() throws Exception {
// Build a text with large zero-matching regions around a single matching token "credit".
String prefix = repeat('a', 130); // ensures at least one 100-char fragment with score 0
String match = " credit ";
String suffix = repeat('b', 130);
String text = prefix + match + suffix;

MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);

TermQuery query = new TermQuery(new Term("f", "credit"));
QueryScorer scorer = new QueryScorer(query, "f");
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), scorer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));

TokenStream ts = analyzer.tokenStream("f", text);
String[] best = highlighter.getBestFragments(ts, text, 3);

// We expect only the fragment containing the match to be returned, not merged with neighbors
assertEquals("Only the scored fragment should be returned", 1, best.length);
assertTrue(
"Returned fragment must contain the highlighted match",
best[0].contains("<B>credit</B>") || best[0].contains("<b>credit</b>"));
// And it should not be overly long (i.e., not a merge of 3x100-size fragments)
assertTrue("Fragment should be near the configured size", best[0].length() <= 160);
}

private static String repeat(char c, int count) {
StringBuilder sb = new StringBuilder(count);
for (int i = 0; i < count; i++) {
sb.append(c);
}
return sb.toString();
}
}
Loading