Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure cross_fields always uses valid term statistics (#90278) #90314

Merged
merged 2 commits into from
Sep 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/90314.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 90314
summary: Ensure `cross_fields` always uses valid term statistics
area: Ranking
type: regression
issues: []
11 changes: 11 additions & 0 deletions docs/reference/release-notes/8.4.2.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@

Also see <<breaking-changes-8.4,Breaking changes in 8.4>>.

=== Known issues

* **This version contains a regression in `multi_match` queries that use the
`cross_fields` scoring type.** {es}
+
When running a <<query-dsl-multi-match-query,`multi_match`>> query with the
`cross_fields` type, {es} can sometimes throw an IllegalArgument exception
with the message "totalTermFreq must be at least docFreq". If you use the
`cross_fields` scoring type, it is recommended that you skip version 8.4.2.
This regression was fixed in version 8.4.3.

[[bug-8.4.2]]
[float]
=== Bug fixes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,15 @@ protected int compare(int i, int j) {
}

int docCount = reader.getDocCount(terms[i].field());
int newDocFreq = Math.min(actualDf, docCount);

// IMPORTANT: we make two adjustments here to ensure the new document frequency is valid:
// 1. We take a minimum with docCount, which is the total number of documents that contain
// this field. The document frequency must always be less than the document count.
// 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of
// maxDoc and minTTF. So taking the minimum ensures that the document frequency is never
// greater than the total term frequency, which would be illegal.
int newDocFreq = Math.min(Math.min(actualDf, docCount), maxDoc);

contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq);
prev = current;
sumTTF += ctx.totalTermFreq();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static org.hamcrest.Matchers.containsInAnyOrder;
Expand Down Expand Up @@ -228,16 +230,22 @@ public void testMinTTF() throws IOException {
Document d = new Document();
d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
d.add(new Field("dense", "foo foo foo", ft));
if (i % 10 == 0) {
if (i % 2 == 0) {
d.add(new Field("sparse", "foo", ft));
}
if (i % 10 == 0) {
d.add(new Field("more_sparse", "foo", ft));
}
w.addDocument(d);
}

w.commit();
w.forceMerge(1);

DirectoryReader reader = DirectoryReader.open(w);
IndexSearcher searcher = setSimilarity(newSearcher(reader));
{
String[] fields = new String[] { "dense", "sparse" };
String[] fields = new String[] { "dense", "sparse", "more_sparse" };
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f);
TopDocs search = searcher.search(query, 10);
ScoreDoc[] scoreDocs = search.scoreDocs;
Expand All @@ -248,6 +256,55 @@ public void testMinTTF() throws IOException {
dir.close();
}

public void testRandomFields() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.freeze();

Map<String, Float> fields = new HashMap<>();
fields.put("field", 1.0f);

int numRandomFields = random().nextInt(7);
for (int i = 0; i < numRandomFields; i++) {
String field = "field" + i;
float probability = randomBoolean() ? 1.0f : randomFloat();
fields.put(field, probability);
}

int numDocs = atLeast(100);
for (int i = 0; i < numDocs; i++) {
Document d = new Document();
for (Map.Entry<String, Float> entry : fields.entrySet()) {
String field = entry.getKey();
float probability = entry.getValue();
if (randomFloat() < probability) {
String value = randomBoolean() ? "foo" : "foo foo foo";
d.add(new Field(field, value, ft));
}
if (randomFloat() < probability) {
d.add(new Field(field, "bar bar", ft));
}
}
w.addDocument(d);
}

w.commit();

DirectoryReader reader = DirectoryReader.open(w);
IndexSearcher searcher = setSimilarity(newSearcher(reader));
{
String[] fieldNames = fields.keySet().toArray(new String[0]);
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fieldNames, "foo"), 0.1f);
TopDocs search = searcher.search(query, 10);
assertTrue(search.totalHits.value > 0);
assertTrue(search.scoreDocs.length > 0);
}
reader.close();
w.close();
dir.close();
}

public void testMissingFields() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
Expand Down