Ensure cross_fields always uses valid term statistics (#90278) (#90314)

In #89016 we adjusted the `cross_fields` scoring formula to prevent negative scores. This fix accidentally dropped another important fix that was added in #41938. Specifically, we need to make sure to take the minimum between the document frequency (`actualDf`) and the minimum total term frequency (`minTTF`). Otherwise, we can produce invalid term statistics where the total term frequency is less than the document frequency. Fixes #90275
elastic · Sep 23, 2022 · 655ce96 · 655ce96
1 parent 16e5fa4
commit 655ce96
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 3 deletions.
diff --git a/docs/changelog/90314.yaml b/docs/changelog/90314.yaml
@@ -0,0 +1,5 @@
+pr: 90314
+summary: Ensure `cross_fields` always uses valid term statistics
+area: Ranking
+type: regression
+issues: []
diff --git a/docs/reference/release-notes/8.4.2.asciidoc b/docs/reference/release-notes/8.4.2.asciidoc
@@ -3,6 +3,17 @@
 
 Also see <<breaking-changes-8.4,Breaking changes in 8.4>>.
 
+=== Known issues
+
+* **This version contains a regression in `multi_match` queries that use the
+`cross_fields` scoring type.** {es}
++
+When running a <<query-dsl-multi-match-query,`multi_match`>> query with the
+`cross_fields` type, {es} can sometimes throw an IllegalArgument exception
+with the message "totalTermFreq must be at least docFreq". If you use the
+`cross_fields` scoring type, it is recommended that you skip version 8.4.2.
+This regression was fixed in version 8.4.3.
+
 [[bug-8.4.2]]
 [float]
 === Bug fixes

diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java
@@ -150,7 +150,15 @@ protected int compare(int i, int j) {
             }
 
             int docCount = reader.getDocCount(terms[i].field());
-            int newDocFreq = Math.min(actualDf, docCount);
+
+            // IMPORTANT: we make two adjustments here to ensure the new document frequency is valid:
+            // 1. We take a minimum with docCount, which is the total number of documents that contain
+            // this field. The document frequency must always be less than the document count.
+            // 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of
+            // maxDoc and minTTF. So taking the minimum ensures that the document frequency is never
+            // greater than the total term frequency, which would be illegal.
+            int newDocFreq = Math.min(Math.min(actualDf, docCount), maxDoc);
+
             contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq);
             prev = current;
             sumTTF += ctx.totalTermFreq();

diff --git a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java
@@ -37,8 +37,10 @@
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import static org.hamcrest.Matchers.containsInAnyOrder;
@@ -228,16 +230,22 @@ public void testMinTTF() throws IOException {
             Document d = new Document();
             d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
             d.add(new Field("dense", "foo foo foo", ft));
-            if (i % 10 == 0) {
+            if (i % 2 == 0) {
                 d.add(new Field("sparse", "foo", ft));
             }
+            if (i % 10 == 0) {
+                d.add(new Field("more_sparse", "foo", ft));
+            }
             w.addDocument(d);
         }
+
         w.commit();
+        w.forceMerge(1);
+
         DirectoryReader reader = DirectoryReader.open(w);
         IndexSearcher searcher = setSimilarity(newSearcher(reader));
         {
-            String[] fields = new String[] { "dense", "sparse" };
+            String[] fields = new String[] { "dense", "sparse", "more_sparse" };
             Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f);
             TopDocs search = searcher.search(query, 10);
             ScoreDoc[] scoreDocs = search.scoreDocs;
@@ -248,6 +256,55 @@ public void testMinTTF() throws IOException {
         dir.close();
     }
 
+    public void testRandomFields() throws IOException {
+        Directory dir = newDirectory();
+        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
+        FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+        ft.freeze();
+
+        Map<String, Float> fields = new HashMap<>();
+        fields.put("field", 1.0f);
+
+        int numRandomFields = random().nextInt(7);
+        for (int i = 0; i < numRandomFields; i++) {
+            String field = "field" + i;
+            float probability = randomBoolean() ? 1.0f : randomFloat();
+            fields.put(field, probability);
+        }
+
+        int numDocs = atLeast(100);
+        for (int i = 0; i < numDocs; i++) {
+            Document d = new Document();
+            for (Map.Entry<String, Float> entry : fields.entrySet()) {
+                String field = entry.getKey();
+                float probability = entry.getValue();
+                if (randomFloat() < probability) {
+                    String value = randomBoolean() ? "foo" : "foo foo foo";
+                    d.add(new Field(field, value, ft));
+                }
+                if (randomFloat() < probability) {
+                    d.add(new Field(field, "bar bar", ft));
+                }
+            }
+            w.addDocument(d);
+        }
+
+        w.commit();
+
+        DirectoryReader reader = DirectoryReader.open(w);
+        IndexSearcher searcher = setSimilarity(newSearcher(reader));
+        {
+            String[] fieldNames = fields.keySet().toArray(new String[0]);
+            Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fieldNames, "foo"), 0.1f);
+            TopDocs search = searcher.search(query, 10);
+            assertTrue(search.totalHits.value > 0);
+            assertTrue(search.scoreDocs.length > 0);
+        }
+        reader.close();
+        w.close();
+        dir.close();
+    }
+
     public void testMissingFields() throws IOException {
         Directory dir = newDirectory();
         IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));