From 727d024a23971d869a62c2e0872caa57482ee098 Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul@quickwit.io>
Date: Wed, 19 Oct 2022 19:25:32 +0900
Subject: [PATCH] Bugfix position broken.

For Field with several FieldValues, with a
value that contained no token at all, the token position
was reinitialized to 0.

As a result, PhraseQueries can show some false positives.
In addition, after the computation of the position delta, we can
underflow u32, and end up with gigantic delta.

We haven't been able to actually explain the bug in 1629, but it
is assumed that in some corner case these delta can cause a panic.

Closes #1629
---
 src/indexer/segment_writer.rs   | 34 +++++++++++++++++++++++++++++++++
 src/postings/postings_writer.rs |  2 +-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
index 724c70e235..baa3b4f1ad 100644
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -751,4 +751,38 @@ mod tests {
         let phrase_query = PhraseQuery::new(vec![nothello_term, happy_term]);
         assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0);
     }
+
+    #[test]
+    fn test_bug_regression_1629_position_when_array_with_a_field_value_that_does_not_contain_any_token(
+    ) {
+        // We experienced a bug where we would have a position underflow when computing position
+        // delta in an horrible corner case.
+        //
+        // See the commit with this unit test if you want the details.
+        let mut schema_builder = Schema::builder();
+        let text = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+        let doc = schema
+            .parse_document(r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
+            .unwrap();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        index_writer.add_document(doc).unwrap();
+        // On debug this did panic on the underflow
+        index_writer.commit().unwrap();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let seg_reader = searcher.segment_reader(0);
+        let inv_index = seg_reader.inverted_index(text).unwrap();
+        let term = Term::from_field_text(text, "aaa");
+        let mut postings = inv_index
+            .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
+            .unwrap()
+            .unwrap();
+        assert_eq!(postings.doc(), 0u32);
+        let mut positions = Vec::new();
+        postings.positions(&mut positions);
+        // On release this was [2, 1]. (< note the decreasing values)
+        assert_eq!(positions, &[2, 5]);
+    }
 }
diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs
index 5581479aa5..915a627481 100644
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -155,7 +155,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
     ) {
         let end_of_path_idx = term_buffer.len_bytes();
         let mut num_tokens = 0;
-        let mut end_position = 0;
+        let mut end_position = indexing_position.end_position;
         token_stream.process(&mut |token: &Token| {
             // We skip all tokens with a len greater than u16.
             if token.text.len() > MAX_TOKEN_LEN {