Merge branch 'main' into support-datetime-field

quickwit-oss · Jul 11, 2022 · e61a59f · e61a59f
2 parents b4a80b0 + 2406d92
commit e61a59f
Show file tree

Hide file tree

Showing 35 changed files with 300 additions and 129 deletions.
diff --git a/.github/workflows/long_running.yml b/.github/workflows/long_running.yml
@@ -9,16 +9,21 @@ env:
   NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
 
 jobs:
-  functional_test_unsorted:
+  test:
+
     runs-on: ubuntu-latest
+
     steps:
     - uses: actions/checkout@v3
+    - name: Install stable
+      uses: actions-rs/toolchain@v1
+      with:
+          toolchain: stable
+          override: true
+          components: rustfmt, clippy
+
     - name: Run indexing_unsorted
       run: cargo test indexing_unsorted -- --ignored
-  functional_test_sorted:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
     - name: Run indexing_sorted
       run: cargo test indexing_sorted -- --ignored
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -16,22 +16,23 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
-    - name: Build
-      run: cargo build --verbose --workspace
     - name: Install latest nightly to test also against unstable feature flag
       uses: actions-rs/toolchain@v1
       with:
             toolchain: nightly
             override: true
             components: rustfmt
 
-    - name: Install latest nightly to test also against unstable feature flag
+    - name: Install stable
       uses: actions-rs/toolchain@v1
       with:
             toolchain: stable
             override: true
             components: rustfmt, clippy
 
+    - name: Build
+      run: cargo build --verbose --workspace
+
     - name: Run tests
       run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints --verbose --workspace
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -71,7 +71,7 @@ proptest = "1.0.0"
 criterion = "0.3.5"
 test-log = "0.2.10"
 env_logger = "0.9.0"
-pprof = { version = "0.9.0", features = ["flamegraph", "criterion"] }
+pprof = { version = "0.10.0", features = ["flamegraph", "criterion"] }
 futures = "0.3.21"
 
 [dev-dependencies.fail]

diff --git a/bitpacker/Cargo.toml b/bitpacker/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "tantivy-bitpacker"
 version = "0.2.0"
-edition = "2018"
+edition = "2021"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = []

diff --git a/common/Cargo.toml b/common/Cargo.toml
@@ -3,7 +3,7 @@ name = "tantivy-common"
 version = "0.3.0"
 authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
 license = "MIT"
-edition = "2018"
+edition = "2021"
 description = "common traits and utility functions used by multiple tantivy subcrates"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml
@@ -3,7 +3,7 @@ name = "fastfield_codecs"
 version = "0.2.0"
 authors = ["Pascal Seitz <pascal@quickwit.io>"]
 license = "MIT"
-edition = "2018"
+edition = "2021"
 description = "Fast field codecs used by tantivy"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

diff --git a/ownedbytes/Cargo.toml b/ownedbytes/Cargo.toml
@@ -2,7 +2,7 @@
 authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
 name = "ownedbytes"
 version = "0.3.0"
-edition = "2018"
+edition = "2021"
 description = "Expose data as static slice"
 license = "MIT"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

diff --git a/query-grammar/Cargo.toml b/query-grammar/Cargo.toml
@@ -9,9 +9,9 @@ homepage = "https://github.com/quickwit-oss/tantivy"
 repository = "https://github.com/quickwit-oss/tantivy"
 readme = "README.md"
 keywords = ["search", "information", "retrieval"]
-edition = "2018"
+edition = "2021"
 
 [dependencies]
 combine = {version="4", default-features=false, features=[] }
 once_cell = "1.7.2"
-regex ={ version = "1.5.4", default-features = false, features = ["std"] }
+regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }
diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs
@@ -16,9 +16,9 @@ use crate::Occur;
 // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
 // special characters.
 const SPECIAL_CHARS: &[char] = &[
-    '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
+    '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
 ];
-const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#;
+const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#;
 
 /// Parses a field_name
 /// A field name must have at least one character and be followed by a colon.
@@ -120,22 +120,36 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
 
 fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
     let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
-    phrase.or(word())
+    negative_number().or(phrase.or(word()))
 }
 
 fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
-    let term_val_with_field = negative_number().or(term_val());
-    (field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
+    (field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
         field_name: Some(field_name),
         phrase,
+        slop,
+    })
+}
+
+fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
+    let slop =
+        (char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::<u32>() {
+            Ok(d) => Ok(d),
+            _ => Err(StringStreamError::UnexpectedParse),
+        });
+    optional(slop).map(|slop| match slop {
+        Some(d) => d,
+        _ => 0,
     })
 }
 
 fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
-    let term_default_field = term_val().map(|phrase| UserInputLiteral {
+    let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
         field_name: None,
         phrase,
+        slop,
     });
+
     attempt(term_query())
         .or(term_default_field)
         .map(UserInputLeaf::from)
@@ -522,18 +536,10 @@ mod test {
             super::field_name().parse(".my.field.name:a"),
             Ok((".my.field.name".to_string(), "a"))
         );
-        assert_eq!(
-            super::field_name().parse(r#"my\　field:a"#),
-            Ok(("my　field".to_string(), "a"))
-        );
         assert_eq!(
             super::field_name().parse(r#"にんじん:a"#),
             Ok(("にんじん".to_string(), "a"))
         );
-        assert_eq!(
-            super::field_name().parse("my\\ field\\ name:a"),
-            Ok(("my field name".to_string(), "a"))
-        );
         assert_eq!(
             super::field_name().parse(r#"my\field:a"#),
             Ok((r#"my\field"#.to_string(), "a"))
@@ -562,6 +568,17 @@ mod test {
             super::field_name().parse("_my_field:a"),
             Ok(("_my_field".to_string(), "a"))
         );
+        assert_eq!(
+            super::field_name().parse("~my~field:a"),
+            Ok(("~my~field".to_string(), "a"))
+        );
+        for special_char in SPECIAL_CHARS.iter() {
+            let query = &format!("\\{special_char}my\\{special_char}field:a");
+            assert_eq!(
+                super::field_name().parse(&query),
+                Ok((format!("{special_char}my{special_char}field"), "a"))
+            );
+        }
     }
 
     #[test]
@@ -714,4 +731,22 @@ mod test {
         );
         test_is_parse_err("abc +    ");
     }
+
+    #[test]
+    fn test_slop() {
+        assert!(parse_to_ast().parse("\"a b\"~").is_err());
+        assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
+        assert!(parse_to_ast().parse("\"a b\"~a").is_err());
+        assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());
+
+        test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
+        test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
+        test_parse_query_to_ast_helper("~Document", "\"~Document\"");
+        test_parse_query_to_ast_helper("a~2", "\"a~2\"");
+        test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
+        test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
+        test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
+        test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300");
+        test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
+    }
 }
diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs
@@ -40,14 +40,19 @@ impl Debug for UserInputLeaf {
 pub struct UserInputLiteral {
     pub field_name: Option<String>,
     pub phrase: String,
+    pub slop: u32,
 }
 
 impl fmt::Debug for UserInputLiteral {
     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
-        match self.field_name {
-            Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
-            None => write!(formatter, "\"{}\"", self.phrase),
+        if let Some(ref field) = self.field_name {
+            write!(formatter, "\"{}\":", field)?;
         }
+        write!(formatter, "\"{}\"", self.phrase)?;
+        if self.slop > 0 {
+            write!(formatter, "~{}", self.slop)?;
+        }
+        Ok(())
     }
 }
 

diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs
@@ -9,6 +9,7 @@ use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_valida
 use crate::collector::{Collector, SegmentCollector};
 use crate::{SegmentReader, TantivyError};
 
+/// The default max bucket count, before the aggregation fails.
 pub const MAX_BUCKET_COUNT: u32 = 65000;
 
 /// Collector for aggregations.
@@ -22,6 +23,7 @@ pub struct AggregationCollector {
 impl AggregationCollector {
     /// Create collector from aggregation request.
     ///
+    /// Aggregation fails when the total bucket count is higher than max_bucket_count.
     /// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset
     pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>) -> Self {
         Self {

diff --git a/src/aggregation/intermediate_agg_result.rs b/src/aggregation/intermediate_agg_result.rs
@@ -280,11 +280,9 @@ impl IntermediateBucketResult {
                     .collect::<crate::Result<Vec<_>>>()?;
 
                 buckets.sort_by(|left, right| {
-                    // TODO use total_cmp next stable rust release
                     left.from
                         .unwrap_or(f64::MIN)
-                        .partial_cmp(&right.from.unwrap_or(f64::MIN))
-                        .unwrap_or(Ordering::Equal)
+                        .total_cmp(&right.from.unwrap_or(f64::MIN))
                 });
                 Ok(BucketResult::Range { buckets })
             }
@@ -441,12 +439,9 @@ impl IntermediateTermBucketResult {
                     })
                     .collect::<crate::Result<Vec<_>>>()?;
 
-                buckets_with_val.sort_by(|(_, val1), (_, val2)| {
-                    // TODO use total_cmp in next rust stable release
-                    match &order {
-                        Order::Desc => val2.partial_cmp(val1).unwrap_or(std::cmp::Ordering::Equal),
-                        Order::Asc => val1.partial_cmp(val2).unwrap_or(std::cmp::Ordering::Equal),
-                    }
+                buckets_with_val.sort_by(|(_, val1), (_, val2)| match &order {
+                    Order::Desc => val2.total_cmp(val1),
+                    Order::Asc => val1.total_cmp(val2),
                 });
                 buckets = buckets_with_val
                     .into_iter()

diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs
@@ -166,6 +166,7 @@ use std::fmt::Display;
 
 pub use collector::{
     AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
+    MAX_BUCKET_COUNT,
 };
 use itertools::Itertools;
 use serde::{Deserialize, Serialize};

diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs
@@ -271,8 +271,8 @@ impl Collector for FacetCollector {
             let mut facet_streamer = facet_reader.facet_dict().range().into_stream()?;
             if facet_streamer.advance() {
                 'outer: loop {
-                    // at the begining of this loop, facet_streamer
-                    // is positionned on a term that has not been processed yet.
+                    // at the beginning of this loop, facet_streamer
+                    // is positioned on a term that has not been processed yet.
                     let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
                     match skip_result {
                         SkipResult::Found => {

diff --git a/src/collector/tests.rs b/src/collector/tests.rs
@@ -69,10 +69,8 @@ pub fn test_filter_collector() -> crate::Result<()> {
 
 /// Stores all of the doc ids.
 /// This collector is only used for tests.
-/// It is unusable in pr
-///
-/// actise, as it does not store
-/// the segment ordinals
+/// It is unusable in practise, as it does
+/// not store the segment ordinals
 pub struct TestCollector {
     pub compute_score: bool,
 }

diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs
@@ -137,7 +137,7 @@ where T: PartialOrd + Clone
 /// sorted by type `T`.
 ///
 /// The implementation is based on a `BinaryHeap`.
-/// The theorical complexity for collecting the top `K` out of `n` documents
+/// The theoretical complexity for collecting the top `K` out of `n` documents
 /// is `O(n log K)`.
 pub(crate) struct TopSegmentCollector<T> {
     limit: usize,

diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs
@@ -79,7 +79,7 @@ where
 /// sorted by their score.
 ///
 /// The implementation is based on a `BinaryHeap`.
-/// The theorical complexity for collecting the top `K` out of `n` documents
+/// The theoretical complexity for collecting the top `K` out of `n` documents
 /// is `O(n log K)`.
 ///
 /// This collector guarantees a stable sorting in case of a tie on the
@@ -283,7 +283,7 @@ impl TopDocs {
     ///
     /// # See also
     ///
-    /// To confortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
+    /// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
     /// [.order_by_fast_field(...)](#method.order_by_fast_field) method.
     pub fn order_by_u64_field(
         self,

diff --git a/src/core/searcher.rs b/src/core/searcher.rs
@@ -6,7 +6,7 @@ use crate::core::{Executor, SegmentReader};
 use crate::query::Query;
 use crate::schema::{Document, Schema, Term};
 use crate::space_usage::SearcherSpaceUsage;
-use crate::store::StoreReader;
+use crate::store::{CacheStats, StoreReader};
 use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
 
 /// Identifies the searcher generation accessed by a [Searcher].
@@ -77,11 +77,13 @@ impl Searcher {
         index: Index,
         segment_readers: Vec<SegmentReader>,
         generation: TrackedObject<SearcherGeneration>,
+        doc_store_cache_size: usize,
     ) -> io::Result<Searcher> {
         let store_readers: Vec<StoreReader> = segment_readers
             .iter()
-            .map(SegmentReader::get_store_reader)
+            .map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_size))
             .collect::<io::Result<Vec<_>>>()?;
+
         Ok(Searcher {
             schema,
             index,
@@ -110,6 +112,18 @@ impl Searcher {
         store_reader.get(doc_address.doc_id)
     }
 
+    /// The cache stats for the underlying store reader.
+    ///
+    /// Aggregates the sum for each segment store reader.
+    pub fn doc_store_cache_stats(&self) -> CacheStats {
+        let cache_stats: CacheStats = self
+            .store_readers
+            .iter()
+            .map(|reader| reader.cache_stats())
+            .sum();
+        cache_stats
+    }
+
     /// Fetches a document in an asynchronous manner.
     #[cfg(feature = "quickwit")]
     pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {