Updated DateTime to hold timestamp in microseconds, while making date…

… field precision configurable
quickwit-oss · Jul 11, 2022 · b4a80b0 · b4a80b0
1 parent 11e4225
commit b4a80b0
Show file tree

Hide file tree

Showing 42 changed files with 1,269 additions and 483 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+Tantivy 0.19
+================================
+- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
+  The `DateTime` type has been updated to hold timestamps with microseconds precision.
+  `DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing).
+
+
 Tantivy 0.18
 ================================
 - For date values `chrono` has been replaced with `time` (@uklotzde) #1304 :

diff --git a/Cargo.toml b/Cargo.toml
@@ -49,7 +49,7 @@ thiserror = "1.0.30"
 htmlescape = "0.3.1"
 fail = "0.5.0"
 murmurhash32 = "0.2.0"
-time = { version = "0.3.9", features = ["serde-well-known"] }
+time = { version = "0.3.10", features = ["serde-well-known"] }
 smallvec = "1.8.0"
 rayon = "1.5.2"
 lru = "0.7.5"

diff --git a/README.md b/README.md
@@ -152,4 +152,13 @@ You can also find other bindings on [GitHub](https://github.com/search?q=tantivy
 - and [more](https://github.com/search?q=tantivy)!
 
 ### On average, how much faster is Tantivy compared to Lucene?
-- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene.
+- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene.
+
+### Does tantivy support incremental indexing?
+- Yes.
+
+### How can I edit documents?
+- Data in tantivy is immutable. To edit a document, the document needs to be deleted and reindexed.
+
+### When will my documents be searchable during indexing?
+- Documents will be searchable after a `commit` is called on an `IndexWriter`. Existing `IndexReader`s will also need to be reloaded in order to reflect the changes. Finally, changes are only visible to newly acquired `Searcher`.
diff --git a/common/src/writer.rs b/common/src/writer.rs
@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
 pub struct AntiCallToken(());
 
 /// Trait used to indicate when no more write need to be done on a writer
-pub trait TerminatingWrite: Write {
+pub trait TerminatingWrite: Write + Send {
     /// Indicate that the writer will no longer be used. Internally call terminate_ref.
     fn terminate(mut self) -> io::Result<()>
     where Self: Sized {

diff --git a/examples/aggregation.rs b/examples/aggregation.rs
@@ -117,7 +117,7 @@ fn main() -> tantivy::Result<()> {
     .into_iter()
     .collect();
 
-    let collector = AggregationCollector::from_aggs(agg_req_1);
+    let collector = AggregationCollector::from_aggs(agg_req_1, None);
 
     let searcher = reader.searcher();
     let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();

diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs
@@ -0,0 +1,69 @@
+// # DateTime field example
+//
+// This example shows how the DateTime field can be used
+
+use tantivy::collector::TopDocs;
+use tantivy::query::QueryParser;
+use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING};
+use tantivy::Index;
+
+fn main() -> tantivy::Result<()> {
+    // # Defining the schema
+    let mut schema_builder = Schema::builder();
+    let opts = DateOptions::from(INDEXED)
+        .set_stored()
+        .set_fast(Cardinality::SingleValue)
+        .set_precision(tantivy::DatePrecision::Seconds);
+    let occurred_at = schema_builder.add_date_field("occurred_at", opts);
+    let event_type = schema_builder.add_text_field("event", STRING | STORED);
+    let schema = schema_builder.build();
+
+    // # Indexing documents
+    let index = Index::create_in_ram(schema.clone());
+
+    let mut index_writer = index.writer(50_000_000)?;
+    let doc = schema.parse_document(
+        r#"{
+        "occurred_at": "2022-06-22T12:53:50.53Z",
+        "event": "pull-request"
+    }"#,
+    )?;
+    index_writer.add_document(doc)?;
+    let doc = schema.parse_document(
+        r#"{
+        "occurred_at": "2022-06-22T13:00:00.22Z",
+        "event": "comment"
+    }"#,
+    )?;
+    index_writer.add_document(doc)?;
+    index_writer.commit()?;
+
+    let reader = index.reader()?;
+    let searcher = reader.searcher();
+
+    // # Default fields: event_type
+    let query_parser = QueryParser::for_index(&index, vec![event_type]);
+    {
+        let query = query_parser.parse_query("event:comment")?;
+        let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
+        assert_eq!(count_docs.len(), 1);
+    }
+    {
+        let query = query_parser
+            .parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
+        let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
+        assert_eq!(count_docs.len(), 1);
+        for (_score, doc_address) in count_docs {
+            let retrieved_doc = searcher.doc(doc_address)?;
+            assert!(matches!(
+                retrieved_doc.get_first(occurred_at),
+                Some(Value::Date(_))
+            ));
+            assert_eq!(
+                schema.to_json(&retrieved_doc),
+                r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
+            );
+        }
+    }
+    Ok(())
+}
diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs
@@ -14,7 +14,7 @@ pub struct BitpackedFastFieldReader {
     pub max_value_u64: u64,
 }
 
-impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
+impl FastFieldCodecReader for BitpackedFastFieldReader {
     /// Opens a fast field given a file.
     fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
         let (_data, mut footer) = bytes.split_at(bytes.len() - 16);

diff --git a/src/aggregation/agg_req_with_accessor.rs b/src/aggregation/agg_req_with_accessor.rs
@@ -1,10 +1,13 @@
 //! This will enhance the request tree with access to the fastfield and metadata.
 
+use std::rc::Rc;
+use std::sync::atomic::AtomicU32;
 use std::sync::Arc;
 
 use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
 use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
 use super::metric::{AverageAggregation, StatsAggregation};
+use super::segment_agg_result::BucketCount;
 use super::VecWithNames;
 use crate::fastfield::{
     type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
@@ -60,13 +63,16 @@ pub struct BucketAggregationWithAccessor {
     pub(crate) field_type: Type,
     pub(crate) bucket_agg: BucketAggregationType,
     pub(crate) sub_aggregation: AggregationsWithAccessor,
+    pub(crate) bucket_count: BucketCount,
 }
 
 impl BucketAggregationWithAccessor {
     fn try_from_bucket(
         bucket: &BucketAggregationType,
         sub_aggregation: &Aggregations,
         reader: &SegmentReader,
+        bucket_count: Rc<AtomicU32>,
+        max_bucket_count: u32,
     ) -> crate::Result<BucketAggregationWithAccessor> {
         let mut inverted_index = None;
         let (accessor, field_type) = match &bucket {
@@ -92,9 +98,18 @@ impl BucketAggregationWithAccessor {
         Ok(BucketAggregationWithAccessor {
             accessor,
             field_type,
-            sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?,
+            sub_aggregation: get_aggs_with_accessor_and_validate(
+                &sub_aggregation,
+                reader,
+                bucket_count.clone(),
+                max_bucket_count,
+            )?,
             bucket_agg: bucket.clone(),
             inverted_index,
+            bucket_count: BucketCount {
+                bucket_count,
+                max_bucket_count,
+            },
         })
     }
 }
@@ -134,6 +149,8 @@ impl MetricAggregationWithAccessor {
 pub(crate) fn get_aggs_with_accessor_and_validate(
     aggs: &Aggregations,
     reader: &SegmentReader,
+    bucket_count: Rc<AtomicU32>,
+    max_bucket_count: u32,
 ) -> crate::Result<AggregationsWithAccessor> {
     let mut metrics = vec![];
     let mut buckets = vec![];
@@ -145,6 +162,8 @@ pub(crate) fn get_aggs_with_accessor_and_validate(
                     &bucket.bucket_agg,
                     &bucket.sub_aggregation,
                     reader,
+                    Rc::clone(&bucket_count),
+                    max_bucket_count,
                 )?,
             )),
             Aggregation::Metric(metric) => metrics.push((