diff --git a/benches/index-bench.rs b/benches/index-bench.rs index 350305b1ba..1ad5017473 100644 --- a/benches/index-bench.rs +++ b/benches/index-bench.rs @@ -21,6 +21,11 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) { schema_builder.add_text_field("severity", STRING | STORED); schema_builder.build() }; + let dynamic_schema = { + let mut schema_builder = tantivy::schema::SchemaBuilder::new(); + schema_builder.add_json_field("json", TEXT); + schema_builder.build() + }; let mut group = c.benchmark_group("index-hdfs"); group.sample_size(20); @@ -74,6 +79,38 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) { index_writer.commit().unwrap(); }) }); + group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| { + b.iter(|| { + let index = Index::create_in_ram(dynamic_schema.clone()); + let json_field = dynamic_schema.get_field("json").unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); + for _ in 0..NUM_REPEATS { + for doc_json in HDFS_LOGS.trim().split("\n") { + let json_val: serde_json::Map = + serde_json::from_str(doc_json).unwrap(); + let doc = tantivy::doc!(json_field=>json_val); + index_writer.add_document(doc).unwrap(); + } + } + index_writer.commit().unwrap(); + }) + }); + group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| { + b.iter(|| { + let index = Index::create_in_ram(dynamic_schema.clone()); + let json_field = dynamic_schema.get_field("json").unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); + for _ in 0..NUM_REPEATS { + for doc_json in HDFS_LOGS.trim().split("\n") { + let json_val: serde_json::Map = + serde_json::from_str(doc_json).unwrap(); + let doc = tantivy::doc!(json_field=>json_val); + index_writer.add_document(doc).unwrap(); + } + } + index_writer.commit().unwrap(); + }) + }); } criterion_group! { diff --git a/examples/json_field.rs b/examples/json_field.rs new file mode 100644 index 0000000000..1649e4e800 --- /dev/null +++ b/examples/json_field.rs @@ -0,0 +1,61 @@ +// # Json field example +// +// This example shows how the json field can be used +// to make tantivy partially schemaless. + +use tantivy::collector::{Count, TopDocs}; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::{doc, Index, ReloadPolicy}; + +fn main() -> tantivy::Result<()> { + // # Defining the schema + // + // We need two fields: + // - a timestamp + // - a json object field + let mut schema_builder = Schema::builder(); + schema_builder.add_date_field("timestamp", FAST | STORED); + let event_type = schema_builder.add_text_field("event_type", STRING | STORED); + let attributes = schema_builder.add_json_field("attributes", STORED | TEXT); + let schema = schema_builder.build(); + + // # Indexing documents + let index = Index::create_in_ram(schema.clone()); + + let mut index_writer = index.writer(50_000_000)?; + let doc = schema.parse_document( + r#"{ + "timestamp": "2022-02-22T23:20:50.53Z", + "event_type": "click", + "attributes": { + "target": "submit-button", + "cart": {"product_id": 103}, + "description": "the best" + } + }"#, + )?; + index_writer.add_document(doc)?; + index_writer.commit()?; + + let reader = index.reader()?; + let searcher = reader.searcher(); + + let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]); + { + let query = query_parser.parse_query("target:submit-button")?; + let count_docs = searcher.search(&*query, &Count)?; + assert_eq!(count_docs, 1); + } + { + let query = query_parser.parse_query("cart.product_id:103")?; + let count_docs = searcher.search(&*query, &Count)?; + assert_eq!(count_docs, 1); + } + { + let query = query_parser.parse_query("cart.product_id:103")?; + let count_docs = searcher.search(&*query, &Count)?; + assert_eq!(count_docs, 1); + } + Ok(()) +} diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index 493c0e2e80..bb8ceffa31 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -243,7 +243,7 @@ pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 { Type::U64 => val as f64, Type::I64 => i64::from_u64(val) as f64, Type::F64 => f64::from_u64(val), - Type::Date | Type::Str | Type::Facet | Type::Bytes => unimplemented!(), + Type::Date | Type::Str | Type::Facet | Type::Bytes | Type::Json => unimplemented!(), } } @@ -262,7 +262,7 @@ pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> u64 { Type::U64 => val as u64, Type::I64 => (val as i64).to_u64(), Type::F64 => val.to_u64(), - Type::Date | Type::Str | Type::Facet | Type::Bytes => unimplemented!(), + Type::Date | Type::Str | Type::Facet | Type::Bytes | Type::Json => unimplemented!(), } } diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs new file mode 100644 index 0000000000..ffb7cbe9e9 --- /dev/null +++ b/src/indexer/json_term_writer.rs @@ -0,0 +1,444 @@ +// Copyright (C) 2021 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// + +use chrono::Utc; +use fnv::FnvHashMap; +use murmurhash32::murmurhash2; + +use crate::fastfield::FastValue; +use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; +use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP}; +use crate::schema::Type; +use crate::tokenizer::TextAnalyzer; +use crate::{DocId, Term}; + +/// This object is a map storing the last position for a given path for the current document +/// being indexed. +/// +/// It is key to solve the following problem: +/// If we index a JsonObject emitting several terms with the same path +/// we do not want to create false positive in phrase queries. +/// +/// For instance: +/// +/// ```json +/// {"bands": [ +/// {"band_name": "Elliot Smith"}, +/// {"band_name": "The Who"}, +/// ]} +/// ``` +/// +/// If we are careless and index each band names independently, +/// `Elliot` and `The` will end up indexed at position 0, and `Smith` and `Who` will be indexed at +/// position 1. +/// As a result, with lemmatization, "The Smiths" will match our object. +/// +/// Worse, if a same term is appears in the second object, a non increasing value would be pushed +/// to the position recorder probably provoking a panic. +/// +/// This problem is solved for regular multivalued object by offsetting the position +/// of values, with a position gap. Here we would like `The` and `Who` to get indexed at +/// position 2 and 3 respectively. +/// +/// With regular fields, we sort the fields beforehands, so that all terms with the same +/// path are indexed consecutively. +/// +/// In JSON object, we do not have this confort, so we need to record these position offsets in +/// a map. +/// +/// Note that using a single position for the entire object would not hurt correctness. +/// It would however hurt compression. +/// +/// We can therefore afford working with a map that is not imperfect. It is fine if several +/// path map to the same index position as long as the probability is relatively low. +struct IndexingPositionsPerPath { + positions_per_path: FnvHashMap, +} + +impl Default for IndexingPositionsPerPath { + fn default() -> Self { + IndexingPositionsPerPath { + positions_per_path: Default::default(), + } + } +} + +impl IndexingPositionsPerPath { + fn get_position(&mut self, term: &Term) -> &mut IndexingPosition { + self.positions_per_path + .entry(murmurhash2(term.as_slice())) + .or_insert_with(Default::default) + } +} + +pub(crate) fn index_json_values<'a>( + doc: DocId, + json_values: impl Iterator>>, + text_analyzer: &TextAnalyzer, + term_buffer: &mut Term, + postings_writer: &mut dyn PostingsWriter, + ctx: &mut IndexingContext, +) -> crate::Result<()> { + let mut json_term_writer = JsonTermWriter::wrap(term_buffer); + let mut positions_per_path: IndexingPositionsPerPath = Default::default(); + for json_value_res in json_values { + let json_value = json_value_res?; + index_json_object( + doc, + json_value, + text_analyzer, + &mut json_term_writer, + postings_writer, + ctx, + &mut positions_per_path, + ); + } + Ok(()) +} + +fn index_json_object<'a>( + doc: DocId, + json_value: &serde_json::Map, + text_analyzer: &TextAnalyzer, + json_term_writer: &mut JsonTermWriter<'a>, + postings_writer: &mut dyn PostingsWriter, + ctx: &mut IndexingContext, + positions_per_path: &mut IndexingPositionsPerPath, +) { + for (json_path_segment, json_value) in json_value { + json_term_writer.push_path_segment(json_path_segment); + index_json_value( + doc, + json_value, + text_analyzer, + json_term_writer, + postings_writer, + ctx, + positions_per_path, + ); + json_term_writer.pop_path_segment(); + } +} + +fn index_json_value<'a>( + doc: DocId, + json_value: &serde_json::Value, + text_analyzer: &TextAnalyzer, + json_term_writer: &mut JsonTermWriter<'a>, + postings_writer: &mut dyn PostingsWriter, + ctx: &mut IndexingContext, + positions_per_path: &mut IndexingPositionsPerPath, +) { + match json_value { + serde_json::Value::Null => {} + serde_json::Value::Bool(val_bool) => { + let bool_u64 = if *val_bool { 1u64 } else { 0u64 }; + json_term_writer.set_fast_value(bool_u64); + postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx); + } + serde_json::Value::Number(number) => { + if let Some(number_u64) = number.as_u64() { + json_term_writer.set_fast_value(number_u64); + } else if let Some(number_i64) = number.as_i64() { + json_term_writer.set_fast_value(number_i64); + } else if let Some(number_f64) = number.as_f64() { + json_term_writer.set_fast_value(number_f64); + } + postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx); + } + serde_json::Value::String(text) => match infer_type_from_str(text) { + TextOrDateTime::Text(text) => { + let mut token_stream = text_analyzer.token_stream(text); + // TODO make sure the chain position works out. + json_term_writer.close_path_and_set_type(Type::Str); + let indexing_position = positions_per_path.get_position(json_term_writer.term()); + postings_writer.index_text( + doc, + &mut *token_stream, + json_term_writer.term_buffer, + ctx, + indexing_position, + ); + } + TextOrDateTime::DateTime(dt) => { + json_term_writer.set_fast_value(dt); + postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx); + } + }, + serde_json::Value::Array(arr) => { + for val in arr { + index_json_value( + doc, + val, + text_analyzer, + json_term_writer, + postings_writer, + ctx, + positions_per_path, + ); + } + } + serde_json::Value::Object(map) => { + index_json_object( + doc, + map, + text_analyzer, + json_term_writer, + postings_writer, + ctx, + positions_per_path, + ); + } + } +} + +enum TextOrDateTime<'a> { + Text(&'a str), + DateTime(crate::DateTime), +} + +fn infer_type_from_str(text: &str) -> TextOrDateTime { + match chrono::DateTime::parse_from_rfc3339(text) { + Ok(dt) => { + let dt_utc = dt.with_timezone(&Utc); + TextOrDateTime::DateTime(dt_utc) + } + Err(_) => TextOrDateTime::Text(text), + } +} + +pub struct JsonTermWriter<'a> { + term_buffer: &'a mut Term, + path_stack: Vec, +} + +impl<'a> JsonTermWriter<'a> { + pub fn wrap(term_buffer: &'a mut Term) -> Self { + let buffer = term_buffer.as_mut(); + buffer.resize(5, 0); + buffer[4] = Type::Json.to_code(); + let mut path_stack = Vec::with_capacity(10); + path_stack.push(5); + Self { + term_buffer, + path_stack, + } + } + + fn trim_to_end_of_path(&mut self) { + let end_of_path = *self.path_stack.last().unwrap(); + self.term_buffer.as_mut().resize(end_of_path, 0u8); + } + + pub fn close_path_and_set_type(&mut self, typ: Type) { + self.trim_to_end_of_path(); + let buffer = self.term_buffer.as_mut(); + let buffer_len = buffer.len(); + buffer[buffer_len - 1] = JSON_END_OF_PATH; + buffer.push(typ.to_code()); + } + + pub fn push_path_segment(&mut self, segment: &str) { + // the path stack should never be empty. + self.trim_to_end_of_path(); + let buffer = self.term_buffer.as_mut(); + let buffer_len = buffer.len(); + if self.path_stack.len() > 1 { + buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP; + } + buffer.extend(segment.as_bytes()); + buffer.push(JSON_PATH_SEGMENT_SEP); + self.path_stack.push(buffer.len()); + } + + pub fn pop_path_segment(&mut self) { + self.path_stack.pop(); + assert!(!self.path_stack.is_empty()); + self.trim_to_end_of_path(); + } + + /// Returns the json path of the term being currently built. + #[cfg(test)] + pub(crate) fn path(&self) -> &[u8] { + let end_of_path = self.path_stack.last().cloned().unwrap_or(6); + &self.term().as_slice()[5..end_of_path - 1] + } + + pub fn set_fast_value(&mut self, val: T) { + self.close_path_and_set_type(T::to_type()); + self.term_buffer + .as_mut() + .extend_from_slice(val.to_u64().to_be_bytes().as_slice()); + } + + #[cfg(test)] + pub(crate) fn set_str(&mut self, text: &str) { + self.close_path_and_set_type(Type::Str); + self.term_buffer.as_mut().extend_from_slice(text.as_bytes()); + } + + pub fn term(&self) -> &Term { + self.term_buffer + } +} + +#[cfg(test)] +mod tests { + use super::JsonTermWriter; + use crate::schema::{Field, Type}; + use crate::Term; + + #[test] + fn test_json_writer() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("attributes"); + json_writer.push_path_segment("color"); + json_writer.set_str("red"); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Json, field=1, path=attributes.color, vtype=Str, \"red\")" + ); + json_writer.set_str("blue"); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Json, field=1, path=attributes.color, vtype=Str, \"blue\")" + ); + json_writer.pop_path_segment(); + json_writer.push_path_segment("dimensions"); + json_writer.push_path_segment("width"); + json_writer.set_fast_value(400i64); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Json, field=1, path=attributes.dimensions.width, vtype=I64, 400)" + ); + json_writer.pop_path_segment(); + json_writer.push_path_segment("height"); + json_writer.set_fast_value(300i64); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Json, field=1, path=attributes.dimensions.height, vtype=I64, 300)" + ); + } + + #[test] + fn test_string_term() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.set_str("red"); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00sred" + ) + } + + #[test] + fn test_i64_term() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.set_fast_value(-4i64); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc" + ) + } + + #[test] + fn test_u64_term() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.set_fast_value(4u64); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04" + ) + } + + #[test] + fn test_f64_term() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.set_fast_value(4.0f64); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00" + ) + } + + #[test] + fn test_push_after_set_path_segment() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("attribute"); + json_writer.set_str("something"); + json_writer.push_path_segment("color"); + json_writer.set_str("red"); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jattribute\x01color\x00sred" + ) + } + + #[test] + fn test_pop_segment() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.push_path_segment("hue"); + json_writer.pop_path_segment(); + json_writer.set_str("red"); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00sred" + ) + } + + #[test] + fn test_json_writer_path() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + assert_eq!(json_writer.path(), b"color"); + json_writer.push_path_segment("hue"); + assert_eq!(json_writer.path(), b"color\x01hue"); + json_writer.set_str("pink"); + assert_eq!(json_writer.path(), b"color\x01hue"); + } +} diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 34b7b7daa1..d8a1ea9b82 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -307,16 +307,16 @@ impl IndexMerger { } None => {} }, - FieldType::Str(_) => { - // We don't handle str fast field for the moment - // They can be implemented using what is done - // for facets in the future. - } FieldType::Bytes(byte_options) => { if byte_options.is_fast() { self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?; } } + FieldType::Str(_) | FieldType::JsonObject(_) => { + // We don't handle json / string fast field for the moment + // They can be implemented using what is done + // for facets in the future + } } } Ok(()) diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index d8f9b0568c..b0634d2bac 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -5,6 +5,7 @@ pub mod doc_id_mapping; mod doc_opstamp_mapping; pub mod index_writer; mod index_writer_status; +mod json_term_writer; mod log_merge_policy; mod merge_operation; pub mod merge_policy; @@ -24,6 +25,7 @@ use crossbeam::channel; use smallvec::SmallVec; pub use self::index_writer::IndexWriter; +pub(crate) use self::json_term_writer::JsonTermWriter; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index cbd2241fb2..009bce4747 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -3,12 +3,13 @@ use super::operation::AddOperation; use crate::core::Segment; use crate::fastfield::FastFieldsWriter; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; +use crate::indexer::json_term_writer::index_json_values; use crate::indexer::segment_serializer::SegmentSerializer; use crate::postings::{ compute_table_size, serialize_postings, IndexingContext, IndexingPosition, PerFieldPostingsWriter, PostingsWriter, }; -use crate::schema::{Field, FieldEntry, FieldType, FieldValue, Schema, Term, Type, Value}; +use crate::schema::{FieldEntry, FieldType, FieldValue, Schema, Term, Value}; use crate::store::{StoreReader, StoreWriter}; use crate::tokenizer::{ BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer, @@ -61,7 +62,7 @@ pub struct SegmentWriter { pub(crate) fast_field_writers: FastFieldsWriter, pub(crate) fieldnorms_writer: FieldNormsWriter, pub(crate) doc_opstamps: Vec, - tokenizers: Vec>, + per_field_text_analyzers: Vec, term_buffer: Term, schema: Schema, } @@ -85,19 +86,23 @@ impl SegmentWriter { let table_size = compute_initial_table_size(memory_budget_in_bytes)?; let segment_serializer = SegmentSerializer::for_segment(segment, false)?; let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema); - let tokenizers = schema + let per_field_text_analyzers = schema .fields() - .map( - |(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() { - FieldType::Str(ref text_options) => text_options - .get_indexing_options() - .and_then(|text_index_option| { - let tokenizer_name = &text_index_option.tokenizer(); - tokenizer_manager.get(tokenizer_name) - }), + .map(|(_, field_entry): (_, &FieldEntry)| { + let text_options = match field_entry.field_type() { + FieldType::Str(ref text_options) => text_options.get_indexing_options(), + FieldType::JsonObject(ref json_object_options) => { + json_object_options.get_text_indexing_options() + } _ => None, - }, - ) + }; + text_options + .and_then(|text_index_option| { + let tokenizer_name = &text_index_option.tokenizer(); + tokenizer_manager.get(tokenizer_name) + }) + .unwrap_or_default() + }) .collect(); Ok(SegmentWriter { max_doc: 0, @@ -107,7 +112,7 @@ impl SegmentWriter { segment_serializer, fast_field_writers: FastFieldsWriter::from_schema(&schema), doc_opstamps: Vec::with_capacity(1_000), - tokenizers, + per_field_text_analyzers, term_buffer: Term::new(), schema, }) @@ -165,9 +170,9 @@ impl SegmentWriter { let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx); let postings_writer: &mut dyn PostingsWriter = self.per_field_postings_writers.get_for_field_mut(field); + term_buffer.set_field(field_entry.field_type().value_type(), field); match *field_entry.field_type() { FieldType::Facet(_) => { - term_buffer.set_field(Type::Facet, field); for value in values { let facet = value.as_facet().ok_or_else(make_schema_error)?; let facet_str = facet.encoded_str(); @@ -205,13 +210,11 @@ impl SegmentWriter { .push(PreTokenizedStream::from(tok_str.clone()).into()); } Value::Str(ref text) => { - if let Some(ref mut tokenizer) = - self.tokenizers[field.field_id() as usize] - { - offsets.push(total_offset); - total_offset += text.len(); - token_streams.push(tokenizer.token_stream(text)); - } + let text_analyzer = + &self.per_field_text_analyzers[field.field_id() as usize]; + offsets.push(total_offset); + total_offset += text.len(); + token_streams.push(text_analyzer.token_stream(text)); } _ => (), } @@ -219,9 +222,9 @@ impl SegmentWriter { let mut indexing_position = IndexingPosition::default(); for mut token_stream in token_streams { + assert_eq!(term_buffer.as_mut().len(), 5); postings_writer.index_text( doc_id, - field, &mut *token_stream, term_buffer, ctx, @@ -233,7 +236,6 @@ impl SegmentWriter { } FieldType::U64(_) => { for value in values { - term_buffer.set_field(Type::U64, field); let u64_val = value.as_u64().ok_or_else(make_schema_error)?; term_buffer.set_u64(u64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); @@ -241,7 +243,6 @@ impl SegmentWriter { } FieldType::Date(_) => { for value in values { - term_buffer.set_field(Type::Date, field); let date_val = value.as_date().ok_or_else(make_schema_error)?; term_buffer.set_i64(date_val.timestamp()); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); @@ -249,7 +250,6 @@ impl SegmentWriter { } FieldType::I64(_) => { for value in values { - term_buffer.set_field(Type::I64, field); let i64_val = value.as_i64().ok_or_else(make_schema_error)?; term_buffer.set_i64(i64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); @@ -257,7 +257,6 @@ impl SegmentWriter { } FieldType::F64(_) => { for value in values { - term_buffer.set_field(Type::F64, field); let f64_val = value.as_f64().ok_or_else(make_schema_error)?; term_buffer.set_f64(f64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); @@ -265,12 +264,36 @@ impl SegmentWriter { } FieldType::Bytes(_) => { for value in values { - term_buffer.set_field(Type::Bytes, field); let bytes = value.as_bytes().ok_or_else(make_schema_error)?; term_buffer.set_bytes(bytes); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } } + FieldType::JsonObject(_) => { + let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize]; + let json_values_it = values + .iter() + .map(|value| value.as_json().ok_or_else(make_schema_error)); + index_json_values( + doc_id, + json_values_it, + text_analyzer, + term_buffer, + postings_writer, + ctx, + )?; + // for value in values { + // let json = value.as_json().ok_or_else(make_schema_error)?; + // index_json( + // doc_id, + // json, + // text_analyzer, + // term_buffer, + // postings_writer, + // ctx, + // ); + // } + } } } Ok(()) @@ -398,10 +421,16 @@ pub fn prepare_doc_for_store(doc: Document, schema: &Schema) -> Document { #[cfg(test)] mod tests { + use chrono::Utc; + use super::compute_initial_table_size; - use crate::schema::{Schema, STORED, TEXT}; + use crate::collector::Count; + use crate::indexer::json_term_writer::JsonTermWriter; + use crate::postings::TermInfo; + use crate::query::PhraseQuery; + use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT}; use crate::tokenizer::{PreTokenizedString, Token}; - use crate::Document; + use crate::{DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED}; #[test] fn test_hashmap_size() { @@ -440,4 +469,247 @@ mod tests { Some("title") ); } + + #[test] + fn test_json_indexing() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", STORED | TEXT); + let schema = schema_builder.build(); + let json_val: serde_json::Map = serde_json::from_str( + r#"{ + "toto": "titi", + "float": -0.2, + "unsigned": 1, + "signed": -2, + "complexobject": { + "field.with.dot": 1 + }, + "date": "1985-04-12T23:20:50.52Z", + "my_arr": [2, 3, {"my_key": "two tokens"}, 4] + }"#, + ) + .unwrap(); + let doc = doc!(json_field=>json_val.clone()); + let index = Index::create_in_ram(schema.clone()); + let mut writer = index.writer_for_tests().unwrap(); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let doc = searcher + .doc(DocAddress { + segment_ord: 0u32, + doc_id: 0u32, + }) + .unwrap(); + let serdeser_json_val = serde_json::from_str::>( + &schema.to_json(&doc), + ) + .unwrap() + .get("json") + .unwrap()[0] + .as_object() + .unwrap() + .clone(); + assert_eq!(json_val, serdeser_json_val); + let segment_reader = searcher.segment_reader(0u32); + let inv_idx = segment_reader.inverted_index(json_field).unwrap(); + let term_dict = inv_idx.terms(); + + let mut term = Term::new(); + term.set_field(Type::Json, json_field); + let mut term_stream = term_dict.stream().unwrap(); + + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + json_term_writer.push_path_segment("complexobject"); + json_term_writer.push_path_segment("field.with.dot"); + json_term_writer.set_fast_value(1u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("date"); + json_term_writer.set_fast_value( + chrono::DateTime::parse_from_rfc3339("1985-04-12T23:20:50.52Z") + .unwrap() + .with_timezone(&Utc), + ); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("float"); + json_term_writer.set_fast_value(-0.2f64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("my_arr"); + json_term_writer.set_fast_value(2u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.set_fast_value(3u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.set_fast_value(4u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.push_path_segment("my_key"); + json_term_writer.set_str("tokens"); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.set_str("two"); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("signed"); + json_term_writer.set_fast_value(-2i64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("toto"); + json_term_writer.set_str("titi"); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("unsigned"); + json_term_writer.set_fast_value(1u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert!(!term_stream.advance()); + } + + #[test] + fn test_json_tokenized_with_position() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", STORED | TEXT); + let schema = schema_builder.build(); + let mut doc = Document::default(); + let json_val: serde_json::Map = + serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap(); + doc.add_json_object(json_field, json_val.clone()); + let index = Index::create_in_ram(schema.clone()); + let mut writer = index.writer_for_tests().unwrap(); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let segment_reader = searcher.segment_reader(0u32); + let inv_index = segment_reader.inverted_index(json_field).unwrap(); + let mut term = Term::new(); + term.set_field(Type::Json, json_field); + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + json_term_writer.push_path_segment("mykey"); + json_term_writer.set_str("token"); + let term_info = inv_index + .get_term_info(json_term_writer.term()) + .unwrap() + .unwrap(); + assert_eq!( + term_info, + TermInfo { + doc_freq: 1, + postings_range: 2..4, + positions_range: 2..5 + } + ); + let mut postings = inv_index + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) + .unwrap() + .unwrap(); + assert_eq!(postings.doc(), 0); + assert_eq!(postings.term_freq(), 2); + let mut positions = Vec::new(); + postings.positions(&mut positions); + assert_eq!(&positions[..], &[1, 2]); + assert_eq!(postings.advance(), TERMINATED); + } + + #[test] + fn test_json_raw_no_position() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", STRING); + let schema = schema_builder.build(); + let json_val: serde_json::Map = + serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap(); + let doc = doc!(json_field=>json_val); + let index = Index::create_in_ram(schema.clone()); + let mut writer = index.writer_for_tests().unwrap(); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let segment_reader = searcher.segment_reader(0u32); + let inv_index = segment_reader.inverted_index(json_field).unwrap(); + let mut term = Term::new(); + term.set_field(Type::Json, json_field); + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + json_term_writer.push_path_segment("mykey"); + json_term_writer.set_str("two tokens"); + let term_info = inv_index + .get_term_info(json_term_writer.term()) + .unwrap() + .unwrap(); + assert_eq!( + term_info, + TermInfo { + doc_freq: 1, + postings_range: 0..1, + positions_range: 0..0 + } + ); + let mut postings = inv_index + .read_postings(&term, IndexRecordOption::WithFreqs) + .unwrap() + .unwrap(); + assert_eq!(postings.doc(), 0); + assert_eq!(postings.term_freq(), 1); + let mut positions = Vec::new(); + postings.positions(&mut positions); + assert_eq!(postings.advance(), TERMINATED); + } + + #[test] + fn test_position_overlapping_path() { + // This test checks that we do not end up detecting phrase query due + // to several string literal in the same json object being overlapping. + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let json_val: serde_json::Map = serde_json::from_str( + r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#, + ) + .unwrap(); + let doc = doc!(json_field=>json_val); + let index = Index::create_in_ram(schema.clone()); + let mut writer = index.writer_for_tests().unwrap(); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let mut term = Term::new(); + term.set_field(Type::Json, json_field); + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + json_term_writer.push_path_segment("mykey"); + json_term_writer.push_path_segment("field"); + json_term_writer.set_str("hello"); + let hello_term = json_term_writer.term().clone(); + json_term_writer.set_str("nothello"); + let nothello_term = json_term_writer.term().clone(); + json_term_writer.set_str("happy"); + let happy_term = json_term_writer.term().clone(); + let phrase_query = PhraseQuery::new(vec![hello_term, happy_term.clone()]); + assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 1); + let phrase_query = PhraseQuery::new(vec![nothello_term, happy_term]); + assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0); + } } diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs new file mode 100644 index 0000000000..4035944a73 --- /dev/null +++ b/src/postings/json_postings_writer.rs @@ -0,0 +1,96 @@ +use std::io; + +use crate::indexer::doc_id_mapping::DocIdMapping; +use crate::postings::postings_writer::SpecializedPostingsWriter; +use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder}; +use crate::postings::stacker::Addr; +use crate::postings::{ + FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId, +}; +use crate::schema::term::as_json_path_type_value_bytes; +use crate::schema::Type; +use crate::tokenizer::TokenStream; +use crate::{DocId, Term}; + +pub(crate) struct JsonPostingsWriter { + str_posting_writer: SpecializedPostingsWriter, + non_str_posting_writer: SpecializedPostingsWriter, +} + +impl JsonPostingsWriter { + pub(crate) fn new() -> Self { + JsonPostingsWriter { + str_posting_writer: SpecializedPostingsWriter::::new(), + non_str_posting_writer: SpecializedPostingsWriter::::new(), + } + } +} + +impl PostingsWriter for JsonPostingsWriter { + fn subscribe( + &mut self, + doc: crate::DocId, + pos: u32, + term: &crate::Term, + ctx: &mut IndexingContext, + ) -> UnorderedTermId { + self.non_str_posting_writer.subscribe(doc, pos, term, ctx) + } + + fn index_text( + &mut self, + doc_id: DocId, + token_stream: &mut dyn TokenStream, + term_buffer: &mut Term, + ctx: &mut IndexingContext, + indexing_position: &mut IndexingPosition, + ) { + self.str_posting_writer.index_text( + doc_id, + token_stream, + term_buffer, + ctx, + indexing_position, + ); + } + + /// The actual serialization format is handled by the `PostingsSerializer`. + fn serialize( + &self, + term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], + doc_id_map: Option<&DocIdMapping>, + ctx: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + let mut buffer_lender = BufferLender::default(); + for (term, addr, _) in term_addrs { + // TODO optimization opportunity here. + if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) { + if typ == Type::Str { + SpecializedPostingsWriter::::serialize_one_term( + term, + *addr, + doc_id_map, + &mut buffer_lender, + ctx, + serializer, + )?; + } else { + SpecializedPostingsWriter::::serialize_one_term( + term, + *addr, + doc_id_map, + &mut buffer_lender, + ctx, + serializer, + )?; + } + } + } + Ok(()) + } + + fn total_num_tokens(&self) -> u64 { + self.str_posting_writer.total_num_tokens() + self.non_str_posting_writer.total_num_tokens() + } +} diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 4df8beee7d..8f4e3078ae 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -7,6 +7,7 @@ pub(crate) use self::block_search::branchless_binary_search; mod block_segment_postings; pub(crate) mod compression; mod indexing_context; +mod json_postings_writer; mod per_field_postings_writer; mod postings; mod postings_writer; diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index 4c32cb51e8..14341519cc 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -1,3 +1,4 @@ +use crate::postings::json_postings_writer::JsonPostingsWriter; use crate::postings::postings_writer::SpecializedPostingsWriter; use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder}; use crate::postings::PostingsWriter; @@ -49,5 +50,25 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box SpecializedPostingsWriter::::new_boxed(), + FieldType::JsonObject(ref json_object_options) => { + if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { + match text_indexing_option.index_option() { + IndexRecordOption::Basic => { + Box::new(JsonPostingsWriter::::new()) + as Box + } + IndexRecordOption::WithFreqs => { + Box::new(JsonPostingsWriter::::new()) + as Box + } + IndexRecordOption::WithFreqsAndPositions => { + Box::new(JsonPostingsWriter::::new()) + as Box + } + } + } else { + Box::new(JsonPostingsWriter::::new()) + } + } } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index f4ca6394b6..39628b36bf 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -13,7 +13,7 @@ use crate::postings::{ FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter, UnorderedTermId, }; -use crate::schema::{Field, FieldType, Schema, Term, Type}; +use crate::schema::{Field, FieldType, Schema, Term}; use crate::termdict::TermOrdinal; use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN}; use crate::DocId; @@ -85,6 +85,7 @@ pub(crate) fn serialize_postings( } FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {} FieldType::Bytes(_) => {} + FieldType::JsonObject(_) => {} } let postings_writer = per_field_postings_writers.get_for_field(field); @@ -142,13 +143,12 @@ pub(crate) trait PostingsWriter { fn index_text( &mut self, doc_id: DocId, - field: Field, token_stream: &mut dyn TokenStream, term_buffer: &mut Term, ctx: &mut IndexingContext, indexing_position: &mut IndexingPosition, ) { - term_buffer.set_field(Type::Str, field); + let end_of_path_idx = term_buffer.as_mut().len(); let mut num_tokens = 0; let mut end_position = 0; token_stream.process(&mut |token: &Token| { @@ -162,7 +162,10 @@ pub(crate) trait PostingsWriter { ); return; } - term_buffer.set_text(token.text.as_str()); + term_buffer.as_mut().truncate(end_of_path_idx); + term_buffer + .as_mut() + .extend_from_slice(token.text.as_bytes()); let start_position = indexing_position.end_position + token.position as u32; end_position = start_position + token.position_length as u32; self.subscribe(doc_id, start_position, term_buffer, ctx); @@ -170,6 +173,7 @@ pub(crate) trait PostingsWriter { }); indexing_position.end_position = end_position + POSITION_GAP; indexing_position.num_tokens += num_tokens; + term_buffer.as_mut().truncate(end_of_path_idx); } fn total_num_tokens(&self) -> u64; @@ -177,27 +181,43 @@ pub(crate) trait PostingsWriter { /// The `SpecializedPostingsWriter` is just here to remove dynamic /// dispatch to the recorder information. -pub(crate) struct SpecializedPostingsWriter { +#[derive(Default)] +pub(crate) struct SpecializedPostingsWriter { total_num_tokens: u64, _recorder_type: PhantomData, } -impl SpecializedPostingsWriter { - /// constructor - pub fn new() -> SpecializedPostingsWriter { - SpecializedPostingsWriter { +impl SpecializedPostingsWriter { + pub fn new() -> Self { + Self { total_num_tokens: 0u64, _recorder_type: PhantomData, } } - /// Builds a `SpecializedPostingsWriter` storing its data in a memory arena. pub fn new_boxed() -> Box { - Box::new(SpecializedPostingsWriter::::new()) + Box::new(Self::new()) + } + + #[inline] + pub(crate) fn serialize_one_term( + term: &Term<&[u8]>, + addr: Addr, + doc_id_map: Option<&DocIdMapping>, + buffer_lender: &mut BufferLender, + ctx: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + let recorder: Rec = ctx.term_index.read(addr); + let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); + serializer.new_term(term.value_bytes(), term_doc_freq)?; + recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender); + serializer.close_term()?; + Ok(()) } } -impl PostingsWriter for SpecializedPostingsWriter { +impl PostingsWriter for SpecializedPostingsWriter { fn subscribe( &mut self, doc: DocId, @@ -235,11 +255,7 @@ impl PostingsWriter for SpecializedPostingsWriter ) -> io::Result<()> { let mut buffer_lender = BufferLender::default(); for (term, addr, _) in term_addrs { - let recorder: Rec = ctx.term_index.read(*addr); - let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); - serializer.new_term(term.value_bytes(), term_doc_freq)?; - recorder.serialize(&ctx.arena, doc_id_map, serializer, &mut buffer_lender); - serializer.close_term()?; + Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?; } Ok(()) } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 90c37b6542..fbf0be4ecf 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -76,7 +76,7 @@ impl InvertedIndexSerializer { field: Field, total_num_tokens: u64, fieldnorm_reader: Option, - ) -> io::Result> { + ) -> io::Result { let field_entry: &FieldEntry = self.schema.get_field_entry(field); let term_dictionary_write = self.terms_write.for_field(field); let postings_write = self.postings_write.for_field(field); @@ -122,24 +122,21 @@ impl<'a> FieldSerializer<'a> { fieldnorm_reader: Option, ) -> io::Result> { total_num_tokens.serialize(postings_write)?; - let mode = match field_type { - FieldType::Str(ref text_options) => { - if let Some(text_indexing_options) = text_options.get_indexing_options() { - text_indexing_options.index_option() - } else { - IndexRecordOption::Basic - } - } - _ => IndexRecordOption::Basic, - }; + let index_record_option = field_type + .index_record_option() + .unwrap_or(IndexRecordOption::Basic); let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?; let average_fieldnorm = fieldnorm_reader .as_ref() .map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score)) .unwrap_or(0.0); - let postings_serializer = - PostingsSerializer::new(postings_write, average_fieldnorm, mode, fieldnorm_reader); - let positions_serializer_opt = if mode.has_positions() { + let postings_serializer = PostingsSerializer::new( + postings_write, + average_fieldnorm, + index_record_option, + fieldnorm_reader, + ); + let positions_serializer_opt = if index_record_option.has_positions() { Some(PositionSerializer::new(positions_write)) } else { None @@ -203,6 +200,7 @@ impl<'a> FieldSerializer<'a> { self.current_term_info.doc_freq += 1; self.postings_serializer.write_doc(doc_id, term_freq); if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() { + assert_eq!(term_freq as usize, position_deltas.len()); positions_serializer.write_positions_delta(position_deltas); } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 2cfcec7f58..695bac436a 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -4,16 +4,19 @@ use std::num::{ParseFloatError, ParseIntError}; use std::ops::Bound; use std::str::FromStr; -use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf}; +use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; use super::logical_ast::*; use crate::core::Index; +use crate::indexer::JsonTermWriter; use crate::query::{ AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery, TermQuery, }; -use crate::schema::{Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term}; -use crate::tokenizer::TokenizerManager; +use crate::schema::{ + Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type, +}; +use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::Score; /// Possible error that may happen when parsing a query. @@ -29,15 +32,15 @@ pub enum QueryParserError { /// The query contains a term for a `u64` or `i64`-field, but the value /// is neither. #[error("Expected a valid integer: '{0:?}'")] - ExpectedInt(ParseIntError), + ExpectedInt(#[from] ParseIntError), /// The query contains a term for a bytes field, but the value is not valid /// base64. #[error("Expected base64: '{0:?}'")] - ExpectedBase64(base64::DecodeError), + ExpectedBase64(#[from] base64::DecodeError), /// The query contains a term for a `f64`-field, but the value /// is not a f64. #[error("Invalid query: Only excluding terms given")] - ExpectedFloat(ParseFloatError), + ExpectedFloat(#[from] ParseFloatError), /// It is forbidden queries that are only "excluding". (e.g. -title:pop) #[error("Invalid query: Only excluding terms given")] AllButQueryForbidden, @@ -63,34 +66,10 @@ pub enum QueryParserError { RangeMustNotHavePhrase, /// The format for the date field is not RFC 3339 compliant. #[error("The date field has an invalid format")] - DateFormatError(chrono::ParseError), + DateFormatError(#[from] chrono::ParseError), /// The format for the facet field is invalid. #[error("The facet field is malformed: {0}")] - FacetFormatError(FacetParseError), -} - -impl From for QueryParserError { - fn from(err: ParseIntError) -> QueryParserError { - QueryParserError::ExpectedInt(err) - } -} - -impl From for QueryParserError { - fn from(err: ParseFloatError) -> QueryParserError { - QueryParserError::ExpectedFloat(err) - } -} - -impl From for QueryParserError { - fn from(err: chrono::ParseError) -> QueryParserError { - QueryParserError::DateFormatError(err) - } -} - -impl From for QueryParserError { - fn from(err: FacetParseError) -> QueryParserError { - QueryParserError::FacetFormatError(err) - } + FacetFormatError(#[from] FacetParseError), } /// Recursively remove empty clause from the AST @@ -284,110 +263,101 @@ impl QueryParser { Ok(ast) } - fn compute_terms_for_string( + fn compute_logical_ast_for_leaf( &self, field: Field, + json_path: &str, phrase: &str, - ) -> Result, QueryParserError> { + ) -> Result, QueryParserError> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); + let field_name = field_entry.name(); if !field_type.is_indexed() { - let field_name = field_entry.name().to_string(); - return Err(QueryParserError::FieldNotIndexed(field_name)); + return Err(QueryParserError::FieldNotIndexed(field_name.to_string())); } match *field_type { + FieldType::U64(_) => { + let val: u64 = u64::from_str(phrase)?; + let i64_term = Term::from_field_u64(field, val); + Ok(vec![LogicalLiteral::Term(i64_term)]) + } FieldType::I64(_) => { let val: i64 = i64::from_str(phrase)?; - let term = Term::from_field_i64(field, val); - Ok(vec![(0, term)]) + let i64_term = Term::from_field_i64(field, val); + Ok(vec![LogicalLiteral::Term(i64_term)]) } FieldType::F64(_) => { let val: f64 = f64::from_str(phrase)?; - let term = Term::from_field_f64(field, val); - Ok(vec![(0, term)]) + let f64_term = Term::from_field_f64(field, val); + Ok(vec![LogicalLiteral::Term(f64_term)]) } - FieldType::Date(_) => match chrono::DateTime::parse_from_rfc3339(phrase) { - Ok(x) => Ok(vec![( - 0, - Term::from_field_date(field, &x.with_timezone(&chrono::Utc)), - )]), - Err(e) => Err(QueryParserError::DateFormatError(e)), - }, - FieldType::U64(_) => { - let val: u64 = u64::from_str(phrase)?; - let term = Term::from_field_u64(field, val); - Ok(vec![(0, term)]) + FieldType::Date(_) => { + let dt = chrono::DateTime::parse_from_rfc3339(phrase)?; + let dt_term = Term::from_field_date(field, &dt.with_timezone(&chrono::Utc)); + Ok(vec![LogicalLiteral::Term(dt_term)]) } FieldType::Str(ref str_options) => { - if let Some(option) = str_options.get_indexing_options() { - let tokenizer = - self.tokenizer_manager - .get(option.tokenizer()) - .ok_or_else(|| { - QueryParserError::UnknownTokenizer( - field_entry.name().to_string(), - option.tokenizer().to_string(), - ) - })?; - let mut terms: Vec<(usize, Term)> = Vec::new(); - let mut token_stream = tokenizer.token_stream(phrase); - token_stream.process(&mut |token| { - let term = Term::from_field_text(field, &token.text); - terms.push((token.position, term)); - }); - if terms.is_empty() { - Ok(vec![]) - } else if terms.len() == 1 { - Ok(terms) - } else { - let field_entry = self.schema.get_field_entry(field); - let field_type = field_entry.field_type(); - if let Some(index_record_option) = field_type.get_index_record_option() { - if index_record_option.has_positions() { - Ok(terms) - } else { - let fieldname = self.schema.get_field_name(field).to_string(); - Err(QueryParserError::FieldDoesNotHavePositionsIndexed( - fieldname, - )) - } - } else { - let fieldname = self.schema.get_field_name(field).to_string(); - Err(QueryParserError::FieldNotIndexed(fieldname)) - } - } - } else { + let option = str_options.get_indexing_options().ok_or_else(|| { // This should have been seen earlier really. - Err(QueryParserError::FieldNotIndexed( - field_entry.name().to_string(), - )) - } + QueryParserError::FieldNotIndexed(field_name.to_string()) + })?; + let text_analyzer = + self.tokenizer_manager + .get(option.tokenizer()) + .ok_or_else(|| { + QueryParserError::UnknownTokenizer( + field_name.to_string(), + option.tokenizer().to_string(), + ) + })?; + let index_record_option = option.index_option(); + generate_literals_for_str( + field_name, + field, + phrase, + &text_analyzer, + index_record_option, + ) + } + FieldType::JsonObject(ref json_options) => { + let option = json_options.get_text_indexing_options().ok_or_else(|| { + // This should have been seen earlier really. + QueryParserError::FieldNotIndexed(field_name.to_string()) + })?; + let text_analyzer = + self.tokenizer_manager + .get(option.tokenizer()) + .ok_or_else(|| { + QueryParserError::UnknownTokenizer( + field_name.to_string(), + option.tokenizer().to_string(), + ) + })?; + let index_record_option = option.index_option(); + generate_literals_for_json_object( + field_name, + field, + json_path, + phrase, + &text_analyzer, + index_record_option, + ) } FieldType::Facet(_) => match Facet::from_text(phrase) { - Ok(facet) => Ok(vec![(0, Term::from_facet(field, &facet))]), + Ok(facet) => { + let facet_term = Term::from_facet(field, &facet); + Ok(vec![LogicalLiteral::Term(facet_term)]) + } Err(e) => Err(QueryParserError::from(e)), }, FieldType::Bytes(_) => { let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; - let term = Term::from_field_bytes(field, &bytes); - Ok(vec![(0, term)]) + let bytes_term = Term::from_field_bytes(field, &bytes); + Ok(vec![LogicalLiteral::Term(bytes_term)]) } } } - fn compute_logical_ast_for_leaf( - &self, - field: Field, - phrase: &str, - ) -> Result, QueryParserError> { - let terms = self.compute_terms_for_string(field, phrase)?; - match &terms[..] { - [] => Ok(None), - [(_, term)] => Ok(Some(LogicalLiteral::Term(term.clone()))), - _ => Ok(Some(LogicalLiteral::Phrase(terms.clone()))), - } - } - fn default_occur(&self) -> Occur { if self.conjunction_by_default { Occur::Must @@ -398,22 +368,24 @@ impl QueryParser { fn resolve_bound( &self, - field: Field, + _field: Field, bound: &UserInputBound, ) -> Result, QueryParserError> { if bound.term_str() == "*" { return Ok(Bound::Unbounded); } - let terms = self.compute_terms_for_string(field, bound.term_str())?; - if terms.len() != 1 { - return Err(QueryParserError::RangeMustNotHavePhrase); - } - let (_, term) = terms.into_iter().next().unwrap(); - match *bound { - UserInputBound::Inclusive(_) => Ok(Bound::Included(term)), - UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)), - UserInputBound::Unbounded => Ok(Bound::Unbounded), - } + unimplemented!() + // let terms = self.compute_terms_for_string(field, bound.term_str())?; + // TODO We need to be careful about JSON fields add new terms here!! + // if terms.len() != 1 { + // return Err(QueryParserError::RangeMustNotHavePhrase); + // } + // let (_, term) = terms.into_iter().next().unwrap(); + // match *bound { + // UserInputBound::Inclusive(_) => Ok(Bound::Included(term)), + // UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)), + // UserInputBound::Unbounded => Ok(Bound::Unbounded), + // } } fn resolved_fields( @@ -459,31 +431,58 @@ impl QueryParser { self.boost.get(&field).cloned().unwrap_or(1.0) } + fn default_indexed_json_fields(&self) -> impl Iterator + '_ { + let schema = self.schema.clone(); + self.default_fields.iter().cloned().filter(move |field| { + let field_type = schema.get_field_entry(*field).field_type(); + field_type.value_type() == Type::Json && field_type.is_indexed() + }) + } + + fn compute_path_triplet_for_literal<'a>( + &self, + literal: &'a UserInputLiteral, + ) -> Result, QueryParserError> { + match &literal.field_name { + Some(ref full_path) => { + // We need to add terms associated to json default fields. + let (field_name, path) = full_path.split_once(".").unwrap_or((full_path, "")); + if let Ok(field) = self.resolve_field_name(field_name) { + return Ok(vec![(field, path, literal.phrase.as_str())]); + } + let triplets: Vec<(Field, &str, &str)> = self + .default_indexed_json_fields() + .map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str())) + .collect(); + if triplets.is_empty() { + return Err(QueryParserError::FieldDoesNotExist(field_name.to_string())); + } + Ok(triplets) + } + None => { + if self.default_fields.is_empty() { + return Err(QueryParserError::NoDefaultFieldDeclared); + } + Ok(self + .default_fields + .iter() + .map(|default_field| (*default_field, "", literal.phrase.as_str())) + .collect::>()) + } + } + } + fn compute_logical_ast_from_leaf( &self, leaf: UserInputLeaf, ) -> Result { match leaf { UserInputLeaf::Literal(literal) => { - let term_phrases: Vec<(Field, String)> = match literal.field_name { - Some(ref field_name) => { - let field = self.resolve_field_name(field_name)?; - vec![(field, literal.phrase.clone())] - } - None => { - if self.default_fields.is_empty() { - return Err(QueryParserError::NoDefaultFieldDeclared); - } else { - self.default_fields - .iter() - .map(|default_field| (*default_field, literal.phrase.clone())) - .collect::>() - } - } - }; + let term_phrases: Vec<(Field, &str, &str)> = + self.compute_path_triplet_for_literal(&literal)?; let mut asts: Vec = Vec::new(); - for (field, phrase) in term_phrases { - if let Some(ast) = self.compute_logical_ast_for_leaf(field, &phrase)? { + for (field, json_path, phrase) in term_phrases { + for ast in self.compute_logical_ast_for_leaf(field, json_path, &phrase)? { // Apply some field specific boost defined at the query parser level. let boost = self.field_boost(field); asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost)); @@ -552,6 +551,114 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { } } +fn generate_literals_for_str( + field_name: &str, + field: Field, + phrase: &str, + text_analyzer: &TextAnalyzer, + index_record_option: IndexRecordOption, +) -> Result, QueryParserError> { + let mut terms: Vec<(usize, Term)> = Vec::new(); + let mut token_stream = text_analyzer.token_stream(phrase); + token_stream.process(&mut |token| { + let term = Term::from_field_text(field, &token.text); + terms.push((token.position, term)); + }); + if terms.len() <= 1 { + return Ok(terms + .into_iter() + .map(|(_, term)| LogicalLiteral::Term(term)) + .collect()); + } + if !index_record_option.has_positions() { + return Err(QueryParserError::FieldDoesNotHavePositionsIndexed( + field_name.to_string(), + )); + } + Ok(vec![LogicalLiteral::Phrase(terms)]) +} + +enum NumValue { + U64(u64), + I64(i64), + F64(f64), + DateTime(crate::DateTime), +} + +fn infer_type_num(phrase: &str) -> Option { + if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(phrase) { + let dt_utc = dt.with_timezone(&chrono::Utc); + return Some(NumValue::DateTime(dt_utc)); + } + if let Ok(u64_val) = str::parse::(phrase) { + return Some(NumValue::U64(u64_val)); + } + if let Ok(i64_val) = str::parse::(phrase) { + return Some(NumValue::I64(i64_val)); + } + if let Ok(f64_val) = str::parse::(phrase) { + return Some(NumValue::F64(f64_val)); + } + None +} + +fn generate_literals_for_json_object( + field_name: &str, + field: Field, + json_path: &str, + phrase: &str, + text_analyzer: &TextAnalyzer, + index_record_option: IndexRecordOption, +) -> Result, QueryParserError> { + let mut logical_literals = Vec::new(); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + for segment in json_path.split(".") { + json_term_writer.push_path_segment(segment); + } + if let Some(num_value) = infer_type_num(phrase) { + match num_value { + NumValue::U64(u64_val) => { + json_term_writer.set_fast_value(u64_val); + } + NumValue::I64(i64_val) => { + json_term_writer.set_fast_value(i64_val); + } + NumValue::F64(f64_val) => { + json_term_writer.set_fast_value(f64_val); + } + NumValue::DateTime(dt_val) => { + json_term_writer.set_fast_value(dt_val); + } + } + logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone())); + } + json_term_writer.close_path_and_set_type(Type::Str); + drop(json_term_writer); + let term_num_bytes = term.as_slice().len(); + let mut token_stream = text_analyzer.token_stream(phrase); + let mut terms: Vec<(usize, Term)> = Vec::new(); + token_stream.process(&mut |token| { + term.as_mut().truncate(term_num_bytes); + term.as_mut().extend_from_slice(token.text.as_bytes()); + terms.push((token.position, term.clone())); + }); + if terms.len() <= 1 { + for (_, term) in terms { + logical_literals.push(LogicalLiteral::Term(term)); + } + return Ok(logical_literals); + } + if !index_record_option.has_positions() { + return Err(QueryParserError::FieldDoesNotHavePositionsIndexed( + field_name.to_string(), + )); + } + logical_literals.push(LogicalLiteral::Phrase(terms)); + Ok(logical_literals) +} + fn convert_to_query(logical_ast: LogicalAst) -> Box { match trim_ast(logical_ast) { Some(LogicalAst::Clause(trimmed_clause)) => { @@ -615,13 +722,15 @@ mod test { schema_builder.add_facet_field("facet", FacetOptions::default()); schema_builder.add_bytes_field("bytes", INDEXED); schema_builder.add_bytes_field("bytes_not_indexed", STORED); + schema_builder.add_json_field("json", TEXT); + schema_builder.add_json_field("json_not_indexed", STORED); schema_builder.build() } - fn make_query_parser() -> QueryParser { + fn make_query_parser_with_default_fields(default_fields: &[&'static str]) -> QueryParser { let schema = make_schema(); - let default_fields: Vec = vec!["title", "text"] - .into_iter() + let default_fields: Vec = default_fields + .iter() .flat_map(|field_name| schema.get_field(field_name)) .collect(); let tokenizer_manager = TokenizerManager::default(); @@ -634,6 +743,10 @@ mod test { QueryParser::new(schema, default_fields, tokenizer_manager) } + fn make_query_parser() -> QueryParser { + make_query_parser_with_default_fields(&["title", "text"]) + } + fn parse_query_to_logical_ast( query: &str, default_conjunction: bool, @@ -661,7 +774,7 @@ mod test { let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap(); assert_eq!( format!("{:?}", query), - r#"TermQuery(Term(type=Facet, field=11, val="/root/branch/leaf"))"# + r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"# ); } @@ -674,7 +787,7 @@ mod test { let query = query_parser.parse_query("text:hello").unwrap(); assert_eq!( format!("{:?}", query), - r#"Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2)"# + r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"# ); } @@ -701,7 +814,7 @@ mod test { let query = query_parser.parse_query("text:hello^2").unwrap(); assert_eq!( format!("{:?}", query), - r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2), boost=2)"# + r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"# ); } @@ -736,7 +849,7 @@ mod test { pub fn test_parse_query_untokenized() { test_parse_query_to_logical_ast_helper( "nottokenized:\"wordone wordtwo\"", - r#"Term(type=Str, field=7, val="wordone wordtwo")"#, + r#"Term(type=Str, field=7, "wordone wordtwo")"#, false, ); } @@ -779,7 +892,7 @@ mod test { .is_ok()); test_parse_query_to_logical_ast_helper( "unsigned:2324", - "Term(type=U64, field=3, val=2324)", + "Term(type=U64, field=3, 2324)", false, ); @@ -806,7 +919,7 @@ mod test { fn test_parse_bytes() { test_parse_query_to_logical_ast_helper( "bytes:YnVidQ==", - "Term(type=Bytes, field=12, val=[98, 117, 98, 117])", + "Term(type=Bytes, field=12, [98, 117, 98, 117])", false, ); } @@ -817,11 +930,100 @@ mod test { assert!(matches!(error, QueryParserError::FieldNotIndexed(_))); } + #[test] + fn test_json_field() { + test_parse_query_to_logical_ast_helper( + "json.titi:hello", + "Term(type=Json, field=14, path=titi, vtype=Str, \"hello\")", + false, + ); + } + + #[test] + fn test_json_field_possibly_a_number() { + test_parse_query_to_logical_ast_helper( + "json.titi:5", + r#"(Term(type=Json, field=14, path=titi, vtype=U64, 5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#, + true, + ); + test_parse_query_to_logical_ast_helper( + "json.titi:-5", + r#"(Term(type=Json, field=14, path=titi, vtype=I64, -5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-". + true + ); + test_parse_query_to_logical_ast_helper( + "json.titi:-5.2", + r#"(Term(type=Json, field=14, path=titi, vtype=F64, -5.2) "[(0, Term(type=Json, field=14, path=titi, vtype=Str, "5")), (1, Term(type=Json, field=14, path=titi, vtype=Str, "2"))]")"#, + true, + ); + } + + #[test] + fn test_json_field_possibly_a_date() { + test_parse_query_to_logical_ast_helper( + r#"json.date:"2019-10-12T07:20:50.52Z""#, + r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#, + true, + ); + } + + #[test] + fn test_json_field_not_indexed() { + let error = parse_query_to_logical_ast("json_not_indexed.titi:hello", false).unwrap_err(); + assert!(matches!(error, QueryParserError::FieldNotIndexed(_))); + } + + fn test_query_to_logical_ast_with_default_json( + query: &str, + expected: &str, + default_conjunction: bool, + ) { + let mut query_parser = make_query_parser_with_default_fields(&["json"]); + if default_conjunction { + query_parser.set_conjunction_by_default(); + } + let ast = query_parser.parse_query_to_logical_ast(query).unwrap(); + let ast_str = format!("{ast:?}"); + assert_eq!(ast_str, expected); + } + + #[test] + fn test_json_default() { + test_query_to_logical_ast_with_default_json( + "titi:4", + "(Term(type=Json, field=14, path=titi, vtype=U64, 4) Term(type=Json, field=14, \ + path=titi, vtype=Str, \"4\"))", + false, + ); + } + + #[test] + fn test_json_default_with_different_field() { + for conjunction in [false, true] { + test_query_to_logical_ast_with_default_json( + "text:4", + r#"Term(type=Str, field=1, "4")"#, + conjunction, + ); + } + } + + #[test] + fn test_json_default_with_same_field() { + for conjunction in [false, true] { + test_query_to_logical_ast_with_default_json( + "json:4", + r#"(Term(type=Json, field=14, path=, vtype=U64, 4) Term(type=Json, field=14, path=, vtype=Str, "4"))"#, + conjunction, + ); + } + } + #[test] fn test_parse_bytes_phrase() { test_parse_query_to_logical_ast_helper( "bytes:\"YnVidQ==\"", - "Term(type=Bytes, field=12, val=[98, 117, 98, 117])", + "Term(type=Bytes, field=12, [98, 117, 98, 117])", false, ); } @@ -837,12 +1039,12 @@ mod test { fn test_parse_query_to_ast_ab_c() { test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - r#"((+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) Term(type=Str, field=0, val="c"))"#, + r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#, false, ); test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - r#"(+(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) +Term(type=Str, field=0, val="c"))"#, + r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#, true, ); } @@ -851,17 +1053,17 @@ mod test { pub fn test_parse_query_to_ast_single_term() { test_parse_query_to_logical_ast_helper( "title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#, + r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#, false, ); } @@ -878,12 +1080,12 @@ mod test { pub fn test_parse_query_to_ast_two_terms() { test_parse_query_to_logical_ast_helper( "title:a b", - r#"(Term(type=Str, field=0, val="a") (Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#, + r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#, false, ); test_parse_query_to_logical_ast_helper( r#"title:"a b""#, - r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#, + r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#, false, ); } @@ -892,37 +1094,37 @@ mod test { pub fn test_parse_query_to_ast_ranges() { test_parse_query_to_logical_ast_helper( "title:[a TO b]", - r#"(Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b")))"#, + r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#, false, ); test_parse_query_to_logical_ast_helper( "[a TO b]", - r#"((Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b"))) (Included(Term(type=Str, field=1, val="a")) TO Included(Term(type=Str, field=1, val="b"))))"#, + r#"((Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b"))) (Included(Term(type=Str, field=1, "a")) TO Included(Term(type=Str, field=1, "b"))))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO toto}", - r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Excluded(Term(type=Str, field=0, val="toto")))"#, + r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{* TO toto}", - r#"(Unbounded TO Excluded(Term(type=Str, field=0, val="toto")))"#, + r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO *}", - r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Unbounded)"#, + r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#, false, ); test_parse_query_to_logical_ast_helper( "signed:{-5 TO 3}", - r#"(Excluded(Term(type=I64, field=2, val=-5)) TO Excluded(Term(type=I64, field=2, val=3)))"#, + r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#, false, ); test_parse_query_to_logical_ast_helper( "float:{-1.5 TO 1.5}", - r#"(Excluded(Term(type=F64, field=10, val=-1.5)) TO Excluded(Term(type=F64, field=10, val=1.5)))"#, + r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#, false, ); test_parse_query_to_logical_ast_helper("*", "*", false); @@ -1051,27 +1253,27 @@ mod test { pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( "title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#, + r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:a b", - r#"(+Term(type=Str, field=0, val="a") +(Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#, + r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:\"a b\"", - r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#, + r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#, true, ); } @@ -1080,7 +1282,7 @@ mod test { pub fn test_query_parser_hyphen() { test_parse_query_to_logical_ast_helper( "title:www-form-encoded", - r#""[(0, Term(type=Str, field=0, val="www")), (1, Term(type=Str, field=0, val="form")), (2, Term(type=Str, field=0, val="encoded"))]""#, + r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#, false, ); } @@ -1090,7 +1292,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a AND title:b", - r#"(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b"))"#, + r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#, default_conjunction, ); } @@ -1101,7 +1303,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a OR title:b", - r#"(Term(type=Str, field=0, val="a") Term(type=Str, field=0, val="b"))"#, + r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#, default_conjunction, ); } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index f997cdc51f..b6c1067215 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -174,7 +174,7 @@ mod tests { ); assert_eq!( format!("{:?}", term_query), - r#"TermQuery(Term(type=Str, field=1, val="hello"))"# + r#"TermQuery(Term(type=Str, field=1, "hello"))"# ); } diff --git a/src/schema/document.rs b/src/schema/document.rs index 2833e27d7a..c9da05b53e 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -117,7 +117,16 @@ impl Document { /// Add a bytes field pub fn add_bytes>>(&mut self, field: Field, value: T) { - self.add_field_value(field, value.into()) + self.add_field_value(field, value.into()); + } + + /// Add a bytes field + pub fn add_json_object( + &mut self, + field: Field, + json_object: serde_json::Map, + ) { + self.add_field_value(field, json_object); } /// Add a (field, value) to the document. diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 099f0ed278..b58d3128cb 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,7 +1,9 @@ use serde::{Deserialize, Serialize}; use crate::schema::bytes_options::BytesOptions; -use crate::schema::{is_valid_field_name, FacetOptions, FieldType, IntOptions, TextOptions}; +use crate::schema::{ + is_valid_field_name, FacetOptions, FieldType, IntOptions, JsonObjectOptions, TextOptions, +}; /// A `FieldEntry` represents a field and its configuration. /// `Schema` are a collection of `FieldEntry` @@ -27,71 +29,44 @@ impl FieldEntry { } } - /// Creates a new u64 field entry in the schema, given - /// a name, and some options. + /// Creates a new text field entry. pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::Str(text_options), - } + Self::new(field_name, FieldType::Str(text_options)) } - /// Creates a new u64 field entry in the schema, given - /// a name, and some options. - pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::U64(field_type), - } + /// Creates a new u64 field entry. + pub fn new_u64(field_name: String, int_options: IntOptions) -> FieldEntry { + Self::new(field_name, FieldType::U64(int_options)) } - /// Creates a new i64 field entry in the schema, given - /// a name, and some options. - pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::I64(field_type), - } + /// Creates a new i64 field entry. + pub fn new_i64(field_name: String, int_options: IntOptions) -> FieldEntry { + Self::new(field_name, FieldType::I64(int_options)) } - /// Creates a new f64 field entry in the schema, given - /// a name, and some options. - pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::F64(field_type), - } + /// Creates a new f64 field entry. + pub fn new_f64(field_name: String, f64_options: IntOptions) -> FieldEntry { + Self::new(field_name, FieldType::F64(f64_options)) } - /// Creates a new date field entry in the schema, given - /// a name, and some options. - pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::Date(field_type), - } + /// Creates a new date field entry. + pub fn new_date(field_name: String, date_options: IntOptions) -> FieldEntry { + Self::new(field_name, FieldType::Date(date_options)) } /// Creates a field entry for a facet. - pub fn new_facet(field_name: String, field_type: FacetOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::Facet(field_type), - } + pub fn new_facet(field_name: String, facet_options: FacetOptions) -> FieldEntry { + Self::new(field_name, FieldType::Facet(facet_options)) } /// Creates a field entry for a bytes field - pub fn new_bytes(field_name: String, bytes_type: BytesOptions) -> FieldEntry { - FieldEntry { - name: field_name, - field_type: FieldType::Bytes(bytes_type), - } + pub fn new_bytes(field_name: String, bytes_options: BytesOptions) -> FieldEntry { + Self::new(field_name, FieldType::Bytes(bytes_options)) + } + + /// Creates a field entry for a json field + pub fn new_json(field_name: String, json_object_options: JsonObjectOptions) -> FieldEntry { + Self::new(field_name, FieldType::JsonObject(json_object_options)) } /// Returns the name of the field @@ -137,6 +112,7 @@ impl FieldEntry { FieldType::Str(ref options) => options.is_stored(), FieldType::Facet(ref options) => options.is_stored(), FieldType::Bytes(ref options) => options.is_stored(), + FieldType::JsonObject(ref options) => options.is_stored(), } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index c0e429a947..dd9fe49edb 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -5,7 +5,9 @@ use thiserror::Error; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; -use crate::schema::{Facet, IndexRecordOption, IntOptions, TextFieldIndexing, TextOptions, Value}; +use crate::schema::{ + Facet, IndexRecordOption, IntOptions, JsonObjectOptions, TextFieldIndexing, TextOptions, Value, +}; use crate::tokenizer::PreTokenizedString; /// Possible error that may occur while parsing a field value @@ -47,9 +49,11 @@ pub enum Type { Facet = b'h', /// `Vec` Bytes = b'b', + /// Leaf in a Json object. + Json = b'j', } -const ALL_TYPES: [Type; 7] = [ +const ALL_TYPES: [Type; 8] = [ Type::Str, Type::U64, Type::I64, @@ -57,6 +61,7 @@ const ALL_TYPES: [Type; 7] = [ Type::Date, Type::Facet, Type::Bytes, + Type::Json, ]; impl Type { @@ -81,6 +86,7 @@ impl Type { Type::Date => "Date", Type::Facet => "Facet", Type::Bytes => "Bytes", + Type::Json => "Json", } } @@ -95,6 +101,7 @@ impl Type { b'd' => Some(Type::Date), b'h' => Some(Type::Facet), b'b' => Some(Type::Bytes), + b'j' => Some(Type::Json), _ => None, } } @@ -121,6 +128,8 @@ pub enum FieldType { Facet(FacetOptions), /// Bytes (one per document) Bytes(BytesOptions), + /// Json object + JsonObject(JsonObjectOptions), } impl FieldType { @@ -134,6 +143,7 @@ impl FieldType { FieldType::Date(_) => Type::Date, FieldType::Facet(_) => Type::Facet, FieldType::Bytes(_) => Type::Bytes, + FieldType::JsonObject(_) => Type::Json, } } @@ -147,6 +157,7 @@ impl FieldType { FieldType::Date(ref date_options) => date_options.is_indexed(), FieldType::Facet(ref _facet_options) => true, FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), + FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(), } } @@ -158,6 +169,9 @@ impl FieldType { FieldType::Str(text_options) => text_options .get_indexing_options() .map(|text_indexing| text_indexing.index_option()), + FieldType::JsonObject(json_object_options) => json_object_options + .get_text_indexing_options() + .map(|text_indexing| text_indexing.index_option()), field_type => { if field_type.is_indexed() { Some(IndexRecordOption::Basic) @@ -181,6 +195,7 @@ impl FieldType { | FieldType::Date(ref int_options) => int_options.fieldnorms(), FieldType::Facet(_) => false, FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), + FieldType::JsonObject(ref _json_object_options) => false, } } @@ -215,6 +230,9 @@ impl FieldType { None } } + FieldType::JsonObject(ref json_obj_options) => json_obj_options + .get_text_indexing_options() + .map(TextFieldIndexing::index_option), } } @@ -249,6 +267,10 @@ impl FieldType { base64: field_text.clone(), } }), + FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { + expected: "a json object", + json: json.clone(), + }), }, JsonValue::Number(ref field_val_num) => match *self { FieldType::I64(_) | FieldType::Date(_) => { @@ -287,8 +309,12 @@ impl FieldType { json: json.clone(), }) } + FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { + expected: "a json object", + json: json.clone(), + }), }, - JsonValue::Object(_) => match *self { + JsonValue::Object(ref json_map) => match *self { FieldType::Str(_) => { if let Ok(tok_str_val) = serde_json::from_value::(json.clone()) @@ -301,6 +327,7 @@ impl FieldType { }) } } + FieldType::JsonObject(_) => Ok(Value::JsonObject(json_map.clone())), _ => Err(ValueParsingError::TypeError { expected: self.value_type().name(), json: json.clone(), diff --git a/src/schema/json_object_options.rs b/src/schema/json_object_options.rs new file mode 100644 index 0000000000..2a0795c121 --- /dev/null +++ b/src/schema/json_object_options.rs @@ -0,0 +1,109 @@ +use std::ops::BitOr; + +use serde::{Deserialize, Serialize}; + +use crate::schema::flags::{SchemaFlagList, StoredFlag}; +use crate::schema::{TextFieldIndexing, TextOptions}; + +/// The `JsonObjectOptions` make it possible to +/// configure how a json object field should be indexed and stored. +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct JsonObjectOptions { + stored: bool, + // If set to some, int, date, f64 and text will be indexed. + // Text will use the TextFieldIndexing setting for indexing. + indexing: Option, +} + +impl JsonObjectOptions { + /// Returns `true` iff the json object should be stored. + pub fn is_stored(&self) -> bool { + self.stored + } + + /// Returns `true` iff the json object should be indexed. + pub fn is_indexed(&self) -> bool { + self.indexing.is_some() + } + + /// Returns the text indexing options. + /// + /// If set to `Some` then both int and str values will be indexed. + /// The inner `TextFieldIndexing` will however, only apply to the str values + /// in the json object. + pub fn get_text_indexing_options(&self) -> Option<&TextFieldIndexing> { + self.indexing.as_ref() + } +} + +impl From for JsonObjectOptions { + fn from(_stored_flag: StoredFlag) -> Self { + JsonObjectOptions { + stored: true, + indexing: None, + } + } +} + +impl From<()> for JsonObjectOptions { + fn from(_: ()) -> Self { + Self::default() + } +} + +impl> BitOr for JsonObjectOptions { + type Output = JsonObjectOptions; + + fn bitor(self, other: T) -> Self { + let other = other.into(); + JsonObjectOptions { + indexing: self.indexing.or(other.indexing), + stored: self.stored | other.stored, + } + } +} + +impl From> for JsonObjectOptions +where + Head: Clone, + Tail: Clone, + Self: BitOr + From + From, +{ + fn from(head_tail: SchemaFlagList) -> Self { + Self::from(head_tail.head) | Self::from(head_tail.tail) + } +} + +impl From for JsonObjectOptions { + fn from(text_options: TextOptions) -> Self { + JsonObjectOptions { + stored: text_options.is_stored(), + indexing: text_options.get_indexing_options().cloned(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::{STORED, TEXT}; + + #[test] + fn test_json_options() { + { + let json_options: JsonObjectOptions = (STORED | TEXT).into(); + assert!(json_options.is_stored()); + assert!(json_options.is_indexed()); + } + { + let json_options: JsonObjectOptions = TEXT.into(); + assert!(!json_options.is_stored()); + assert!(json_options.is_indexed()); + } + { + let json_options: JsonObjectOptions = STORED.into(); + assert!(json_options.is_stored()); + assert!(!json_options.is_indexed()); + } + } +} diff --git a/src/schema/mod.rs b/src/schema/mod.rs index e64ccc4381..c9b9b899a0 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -104,7 +104,7 @@ mod document; mod facet; mod facet_options; mod schema; -mod term; +pub(crate) mod term; mod field_entry; mod field_type; @@ -112,14 +112,14 @@ mod field_value; mod bytes_options; mod field; +mod flags; mod index_record_option; mod int_options; +mod json_object_options; mod named_field_document; mod text_options; mod value; -mod flags; - pub use self::bytes_options::BytesOptions; pub use self::document::Document; pub(crate) use self::facet::FACET_SEP_BYTE; @@ -132,6 +132,7 @@ pub use self::field_value::FieldValue; pub use self::flags::{FAST, INDEXED, STORED}; pub use self::index_record_option::IndexRecordOption; pub use self::int_options::{Cardinality, IntOptions}; +pub use self::json_object_options::JsonObjectOptions; pub use self::named_field_document::NamedFieldDocument; pub use self::schema::{DocParsingError, Schema, SchemaBuilder}; pub use self::term::Term; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index b501ee70b6..6768eaee8f 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -173,6 +173,16 @@ impl SchemaBuilder { self.add_field(field_entry) } + /// Adds a json object field to the schema. + pub fn add_json_field>( + &mut self, + field_name: &str, + field_options: T, + ) -> Field { + let field_entry = FieldEntry::new_json(field_name.to_string(), field_options.into()); + self.add_field(field_entry) + } + /// Adds a field entry to the schema in build. pub fn add_field(&mut self, field_entry: FieldEntry) -> Field { let field = Field::from_field_id(self.fields.len() as u32); diff --git a/src/schema/term.rs b/src/schema/term.rs index c6e3cfbb08..9257cdba3f 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,3 +1,4 @@ +use std::convert::TryInto; use std::hash::{Hash, Hasher}; use std::{fmt, str}; @@ -8,8 +9,26 @@ use crate::DateTime; /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. /// + + +/// +/// - is a big endian encoded u32 field id +/// - 's most significant bit expresses whether the term is a json term or not +/// The remaining 7 bits are used to encode the type of the value. +/// If this is a JSON term, the type is the type of the leaf of the json. +/// +/// - is, if this is not the json term, a binary representation specific to the type. +/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value. const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; +/// Separates the different segments of +/// the json path. +pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8; +pub const JSON_PATH_SEGMENT_SEP_STR: &str = + unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) }; + +/// Separates the json path and the value in +/// a JSON term binary representation. +pub const JSON_END_OF_PATH: u8 = 0u8; + /// Term represents the value that the token can take. /// /// It actually wraps a `Vec`. @@ -17,6 +36,12 @@ const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; pub struct Term>(B) where B: AsRef<[u8]>; +impl AsMut> for Term { + fn as_mut(&mut self) -> &mut Vec { + &mut self.0 + } +} + impl Term { pub(crate) fn new() -> Term { Term(Vec::with_capacity(100)) @@ -164,13 +189,16 @@ where B: AsRef<[u8]> Term(data) } + fn typ_code(&self) -> u8 { + *self + .as_slice() + .get(4) + .expect("the byte representation is too short") + } + /// Return the type of the term. pub fn typ(&self) -> Type { - assert!( - self.as_slice().len() >= 5, - "the type does byte representation is too short" - ); - Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code") + Type::from_code(self.typ_code()).expect("The term has an invalid type code") } /// Returns the field. @@ -189,10 +217,14 @@ where B: AsRef<[u8]> } fn get_fast_type(&self) -> Option { - if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN { + if self.typ() != T::to_type() { return None; } let mut value_bytes = [0u8; 8]; + let bytes = self.value_bytes(); + if bytes.len() != 8 { + return None; + } value_bytes.copy_from_slice(self.value_bytes()); let value_u64 = u64::from_be_bytes(value_bytes); Some(FastValue::from_u64(value_u64)) @@ -290,40 +322,74 @@ fn write_opt(f: &mut fmt::Formatter, val_opt: Option) -> Ok(()) } -impl fmt::Debug for Term { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let field_id = self.field().field_id(); - let typ = self.typ(); - write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?; - match typ { - Type::Str => { - let s = str::from_utf8(self.value_bytes()).ok(); - write_opt(f, s)?; - } - Type::U64 => { - write_opt(f, self.as_u64())?; - } - Type::I64 => { - let val_i64 = self.as_i64(); - write_opt(f, val_i64)?; - } - Type::F64 => { - let val_f64 = self.as_f64(); - write_opt(f, val_f64)?; - } - // TODO pretty print these types too. - Type::Date => { - let val_date = self.as_date(); - write_opt(f, val_date)?; - } - Type::Facet => { - let facet = self.as_facet().map(|facet| facet.to_path_string()); - write_opt(f, facet)?; - } - Type::Bytes => { - write_opt(f, self.as_bytes())?; +fn as_str(value_bytes: &[u8]) -> Option<&str> { + std::str::from_utf8(value_bytes).ok() +} + +fn get_fast_type(bytes: &[u8]) -> Option { + let value_u64 = u64::from_be_bytes(bytes.try_into().ok()?); + Some(FastValue::from_u64(value_u64)) +} + +/// Returns the json path (without non-human friendly separators, the type of the value, and the +/// value bytes). Returns None if the value is not JSON or is not valid. +pub(crate) fn as_json_path_type_value_bytes(bytes: &[u8]) -> Option<(&str, Type, &[u8])> { + let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?; + let json_path = str::from_utf8(&bytes[..pos]).ok()?; + let type_code = *bytes.get(pos + 1)?; + let typ = Type::from_code(type_code)?; + Some((json_path, typ, &bytes[pos + 2..])) +} + +fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Result { + match typ { + Type::Str => { + let s = as_str(bytes); + write_opt(f, s)?; + } + Type::U64 => { + write_opt(f, get_fast_type::(bytes))?; + } + Type::I64 => { + write_opt(f, get_fast_type::(bytes))?; + } + Type::F64 => { + write_opt(f, get_fast_type::(bytes))?; + } + // TODO pretty print these types too. + Type::Date => { + write_opt(f, get_fast_type::(bytes))?; + } + Type::Facet => { + let facet_str = str::from_utf8(bytes) + .ok() + .map(ToString::to_string) + .map(Facet::from_encoded_string) + .map(|facet| facet.to_path_string()); + write_opt(f, facet_str)?; + } + Type::Bytes => { + write_opt(f, Some(bytes))?; + } + Type::Json => { + if let Some((path, typ, bytes)) = as_json_path_type_value_bytes(bytes) { + let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, "."); + write!(f, "path={path_pretty}, vtype={typ:?}, ")?; + debug_value_bytes(typ, bytes, f)?; } } + } + Ok(()) +} + +impl fmt::Debug for Term +where B: AsRef<[u8]> +{ + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let field_id = self.field().field_id(); + let typ = self.typ(); + write!(f, "Term(type={typ:?}, field={field_id}, ")?; + debug_value_bytes(typ, self.value_bytes(), f)?; write!(f, ")",)?; Ok(()) } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 83b2dd674a..45fa3b4888 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -46,7 +46,7 @@ impl TextOptions { /// Essentially, should we store the term frequency and/or the positions (See /// [`IndexRecordOption`](./enum.IndexRecordOption.html)). /// - the name of the `Tokenizer` that should be used to process the field. -#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] +#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)] pub struct TextFieldIndexing { record: IndexRecordOption, fieldnorms: bool, diff --git a/src/schema/value.rs b/src/schema/value.rs index df83930d60..5c7d25cfea 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -2,6 +2,7 @@ use std::fmt; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_json::Map; use crate::schema::Facet; use crate::tokenizer::PreTokenizedString; @@ -27,6 +28,8 @@ pub enum Value { Facet(Facet), /// Arbitrarily sized byte array Bytes(Vec), + /// Json object value. + JsonObject(serde_json::Map), } impl Eq for Value {} @@ -43,6 +46,7 @@ impl Serialize for Value { Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()), Value::Facet(ref facet) => facet.serialize(serializer), Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes), + Value::JsonObject(ref obj) => obj.serialize(serializer), } } } @@ -168,6 +172,17 @@ impl Value { None } } + + /// Returns the json object, provided the value is of the JsonObject type. + /// + /// Returns None if the value is not of type JsonObject. + pub fn as_json(&self) -> Option<&Map> { + if let Value::JsonObject(json) = self { + Some(json) + } else { + None + } + } } impl From for Value { @@ -230,6 +245,23 @@ impl From for Value { } } +impl From> for Value { + fn from(json_object: serde_json::Map) -> Value { + Value::JsonObject(json_object) + } +} + +impl From for Value { + fn from(json_value: serde_json::Value) -> Value { + match json_value { + serde_json::Value::Object(json_object) => Value::JsonObject(json_object), + _ => { + panic!("Expected a json object."); + } + } + } +} + mod binary_serialize { use std::io::{self, Read, Write}; @@ -248,6 +280,7 @@ mod binary_serialize { const DATE_CODE: u8 = 5; const F64_CODE: u8 = 6; const EXT_CODE: u8 = 7; + const JSON_OBJ_CODE: u8 = 8; // extended types @@ -296,8 +329,14 @@ mod binary_serialize { BYTES_CODE.serialize(writer)?; bytes.serialize(writer) } + Value::JsonObject(ref map) => { + JSON_OBJ_CODE.serialize(writer)?; + serde_json::to_writer(writer, &map)?; + Ok(()) + } } } + fn deserialize(reader: &mut R) -> io::Result { let type_code = u8::deserialize(reader)?; match type_code { @@ -347,6 +386,10 @@ mod binary_serialize { )), } } + JSON_OBJ_CODE => { + let map = serde_json::from_reader(reader)?; + Ok(Value::JsonObject(map)) + } _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code), diff --git a/src/tokenizer/empty_tokenizer.rs b/src/tokenizer/empty_tokenizer.rs new file mode 100644 index 0000000000..1dca0006dd --- /dev/null +++ b/src/tokenizer/empty_tokenizer.rs @@ -0,0 +1,41 @@ +use crate::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer}; + +#[derive(Clone)] +pub(crate) struct EmptyTokenizer; + +impl Tokenizer for EmptyTokenizer { + fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> { + EmptyTokenStream::default().into() + } +} + +#[derive(Default)] +struct EmptyTokenStream { + token: Token, +} + +impl TokenStream for EmptyTokenStream { + fn advance(&mut self) -> bool { + false + } + + fn token(&self) -> &super::Token { + &self.token + } + + fn token_mut(&mut self) -> &mut super::Token { + &mut self.token + } +} + +#[cfg(test)] +mod tests { + use crate::tokenizer::Tokenizer; + + #[test] + fn test_empty_tokenizer() { + let tokenizer = super::EmptyTokenizer; + let mut empty = tokenizer.token_stream("whatever string"); + assert!(!empty.advance()); + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 506b4d60a9..9549fa1820 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -119,6 +119,7 @@ //! ``` mod alphanum_only; mod ascii_folding_filter; +mod empty_tokenizer; mod facet_tokenizer; mod lower_caser; mod ngram_tokenizer; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index e895ad0f1a..12a0dfab22 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -5,6 +5,8 @@ use std::ops::{Deref, DerefMut}; use serde::{Deserialize, Serialize}; +use crate::tokenizer::empty_tokenizer::EmptyTokenizer; + /// Token #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct Token { @@ -43,6 +45,12 @@ pub struct TextAnalyzer { token_filters: Vec, } +impl Default for TextAnalyzer { + fn default() -> TextAnalyzer { + TextAnalyzer::from(EmptyTokenizer) + } +} + impl From for TextAnalyzer { fn from(tokenizer: T) -> Self { TextAnalyzer::new(tokenizer, Vec::new())