From 400a20b7af9b569892b38ec586f309198b0a4561 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 20 Sep 2022 16:17:34 +0800 Subject: [PATCH 01/27] add ip field add u128 multivalue reader and writer add ip to schema add ip writers, handle merge --- Cargo.toml | 1 + fastfield_codecs/benches/bench.rs | 2 +- fastfield_codecs/src/compact_space/mod.rs | 2 +- fastfield_codecs/src/lib.rs | 10 +- fastfield_codecs/src/main.rs | 2 +- src/fastfield/mod.rs | 5 +- src/fastfield/multivalued/mod.rs | 4 +- src/fastfield/multivalued/reader.rs | 151 +++++++++++++++- src/fastfield/multivalued/writer.rs | 142 ++++++++++++++- src/fastfield/readers.rs | 58 ++++++- src/fastfield/serializer/mod.rs | 5 + src/fastfield/writer.rs | 200 +++++++++++++++++++++- src/indexer/index_writer.rs | 134 +++++++++++++-- src/indexer/merger.rs | 161 ++++++++++++++++- src/indexer/segment_writer.rs | 7 + src/postings/per_field_postings_writer.rs | 1 + src/postings/postings_writer.rs | 1 + src/query/query_parser/query_parser.rs | 2 + src/schema/field_entry.rs | 7 + src/schema/field_type.rs | 35 +++- src/schema/schema.rs | 23 +++ src/schema/term.rs | 8 + src/schema/value.rs | 38 +++- 23 files changed, 966 insertions(+), 33 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 330d963625..1bbe0220bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,6 +60,7 @@ measure_time = "0.8.2" ciborium = { version = "0.2", optional = true} async-trait = "0.1.53" arc-swap = "1.5.0" +roaring = "0.10.1" [target.'cfg(windows)'.dependencies] winapi = "0.3.9" diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 526036d4a2..0bf46ae6e0 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -102,7 +102,7 @@ mod tests { let mut out = vec![]; serialize_u128(VecColumn::from(&data), &mut out).unwrap(); let out = OwnedBytes::new(out); - open_u128(out).unwrap() + open_u128::(out).unwrap() } #[bench] diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index 389bccf6e7..72283bb481 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -604,7 +604,7 @@ mod tests { ]; let mut out = Vec::new(); serialize_u128(VecColumn::from(vals), &mut out).unwrap(); - let decomp = open_u128(OwnedBytes::new(out)).unwrap(); + let decomp = open_u128::(OwnedBytes::new(out)).unwrap(); assert_eq!(decomp.get_between_vals(199..=200), vec![0]); assert_eq!(decomp.get_between_vals(199..=201), vec![0, 1]); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 1f66a27e9b..286564a867 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -22,6 +22,7 @@ mod compact_space; mod line; mod linear; mod monotonic_mapping; +mod monotonic_mapping_u128; mod column; mod gcd; @@ -32,6 +33,7 @@ use self::blockwise_linear::BlockwiseLinearCodec; pub use self::column::{monotonic_map_column, Column, VecColumn}; use self::linear::LinearCodec; pub use self::monotonic_mapping::MonotonicallyMappableToU64; +pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128; pub use self::serialize::{ estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader, }; @@ -73,8 +75,12 @@ impl FastFieldCodecType { } /// Returns the correct codec reader wrapped in the `Arc` for the data. -pub fn open_u128(bytes: OwnedBytes) -> io::Result>> { - Ok(Arc::new(CompactSpaceDecompressor::open(bytes)?)) +pub fn open_u128( + bytes: OwnedBytes, +) -> io::Result>> { + let monotonic_mapping = move |val: u128| Item::from_u128(val); + let reader = CompactSpaceDecompressor::open(bytes)?; + Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping))) } /// Returns the correct codec reader wrapped in the `Arc` for the data. diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index d3d9c06f8d..7b963dc128 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -110,7 +110,7 @@ fn bench_ip() { (data.len() * 8) as f32 / dataset.len() as f32 ); - let decompressor = open_u128(OwnedBytes::new(data)).unwrap(); + let decompressor = open_u128::(OwnedBytes::new(data)).unwrap(); // Sample some ranges for value in dataset.iter().take(1110).skip(1100).cloned() { print_time!("get range"); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 3fca75fce4..c825ee85c8 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -27,7 +27,10 @@ pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex}; -pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; +pub use self::multivalued::{ + MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, MultiValuedFastFieldWriter, + MultiValuedU128FastFieldReader, +}; pub use self::readers::FastFieldReaders; pub(crate) use self::readers::{type_and_cardinality, FastType}; pub use self::serializer::{Column, CompositeFastFieldSerializer}; diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 26b49abd7b..c625a2e76d 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -3,9 +3,9 @@ mod writer; use fastfield_codecs::FastFieldCodecType; -pub use self::reader::MultiValuedFastFieldReader; -pub use self::writer::MultiValuedFastFieldWriter; +pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader}; pub(crate) use self::writer::MultivalueStartIndex; +pub use self::writer::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter}; /// The valid codecs for multivalue values excludes the linear interpolation codec. /// diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index f8e41f2e1c..994c03c7e7 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -1,7 +1,7 @@ -use std::ops::Range; +use std::ops::{Range, RangeInclusive}; use std::sync::Arc; -use fastfield_codecs::Column; +use fastfield_codecs::{Column, MonotonicallyMappableToU128}; use crate::fastfield::{FastValue, MultiValueLength}; use crate::DocId; @@ -99,6 +99,153 @@ impl MultiValueLength for MultiValuedFastFieldReader { self.total_num_vals() as u64 } } + +/// Reader for a multivalued `u128` fast field. +/// +/// The reader is implemented as a `u64` fast field for the index and a `u128` fast field. +/// +/// The `vals_reader` will access the concatenated list of all +/// values for all reader. +/// The `idx_reader` associated, for each document, the index of its first value. +#[derive(Clone)] +pub struct MultiValuedU128FastFieldReader { + idx_reader: Arc>, + vals_reader: Arc>, +} + +impl MultiValuedU128FastFieldReader { + pub(crate) fn open( + idx_reader: Arc>, + vals_reader: Arc>, + ) -> MultiValuedU128FastFieldReader { + Self { + idx_reader, + vals_reader, + } + } + + /// Returns `[start, end)`, such that the values associated + /// to the given document are `start..end`. + #[inline] + fn range(&self, doc: DocId) -> Range { + let start = self.idx_reader.get_val(doc as u64); + let end = self.idx_reader.get_val(doc as u64 + 1); + start..end + } + + /// Returns the array of values associated to the given `doc`. + #[inline] + pub fn get_first_val(&self, doc: DocId) -> Option { + let range = self.range(doc); + if range.is_empty() { + return None; + } + Some(self.vals_reader.get_val(range.start)) + } + + /// Returns the array of values associated to the given `doc`. + #[inline] + fn get_vals_for_range(&self, range: Range, vals: &mut Vec) { + let len = (range.end - range.start) as usize; + vals.resize(len, T::from_u128(0)); + self.vals_reader.get_range(range.start, &mut vals[..]); + } + + /// Returns the array of values associated to the given `doc`. + #[inline] + pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { + let range = self.range(doc); + self.get_vals_for_range(range, vals); + } + + /// Returns all docids which are in the provided value range + pub fn get_between_vals(&self, range: RangeInclusive) -> Vec { + let positions = self.vals_reader.get_between_vals(range); + + positions_to_docids(&positions, self) + } + + /// Iterates over all elements in the fast field + pub fn iter(&self) -> impl Iterator + '_ { + self.vals_reader.iter() + } + + /// Returns the minimum value for this fast field. + /// + /// The min value does not take in account of possible + /// deleted document, and should be considered as a lower bound + /// of the actual mimimum value. + pub fn min_value(&self) -> T { + self.vals_reader.min_value() + } + + /// Returns the maximum value for this fast field. + /// + /// The max value does not take in account of possible + /// deleted document, and should be considered as an upper bound + /// of the actual maximum value. + pub fn max_value(&self) -> T { + self.vals_reader.max_value() + } + + /// Returns the number of values associated with the document `DocId`. + #[inline] + pub fn num_vals(&self, doc: DocId) -> usize { + let range = self.range(doc); + (range.end - range.start) as usize + } + + /// Returns the overall number of values in this field. + #[inline] + pub fn total_num_vals(&self) -> u64 { + self.idx_reader.max_value() + } +} + +impl MultiValueLength for MultiValuedU128FastFieldReader { + fn get_range(&self, doc_id: DocId) -> std::ops::Range { + self.range(doc_id) + } + fn get_len(&self, doc_id: DocId) -> u64 { + self.num_vals(doc_id) as u64 + } + fn get_total_len(&self) -> u64 { + self.total_num_vals() as u64 + } +} + +/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds. +/// +/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index. +/// +/// Correctness: positions needs to be sorted. +/// +/// TODO: Instead of a linear scan we can employ a binary search to match a docid to its value +/// position. +fn positions_to_docids(positions: &[u64], multival_idx: &T) -> Vec { + let mut docs = vec![]; + let mut cur_doc = 0u32; + let mut last_doc = None; + + for pos in positions { + loop { + let range = multival_idx.get_range(cur_doc); + if range.contains(pos) { + // avoid duplicates + if Some(cur_doc) == last_doc { + break; + } + docs.push(cur_doc); + last_doc = Some(cur_doc); + break; + } + cur_doc += 1; + } + } + + docs +} + #[cfg(test)] mod tests { diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 0fb30caf63..c5012911ed 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -1,6 +1,8 @@ use std::io; -use fastfield_codecs::{Column, MonotonicallyMappableToU64, VecColumn}; +use fastfield_codecs::{ + serialize_u128, Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn, +}; use fnv::FnvHashMap; use super::get_fastfield_codecs_for_multivalue; @@ -264,6 +266,144 @@ fn iter_remapped_multivalue_index<'a, C: Column>( })) } +/// Writer for multi-valued (as in, more than one value per document) +/// int fast field. +/// +/// This `Writer` is only useful for advanced users. +/// The normal way to get your multivalued int in your index +/// is to +/// - declare your field with fast set to `Cardinality::MultiValues` +/// in your schema +/// - add your document simply by calling `.add_document(...)`. +/// +/// The `MultiValuedFastFieldWriter` can be acquired from the + +pub struct MultiValueU128FastFieldWriter { + field: Field, + vals: Vec, + doc_index: Vec, +} + +impl MultiValueU128FastFieldWriter { + /// Creates a new `U128MultiValueFastFieldWriter` + pub(crate) fn new(field: Field) -> Self { + MultiValueU128FastFieldWriter { + field, + vals: Vec::new(), + doc_index: Vec::new(), + } + } + + /// The memory used (inclusive childs) + pub fn mem_usage(&self) -> usize { + self.vals.capacity() * std::mem::size_of::() + + self.doc_index.capacity() * std::mem::size_of::() + } + + /// Finalize the current document. + pub(crate) fn next_doc(&mut self) { + self.doc_index.push(self.vals.len() as u64); + } + + /// Pushes a new value to the current document. + pub(crate) fn add_val(&mut self, val: u128) { + self.vals.push(val); + } + + /// Shift to the next document and adds + /// all of the matching field values present in the document. + pub fn add_document(&mut self, doc: &Document) { + self.next_doc(); + for field_value in doc.field_values() { + if field_value.field == self.field { + let value = field_value.value(); + let ip_addr = value.as_ip().unwrap(); + let value = ip_addr.to_u128(); + self.add_val(value); + } + } + } + + /// Returns an iterator over values per doc_id in ascending doc_id order. + /// + /// Normally the order is simply iterating self.doc_id_index. + /// With doc_id_map it accounts for the new mapping, returning values in the order of the + /// new doc_ids. + fn get_ordered_values<'a: 'b, 'b>( + &'a self, + doc_id_map: Option<&'b DocIdMapping>, + ) -> impl Iterator { + get_ordered_values(&self.vals, &self.doc_index, doc_id_map) + } + + /// Serializes fast field values. + pub fn serialize( + mut self, + serializer: &mut CompositeFastFieldSerializer, + doc_id_map: Option<&DocIdMapping>, + ) -> io::Result<()> { + { + // writing the offset index + // + self.doc_index.push(self.vals.len() as u64); + let col = VecColumn::from(&self.doc_index[..]); + if let Some(doc_id_map) = doc_id_map { + let multi_value_start_index = MultivalueStartIndex::new(&col, doc_id_map); + serializer.create_auto_detect_u64_fast_field_with_idx( + self.field, + multi_value_start_index, + 0, + )?; + } else { + serializer.create_auto_detect_u64_fast_field_with_idx(self.field, col, 0)?; + } + } + { + let field_write = serializer.get_field_writer(self.field, 1); + + let mut values = Vec::with_capacity(self.vals.len()); + for vals in self.get_ordered_values(doc_id_map) { + for &val in vals { + values.push(val); + } + } + let col = VecColumn::from(&values[..]); + + serialize_u128(col, field_write)?; + } + Ok(()) + } +} + +/// Returns an iterator over values per doc_id in ascending doc_id order. +/// +/// Normally the order is simply iterating self.doc_id_index. +/// With doc_id_map it accounts for the new mapping, returning values in the order of the +/// new doc_ids. +fn get_ordered_values<'a: 'b, 'b, T>( + vals: &'a [T], + doc_index: &'a [u64], + doc_id_map: Option<&'b DocIdMapping>, +) -> impl Iterator { + let doc_id_iter: Box> = if let Some(doc_id_map) = doc_id_map { + Box::new(doc_id_map.iter_old_doc_ids()) + } else { + let max_doc = doc_index.len() as DocId; + Box::new(0..max_doc) + }; + doc_id_iter.map(move |doc_id| get_values_for_doc_id(doc_id, vals, doc_index)) +} + +/// returns all values for a doc_id +fn get_values_for_doc_id<'a, T>(doc_id: u32, vals: &'a [T], doc_index: &'a [u64]) -> &'a [T] { + let start_pos = doc_index[doc_id as usize] as usize; + let end_pos = doc_index + .get(doc_id as usize + 1) + .cloned() + .unwrap_or(vals.len() as u64) as usize; // special case, last doc_id has no offset information + &vals[start_pos..end_pos] +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 68f9a811f3..e4dbbd8588 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -1,7 +1,9 @@ +use std::net::IpAddr; use std::sync::Arc; -use fastfield_codecs::{open, Column}; +use fastfield_codecs::{open, open_u128, Column}; +use super::multivalued::MultiValuedU128FastFieldReader; use crate::directory::{CompositeFile, FileSlice}; use crate::fastfield::{ BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader, @@ -23,6 +25,7 @@ pub struct FastFieldReaders { pub(crate) enum FastType { I64, U64, + U128, F64, Bool, Date, @@ -49,6 +52,9 @@ pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, FieldType::Str(options) if options.is_fast() => { Some((FastType::U64, Cardinality::MultiValues)) } + FieldType::Ip(options) => options + .get_fastfield_cardinality() + .map(|cardinality| (FastType::U128, cardinality)), _ => None, } } @@ -143,6 +149,56 @@ impl FastFieldReaders { self.typed_fast_field_reader(field) } + /// Returns the `ip` fast field reader reader associated to `field`. + /// + /// If `field` is not a u128 fast field, this method returns an Error. + pub fn ip_addr(&self, field: Field) -> crate::Result>> { + self.check_type(field, FastType::U128, Cardinality::SingleValue)?; + let bytes = self.fast_field_data(field, 0)?.read_bytes()?; + Ok(open_u128::(bytes)?) + } + + /// Returns the `ip` fast field reader reader associated to `field`. + /// + /// If `field` is not a u128 fast field, this method returns an Error. + pub fn ip_addrs(&self, field: Field) -> crate::Result> { + self.check_type(field, FastType::U128, Cardinality::MultiValues)?; + let idx_reader: Arc> = self.typed_fast_field_reader(field)?; + + let bytes = self.fast_field_data(field, 1)?.read_bytes()?; + let vals_reader = open_u128::(bytes)?; + + Ok(MultiValuedU128FastFieldReader::open( + idx_reader, + vals_reader, + )) + } + + /// Returns the `u128` fast field reader reader associated to `field`. + /// + /// If `field` is not a u128 fast field, this method returns an Error. + pub fn u128(&self, field: Field) -> crate::Result>> { + self.check_type(field, FastType::U128, Cardinality::SingleValue)?; + let bytes = self.fast_field_data(field, 0)?.read_bytes()?; + Ok(open_u128::(bytes)?) + } + + /// Returns the `u128` multi-valued fast field reader reader associated to `field`. + /// + /// If `field` is not a u128 multi-valued fast field, this method returns an Error. + pub fn u128s(&self, field: Field) -> crate::Result> { + self.check_type(field, FastType::U128, Cardinality::MultiValues)?; + let idx_reader: Arc> = self.typed_fast_field_reader(field)?; + + let bytes = self.fast_field_data(field, 1)?.read_bytes()?; + let vals_reader = open_u128::(bytes)?; + + Ok(MultiValuedU128FastFieldReader::open( + idx_reader, + vals_reader, + )) + } + /// Returns the `u64` fast field reader reader associated with `field`, regardless of whether /// the given field is effectively of type `u64` or not. /// diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 6ca2929317..f58f28a123 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -93,6 +93,11 @@ impl CompositeFastFieldSerializer { self.composite_write.for_field_with_idx(field, 1) } + /// Gets the underlying writer + pub fn get_field_writer(&mut self, field: Field, idx: usize) -> &mut impl Write { + self.composite_write.for_field_with_idx(field, idx) + } + /// Closes the serializer /// /// After this call the data must be persistently saved on disk. diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 5d1a0810e4..972e0dde2f 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -2,11 +2,14 @@ use std::collections::HashMap; use std::io; use common; -use fastfield_codecs::{Column, MonotonicallyMappableToU64}; +use fastfield_codecs::{ + serialize_u128, Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, +}; use fnv::FnvHashMap; +use roaring::RoaringBitmap; use tantivy_bitpacker::BlockedBitpacker; -use super::multivalued::MultiValuedFastFieldWriter; +use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter}; use super::FastFieldType; use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer}; use crate::indexer::doc_id_mapping::DocIdMapping; @@ -19,6 +22,8 @@ use crate::DatePrecision; pub struct FastFieldsWriter { term_id_writers: Vec, single_value_writers: Vec, + u128_value_writers: Vec, + u128_multi_value_writers: Vec, multi_values_writers: Vec, bytes_value_writers: Vec, } @@ -34,6 +39,8 @@ fn fast_field_default_value(field_entry: &FieldEntry) -> u64 { impl FastFieldsWriter { /// Create all `FastFieldWriter` required by the schema. pub fn from_schema(schema: &Schema) -> FastFieldsWriter { + let mut u128_value_writers = Vec::new(); + let mut u128_multi_value_writers = Vec::new(); let mut single_value_writers = Vec::new(); let mut term_id_writers = Vec::new(); let mut multi_values_writers = Vec::new(); @@ -97,10 +104,27 @@ impl FastFieldsWriter { bytes_value_writers.push(fast_field_writer); } } + FieldType::Ip(opt) => { + if opt.is_fast() { + match opt.get_fastfield_cardinality() { + Some(Cardinality::SingleValue) => { + let fast_field_writer = U128FastFieldWriter::new(field); + u128_value_writers.push(fast_field_writer); + } + Some(Cardinality::MultiValues) => { + let fast_field_writer = MultiValueU128FastFieldWriter::new(field); + u128_multi_value_writers.push(fast_field_writer); + } + None => {} + } + } + } FieldType::Str(_) | FieldType::JsonObject(_) => {} } } FastFieldsWriter { + u128_value_writers, + u128_multi_value_writers, term_id_writers, single_value_writers, multi_values_writers, @@ -129,6 +153,16 @@ impl FastFieldsWriter { .iter() .map(|w| w.mem_usage()) .sum::() + + self + .u128_value_writers + .iter() + .map(|w| w.mem_usage()) + .sum::() + + self + .u128_multi_value_writers + .iter() + .map(|w| w.mem_usage()) + .sum::() } /// Get the `FastFieldWriter` associated with a field. @@ -190,7 +224,6 @@ impl FastFieldsWriter { .iter_mut() .find(|field_writer| field_writer.field() == field) } - /// Indexes all of the fastfields of a new document. pub fn add_document(&mut self, doc: &Document) { for field_writer in &mut self.term_id_writers { @@ -205,6 +238,12 @@ impl FastFieldsWriter { for field_writer in &mut self.bytes_value_writers { field_writer.add_document(doc); } + for field_writer in &mut self.u128_value_writers { + field_writer.add_document(doc); + } + for field_writer in &mut self.u128_multi_value_writers { + field_writer.add_document(doc); + } } /// Serializes all of the `FastFieldWriter`s by pushing them in @@ -230,6 +269,161 @@ impl FastFieldsWriter { for field_writer in self.bytes_value_writers { field_writer.serialize(serializer, doc_id_map)?; } + for field_writer in self.u128_value_writers { + field_writer.serialize(serializer, doc_id_map)?; + } + for field_writer in self.u128_multi_value_writers { + field_writer.serialize(serializer, doc_id_map)?; + } + + Ok(()) + } +} + +/// Fast field writer for u128 values. +/// The fast field writer just keeps the values in memory. +/// +/// Only when the segment writer can be closed and +/// persisted on disc, the fast field writer is +/// sent to a `FastFieldSerializer` via the `.serialize(...)` +/// method. +/// +/// We cannot serialize earlier as the values are +/// compressed to a compact number space and the number of +/// bits required for bitpacking can only been known once +/// we have seen all of the values. +pub struct U128FastFieldWriter { + field: Field, + vals: Vec, + val_count: u32, + + null_values: RoaringBitmap, +} + +impl U128FastFieldWriter { + /// Creates a new `IntFastFieldWriter` + pub fn new(field: Field) -> Self { + Self { + field, + vals: vec![], + val_count: 0, + null_values: RoaringBitmap::new(), + } + } + + /// The memory used (inclusive childs) + pub fn mem_usage(&self) -> usize { + self.vals.len() * 16 + } + + /// Records a new value. + /// + /// The n-th value being recorded is implicitely + /// associated to the document with the `DocId` n. + /// (Well, `n-1` actually because of 0-indexing) + pub fn add_val(&mut self, val: u128) { + self.vals.push(val); + } + + /// Extract the fast field value from the document + /// (or use the default value) and records it. + /// + /// Extract the value associated to the fast field for + /// this document. + pub fn add_document(&mut self, doc: &Document) { + match doc.get_first(self.field) { + Some(v) => { + let ip_addr = v.as_ip().unwrap(); + let value = ip_addr.to_u128(); + self.add_val(value); + } + None => { + self.null_values.insert(self.val_count as u32); + } + }; + self.val_count += 1; + } + + /// Push the fast fields value to the `FastFieldWriter`. + pub fn serialize( + &self, + serializer: &mut CompositeFastFieldSerializer, + doc_id_map: Option<&DocIdMapping>, + ) -> io::Result<()> { + // To get the actual value, we could materialize the vec with u128 including nulls, but + // that could cost a lot of memory. Instead we just compute the index for of + // the values + let mut idx_to_val_idx = vec![]; + idx_to_val_idx.resize(self.val_count as usize, 0); + + let mut val_idx = 0; + for idx in 0..self.val_count { + if !self.null_values.contains(idx as u32) { + idx_to_val_idx[idx as usize] = val_idx as u32; + val_idx += 1; + } + } + + struct RemappedFFWriter<'a> { + doc_id_map: Option<&'a DocIdMapping>, + null_values: &'a RoaringBitmap, + vals: &'a [u128], + idx_to_val_idx: Vec, + val_count: u32, + } + impl<'a> Column for RemappedFFWriter<'a> { + fn get_val(&self, _idx: u64) -> u128 { + // unused by codec + unreachable!() + } + + fn min_value(&self) -> u128 { + // unused by codec + unreachable!() + } + + fn max_value(&self) -> u128 { + // unused by codec + unreachable!() + } + + fn num_vals(&self) -> u64 { + self.val_count as u64 + } + fn iter(&self) -> Box + '_> { + if let Some(doc_id_map) = self.doc_id_map { + let iter = doc_id_map.iter_old_doc_ids().map(|idx| { + if self.null_values.contains(idx as u32) { + 0 // TODO properly handle nulls + } else { + self.vals[self.idx_to_val_idx[idx as usize] as usize] + } + }); + Box::new(iter) + } else { + let iter = (0..self.val_count).map(|idx| { + if self.null_values.contains(idx as u32) { + 0 // TODO properly handle nulls + } else { + self.vals[self.idx_to_val_idx[idx as usize] as usize] + } + }); + Box::new(iter) + } + } + } + + let column = RemappedFFWriter { + doc_id_map, + null_values: &self.null_values, + vals: &self.vals, + idx_to_val_idx, + val_count: self.val_count, + }; + + let field_write = serializer.get_field_writer(self.field, 0); + serialize_u128(column, field_write)?; + Ok(()) } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 3caa0f4aa3..9b3b6bfc9e 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -803,7 +803,9 @@ impl Drop for IndexWriter { #[cfg(test)] mod tests { use std::collections::{HashMap, HashSet}; + use std::net::IpAddr; + use fastfield_codecs::MonotonicallyMappableToU128; use proptest::prelude::*; use proptest::prop_oneof; use proptest::strategy::Strategy; @@ -815,7 +817,7 @@ mod tests { use crate::indexer::NoMergePolicy; use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}; use crate::schema::{ - self, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions, + self, Cardinality, Facet, FacetOptions, IndexRecordOption, IpOptions, NumericOptions, TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, }; use crate::store::DOCSTORE_CACHE_CAPACITY; @@ -1593,6 +1595,11 @@ mod tests { force_end_merge: bool, ) -> crate::Result<()> { let mut schema_builder = schema::Schema::builder(); + let ip_field = schema_builder.add_ip_field("ip", FAST | INDEXED | STORED); + let ips_field = schema_builder.add_ip_field( + "ips", + IpOptions::default().set_fast(Cardinality::MultiValues), + ); let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED); let bytes_field = schema_builder.add_bytes_field("bytes", FAST | INDEXED | STORED); let bool_field = schema_builder.add_bool_field("bool", FAST | INDEXED | STORED); @@ -1648,17 +1655,37 @@ mod tests { match op { IndexingOp::AddDoc { id } => { let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); - index_writer.add_document(doc!(id_field=>id, - bytes_field => id.to_le_bytes().as_slice(), - multi_numbers=> id, - multi_numbers => id, - bool_field => (id % 2u64) != 0, - multi_bools => (id % 2u64) != 0, - multi_bools => (id % 2u64) == 0, - text_field => id.to_string(), - facet_field => facet, - large_text_field=> LOREM - ))?; + let ip_from_id = IpAddr::from_u128(id as u128); + + if id % 3 == 0 { + // every 3rd doc has no ip field + index_writer.add_document(doc!(id_field=>id, + bytes_field => id.to_le_bytes().as_slice(), + multi_numbers=> id, + multi_numbers => id, + bool_field => (id % 2u64) != 0, + multi_bools => (id % 2u64) != 0, + multi_bools => (id % 2u64) == 0, + text_field => id.to_string(), + facet_field => facet, + large_text_field=> LOREM + ))?; + } else { + index_writer.add_document(doc!(id_field=>id, + bytes_field => id.to_le_bytes().as_slice(), + ip_field => ip_from_id, + ips_field => ip_from_id, + ips_field => ip_from_id, + multi_numbers=> id, + multi_numbers => id, + bool_field => (id % 2u64) != 0, + multi_bools => (id % 2u64) != 0, + multi_bools => (id % 2u64) == 0, + text_field => id.to_string(), + facet_field => facet, + large_text_field=> LOREM + ))?; + } } IndexingOp::DeleteDoc { id } => { index_writer.delete_term(Term::from_field_u64(id_field, id)); @@ -1744,6 +1771,59 @@ mod tests { .collect::>() ); + // Load all ips addr + let ips: HashSet = searcher + .segment_readers() + .iter() + .flat_map(|segment_reader| { + let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap(); + segment_reader.doc_ids_alive().flat_map(move |doc| { + let val = ff_reader.get_val(doc as u64); + if val == IpAddr::from_u128(0) { + None + } else { + Some(val) + } + }) + }) + .collect(); + + let expected_ips = expected_ids_and_num_occurrences + .keys() + .flat_map(|id| { + if id % 3 == 0 { + None + } else { + Some(IpAddr::from_u128(*id as u128)) + } + }) + .collect::>(); + assert_eq!(ips, expected_ips); + + let expected_ips = expected_ids_and_num_occurrences + .keys() + .filter_map(|id| { + if id % 3 == 0 { + None + } else { + Some(IpAddr::from_u128(*id as u128)) + } + }) + .collect::>(); + let ips: HashSet = searcher + .segment_readers() + .iter() + .flat_map(|segment_reader| { + let ff_reader = segment_reader.fast_fields().ip_addrs(ips_field).unwrap(); + segment_reader.doc_ids_alive().flat_map(move |doc| { + let mut vals = vec![]; + ff_reader.get_vals(doc, &mut vals); + vals.into_iter().filter(|val| val.to_u128() != 0) + }) + }) + .collect(); + assert_eq!(ips, expected_ips); + // multivalue fast field tests for segment_reader in searcher.segment_readers().iter() { let id_reader = segment_reader.fast_fields().u64(id_field).unwrap(); @@ -1847,6 +1927,36 @@ mod tests { Ok(()) } + #[test] + fn test_minimal() { + assert!(test_operation_strategy( + &[ + IndexingOp::AddDoc { id: 23 }, + IndexingOp::AddDoc { id: 13 }, + IndexingOp::DeleteDoc { id: 13 } + ], + true, + false + ) + .is_ok()); + + assert!(test_operation_strategy( + &[ + IndexingOp::AddDoc { id: 23 }, + IndexingOp::AddDoc { id: 13 }, + IndexingOp::DeleteDoc { id: 13 } + ], + false, + false + ) + .is_ok()); + } + + #[test] + fn test_minimal_sort_merge() { + assert!(test_operation_strategy(&[IndexingOp::AddDoc { id: 3 },], true, true).is_ok()); + } + proptest! { #![proptest_config(ProptestConfig::with_cases(20))] #[test] diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 0ed47a9156..84762091f2 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::io::Write; use std::sync::Arc; -use fastfield_codecs::VecColumn; +use fastfield_codecs::{serialize_u128, VecColumn}; use itertools::Itertools; use measure_time::debug_time; @@ -11,8 +11,8 @@ use crate::core::{Segment, SegmentReader}; use crate::docset::{DocSet, TERMINATED}; use crate::error::DataCorruption; use crate::fastfield::{ - get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer, - MultiValueLength, MultiValuedFastFieldReader, + get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer, MultiValueLength, + MultiValuedFastFieldReader, MultiValuedU128FastFieldReader, }; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; @@ -295,6 +295,24 @@ impl IndexMerger { self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?; } } + FieldType::Ip(options) => match options.get_fastfield_cardinality() { + Some(Cardinality::SingleValue) => { + self.write_u128_single_fast_field( + field, + fast_field_serializer, + doc_id_mapping, + )?; + } + Some(Cardinality::MultiValues) => { + self.write_u128_multi_fast_field( + field, + fast_field_serializer, + doc_id_mapping, + )?; + } + None => {} + }, + FieldType::JsonObject(_) | FieldType::Facet(_) | FieldType::Str(_) => { // We don't handle json fast field for the moment // They can be implemented using what is done @@ -305,6 +323,143 @@ impl IndexMerger { Ok(()) } + // used to merge `u128` single fast fields. + fn write_u128_multi_fast_field( + &self, + field: Field, + fast_field_serializer: &mut CompositeFastFieldSerializer, + doc_id_mapping: &SegmentDocIdMapping, + ) -> crate::Result<()> { + let segment_and_ff_readers = self + .readers + .iter() + .map(|segment_reader| { + let ff_reader: MultiValuedU128FastFieldReader = + segment_reader.fast_fields().u128s(field).expect( + "Failed to find index for multivalued field. This is a bug in tantivy, \ + please report.", + ); + (segment_reader, ff_reader) + }) + .collect::>(); + + Self::write_1_n_fast_field_idx_generic( + field, + fast_field_serializer, + doc_id_mapping, + &segment_and_ff_readers, + )?; + + let fast_field_readers = segment_and_ff_readers + .into_iter() + .map(|(_, ff_reader)| ff_reader) + .collect::>(); + + struct RemappedFFReader<'a> { + doc_id_mapping: &'a SegmentDocIdMapping, + fast_field_readers: Vec>, + } + impl<'a> Column for RemappedFFReader<'a> { + fn get_val(&self, _idx: u64) -> u128 { + // unused by codec + unreachable!() + } + + fn min_value(&self) -> u128 { + // unused by codec + unreachable!() + } + + fn max_value(&self) -> u128 { + // unused by codec + unreachable!() + } + + fn num_vals(&self) -> u64 { + self.doc_id_mapping.len() as u64 + } + fn iter<'b>(&'b self) -> Box + 'b> { + Box::new( + self.doc_id_mapping + .iter_old_doc_addrs() + .flat_map(|doc_addr| { + let fast_field_reader = + &self.fast_field_readers[doc_addr.segment_ord as usize]; + let mut out = vec![]; + fast_field_reader.get_vals(doc_addr.doc_id, &mut out); + out.into_iter() + }), + ) + } + } + let column = RemappedFFReader { + doc_id_mapping, + fast_field_readers, + }; + let field_write = fast_field_serializer.get_field_writer(field, 1); + serialize_u128(column, field_write)?; + + Ok(()) + } + + // used to merge `u128` single fast fields. + fn write_u128_single_fast_field( + &self, + field: Field, + fast_field_serializer: &mut CompositeFastFieldSerializer, + doc_id_mapping: &SegmentDocIdMapping, + ) -> crate::Result<()> { + let fast_field_readers = self + .readers + .iter() + .map(|reader| { + let u128_reader: Arc> = reader.fast_fields().u128(field).expect( + "Failed to find a reader for single fast field. This is a tantivy bug and it \ + should never happen.", + ); + u128_reader + }) + .collect::>(); + + struct RemappedFFReader<'a> { + doc_id_mapping: &'a SegmentDocIdMapping, + fast_field_readers: Vec>>, + } + impl<'a> Column for RemappedFFReader<'a> { + fn get_val(&self, _idx: u64) -> u128 { + // unused by codec + unreachable!() + } + + fn min_value(&self) -> u128 { + // unused by codec + unreachable!() + } + + fn max_value(&self) -> u128 { + // unused by codec + unreachable!() + } + + fn num_vals(&self) -> u64 { + self.doc_id_mapping.len() as u64 + } + fn iter<'b>(&'b self) -> Box + 'b> { + Box::new(self.doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { + let fast_field_reader = &self.fast_field_readers[doc_addr.segment_ord as usize]; + fast_field_reader.get_val(doc_addr.doc_id as u64) + })) + } + } + let column = RemappedFFReader { + doc_id_mapping, + fast_field_readers, + }; + let field_write = fast_field_serializer.get_field_writer(field, 0); + serialize_u128(column, field_write)?; + Ok(()) + } + // used both to merge field norms, `u64/i64` single fast fields. fn write_single_fast_field( &self, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 415378752c..36348518eb 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -294,6 +294,13 @@ impl SegmentWriter { ctx, )?; } + FieldType::Ip(_) => { + for value in values { + let ip_val = value.as_ip().ok_or_else(make_schema_error)?; + term_buffer.set_text(&ip_val.to_string()); + postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); + } + } } } Ok(()) diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index 61d02752f7..d0ca89b11d 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -50,6 +50,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box Box::new(SpecializedPostingsWriter::::default()), FieldType::JsonObject(ref json_object_options) => { if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 84c95739e6..ed2bd24341 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -89,6 +89,7 @@ pub(crate) fn serialize_postings( | FieldType::Bool(_) => {} FieldType::Bytes(_) => {} FieldType::JsonObject(_) => {} + FieldType::Ip(_) => {} // TODO check } let postings_writer = per_field_postings_writers.get_for_field(field); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index f9e032f7a0..497bb80e6d 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -400,6 +400,7 @@ impl QueryParser { let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; Ok(Term::from_field_bytes(field, &bytes)) } + FieldType::Ip(_) => Ok(Term::from_field_text(field, phrase)), } } @@ -506,6 +507,7 @@ impl QueryParser { let bytes_term = Term::from_field_bytes(field, &bytes); Ok(vec![LogicalLiteral::Term(bytes_term)]) } + FieldType::Ip(_) => Err(QueryParserError::FieldNotIndexed(field_name.to_string())), } } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 997fbd2564..e3c23687e6 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,5 +1,6 @@ use serde::{Deserialize, Serialize}; +use super::ip_options::IpOptions; use crate::schema::bytes_options::BytesOptions; use crate::schema::{ is_valid_field_name, DateOptions, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, @@ -60,6 +61,11 @@ impl FieldEntry { Self::new(field_name, FieldType::Date(date_options)) } + /// Creates a new ip field entry. + pub fn new_ip(field_name: String, ip_options: IpOptions) -> FieldEntry { + Self::new(field_name, FieldType::Ip(ip_options)) + } + /// Creates a field entry for a facet. pub fn new_facet(field_name: String, facet_options: FacetOptions) -> FieldEntry { Self::new(field_name, FieldType::Facet(facet_options)) @@ -114,6 +120,7 @@ impl FieldEntry { FieldType::Facet(ref options) => options.is_stored(), FieldType::Bytes(ref options) => options.is_stored(), FieldType::JsonObject(ref options) => options.is_stored(), + FieldType::Ip(ref options) => options.is_stored(), } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 3a631697e8..9c51f9a6fa 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,8 +1,12 @@ +use std::net::IpAddr; +use std::str::FromStr; + use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use thiserror::Error; use super::Cardinality; +use super::ip_options::IpOptions; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; use crate::schema::{ @@ -62,9 +66,13 @@ pub enum Type { Bytes = b'b', /// Leaf in a Json object. Json = b'j', + /// IpAddr + Ip = b'p', + /// IpAddr + U128 = b'1', } -const ALL_TYPES: [Type; 9] = [ +const ALL_TYPES: [Type; 11] = [ Type::Str, Type::U64, Type::I64, @@ -74,6 +82,8 @@ const ALL_TYPES: [Type; 9] = [ Type::Facet, Type::Bytes, Type::Json, + Type::Ip, + Type::U128, ]; impl Type { @@ -100,6 +110,8 @@ impl Type { Type::Facet => "Facet", Type::Bytes => "Bytes", Type::Json => "Json", + Type::Ip => "Ip", + Type::U128 => "U128", } } @@ -116,6 +128,8 @@ impl Type { b'h' => Some(Type::Facet), b'b' => Some(Type::Bytes), b'j' => Some(Type::Json), + b'p' => Some(Type::Ip), + b'1' => Some(Type::U128), _ => None, } } @@ -146,6 +160,8 @@ pub enum FieldType { Bytes(BytesOptions), /// Json object JsonObject(JsonObjectOptions), + /// IpAddr field + Ip(IpOptions), } impl FieldType { @@ -161,6 +177,7 @@ impl FieldType { FieldType::Facet(_) => Type::Facet, FieldType::Bytes(_) => Type::Bytes, FieldType::JsonObject(_) => Type::Json, + FieldType::Ip(_) => Type::Ip, } } @@ -176,6 +193,7 @@ impl FieldType { FieldType::Facet(ref _facet_options) => true, FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(), + FieldType::Ip(_) => false, } } @@ -210,6 +228,7 @@ impl FieldType { | FieldType::F64(ref int_options) | FieldType::Bool(ref int_options) => int_options.is_fast(), FieldType::Date(ref date_options) => date_options.is_fast(), + FieldType::Ip(ref options) => options.is_fast(), FieldType::Facet(_) => true, FieldType::JsonObject(_) => false, } @@ -250,6 +269,7 @@ impl FieldType { FieldType::Facet(_) => false, FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), FieldType::JsonObject(ref _json_object_options) => false, + FieldType::Ip(_) => false, } } @@ -294,6 +314,7 @@ impl FieldType { FieldType::JsonObject(ref json_obj_options) => json_obj_options .get_text_indexing_options() .map(TextFieldIndexing::index_option), + FieldType::Ip(_) => None, } } @@ -333,6 +354,14 @@ impl FieldType { expected: "a json object", json: JsonValue::String(field_text), }), + FieldType::Ip(_) => { + Ok(Value::Ip(IpAddr::from_str(&field_text).map_err(|err| { + ValueParsingError::ParseError { + error: err.to_string(), + json: JsonValue::String(field_text), + } + })?)) + } } } JsonValue::Number(field_val_num) => match self { @@ -380,6 +409,10 @@ impl FieldType { expected: "a json object", json: JsonValue::Number(field_val_num), }), + FieldType::Ip(_) => Err(ValueParsingError::TypeError { + expected: "a string with an ip addr", + json: JsonValue::Number(field_val_num), + }), }, JsonValue::Object(json_map) => match self { FieldType::Str(_) => { diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 783ce11fec..e85c9d5ae2 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -7,6 +7,7 @@ use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_json::{self, Value as JsonValue}; +use super::ip_options::IpOptions; use super::*; use crate::schema::bytes_options::BytesOptions; use crate::schema::field_type::ValueParsingError; @@ -144,6 +145,28 @@ impl SchemaBuilder { self.add_field(field_entry) } + /// Adds a ip field. + /// Returns the associated field handle + /// Internally, Tantivy simply stores ips as u64, + /// while the user supplies IpAddr values for convenience. + /// + /// # Caution + /// + /// Appending two fields with the same name + /// will result in the shadowing of the first + /// by the second one. + /// The first field will get a field id + /// but only the second one will be indexed + pub fn add_ip_field>( + &mut self, + field_name_str: &str, + field_options: T, + ) -> Field { + let field_name = String::from(field_name_str); + let field_entry = FieldEntry::new_ip(field_name, field_options.into()); + self.add_field(field_entry) + } + /// Adds a new text field. /// Returns the associated field handle /// diff --git a/src/schema/term.rs b/src/schema/term.rs index 99f3e5ed50..79546d1dd2 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -415,6 +415,14 @@ fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Re debug_value_bytes(typ, bytes, f)?; } } + Type::Ip => { + let s = as_str(bytes); // TODO: change when serialization changes + write_opt(f, s)?; + } + Type::U128 => { + let s = as_str(bytes); // TODO: change when serialization changes + write_opt(f, s)?; + } } Ok(()) } diff --git a/src/schema/value.rs b/src/schema/value.rs index bcfcfb74b3..55cdc4dd77 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -1,4 +1,5 @@ use std::fmt; +use std::net::IpAddr; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -32,6 +33,8 @@ pub enum Value { Bytes(Vec), /// Json object value. JsonObject(serde_json::Map), + /// Ip + Ip(IpAddr), } impl Eq for Value {} @@ -50,6 +53,7 @@ impl Serialize for Value { Value::Facet(ref facet) => facet.serialize(serializer), Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes), Value::JsonObject(ref obj) => obj.serialize(serializer), + Value::Ip(ref obj) => obj.serialize(serializer), // TODO check serialization } } } @@ -201,6 +205,16 @@ impl Value { None } } + + /// Returns the ip addr, provided the value is of the `Ip` type. + /// (Returns None if the value is not of the `Ip` type) + pub fn as_ip(&self) -> Option { + if let Value::Ip(val) = self { + Some(*val) + } else { + None + } + } } impl From for Value { @@ -209,6 +223,12 @@ impl From for Value { } } +impl From for Value { + fn from(v: IpAddr) -> Value { + Value::Ip(v) + } +} + impl From for Value { fn from(v: u64) -> Value { Value::U64(v) @@ -287,7 +307,9 @@ impl From for Value { } mod binary_serialize { - use std::io::{self, Read, Write}; + use std::io::{self, ErrorKind, Read, Write}; + use std::net::IpAddr; + use std::str::FromStr; use common::{f64_to_u64, u64_to_f64, BinarySerializable}; @@ -306,6 +328,7 @@ mod binary_serialize { const EXT_CODE: u8 = 7; const JSON_OBJ_CODE: u8 = 8; const BOOL_CODE: u8 = 9; + const IP_CODE: u8 = 10; // extended types @@ -366,6 +389,10 @@ mod binary_serialize { serde_json::to_writer(writer, &map)?; Ok(()) } + Value::Ip(ref ip) => { + IP_CODE.serialize(writer)?; + ip.to_string().serialize(writer) // TODO Check best format + } } } @@ -418,7 +445,7 @@ mod binary_serialize { _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!( - "No extended field type is associated with code {:?}", + "No extened field type is associated with code {:?}", ext_type_code ), )), @@ -436,6 +463,13 @@ mod binary_serialize { let json_map = as serde::Deserialize>::deserialize(&mut de)?; Ok(Value::JsonObject(json_map)) } + IP_CODE => { + let text = String::deserialize(reader)?; + Ok(Value::Ip(IpAddr::from_str(&text).map_err(|err| { + io::Error::new(ErrorKind::Other, err.to_string()) + })?)) + } + _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code), From 6113e0408cbc4dd4a0ed03984a6438306cdb0739 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 26 Sep 2022 17:03:29 +0800 Subject: [PATCH 02/27] remove comment --- src/postings/postings_writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index ed2bd24341..c9eba39d75 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -89,7 +89,7 @@ pub(crate) fn serialize_postings( | FieldType::Bool(_) => {} FieldType::Bytes(_) => {} FieldType::JsonObject(_) => {} - FieldType::Ip(_) => {} // TODO check + FieldType::Ip(_) => {} } let postings_writer = per_field_postings_writers.get_for_field(field); From c8713a01edb759f8a199e6172ba275d3fee3fd60 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 27 Sep 2022 09:50:49 +0800 Subject: [PATCH 03/27] use iter api --- fastfield_codecs/benches/bench.rs | 3 +- fastfield_codecs/src/compact_space/mod.rs | 17 +++-- fastfield_codecs/src/main.rs | 7 +- fastfield_codecs/src/serialize.rs | 11 ++- src/fastfield/multivalued/writer.rs | 11 +-- src/fastfield/writer.rs | 80 ++++++-------------- src/indexer/merger.rs | 91 ++++------------------- src/schema/document.rs | 6 ++ 8 files changed, 69 insertions(+), 157 deletions(-) diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 0bf46ae6e0..d56cbc51e1 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -100,7 +100,8 @@ mod tests { fn get_u128_column_from_data(data: &[u128]) -> Arc> { let mut out = vec![]; - serialize_u128(VecColumn::from(&data), &mut out).unwrap(); + let iter = || data.iter().cloned(); + serialize_u128(iter, data.len() as u64, &mut out).unwrap(); let out = OwnedBytes::new(out); open_u128::(out).unwrap() } diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index 72283bb481..dd6dfbdbbe 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -171,10 +171,10 @@ pub struct IPCodecParams { impl CompactSpaceCompressor { /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. - pub fn train_from(column: &impl Column) -> Self { + pub fn train_from(iter: impl Iterator, num_vals: u64) -> Self { let mut values_sorted = BTreeSet::new(); - values_sorted.extend(column.iter()); - let total_num_values = column.num_vals(); + values_sorted.extend(iter); + let total_num_values = num_vals; let compact_space = get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS); @@ -443,7 +443,7 @@ impl CompactSpaceDecompressor { mod tests { use super::*; - use crate::{open_u128, serialize_u128, VecColumn}; + use crate::{open_u128, serialize_u128}; #[test] fn compact_space_test() { @@ -513,7 +513,12 @@ mod tests { fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { let mut out = Vec::new(); - serialize_u128(VecColumn::from(u128_vals), &mut out).unwrap(); + serialize_u128( + || u128_vals.iter().cloned(), + u128_vals.len() as u64, + &mut out, + ) + .unwrap(); let data = OwnedBytes::new(out); test_all(data.clone(), u128_vals); @@ -603,7 +608,7 @@ mod tests { 5_000_000_000, ]; let mut out = Vec::new(); - serialize_u128(VecColumn::from(vals), &mut out).unwrap(); + serialize_u128(|| vals.iter().cloned(), vals.len() as u64, &mut out).unwrap(); let decomp = open_u128::(OwnedBytes::new(out)).unwrap(); assert_eq!(decomp.get_between_vals(199..=200), vec![0]); diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 7b963dc128..8e81c41f5f 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -90,7 +90,7 @@ fn bench_ip() { { let mut data = vec![]; for dataset in dataset.chunks(500_000) { - serialize_u128(VecColumn::from(dataset), &mut data).unwrap(); + serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap(); } let compression = data.len() as f64 / (dataset.len() * 16) as f64; println!("Compression 50_000 chunks {:.4}", compression); @@ -101,7 +101,10 @@ fn bench_ip() { } let mut data = vec![]; - serialize_u128(VecColumn::from(&dataset), &mut data).unwrap(); + { + print_time!("creation"); + serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap(); + } let compression = data.len() as f64 / (dataset.len() * 16) as f64; println!("Compression {:.2}", compression); diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 92f55f5d0f..9f1188f511 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -142,15 +142,14 @@ pub fn estimate( } } -pub fn serialize_u128( - typed_column: impl Column, +pub fn serialize_u128 I, I: Iterator>( + iter_gen: F, + num_vals: u64, output: &mut impl io::Write, ) -> io::Result<()> { // TODO write header, to later support more codecs - let compressor = CompactSpaceCompressor::train_from(&typed_column); - compressor - .compress_into(typed_column.iter(), output) - .unwrap(); + let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals); + compressor.compress_into(iter_gen(), output).unwrap(); Ok(()) } diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index c5012911ed..f694ca8df7 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -361,15 +361,8 @@ impl MultiValueU128FastFieldWriter { { let field_write = serializer.get_field_writer(self.field, 1); - let mut values = Vec::with_capacity(self.vals.len()); - for vals in self.get_ordered_values(doc_id_map) { - for &val in vals { - values.push(val); - } - } - let col = VecColumn::from(&values[..]); - - serialize_u128(col, field_write)?; + let iter = || self.get_ordered_values(doc_id_map).flatten().cloned(); + serialize_u128(iter, self.vals.len() as u64, field_write)?; } Ok(()) } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 972e0dde2f..75d3fbebe0 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -364,66 +364,32 @@ impl U128FastFieldWriter { } } - struct RemappedFFWriter<'a> { - doc_id_map: Option<&'a DocIdMapping>, - null_values: &'a RoaringBitmap, - vals: &'a [u128], - idx_to_val_idx: Vec, - val_count: u32, - } - impl<'a> Column for RemappedFFWriter<'a> { - fn get_val(&self, _idx: u64) -> u128 { - // unused by codec - unreachable!() - } - - fn min_value(&self) -> u128 { - // unused by codec - unreachable!() - } - - fn max_value(&self) -> u128 { - // unused by codec - unreachable!() - } + let field_write = serializer.get_field_writer(self.field, 0); - fn num_vals(&self) -> u64 { - self.val_count as u64 - } - fn iter(&self) -> Box + '_> { - if let Some(doc_id_map) = self.doc_id_map { - let iter = doc_id_map.iter_old_doc_ids().map(|idx| { - if self.null_values.contains(idx as u32) { - 0 // TODO properly handle nulls - } else { - self.vals[self.idx_to_val_idx[idx as usize] as usize] - } - }); - Box::new(iter) - } else { - let iter = (0..self.val_count).map(|idx| { - if self.null_values.contains(idx as u32) { - 0 // TODO properly handle nulls - } else { - self.vals[self.idx_to_val_idx[idx as usize] as usize] - } - }); - Box::new(iter) - } - } + if let Some(doc_id_map) = doc_id_map { + let iter = || { + doc_id_map.iter_old_doc_ids().map(|idx| { + if self.null_values.contains(idx as u32) { + 0 // TODO properly handle nulls + } else { + self.vals[idx_to_val_idx[idx as usize] as usize] + } + }) + }; + serialize_u128(iter, self.val_count as u64, field_write)?; + } else { + let iter = || { + (0..self.val_count).map(|idx| { + if self.null_values.contains(idx as u32) { + 0 // TODO properly handle nulls + } else { + self.vals[idx_to_val_idx[idx as usize] as usize] + } + }) + }; + serialize_u128(iter, self.val_count as u64, field_write)?; } - let column = RemappedFFWriter { - doc_id_map, - null_values: &self.null_values, - vals: &self.vals, - idx_to_val_idx, - val_count: self.val_count, - }; - - let field_write = serializer.get_field_writer(self.field, 0); - serialize_u128(column, field_write)?; - Ok(()) } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 84762091f2..6121aa2aaf 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -355,49 +355,16 @@ impl IndexMerger { .map(|(_, ff_reader)| ff_reader) .collect::>(); - struct RemappedFFReader<'a> { - doc_id_mapping: &'a SegmentDocIdMapping, - fast_field_readers: Vec>, - } - impl<'a> Column for RemappedFFReader<'a> { - fn get_val(&self, _idx: u64) -> u128 { - // unused by codec - unreachable!() - } - - fn min_value(&self) -> u128 { - // unused by codec - unreachable!() - } - - fn max_value(&self) -> u128 { - // unused by codec - unreachable!() - } - - fn num_vals(&self) -> u64 { - self.doc_id_mapping.len() as u64 - } - fn iter<'b>(&'b self) -> Box + 'b> { - Box::new( - self.doc_id_mapping - .iter_old_doc_addrs() - .flat_map(|doc_addr| { - let fast_field_reader = - &self.fast_field_readers[doc_addr.segment_ord as usize]; - let mut out = vec![]; - fast_field_reader.get_vals(doc_addr.doc_id, &mut out); - out.into_iter() - }), - ) - } - } - let column = RemappedFFReader { - doc_id_mapping, - fast_field_readers, + let iter = || { + doc_id_mapping.iter_old_doc_addrs().flat_map(|doc_addr| { + let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; + let mut out = vec![]; + fast_field_reader.get_vals(doc_addr.doc_id, &mut out); + out.into_iter() + }) }; let field_write = fast_field_serializer.get_field_writer(field, 1); - serialize_u128(column, field_write)?; + serialize_u128(iter, doc_id_mapping.len() as u64, field_write)?; Ok(()) } @@ -421,42 +388,14 @@ impl IndexMerger { }) .collect::>(); - struct RemappedFFReader<'a> { - doc_id_mapping: &'a SegmentDocIdMapping, - fast_field_readers: Vec>>, - } - impl<'a> Column for RemappedFFReader<'a> { - fn get_val(&self, _idx: u64) -> u128 { - // unused by codec - unreachable!() - } - - fn min_value(&self) -> u128 { - // unused by codec - unreachable!() - } - - fn max_value(&self) -> u128 { - // unused by codec - unreachable!() - } - - fn num_vals(&self) -> u64 { - self.doc_id_mapping.len() as u64 - } - fn iter<'b>(&'b self) -> Box + 'b> { - Box::new(self.doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { - let fast_field_reader = &self.fast_field_readers[doc_addr.segment_ord as usize]; - fast_field_reader.get_val(doc_addr.doc_id as u64) - })) - } - } - let column = RemappedFFReader { - doc_id_mapping, - fast_field_readers, - }; let field_write = fast_field_serializer.get_field_writer(field, 0); - serialize_u128(column, field_write)?; + let iter = || { + doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { + let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; + fast_field_reader.get_val(doc_addr.doc_id as u64) + }) + }; + fastfield_codecs::serialize_u128(iter, doc_id_mapping.len() as u64, field_write)?; Ok(()) } diff --git a/src/schema/document.rs b/src/schema/document.rs index 3bde526b1a..b39acedc83 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,6 +1,7 @@ use std::collections::{HashMap, HashSet}; use std::io::{self, Read, Write}; use std::mem; +use std::net::IpAddr; use common::{BinarySerializable, VInt}; @@ -97,6 +98,11 @@ impl Document { self.add_field_value(field, value); } + /// Add a u64 field + pub fn add_ip(&mut self, field: Field, value: IpAddr) { + self.add_field_value(field, value); + } + /// Add a i64 field pub fn add_i64(&mut self, field: Field, value: i64) { self.add_field_value(field, value); From 5a76e6c5d35b5fd0f05549bfd0c89e7f686140d0 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 28 Sep 2022 16:01:35 +0800 Subject: [PATCH 04/27] fix get_between_vals forwarding fix get_between_vals forwarding in monotonicmapping column by adding an additional conversion function Output->Input --- fastfield_codecs/src/column.rs | 52 +++++++++++++++++++++---------- fastfield_codecs/src/lib.rs | 18 +++++++++-- fastfield_codecs/src/serialize.rs | 16 +++++++--- 3 files changed, 61 insertions(+), 25 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 2962472f9a..bcc5f50846 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -124,7 +124,8 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column for VecColumn<'a, T> { } impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T> -where V: AsRef<[T]> + ?Sized +where + V: AsRef<[T]> + ?Sized, { fn from(values: &'a V) -> Self { let values = values.as_ref(); @@ -137,52 +138,57 @@ where V: AsRef<[T]> + ?Sized } } -struct MonotonicMappingColumn { +struct MonotonicMappingColumn { from_column: C, - monotonic_mapping: T, + monotonic_mapping_to_output: T, + monotonic_mapping_to_input: U, _phantom: PhantomData, } /// Creates a view of a column transformed by a monotonic mapping. -pub fn monotonic_map_column( +pub fn monotonic_map_column( from_column: C, - monotonic_mapping: T, + monotonic_mapping_to_output: T, + monotonic_mapping_to_input: U, ) -> impl Column where C: Column, T: Fn(Input) -> Output + Send + Sync, + U: Fn(Output) -> Input + Send + Sync, Input: Send + Sync, Output: Send + Sync, { MonotonicMappingColumn { from_column, - monotonic_mapping, + monotonic_mapping_to_output, + monotonic_mapping_to_input, _phantom: PhantomData, } } -impl Column - for MonotonicMappingColumn +impl Column + for MonotonicMappingColumn where C: Column, T: Fn(Input) -> Output + Send + Sync, + U: Fn(Output) -> Input + Send + Sync, Input: Send + Sync, - Output: Send + Sync, + Output: Send + Sync + Clone, { #[inline] fn get_val(&self, idx: u64) -> Output { let from_val = self.from_column.get_val(idx); - (self.monotonic_mapping)(from_val) + (self.monotonic_mapping_to_output)(from_val) } fn min_value(&self) -> Output { let from_min_value = self.from_column.min_value(); - (self.monotonic_mapping)(from_min_value) + (self.monotonic_mapping_to_output)(from_min_value) } fn max_value(&self) -> Output { let from_max_value = self.from_column.max_value(); - (self.monotonic_mapping)(from_max_value) + (self.monotonic_mapping_to_output)(from_max_value) } fn num_vals(&self) -> u64 { @@ -190,7 +196,18 @@ where } fn iter(&self) -> Box + '_> { - Box::new(self.from_column.iter().map(&self.monotonic_mapping)) + Box::new( + self.from_column + .iter() + .map(&self.monotonic_mapping_to_output), + ) + } + + fn get_between_vals(&self, range: RangeInclusive) -> Vec { + self.from_column.get_between_vals( + (self.monotonic_mapping_to_input)(range.start().clone()) + ..=(self.monotonic_mapping_to_input)(range.end().clone()), + ) } // We voluntarily do not implement get_range as it yields a regression, @@ -200,7 +217,8 @@ where pub struct IterColumn(T); impl From for IterColumn -where T: Iterator + Clone + ExactSizeIterator +where + T: Iterator + Clone + ExactSizeIterator, { fn from(iter: T) -> Self { IterColumn(iter) @@ -242,7 +260,7 @@ mod tests { fn test_monotonic_mapping() { let vals = &[1u64, 3u64][..]; let col = VecColumn::from(vals); - let mapped = monotonic_map_column(col, |el| el + 4); + let mapped = monotonic_map_column(col, |el| el + 4, |el| el); assert_eq!(mapped.min_value(), 5u64); assert_eq!(mapped.max_value(), 7u64); assert_eq!(mapped.num_vals(), 2); @@ -262,7 +280,7 @@ mod tests { fn test_monotonic_mapping_iter() { let vals: Vec = (-1..99).map(i64::to_u64).collect(); let col = VecColumn::from(&vals); - let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64); + let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, i64::to_u64); let val_i64s: Vec = mapped.iter().collect(); for i in 0..100 { assert_eq!(val_i64s[i as usize], mapped.get_val(i)); @@ -273,7 +291,7 @@ mod tests { fn test_monotonic_mapping_get_range() { let vals: Vec = (-1..99).map(i64::to_u64).collect(); let col = VecColumn::from(&vals); - let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64); + let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, i64::to_u64); assert_eq!(mapped.min_value(), -10i64); assert_eq!(mapped.max_value(), 980i64); assert_eq!(mapped.num_vals(), 100); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 286564a867..e89f8584f5 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -80,7 +80,11 @@ pub fn open_u128( ) -> io::Result>> { let monotonic_mapping = move |val: u128| Item::from_u128(val); let reader = CompactSpaceDecompressor::open(bytes)?; - Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping))) + Ok(Arc::new(monotonic_map_column( + reader, + monotonic_mapping, + Item::to_u128, + ))) } /// Returns the correct codec reader wrapped in the `Arc` for the data. @@ -106,10 +110,18 @@ fn open_specific_codec( let min_value = header.min_value; if let Some(gcd) = header.gcd { let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val * gcd.get()); - Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping))) + Ok(Arc::new(monotonic_map_column( + reader, + monotonic_mapping, + Item::to_u64, + ))) } else { let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val); - Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping))) + Ok(Arc::new(monotonic_map_column( + reader, + monotonic_mapping, + Item::to_u64, + ))) } } diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 9f1188f511..60eaec0fa4 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -69,7 +69,11 @@ impl Header { let min_value = self.min_value; let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1); let divider = DividerU64::divide_by(gcd); - monotonic_map_column(from_column, move |val| divider.divide(val - min_value)) + monotonic_map_column( + from_column, + move |val| divider.divide(val - min_value), + |val| val, + ) } pub fn compute_header( @@ -82,7 +86,8 @@ impl Header { let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value)) .filter(|gcd| gcd.get() > 1u64); let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); - let shifted_column = monotonic_map_column(&column, |val| divider.divide(val - min_value)); + let shifted_column = + monotonic_map_column(&column, |val| divider.divide(val - min_value), |val| val); let codec_type = detect_codec(shifted_column, codecs)?; Some(Header { num_vals, @@ -129,12 +134,13 @@ pub fn estimate( typed_column: impl Column, codec_type: FastFieldCodecType, ) -> Option { - let column = monotonic_map_column(typed_column, T::to_u64); + let column = monotonic_map_column(typed_column, T::to_u64, T::from_u64); let min_value = column.min_value(); let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value)) .filter(|gcd| gcd.get() > 1u64); let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); - let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value)); + let normalized_column = + monotonic_map_column(&column, |val| divider.divide(val - min_value), |val| val); match codec_type { FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column), FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column), @@ -159,7 +165,7 @@ pub fn serialize( output: &mut impl io::Write, codecs: &[FastFieldCodecType], ) -> io::Result<()> { - let column = monotonic_map_column(typed_column, T::to_u64); + let column = monotonic_map_column(typed_column, T::to_u64, T::from_u64); let header = Header::compute_header(&column, codecs).ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidInput, From 309449dba392206c00f0506a8493af8df2b8fb5c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 30 Sep 2022 10:07:44 +0800 Subject: [PATCH 05/27] rename to IpAddr --- src/fastfield/readers.rs | 2 +- src/fastfield/writer.rs | 2 +- src/indexer/merger.rs | 2 +- src/indexer/segment_writer.rs | 2 +- src/postings/per_field_postings_writer.rs | 2 +- src/postings/postings_writer.rs | 2 +- src/query/query_parser/query_parser.rs | 4 ++-- src/schema/document.rs | 8 +++++--- src/schema/field_entry.rs | 4 ++-- src/schema/field_type.rs | 16 ++++++++-------- 10 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index e4dbbd8588..9214a2a44f 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -52,7 +52,7 @@ pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, FieldType::Str(options) if options.is_fast() => { Some((FastType::U64, Cardinality::MultiValues)) } - FieldType::Ip(options) => options + FieldType::IpAddr(options) => options .get_fastfield_cardinality() .map(|cardinality| (FastType::U128, cardinality)), _ => None, diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 75d3fbebe0..b2a0ae9948 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -104,7 +104,7 @@ impl FastFieldsWriter { bytes_value_writers.push(fast_field_writer); } } - FieldType::Ip(opt) => { + FieldType::IpAddr(opt) => { if opt.is_fast() { match opt.get_fastfield_cardinality() { Some(Cardinality::SingleValue) => { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 6121aa2aaf..ccdb660433 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -295,7 +295,7 @@ impl IndexMerger { self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?; } } - FieldType::Ip(options) => match options.get_fastfield_cardinality() { + FieldType::IpAddr(options) => match options.get_fastfield_cardinality() { Some(Cardinality::SingleValue) => { self.write_u128_single_fast_field( field, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 36348518eb..731a7c17f8 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -294,7 +294,7 @@ impl SegmentWriter { ctx, )?; } - FieldType::Ip(_) => { + FieldType::IpAddr(_) => { for value in values { let ip_val = value.as_ip().ok_or_else(make_schema_error)?; term_buffer.set_text(&ip_val.to_string()); diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index d0ca89b11d..a414870710 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -50,7 +50,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box Box::new(SpecializedPostingsWriter::::default()), FieldType::JsonObject(ref json_object_options) => { if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index c9eba39d75..552964a2d5 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -89,7 +89,7 @@ pub(crate) fn serialize_postings( | FieldType::Bool(_) => {} FieldType::Bytes(_) => {} FieldType::JsonObject(_) => {} - FieldType::Ip(_) => {} + FieldType::IpAddr(_) => {} } let postings_writer = per_field_postings_writers.get_for_field(field); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 497bb80e6d..d533e599ca 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -400,7 +400,7 @@ impl QueryParser { let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; Ok(Term::from_field_bytes(field, &bytes)) } - FieldType::Ip(_) => Ok(Term::from_field_text(field, phrase)), + FieldType::IpAddr(_) => Ok(Term::from_field_text(field, phrase)), } } @@ -507,7 +507,7 @@ impl QueryParser { let bytes_term = Term::from_field_bytes(field, &bytes); Ok(vec![LogicalLiteral::Term(bytes_term)]) } - FieldType::Ip(_) => Err(QueryParserError::FieldNotIndexed(field_name.to_string())), + FieldType::IpAddr(_) => Err(QueryParserError::FieldNotIndexed(field_name.to_string())), } } diff --git a/src/schema/document.rs b/src/schema/document.rs index b39acedc83..0311b70621 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -76,7 +76,9 @@ impl Document { /// Adding a facet to the document. pub fn add_facet(&mut self, field: Field, path: F) - where Facet: From { + where + Facet: From, + { let facet = Facet::from(path); let value = Value::Facet(facet); self.add_field_value(field, value); @@ -98,8 +100,8 @@ impl Document { self.add_field_value(field, value); } - /// Add a u64 field - pub fn add_ip(&mut self, field: Field, value: IpAddr) { + /// Add a IP address field + pub fn add_ip_addr(&mut self, field: Field, value: IpAddr) { self.add_field_value(field, value); } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index e3c23687e6..80a8d5c1e4 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -63,7 +63,7 @@ impl FieldEntry { /// Creates a new ip field entry. pub fn new_ip(field_name: String, ip_options: IpOptions) -> FieldEntry { - Self::new(field_name, FieldType::Ip(ip_options)) + Self::new(field_name, FieldType::IpAddr(ip_options)) } /// Creates a field entry for a facet. @@ -120,7 +120,7 @@ impl FieldEntry { FieldType::Facet(ref options) => options.is_stored(), FieldType::Bytes(ref options) => options.is_stored(), FieldType::JsonObject(ref options) => options.is_stored(), - FieldType::Ip(ref options) => options.is_stored(), + FieldType::IpAddr(ref options) => options.is_stored(), } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 9c51f9a6fa..ebcfa6d2b0 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -161,7 +161,7 @@ pub enum FieldType { /// Json object JsonObject(JsonObjectOptions), /// IpAddr field - Ip(IpOptions), + IpAddr(IpOptions), } impl FieldType { @@ -177,7 +177,7 @@ impl FieldType { FieldType::Facet(_) => Type::Facet, FieldType::Bytes(_) => Type::Bytes, FieldType::JsonObject(_) => Type::Json, - FieldType::Ip(_) => Type::Ip, + FieldType::IpAddr(_) => Type::Ip, } } @@ -193,7 +193,7 @@ impl FieldType { FieldType::Facet(ref _facet_options) => true, FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(), - FieldType::Ip(_) => false, + FieldType::IpAddr(_) => false, } } @@ -228,7 +228,7 @@ impl FieldType { | FieldType::F64(ref int_options) | FieldType::Bool(ref int_options) => int_options.is_fast(), FieldType::Date(ref date_options) => date_options.is_fast(), - FieldType::Ip(ref options) => options.is_fast(), + FieldType::IpAddr(ref options) => options.is_fast(), FieldType::Facet(_) => true, FieldType::JsonObject(_) => false, } @@ -269,7 +269,7 @@ impl FieldType { FieldType::Facet(_) => false, FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), FieldType::JsonObject(ref _json_object_options) => false, - FieldType::Ip(_) => false, + FieldType::IpAddr(_) => false, } } @@ -314,7 +314,7 @@ impl FieldType { FieldType::JsonObject(ref json_obj_options) => json_obj_options .get_text_indexing_options() .map(TextFieldIndexing::index_option), - FieldType::Ip(_) => None, + FieldType::IpAddr(_) => None, } } @@ -354,7 +354,7 @@ impl FieldType { expected: "a json object", json: JsonValue::String(field_text), }), - FieldType::Ip(_) => { + FieldType::IpAddr(_) => { Ok(Value::Ip(IpAddr::from_str(&field_text).map_err(|err| { ValueParsingError::ParseError { error: err.to_string(), @@ -409,7 +409,7 @@ impl FieldType { expected: "a json object", json: JsonValue::Number(field_val_num), }), - FieldType::Ip(_) => Err(ValueParsingError::TypeError { + FieldType::IpAddr(_) => Err(ValueParsingError::TypeError { expected: "a string with an ip addr", json: JsonValue::Number(field_val_num), }), From 087beaf328c1ad0e709da5bc838c1294b4045f9c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 30 Sep 2022 10:15:35 +0800 Subject: [PATCH 06/27] remove null handling --- src/fastfield/writer.rs | 39 +++++---------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index b2a0ae9948..147f2f0b6b 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -296,8 +296,6 @@ pub struct U128FastFieldWriter { field: Field, vals: Vec, val_count: u32, - - null_values: RoaringBitmap, } impl U128FastFieldWriter { @@ -307,7 +305,6 @@ impl U128FastFieldWriter { field, vals: vec![], val_count: 0, - null_values: RoaringBitmap::new(), } } @@ -338,7 +335,7 @@ impl U128FastFieldWriter { self.add_val(value); } None => { - self.null_values.insert(self.val_count as u32); + self.add_val(0); // TODO fix null handling } }; self.val_count += 1; @@ -350,43 +347,17 @@ impl U128FastFieldWriter { serializer: &mut CompositeFastFieldSerializer, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { - // To get the actual value, we could materialize the vec with u128 including nulls, but - // that could cost a lot of memory. Instead we just compute the index for of - // the values - let mut idx_to_val_idx = vec![]; - idx_to_val_idx.resize(self.val_count as usize, 0); - - let mut val_idx = 0; - for idx in 0..self.val_count { - if !self.null_values.contains(idx as u32) { - idx_to_val_idx[idx as usize] = val_idx as u32; - val_idx += 1; - } - } - let field_write = serializer.get_field_writer(self.field, 0); if let Some(doc_id_map) = doc_id_map { let iter = || { - doc_id_map.iter_old_doc_ids().map(|idx| { - if self.null_values.contains(idx as u32) { - 0 // TODO properly handle nulls - } else { - self.vals[idx_to_val_idx[idx as usize] as usize] - } - }) + doc_id_map + .iter_old_doc_ids() + .map(|idx| self.vals[idx as usize]) }; serialize_u128(iter, self.val_count as u64, field_write)?; } else { - let iter = || { - (0..self.val_count).map(|idx| { - if self.null_values.contains(idx as u32) { - 0 // TODO properly handle nulls - } else { - self.vals[idx_to_val_idx[idx as usize] as usize] - } - }) - }; + let iter = || (0..self.val_count).map(|idx| self.vals[idx as usize]); serialize_u128(iter, self.val_count as u64, field_write)?; } From eeb1f1909375f2309194be990a074e07db65deb7 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 30 Sep 2022 10:49:28 +0800 Subject: [PATCH 07/27] rename to iter_gen --- fastfield_codecs/src/column.rs | 6 ++---- src/fastfield/multivalued/writer.rs | 4 ++-- src/fastfield/writer.rs | 8 ++++---- src/indexer/merger.rs | 32 ++++++++++++++--------------- src/schema/document.rs | 4 +--- 5 files changed, 25 insertions(+), 29 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index bcc5f50846..a6bb06f7d2 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -124,8 +124,7 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column for VecColumn<'a, T> { } impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T> -where - V: AsRef<[T]> + ?Sized, +where V: AsRef<[T]> + ?Sized { fn from(values: &'a V) -> Self { let values = values.as_ref(); @@ -217,8 +216,7 @@ where pub struct IterColumn(T); impl From for IterColumn -where - T: Iterator + Clone + ExactSizeIterator, +where T: Iterator + Clone + ExactSizeIterator { fn from(iter: T) -> Self { IterColumn(iter) diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index f694ca8df7..59d958c6f9 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -361,8 +361,8 @@ impl MultiValueU128FastFieldWriter { { let field_write = serializer.get_field_writer(self.field, 1); - let iter = || self.get_ordered_values(doc_id_map).flatten().cloned(); - serialize_u128(iter, self.vals.len() as u64, field_write)?; + let iter_gen = || self.get_ordered_values(doc_id_map).flatten().cloned(); + serialize_u128(iter_gen, self.vals.len() as u64, field_write)?; } Ok(()) } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 147f2f0b6b..831811f944 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -350,15 +350,15 @@ impl U128FastFieldWriter { let field_write = serializer.get_field_writer(self.field, 0); if let Some(doc_id_map) = doc_id_map { - let iter = || { + let iter_gen = || { doc_id_map .iter_old_doc_ids() .map(|idx| self.vals[idx as usize]) }; - serialize_u128(iter, self.val_count as u64, field_write)?; + serialize_u128(iter_gen, self.val_count as u64, field_write)?; } else { - let iter = || (0..self.val_count).map(|idx| self.vals[idx as usize]); - serialize_u128(iter, self.val_count as u64, field_write)?; + let iter_gen = || (0..self.val_count).map(|idx| self.vals[idx as usize]); + serialize_u128(iter_gen, self.val_count as u64, field_write)?; } Ok(()) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index ccdb660433..c16bfc22d7 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -330,18 +330,18 @@ impl IndexMerger { fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &SegmentDocIdMapping, ) -> crate::Result<()> { - let segment_and_ff_readers = self - .readers - .iter() - .map(|segment_reader| { - let ff_reader: MultiValuedU128FastFieldReader = - segment_reader.fast_fields().u128s(field).expect( - "Failed to find index for multivalued field. This is a bug in tantivy, \ - please report.", - ); - (segment_reader, ff_reader) - }) - .collect::>(); + let segment_and_ff_readers: Vec<(&SegmentReader, MultiValuedU128FastFieldReader)> = + self.readers + .iter() + .map(|segment_reader| { + let ff_reader: MultiValuedU128FastFieldReader = + segment_reader.fast_fields().u128s(field).expect( + "Failed to find index for multivalued field. This is a bug in \ + tantivy, please report.", + ); + (segment_reader, ff_reader) + }) + .collect::>(); Self::write_1_n_fast_field_idx_generic( field, @@ -355,7 +355,7 @@ impl IndexMerger { .map(|(_, ff_reader)| ff_reader) .collect::>(); - let iter = || { + let iter_gen = || { doc_id_mapping.iter_old_doc_addrs().flat_map(|doc_addr| { let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; let mut out = vec![]; @@ -364,7 +364,7 @@ impl IndexMerger { }) }; let field_write = fast_field_serializer.get_field_writer(field, 1); - serialize_u128(iter, doc_id_mapping.len() as u64, field_write)?; + serialize_u128(iter_gen, doc_id_mapping.len() as u64, field_write)?; Ok(()) } @@ -389,13 +389,13 @@ impl IndexMerger { .collect::>(); let field_write = fast_field_serializer.get_field_writer(field, 0); - let iter = || { + let iter_gen = || { doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; fast_field_reader.get_val(doc_addr.doc_id as u64) }) }; - fastfield_codecs::serialize_u128(iter, doc_id_mapping.len() as u64, field_write)?; + fastfield_codecs::serialize_u128(iter_gen, doc_id_mapping.len() as u64, field_write)?; Ok(()) } diff --git a/src/schema/document.rs b/src/schema/document.rs index 0311b70621..4940c8778e 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -76,9 +76,7 @@ impl Document { /// Adding a facet to the document. pub fn add_facet(&mut self, field: Field, path: F) - where - Facet: From, - { + where Facet: From { let facet = Facet::from(path); let value = Value::Facet(facet); self.add_field_value(field, value); From f5039f18466c212f5286bb747e500ad7ad9609fc Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 30 Sep 2022 10:50:29 +0800 Subject: [PATCH 08/27] remove roaring --- Cargo.toml | 1 - src/fastfield/writer.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1bbe0220bb..330d963625 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,7 +60,6 @@ measure_time = "0.8.2" ciborium = { version = "0.2", optional = true} async-trait = "0.1.53" arc-swap = "1.5.0" -roaring = "0.10.1" [target.'cfg(windows)'.dependencies] winapi = "0.3.9" diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 831811f944..8dbfb954ad 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -6,7 +6,6 @@ use fastfield_codecs::{ serialize_u128, Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, }; use fnv::FnvHashMap; -use roaring::RoaringBitmap; use tantivy_bitpacker::BlockedBitpacker; use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter}; From 787a37bacf41497e57fc73356b4ea331bb416554 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 30 Sep 2022 11:01:12 +0800 Subject: [PATCH 09/27] expect instead of unwrap --- src/fastfield/multivalued/writer.rs | 4 +++- src/fastfield/writer.rs | 5 ++++- src/schema/field_type.rs | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 59d958c6f9..66bb501db1 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -317,7 +317,9 @@ impl MultiValueU128FastFieldWriter { for field_value in doc.field_values() { if field_value.field == self.field { let value = field_value.value(); - let ip_addr = value.as_ip().unwrap(); + let ip_addr = value + .as_ip() + .expect(&format!("expected and ip, but got {:?}", value)); let value = ip_addr.to_u128(); self.add_val(value); } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 8dbfb954ad..68d3d9f400 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -329,7 +329,10 @@ impl U128FastFieldWriter { pub fn add_document(&mut self, doc: &Document) { match doc.get_first(self.field) { Some(v) => { - let ip_addr = v.as_ip().unwrap(); + let ip_addr = v + .as_ip() + .expect(&format!("expected and ip, but got {:?}", v)); + let value = ip_addr.to_u128(); self.add_val(value); } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index ebcfa6d2b0..19bc09f297 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -5,8 +5,8 @@ use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use thiserror::Error; -use super::Cardinality; use super::ip_options::IpOptions; +use super::Cardinality; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; use crate::schema::{ From 67f453b534d588d76ff45834536daaf3a24a429b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 30 Sep 2022 11:50:51 +0800 Subject: [PATCH 10/27] rename to iter_gen --- fastfield_codecs/benches/bench.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index d56cbc51e1..5546d2af70 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -100,8 +100,8 @@ mod tests { fn get_u128_column_from_data(data: &[u128]) -> Arc> { let mut out = vec![]; - let iter = || data.iter().cloned(); - serialize_u128(iter, data.len() as u64, &mut out).unwrap(); + let iter_gen = || data.iter().cloned(); + serialize_u128(iter_gen, data.len() as u64, &mut out).unwrap(); let out = OwnedBytes::new(out); open_u128::(out).unwrap() } From cdc8e3a8bebe4e7cb7f7a73a86b6c3d984386e2a Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 4 Oct 2022 16:53:03 +0800 Subject: [PATCH 11/27] group montonic mapping and inverse fix mapping inverse remove ip indexing add get_between_vals test --- fastfield_codecs/src/column.rs | 47 ++++++++++++--------- fastfield_codecs/src/lib.rs | 30 ++++++++++---- fastfield_codecs/src/monotonic_mapping.rs | 50 +++++++++++++++++++++++ fastfield_codecs/src/serialize.rs | 44 ++++++++++++-------- src/fastfield/multivalued/writer.rs | 2 +- src/fastfield/writer.rs | 2 +- src/indexer/segment_writer.rs | 8 +--- 7 files changed, 130 insertions(+), 53 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index a6bb06f7d2..02975a56c2 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -139,16 +139,27 @@ where V: AsRef<[T]> + ?Sized struct MonotonicMappingColumn { from_column: C, - monotonic_mapping_to_output: T, - monotonic_mapping_to_input: U, + monotonic_mapping: T, + monotonic_mapping_inv: U, _phantom: PhantomData, } /// Creates a view of a column transformed by a monotonic mapping. +/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3] +/// The provided mappings need to be the inverse of each other. +/// +/// The inverse of the mapping is required for: +/// `fn get_between_vals(&self, range: RangeInclusive) -> Vec ` +/// The user provides the original value range and we need to monotonic map them in the same way the +/// serialization does before calling the underlying column. +/// +/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping +/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as +/// monotonic_mapping during serialization. pub fn monotonic_map_column( from_column: C, - monotonic_mapping_to_output: T, - monotonic_mapping_to_input: U, + monotonic_mapping: T, + monotonic_mapping_inv: U, ) -> impl Column where C: Column, @@ -159,8 +170,8 @@ where { MonotonicMappingColumn { from_column, - monotonic_mapping_to_output, - monotonic_mapping_to_input, + monotonic_mapping, + monotonic_mapping_inv, _phantom: PhantomData, } } @@ -177,17 +188,17 @@ where #[inline] fn get_val(&self, idx: u64) -> Output { let from_val = self.from_column.get_val(idx); - (self.monotonic_mapping_to_output)(from_val) + (self.monotonic_mapping)(from_val) } fn min_value(&self) -> Output { let from_min_value = self.from_column.min_value(); - (self.monotonic_mapping_to_output)(from_min_value) + (self.monotonic_mapping)(from_min_value) } fn max_value(&self) -> Output { let from_max_value = self.from_column.max_value(); - (self.monotonic_mapping_to_output)(from_max_value) + (self.monotonic_mapping)(from_max_value) } fn num_vals(&self) -> u64 { @@ -195,17 +206,13 @@ where } fn iter(&self) -> Box + '_> { - Box::new( - self.from_column - .iter() - .map(&self.monotonic_mapping_to_output), - ) + Box::new(self.from_column.iter().map(&self.monotonic_mapping)) } fn get_between_vals(&self, range: RangeInclusive) -> Vec { self.from_column.get_between_vals( - (self.monotonic_mapping_to_input)(range.start().clone()) - ..=(self.monotonic_mapping_to_input)(range.end().clone()), + (self.monotonic_mapping_inv)(range.start().clone()) + ..=(self.monotonic_mapping_inv)(range.end().clone()), ) } @@ -258,7 +265,7 @@ mod tests { fn test_monotonic_mapping() { let vals = &[1u64, 3u64][..]; let col = VecColumn::from(vals); - let mapped = monotonic_map_column(col, |el| el + 4, |el| el); + let mapped = monotonic_map_column(col, |el| el + 4, |_el| unimplemented!()); assert_eq!(mapped.min_value(), 5u64); assert_eq!(mapped.max_value(), 7u64); assert_eq!(mapped.num_vals(), 2); @@ -278,7 +285,8 @@ mod tests { fn test_monotonic_mapping_iter() { let vals: Vec = (-1..99).map(i64::to_u64).collect(); let col = VecColumn::from(&vals); - let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, i64::to_u64); + let mapped = + monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!()); let val_i64s: Vec = mapped.iter().collect(); for i in 0..100 { assert_eq!(val_i64s[i as usize], mapped.get_val(i)); @@ -289,7 +297,8 @@ mod tests { fn test_monotonic_mapping_get_range() { let vals: Vec = (-1..99).map(i64::to_u64).collect(); let col = VecColumn::from(&vals); - let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, i64::to_u64); + let mapped = + monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!()); assert_eq!(mapped.min_value(), -10i64); assert_eq!(mapped.max_value(), 980i64); assert_eq!(mapped.num_vals(), 100); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index e89f8584f5..c537d2faa2 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -13,6 +13,9 @@ use std::sync::Arc; use common::BinarySerializable; use compact_space::CompactSpaceDecompressor; +use fastdivide::DividerU64; +use monotonic_mapping::gcd_min_val_mapping_pairs::{from_gcd_normalized_u64, normalize_with_gcd}; +use monotonic_mapping::min_val_mapping_pairs::{from_normalized_u64, normalize}; use ownedbytes::OwnedBytes; use serialize::Header; @@ -78,11 +81,10 @@ impl FastFieldCodecType { pub fn open_u128( bytes: OwnedBytes, ) -> io::Result>> { - let monotonic_mapping = move |val: u128| Item::from_u128(val); let reader = CompactSpaceDecompressor::open(bytes)?; Ok(Arc::new(monotonic_map_column( reader, - monotonic_mapping, + Item::from_u128, Item::to_u128, ))) } @@ -109,18 +111,17 @@ fn open_specific_codec( let reader = C::open_from_bytes(bytes, normalized_header)?; let min_value = header.min_value; if let Some(gcd) = header.gcd { - let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val * gcd.get()); + let divider = DividerU64::divide_by(gcd.get()); Ok(Arc::new(monotonic_map_column( reader, - monotonic_mapping, - Item::to_u64, + move |val: u64| from_gcd_normalized_u64(val, min_value, gcd.get()), + move |val| normalize_with_gcd(val, min_value, ÷r), ))) } else { - let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val); Ok(Arc::new(monotonic_map_column( reader, - monotonic_mapping, - Item::to_u64, + move |val: u64| from_normalized_u64(val, min_value), + move |val| normalize(val, min_value), ))) } } @@ -161,6 +162,7 @@ pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [ #[cfg(test)] mod tests { + use proptest::prelude::*; use proptest::strategy::Strategy; use proptest::{prop_oneof, proptest}; @@ -195,6 +197,18 @@ mod tests { `{data:?}`", ); } + + if !data.is_empty() { + let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1); + let expected_positions: Vec = data + .iter() + .enumerate() + .filter(|(_, el)| **el == data[test_rand_idx]) + .map(|(pos, _)| pos as u64) + .collect(); + let positions = reader.get_between_vals(data[test_rand_idx]..=data[test_rand_idx]); + assert_eq!(expected_positions, positions); + } Some((estimation, actual_compression)) } diff --git a/fastfield_codecs/src/monotonic_mapping.rs b/fastfield_codecs/src/monotonic_mapping.rs index d4e673040f..0a8310a8e5 100644 --- a/fastfield_codecs/src/monotonic_mapping.rs +++ b/fastfield_codecs/src/monotonic_mapping.rs @@ -1,3 +1,5 @@ +use fastdivide::DividerU64; + pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync { /// Converts a value to u64. /// @@ -11,6 +13,54 @@ pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync fn from_u64(val: u64) -> Self; } +// Mapping pairs for the case we subtract the min_value and apply a gcd (greatest common divisor) +pub mod gcd_min_val_mapping_pairs { + + use super::*; + pub fn from_gcd_normalized_u64( + val: u64, + min_value: u64, + gcd: u64, + ) -> Item { + Item::from_u64(min_value + val * gcd) + } + + pub fn normalize_with_gcd( + val: Item, + min_value: u64, + gcd_divider: &DividerU64, + ) -> u64 { + gcd_divider.divide(Item::to_u64(val) - min_value) + } + + #[test] + fn monotonic_mapping_roundtrip_test() { + let gcd = std::num::NonZeroU64::new(10).unwrap(); + let divider = DividerU64::divide_by(gcd.get()); + + let orig_value: u64 = 500; + let normalized_val: u64 = normalize_with_gcd(orig_value, 100, ÷r); + assert_eq!(normalized_val, 40); + assert_eq!( + from_gcd_normalized_u64::(normalized_val, 100, gcd.get()), + 500 + ); + } +} + +// Mapping pairs for the case we subtract the min_value +pub mod min_val_mapping_pairs { + use super::*; + + pub fn from_normalized_u64(val: u64, min_value: u64) -> Item { + Item::from_u64(min_value + val) + } + + pub fn normalize(val: Item, min_value: u64) -> u64 { + Item::to_u64(val) - min_value + } +} + impl MonotonicallyMappableToU64 for u64 { fn to_u64(self) -> u64 { self diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 60eaec0fa4..9d653cab1a 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -30,6 +30,7 @@ use crate::bitpacked::BitpackedCodec; use crate::blockwise_linear::BlockwiseLinearCodec; use crate::compact_space::CompactSpaceCompressor; use crate::linear::LinearCodec; +use crate::monotonic_mapping::gcd_min_val_mapping_pairs::normalize_with_gcd; use crate::{ monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType, MonotonicallyMappableToU64, VecColumn, ALL_CODEC_TYPES, @@ -57,8 +58,9 @@ pub(crate) struct Header { impl Header { pub fn normalized(self) -> NormalizedHeader { - let max_value = - (self.max_value - self.min_value) / self.gcd.map(|gcd| gcd.get()).unwrap_or(1); + let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1); + let gcd_divider = DividerU64::divide_by(gcd); + let max_value = normalize_with_gcd(self.max_value, self.min_value, &gcd_divider); NormalizedHeader { num_vals: self.num_vals, max_value, @@ -66,14 +68,7 @@ impl Header { } pub fn normalize_column(&self, from_column: C) -> impl Column { - let min_value = self.min_value; - let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1); - let divider = DividerU64::divide_by(gcd); - monotonic_map_column( - from_column, - move |val| divider.divide(val - min_value), - |val| val, - ) + normalize_column(from_column, self.min_value, self.gcd) } pub fn compute_header( @@ -85,10 +80,8 @@ impl Header { let max_value = column.max_value(); let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value)) .filter(|gcd| gcd.get() > 1u64); - let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); - let shifted_column = - monotonic_map_column(&column, |val| divider.divide(val - min_value), |val| val); - let codec_type = detect_codec(shifted_column, codecs)?; + let normalized_column = normalize_column(column, min_value, gcd); + let codec_type = detect_codec(normalized_column, codecs)?; Some(Header { num_vals, min_value, @@ -99,6 +92,20 @@ impl Header { } } +pub fn normalize_column( + from_column: C, + min_value: u64, + gcd: Option, +) -> impl Column { + let gcd = gcd.map(|gcd| gcd.get()).unwrap_or(1); + let gcd_divider = DividerU64::divide_by(gcd); + monotonic_map_column( + from_column, + move |val| normalize_with_gcd(val, min_value, &gcd_divider), + move |_val| unimplemented!(), // This code is only used in serialization + ) +} + impl BinarySerializable for Header { fn serialize(&self, writer: &mut W) -> io::Result<()> { VInt(self.num_vals).serialize(writer)?; @@ -138,9 +145,12 @@ pub fn estimate( let min_value = column.min_value(); let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value)) .filter(|gcd| gcd.get() > 1u64); - let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); - let normalized_column = - monotonic_map_column(&column, |val| divider.divide(val - min_value), |val| val); + let gcd_divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); + let normalized_column = monotonic_map_column( + &column, + |val| normalize_with_gcd(val, min_value, &gcd_divider), + |_val| unimplemented!(), + ); match codec_type { FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column), FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column), diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 66bb501db1..1f7b6bb32b 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -319,7 +319,7 @@ impl MultiValueU128FastFieldWriter { let value = field_value.value(); let ip_addr = value .as_ip() - .expect(&format!("expected and ip, but got {:?}", value)); + .unwrap_or_else(|| panic!("expected and ip, but got {:?}", value)); let value = ip_addr.to_u128(); self.add_val(value); } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 68d3d9f400..cc66ae544a 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -331,7 +331,7 @@ impl U128FastFieldWriter { Some(v) => { let ip_addr = v .as_ip() - .expect(&format!("expected and ip, but got {:?}", v)); + .unwrap_or_else(|| panic!("expected and ip, but got {:?}", v)); let value = ip_addr.to_u128(); self.add_val(value); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 731a7c17f8..3e33933923 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -294,13 +294,7 @@ impl SegmentWriter { ctx, )?; } - FieldType::IpAddr(_) => { - for value in values { - let ip_val = value.as_ip().ok_or_else(make_schema_error)?; - term_buffer.set_text(&ip_val.to_string()); - postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); - } - } + FieldType::IpAddr(_) => {} } } Ok(()) From 4d29ff4d01c233ba58ca0f4a3985d41d5189aa11 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 6 Oct 2022 11:22:38 +0800 Subject: [PATCH 12/27] finalize ip addr rename --- src/fastfield/writer.rs | 4 ++-- src/indexer/index_writer.rs | 8 ++++---- src/schema/field_entry.rs | 4 ++-- src/schema/field_type.rs | 14 +++++++------- src/schema/ip_options.rs | 32 ++++++++++++++++---------------- src/schema/mod.rs | 2 +- src/schema/schema.rs | 6 +++--- src/schema/term.rs | 2 +- 8 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index cc66ae544a..f6a3162c45 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -283,7 +283,7 @@ impl FastFieldsWriter { /// The fast field writer just keeps the values in memory. /// /// Only when the segment writer can be closed and -/// persisted on disc, the fast field writer is +/// persisted on disk, the fast field writer is /// sent to a `FastFieldSerializer` via the `.serialize(...)` /// method. /// @@ -371,7 +371,7 @@ impl U128FastFieldWriter { /// The fast field writer just keeps the values in memory. /// /// Only when the segment writer can be closed and -/// persisted on disc, the fast field writer is +/// persisted on disk, the fast field writer is /// sent to a `FastFieldSerializer` via the `.serialize(...)` /// method. /// diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 9b3b6bfc9e..619fa4a7b8 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -817,7 +817,7 @@ mod tests { use crate::indexer::NoMergePolicy; use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}; use crate::schema::{ - self, Cardinality, Facet, FacetOptions, IndexRecordOption, IpOptions, NumericOptions, + self, Cardinality, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, }; use crate::store::DOCSTORE_CACHE_CAPACITY; @@ -1595,10 +1595,10 @@ mod tests { force_end_merge: bool, ) -> crate::Result<()> { let mut schema_builder = schema::Schema::builder(); - let ip_field = schema_builder.add_ip_field("ip", FAST | INDEXED | STORED); - let ips_field = schema_builder.add_ip_field( + let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED); + let ips_field = schema_builder.add_ip_addr_field( "ips", - IpOptions::default().set_fast(Cardinality::MultiValues), + IpAddrOptions::default().set_fast(Cardinality::MultiValues), ); let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED); let bytes_field = schema_builder.add_bytes_field("bytes", FAST | INDEXED | STORED); diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 80a8d5c1e4..db25300403 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; -use super::ip_options::IpOptions; +use super::ip_options::IpAddrOptions; use crate::schema::bytes_options::BytesOptions; use crate::schema::{ is_valid_field_name, DateOptions, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, @@ -62,7 +62,7 @@ impl FieldEntry { } /// Creates a new ip field entry. - pub fn new_ip(field_name: String, ip_options: IpOptions) -> FieldEntry { + pub fn new_ip_addr(field_name: String, ip_options: IpAddrOptions) -> FieldEntry { Self::new(field_name, FieldType::IpAddr(ip_options)) } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 19bc09f297..4ae6071e90 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use thiserror::Error; -use super::ip_options::IpOptions; +use super::ip_options::IpAddrOptions; use super::Cardinality; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; @@ -67,7 +67,7 @@ pub enum Type { /// Leaf in a Json object. Json = b'j', /// IpAddr - Ip = b'p', + IpAddr = b'p', /// IpAddr U128 = b'1', } @@ -82,7 +82,7 @@ const ALL_TYPES: [Type; 11] = [ Type::Facet, Type::Bytes, Type::Json, - Type::Ip, + Type::IpAddr, Type::U128, ]; @@ -110,7 +110,7 @@ impl Type { Type::Facet => "Facet", Type::Bytes => "Bytes", Type::Json => "Json", - Type::Ip => "Ip", + Type::IpAddr => "IpAddr", Type::U128 => "U128", } } @@ -128,7 +128,7 @@ impl Type { b'h' => Some(Type::Facet), b'b' => Some(Type::Bytes), b'j' => Some(Type::Json), - b'p' => Some(Type::Ip), + b'p' => Some(Type::IpAddr), b'1' => Some(Type::U128), _ => None, } @@ -161,7 +161,7 @@ pub enum FieldType { /// Json object JsonObject(JsonObjectOptions), /// IpAddr field - IpAddr(IpOptions), + IpAddr(IpAddrOptions), } impl FieldType { @@ -177,7 +177,7 @@ impl FieldType { FieldType::Facet(_) => Type::Facet, FieldType::Bytes(_) => Type::Bytes, FieldType::JsonObject(_) => Type::Json, - FieldType::IpAddr(_) => Type::Ip, + FieldType::IpAddr(_) => Type::IpAddr, } } diff --git a/src/schema/ip_options.rs b/src/schema/ip_options.rs index 195d469167..ce998f43fe 100644 --- a/src/schema/ip_options.rs +++ b/src/schema/ip_options.rs @@ -7,13 +7,13 @@ use super::Cardinality; /// Define how an ip field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] -pub struct IpOptions { +pub struct IpAddrOptions { #[serde(skip_serializing_if = "Option::is_none")] fast: Option, stored: bool, } -impl IpOptions { +impl IpAddrOptions { /// Returns true iff the value is a fast field. pub fn is_fast(&self) -> bool { self.fast.is_some() @@ -52,52 +52,52 @@ impl IpOptions { } } -impl From<()> for IpOptions { - fn from(_: ()) -> IpOptions { - IpOptions::default() +impl From<()> for IpAddrOptions { + fn from(_: ()) -> IpAddrOptions { + IpAddrOptions::default() } } -impl From for IpOptions { +impl From for IpAddrOptions { fn from(_: FastFlag) -> Self { - IpOptions { + IpAddrOptions { stored: false, fast: Some(Cardinality::SingleValue), } } } -impl From for IpOptions { +impl From for IpAddrOptions { fn from(_: StoredFlag) -> Self { - IpOptions { + IpAddrOptions { stored: true, fast: None, } } } -impl From for IpOptions { +impl From for IpAddrOptions { fn from(_: IndexedFlag) -> Self { - IpOptions { + IpAddrOptions { stored: false, fast: None, } } } -impl> BitOr for IpOptions { - type Output = IpOptions; +impl> BitOr for IpAddrOptions { + type Output = IpAddrOptions; - fn bitor(self, other: T) -> IpOptions { + fn bitor(self, other: T) -> IpAddrOptions { let other = other.into(); - IpOptions { + IpAddrOptions { stored: self.stored | other.stored, fast: self.fast.or(other.fast), } } } -impl From> for IpOptions +impl From> for IpAddrOptions where Head: Clone, Tail: Clone, diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 4d966a8b9b..c64eef788a 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -138,7 +138,7 @@ pub use self::field_type::{FieldType, Type}; pub use self::field_value::FieldValue; pub use self::flags::{FAST, INDEXED, STORED}; pub use self::index_record_option::IndexRecordOption; -pub use self::ip_options::IpOptions; +pub use self::ip_options::IpAddrOptions; pub use self::json_object_options::JsonObjectOptions; pub use self::named_field_document::NamedFieldDocument; pub use self::numeric_options::NumericOptions; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index e85c9d5ae2..a8e2be29f5 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -7,7 +7,7 @@ use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_json::{self, Value as JsonValue}; -use super::ip_options::IpOptions; +use super::ip_options::IpAddrOptions; use super::*; use crate::schema::bytes_options::BytesOptions; use crate::schema::field_type::ValueParsingError; @@ -157,13 +157,13 @@ impl SchemaBuilder { /// by the second one. /// The first field will get a field id /// but only the second one will be indexed - pub fn add_ip_field>( + pub fn add_ip_addr_field>( &mut self, field_name_str: &str, field_options: T, ) -> Field { let field_name = String::from(field_name_str); - let field_entry = FieldEntry::new_ip(field_name, field_options.into()); + let field_entry = FieldEntry::new_ip_addr(field_name, field_options.into()); self.add_field(field_entry) } diff --git a/src/schema/term.rs b/src/schema/term.rs index 79546d1dd2..d0c37b7674 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -415,7 +415,7 @@ fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Re debug_value_bytes(typ, bytes, f)?; } } - Type::Ip => { + Type::IpAddr => { let s = as_str(bytes); // TODO: change when serialization changes write_opt(f, s)?; } From 5d6602a8d98acc09c779eb43631d99399a136e63 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 6 Oct 2022 11:52:02 +0800 Subject: [PATCH 13/27] mark null handling TODO --- src/indexer/index_writer.rs | 3 ++- src/indexer/merger.rs | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 619fa4a7b8..ba57fce360 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1780,6 +1780,7 @@ mod tests { segment_reader.doc_ids_alive().flat_map(move |doc| { let val = ff_reader.get_val(doc as u64); if val == IpAddr::from_u128(0) { + // TODO Fix null handling None } else { Some(val) @@ -1818,7 +1819,7 @@ mod tests { segment_reader.doc_ids_alive().flat_map(move |doc| { let mut vals = vec![]; ff_reader.get_vals(doc, &mut vals); - vals.into_iter().filter(|val| val.to_u128() != 0) + vals.into_iter().filter(|val| val.to_u128() != 0) // TODO Fix null handling }) }) .collect(); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index c16bfc22d7..5ea5b5da18 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -11,8 +11,8 @@ use crate::core::{Segment, SegmentReader}; use crate::docset::{DocSet, TERMINATED}; use crate::error::DataCorruption; use crate::fastfield::{ - get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer, MultiValueLength, - MultiValuedFastFieldReader, MultiValuedU128FastFieldReader, + get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer, + MultiValueLength, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader, }; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; From 0b86658389ea9e0d0edb4acbee658fabb7e60f23 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 6 Oct 2022 13:13:05 +0800 Subject: [PATCH 14/27] rename ip addr, use buffer --- src/fastfield/multivalued/writer.rs | 6 +-- src/fastfield/writer.rs | 4 +- src/indexer/merger.rs | 13 ++--- src/query/query_parser/query_parser.rs | 6 ++- src/schema/field_entry.rs | 2 +- src/schema/field_type.rs | 70 +++++++++++++------------- src/schema/schema.rs | 4 +- src/schema/value.rs | 22 ++++---- 8 files changed, 66 insertions(+), 61 deletions(-) diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 1f7b6bb32b..4cefeadac4 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -318,10 +318,10 @@ impl MultiValueU128FastFieldWriter { if field_value.field == self.field { let value = field_value.value(); let ip_addr = value - .as_ip() + .as_ip_addr() .unwrap_or_else(|| panic!("expected and ip, but got {:?}", value)); - let value = ip_addr.to_u128(); - self.add_val(value); + let ip_addr_u128 = ip_addr.to_u128(); + self.add_val(ip_addr_u128); } } } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index f6a3162c45..53cd2a2116 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -330,7 +330,7 @@ impl U128FastFieldWriter { match doc.get_first(self.field) { Some(v) => { let ip_addr = v - .as_ip() + .as_ip_addr() .unwrap_or_else(|| panic!("expected and ip, but got {:?}", v)); let value = ip_addr.to_u128(); @@ -359,7 +359,7 @@ impl U128FastFieldWriter { }; serialize_u128(iter_gen, self.val_count as u64, field_write)?; } else { - let iter_gen = || (0..self.val_count).map(|idx| self.vals[idx as usize]); + let iter_gen = || self.vals.iter().cloned(); serialize_u128(iter_gen, self.val_count as u64, field_write)?; } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 5ea5b5da18..3d489d6625 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -6,6 +6,7 @@ use fastfield_codecs::{serialize_u128, VecColumn}; use itertools::Itertools; use measure_time::debug_time; +use super::flat_map_with_buffer::FlatMapWithBufferIter; use super::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueIndexColumn; use crate::core::{Segment, SegmentReader}; use crate::docset::{DocSet, TERMINATED}; @@ -356,12 +357,12 @@ impl IndexMerger { .collect::>(); let iter_gen = || { - doc_id_mapping.iter_old_doc_addrs().flat_map(|doc_addr| { - let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; - let mut out = vec![]; - fast_field_reader.get_vals(doc_addr.doc_id, &mut out); - out.into_iter() - }) + doc_id_mapping + .iter_old_doc_addrs() + .flat_map_with_buffer(|doc_addr, buffer| { + let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; + fast_field_reader.get_vals(doc_addr.doc_id, buffer); + }) }; let field_write = fast_field_serializer.get_field_writer(field, 1); serialize_u128(iter_gen, doc_id_mapping.len() as u64, field_write)?; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index d533e599ca..19b027bd33 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -400,7 +400,11 @@ impl QueryParser { let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; Ok(Term::from_field_bytes(field, &bytes)) } - FieldType::IpAddr(_) => Ok(Term::from_field_text(field, phrase)), + FieldType::IpAddr(_) => { + return Err(QueryParserError::UnsupportedQuery( + "Range query are not supported on IpAddr field.".to_string(), + )); + } } } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index db25300403..9c66663af7 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -61,7 +61,7 @@ impl FieldEntry { Self::new(field_name, FieldType::Date(date_options)) } - /// Creates a new ip field entry. + /// Creates a new ip address field entry. pub fn new_ip_addr(field_name: String, ip_options: IpAddrOptions) -> FieldEntry { Self::new(field_name, FieldType::IpAddr(ip_options)) } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 4ae6071e90..21390e09f9 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -228,7 +228,7 @@ impl FieldType { | FieldType::F64(ref int_options) | FieldType::Bool(ref int_options) => int_options.is_fast(), FieldType::Date(ref date_options) => date_options.is_fast(), - FieldType::IpAddr(ref options) => options.is_fast(), + FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_fast(), FieldType::Facet(_) => true, FieldType::JsonObject(_) => false, } @@ -325,45 +325,43 @@ impl FieldType { /// target field is a `Str`, this method will return an Error. pub fn value_from_json(&self, json: JsonValue) -> Result { match json { - JsonValue::String(field_text) => { - match self { - FieldType::Date(_) => { - let dt_with_fixed_tz = OffsetDateTime::parse(&field_text, &Rfc3339) - .map_err(|_err| ValueParsingError::TypeError { + JsonValue::String(field_text) => match self { + FieldType::Date(_) => { + let dt_with_fixed_tz = + OffsetDateTime::parse(&field_text, &Rfc3339).map_err(|_err| { + ValueParsingError::TypeError { expected: "rfc3339 format", json: JsonValue::String(field_text), - })?; - Ok(DateTime::from_utc(dt_with_fixed_tz).into()) - } - FieldType::Str(_) => Ok(Value::Str(field_text)), - FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => { - Err(ValueParsingError::TypeError { - expected: "an integer", - json: JsonValue::String(field_text), - }) - } - FieldType::Bool(_) => Err(ValueParsingError::TypeError { - expected: "a boolean", - json: JsonValue::String(field_text), - }), - FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))), - FieldType::Bytes(_) => base64::decode(&field_text) - .map(Value::Bytes) - .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }), - FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { - expected: "a json object", - json: JsonValue::String(field_text), - }), - FieldType::IpAddr(_) => { - Ok(Value::Ip(IpAddr::from_str(&field_text).map_err(|err| { - ValueParsingError::ParseError { - error: err.to_string(), - json: JsonValue::String(field_text), } - })?)) - } + })?; + Ok(DateTime::from_utc(dt_with_fixed_tz).into()) } - } + FieldType::Str(_) => Ok(Value::Str(field_text)), + FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => { + Err(ValueParsingError::TypeError { + expected: "an integer", + json: JsonValue::String(field_text), + }) + } + FieldType::Bool(_) => Err(ValueParsingError::TypeError { + expected: "a boolean", + json: JsonValue::String(field_text), + }), + FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))), + FieldType::Bytes(_) => base64::decode(&field_text) + .map(Value::Bytes) + .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }), + FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { + expected: "a json object", + json: JsonValue::String(field_text), + }), + FieldType::IpAddr(_) => Ok(Value::IpAddr(IpAddr::from_str(&field_text).map_err( + |err| ValueParsingError::ParseError { + error: err.to_string(), + json: JsonValue::String(field_text), + }, + )?)), + }, JsonValue::Number(field_val_num) => match self { FieldType::I64(_) | FieldType::Date(_) => { if let Some(field_val_i64) = field_val_num.as_i64() { diff --git a/src/schema/schema.rs b/src/schema/schema.rs index a8e2be29f5..0884ef71be 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -146,9 +146,7 @@ impl SchemaBuilder { } /// Adds a ip field. - /// Returns the associated field handle - /// Internally, Tantivy simply stores ips as u64, - /// while the user supplies IpAddr values for convenience. + /// Returns the associated field handle. /// /// # Caution /// diff --git a/src/schema/value.rs b/src/schema/value.rs index 55cdc4dd77..7bc3ad2f27 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -34,14 +34,16 @@ pub enum Value { /// Json object value. JsonObject(serde_json::Map), /// Ip - Ip(IpAddr), + IpAddr(IpAddr), } impl Eq for Value {} impl Serialize for Value { fn serialize(&self, serializer: S) -> Result - where S: Serializer { + where + S: Serializer, + { match *self { Value::Str(ref v) => serializer.serialize_str(v), Value::PreTokStr(ref v) => v.serialize(serializer), @@ -53,14 +55,16 @@ impl Serialize for Value { Value::Facet(ref facet) => facet.serialize(serializer), Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes), Value::JsonObject(ref obj) => obj.serialize(serializer), - Value::Ip(ref obj) => obj.serialize(serializer), // TODO check serialization + Value::IpAddr(ref obj) => obj.serialize(serializer), // TODO check serialization } } } impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> { + where + D: Deserializer<'de>, + { struct ValueVisitor; impl<'de> Visitor<'de> for ValueVisitor { @@ -208,8 +212,8 @@ impl Value { /// Returns the ip addr, provided the value is of the `Ip` type. /// (Returns None if the value is not of the `Ip` type) - pub fn as_ip(&self) -> Option { - if let Value::Ip(val) = self { + pub fn as_ip_addr(&self) -> Option { + if let Value::IpAddr(val) = self { Some(*val) } else { None @@ -225,7 +229,7 @@ impl From for Value { impl From for Value { fn from(v: IpAddr) -> Value { - Value::Ip(v) + Value::IpAddr(v) } } @@ -389,7 +393,7 @@ mod binary_serialize { serde_json::to_writer(writer, &map)?; Ok(()) } - Value::Ip(ref ip) => { + Value::IpAddr(ref ip) => { IP_CODE.serialize(writer)?; ip.to_string().serialize(writer) // TODO Check best format } @@ -465,7 +469,7 @@ mod binary_serialize { } IP_CODE => { let text = String::deserialize(reader)?; - Ok(Value::Ip(IpAddr::from_str(&text).map_err(|err| { + Ok(Value::IpAddr(IpAddr::from_str(&text).map_err(|err| { io::Error::new(ErrorKind::Other, err.to_string()) })?)) } From e50e74acf825f947cbbe2e8e5b966cd7344b0fa3 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 6 Oct 2022 13:36:44 +0800 Subject: [PATCH 15/27] remove u128 type --- src/query/query_parser/query_parser.rs | 8 +++----- src/schema/field_type.rs | 7 +------ src/schema/term.rs | 7 +------ src/schema/value.rs | 8 ++------ 4 files changed, 7 insertions(+), 23 deletions(-) diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 19b027bd33..d14a09f218 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -400,11 +400,9 @@ impl QueryParser { let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; Ok(Term::from_field_bytes(field, &bytes)) } - FieldType::IpAddr(_) => { - return Err(QueryParserError::UnsupportedQuery( - "Range query are not supported on IpAddr field.".to_string(), - )); - } + FieldType::IpAddr(_) => Err(QueryParserError::UnsupportedQuery( + "Range query are not supported on IpAddr field.".to_string(), + )), } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 21390e09f9..b357287910 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -68,11 +68,9 @@ pub enum Type { Json = b'j', /// IpAddr IpAddr = b'p', - /// IpAddr - U128 = b'1', } -const ALL_TYPES: [Type; 11] = [ +const ALL_TYPES: [Type; 10] = [ Type::Str, Type::U64, Type::I64, @@ -83,7 +81,6 @@ const ALL_TYPES: [Type; 11] = [ Type::Bytes, Type::Json, Type::IpAddr, - Type::U128, ]; impl Type { @@ -111,7 +108,6 @@ impl Type { Type::Bytes => "Bytes", Type::Json => "Json", Type::IpAddr => "IpAddr", - Type::U128 => "U128", } } @@ -129,7 +125,6 @@ impl Type { b'b' => Some(Type::Bytes), b'j' => Some(Type::Json), b'p' => Some(Type::IpAddr), - b'1' => Some(Type::U128), _ => None, } } diff --git a/src/schema/term.rs b/src/schema/term.rs index d0c37b7674..9bfa7614b9 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -416,12 +416,7 @@ fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Re } } Type::IpAddr => { - let s = as_str(bytes); // TODO: change when serialization changes - write_opt(f, s)?; - } - Type::U128 => { - let s = as_str(bytes); // TODO: change when serialization changes - write_opt(f, s)?; + write!(f, "")?; // TODO change once we actually have IP address terms. } } Ok(()) diff --git a/src/schema/value.rs b/src/schema/value.rs index 7bc3ad2f27..bbd6c48247 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -41,9 +41,7 @@ impl Eq for Value {} impl Serialize for Value { fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { + where S: Serializer { match *self { Value::Str(ref v) => serializer.serialize_str(v), Value::PreTokStr(ref v) => v.serialize(serializer), @@ -62,9 +60,7 @@ impl Serialize for Value { impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { + where D: Deserializer<'de> { struct ValueVisitor; impl<'de> Visitor<'de> for ValueVisitor { From 5171ff611bd19c77d8fa4dd671a8d04ecf1add3b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 6 Oct 2022 15:51:11 +0800 Subject: [PATCH 16/27] serialize ip as u128, add test for positions_to_docid --- common/src/serialize.rs | 13 ++++++++++ src/fastfield/multivalued/reader.rs | 38 +++++++++++++++++++++++++++-- src/schema/value.rs | 18 ++++++-------- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/common/src/serialize.rs b/common/src/serialize.rs index bc893f7d06..7b96316ee4 100644 --- a/common/src/serialize.rs +++ b/common/src/serialize.rs @@ -107,6 +107,19 @@ impl FixedSize for u64 { const SIZE_IN_BYTES: usize = 8; } +impl BinarySerializable for u128 { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + writer.write_u128::(*self) + } + fn deserialize(reader: &mut R) -> io::Result { + reader.read_u128::() + } +} + +impl FixedSize for u128 { + const SIZE_IN_BYTES: usize = 16; +} + impl BinarySerializable for f32 { fn serialize(&self, writer: &mut W) -> io::Result<()> { writer.write_f32::(*self) diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 994c03c7e7..67759b2809 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -220,8 +220,8 @@ impl MultiValueLength for MultiValuedU128FastFie /// /// Correctness: positions needs to be sorted. /// -/// TODO: Instead of a linear scan we can employ a binary search to match a docid to its value -/// position. +/// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a +/// docid to its value position. fn positions_to_docids(positions: &[u64], multival_idx: &T) -> Vec { let mut docs = vec![]; let mut cur_doc = 0u32; @@ -250,8 +250,42 @@ fn positions_to_docids(positions: &[u64], multival_idx: &T) mod tests { use crate::core::Index; + use crate::fastfield::multivalued::reader::positions_to_docids; + use crate::fastfield::MultiValueLength; use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema}; + #[test] + fn test_positions_to_docid() { + let positions = vec![10u64, 11, 15, 20, 21, 22]; + + let offsets = vec![0, 10, 12, 15, 22, 23]; + + struct MultiValueLenghtIdx { + offsets: Vec, + } + + impl MultiValueLength for MultiValueLenghtIdx { + fn get_range(&self, doc_id: crate::DocId) -> std::ops::Range { + let idx = doc_id as u64; + let start = self.offsets[idx as usize]; + let end = self.offsets[idx as usize + 1]; + start..end + } + + fn get_len(&self, _doc_id: crate::DocId) -> u64 { + todo!() + } + + fn get_total_len(&self) -> u64 { + todo!() + } + } + + let idx = MultiValueLenghtIdx { offsets }; + let docids = positions_to_docids(&positions, &idx); + assert_eq!(docids, vec![1, 3, 4]); + } + #[test] fn test_multifastfield_reader() -> crate::Result<()> { let mut schema_builder = Schema::builder(); diff --git a/src/schema/value.rs b/src/schema/value.rs index bbd6c48247..559b0f2ea8 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -33,7 +33,7 @@ pub enum Value { Bytes(Vec), /// Json object value. JsonObject(serde_json::Map), - /// Ip + /// Ip Address value IpAddr(IpAddr), } @@ -53,7 +53,7 @@ impl Serialize for Value { Value::Facet(ref facet) => facet.serialize(serializer), Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes), Value::JsonObject(ref obj) => obj.serialize(serializer), - Value::IpAddr(ref obj) => obj.serialize(serializer), // TODO check serialization + Value::IpAddr(ref obj) => obj.serialize(serializer), } } } @@ -307,11 +307,11 @@ impl From for Value { } mod binary_serialize { - use std::io::{self, ErrorKind, Read, Write}; + use std::io::{self, Read, Write}; use std::net::IpAddr; - use std::str::FromStr; use common::{f64_to_u64, u64_to_f64, BinarySerializable}; + use fastfield_codecs::MonotonicallyMappableToU128; use super::Value; use crate::schema::Facet; @@ -391,7 +391,7 @@ mod binary_serialize { } Value::IpAddr(ref ip) => { IP_CODE.serialize(writer)?; - ip.to_string().serialize(writer) // TODO Check best format + ip.to_u128().serialize(writer) } } } @@ -445,7 +445,7 @@ mod binary_serialize { _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!( - "No extened field type is associated with code {:?}", + "No extended field type is associated with code {:?}", ext_type_code ), )), @@ -464,10 +464,8 @@ mod binary_serialize { Ok(Value::JsonObject(json_map)) } IP_CODE => { - let text = String::deserialize(reader)?; - Ok(Value::IpAddr(IpAddr::from_str(&text).map_err(|err| { - io::Error::new(ErrorKind::Other, err.to_string()) - })?)) + let value = u128::deserialize(reader)?; + Ok(Value::IpAddr(IpAddr::from_u128(value))) } _ => Err(io::Error::new( From 2864bf7123c2264bcbb4279099e656c54e54f057 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 6 Oct 2022 16:13:35 +0800 Subject: [PATCH 17/27] use serializer for u128 --- src/fastfield/multivalued/writer.rs | 12 ++++++++---- src/fastfield/serializer/mod.rs | 20 +++++++++++++++----- src/fastfield/writer.rs | 21 ++++++++++++++------- src/indexer/merger.rs | 19 ++++++++++++++----- 4 files changed, 51 insertions(+), 21 deletions(-) diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 4cefeadac4..127416f590 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -1,7 +1,7 @@ use std::io; use fastfield_codecs::{ - serialize_u128, Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn, + Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn, }; use fnv::FnvHashMap; @@ -361,10 +361,14 @@ impl MultiValueU128FastFieldWriter { } } { - let field_write = serializer.get_field_writer(self.field, 1); - let iter_gen = || self.get_ordered_values(doc_id_map).flatten().cloned(); - serialize_u128(iter_gen, self.vals.len() as u64, field_write)?; + + serializer.create_u128_fast_field_with_idx( + self.field, + iter_gen, + self.vals.len() as u64, + 1, + )?; } Ok(()) } diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index f58f28a123..e0fb6e64b6 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -84,6 +84,21 @@ impl CompositeFastFieldSerializer { Ok(()) } + /// Serialize data into a new u128 fast field. The codec will be compact space compressor, + /// which is optimized for scanning the fast field for a given range. + pub fn create_u128_fast_field_with_idx I, I: Iterator>( + &mut self, + field: Field, + iter_gen: F, + num_vals: u64, + idx: usize, + ) -> io::Result<()> { + let field_write = self.composite_write.for_field_with_idx(field, idx); + fastfield_codecs::serialize_u128(iter_gen, num_vals, field_write)?; + + Ok(()) + } + /// Start serializing a new [u8] fast field. Use the returned writer to write data into the /// bytes field. To associate the bytes with documents a seperate index must be created on /// index 0. See bytes/writer.rs::serialize for an example. @@ -93,11 +108,6 @@ impl CompositeFastFieldSerializer { self.composite_write.for_field_with_idx(field, 1) } - /// Gets the underlying writer - pub fn get_field_writer(&mut self, field: Field, idx: usize) -> &mut impl Write { - self.composite_write.for_field_with_idx(field, idx) - } - /// Closes the serializer /// /// After this call the data must be persistently saved on disk. diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 53cd2a2116..4e1fe50bcb 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -2,9 +2,7 @@ use std::collections::HashMap; use std::io; use common; -use fastfield_codecs::{ - serialize_u128, Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, -}; +use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64}; use fnv::FnvHashMap; use tantivy_bitpacker::BlockedBitpacker; @@ -349,18 +347,27 @@ impl U128FastFieldWriter { serializer: &mut CompositeFastFieldSerializer, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { - let field_write = serializer.get_field_writer(self.field, 0); - if let Some(doc_id_map) = doc_id_map { let iter_gen = || { doc_id_map .iter_old_doc_ids() .map(|idx| self.vals[idx as usize]) }; - serialize_u128(iter_gen, self.val_count as u64, field_write)?; + + serializer.create_u128_fast_field_with_idx( + self.field, + iter_gen, + self.val_count as u64, + 0, + )?; } else { let iter_gen = || self.vals.iter().cloned(); - serialize_u128(iter_gen, self.val_count as u64, field_write)?; + serializer.create_u128_fast_field_with_idx( + self.field, + iter_gen, + self.val_count as u64, + 0, + )?; } Ok(()) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 3d489d6625..b1963e6749 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::io::Write; use std::sync::Arc; -use fastfield_codecs::{serialize_u128, VecColumn}; +use fastfield_codecs::VecColumn; use itertools::Itertools; use measure_time::debug_time; @@ -364,8 +364,13 @@ impl IndexMerger { fast_field_reader.get_vals(doc_addr.doc_id, buffer); }) }; - let field_write = fast_field_serializer.get_field_writer(field, 1); - serialize_u128(iter_gen, doc_id_mapping.len() as u64, field_write)?; + + fast_field_serializer.create_u128_fast_field_with_idx( + field, + iter_gen, + doc_id_mapping.len() as u64, + 1, + )?; Ok(()) } @@ -389,14 +394,18 @@ impl IndexMerger { }) .collect::>(); - let field_write = fast_field_serializer.get_field_writer(field, 0); let iter_gen = || { doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize]; fast_field_reader.get_val(doc_addr.doc_id as u64) }) }; - fastfield_codecs::serialize_u128(iter_gen, doc_id_mapping.len() as u64, field_write)?; + fast_field_serializer.create_u128_fast_field_with_idx( + field, + iter_gen, + doc_id_mapping.len() as u64, + 0, + )?; Ok(()) } From 226a49338f01f6bcfd54876d1238a6767d584df7 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 6 Oct 2022 17:49:05 +0800 Subject: [PATCH 18/27] add StrictlyMonotonicFn --- fastfield_codecs/src/column.rs | 135 ++++++++++--------- fastfield_codecs/src/lib.rs | 36 +++-- fastfield_codecs/src/monotonic_mapping.rs | 152 +++++++++++++++++----- fastfield_codecs/src/serialize.rs | 34 ++--- 4 files changed, 219 insertions(+), 138 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 02975a56c2..e12a8014b6 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -3,6 +3,8 @@ use std::ops::RangeInclusive; use tantivy_bitpacker::minmax; +use crate::monotonic_mapping::StrictlyMonotonicFn; + pub trait Column: Send + Sync { /// Return the value associated with the given idx. /// @@ -137,15 +139,14 @@ where V: AsRef<[T]> + ?Sized } } -struct MonotonicMappingColumn { +struct MonotonicMappingColumn { from_column: C, monotonic_mapping: T, - monotonic_mapping_inv: U, _phantom: PhantomData, } -/// Creates a view of a column transformed by a monotonic mapping. -/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3] +/// Creates a view of a column transformed by a strictly monotonic mapping. See +/// [`StrictlyMonotonicFn`]. E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3] /// The provided mappings need to be the inverse of each other. /// /// The inverse of the mapping is required for: @@ -156,49 +157,44 @@ struct MonotonicMappingColumn { /// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping /// during serialization. And therefore the monotonic_mapping_inv when opening is the same as /// monotonic_mapping during serialization. -pub fn monotonic_map_column( +pub fn monotonic_map_column( from_column: C, monotonic_mapping: T, - monotonic_mapping_inv: U, ) -> impl Column where C: Column, - T: Fn(Input) -> Output + Send + Sync, - U: Fn(Output) -> Input + Send + Sync, - Input: Send + Sync, - Output: Send + Sync, + T: StrictlyMonotonicFn + Send + Sync, + Input: PartialOrd + Send + Sync + Clone, + Output: PartialOrd + Send + Sync + Clone, { MonotonicMappingColumn { from_column, monotonic_mapping, - monotonic_mapping_inv, _phantom: PhantomData, } } -impl Column - for MonotonicMappingColumn +impl Column for MonotonicMappingColumn where C: Column, - T: Fn(Input) -> Output + Send + Sync, - U: Fn(Output) -> Input + Send + Sync, - Input: Send + Sync, - Output: Send + Sync + Clone, + T: StrictlyMonotonicFn + Send + Sync, + Input: PartialOrd + Send + Sync + Clone, + Output: PartialOrd + Send + Sync + Clone, { #[inline] fn get_val(&self, idx: u64) -> Output { let from_val = self.from_column.get_val(idx); - (self.monotonic_mapping)(from_val) + self.monotonic_mapping.mapping(from_val) } fn min_value(&self) -> Output { let from_min_value = self.from_column.min_value(); - (self.monotonic_mapping)(from_min_value) + self.monotonic_mapping.mapping(from_min_value) } fn max_value(&self) -> Output { let from_max_value = self.from_column.max_value(); - (self.monotonic_mapping)(from_max_value) + self.monotonic_mapping.mapping(from_max_value) } fn num_vals(&self) -> u64 { @@ -206,13 +202,17 @@ where } fn iter(&self) -> Box + '_> { - Box::new(self.from_column.iter().map(&self.monotonic_mapping)) + Box::new( + self.from_column + .iter() + .map(|el| self.monotonic_mapping.mapping(el)), + ) } fn get_between_vals(&self, range: RangeInclusive) -> Vec { self.from_column.get_between_vals( - (self.monotonic_mapping_inv)(range.start().clone()) - ..=(self.monotonic_mapping_inv)(range.end().clone()), + self.monotonic_mapping.inverse(range.start().clone()) + ..=self.monotonic_mapping.inverse(range.end().clone()), ) } @@ -259,20 +259,20 @@ where #[cfg(test)] mod tests { use super::*; - use crate::MonotonicallyMappableToU64; - - #[test] - fn test_monotonic_mapping() { - let vals = &[1u64, 3u64][..]; - let col = VecColumn::from(vals); - let mapped = monotonic_map_column(col, |el| el + 4, |_el| unimplemented!()); - assert_eq!(mapped.min_value(), 5u64); - assert_eq!(mapped.max_value(), 7u64); - assert_eq!(mapped.num_vals(), 2); - assert_eq!(mapped.num_vals(), 2); - assert_eq!(mapped.get_val(0), 5); - assert_eq!(mapped.get_val(1), 7); - } + // use crate::MonotonicallyMappableToU64; + + //#[test] + // fn test_monotonic_mapping() { + // let vals = &[1u64, 3u64][..]; + // let col = VecColumn::from(vals); + // let mapped = monotonic_map_column(col, |el| el + 4, |_el| unimplemented!()); + // assert_eq!(mapped.min_value(), 5u64); + // assert_eq!(mapped.max_value(), 7u64); + // assert_eq!(mapped.num_vals(), 2); + // assert_eq!(mapped.num_vals(), 2); + // assert_eq!(mapped.get_val(0), 5); + // assert_eq!(mapped.get_val(1), 7); + //} #[test] fn test_range_as_col() { @@ -281,35 +281,34 @@ mod tests { assert_eq!(col.max_value(), 99); } - #[test] - fn test_monotonic_mapping_iter() { - let vals: Vec = (-1..99).map(i64::to_u64).collect(); - let col = VecColumn::from(&vals); - let mapped = - monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!()); - let val_i64s: Vec = mapped.iter().collect(); - for i in 0..100 { - assert_eq!(val_i64s[i as usize], mapped.get_val(i)); - } - } - - #[test] - fn test_monotonic_mapping_get_range() { - let vals: Vec = (-1..99).map(i64::to_u64).collect(); - let col = VecColumn::from(&vals); - let mapped = - monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!()); - assert_eq!(mapped.min_value(), -10i64); - assert_eq!(mapped.max_value(), 980i64); - assert_eq!(mapped.num_vals(), 100); - let val_i64s: Vec = mapped.iter().collect(); - assert_eq!(val_i64s.len(), 100); - for i in 0..100 { - assert_eq!(val_i64s[i as usize], mapped.get_val(i)); - assert_eq!(val_i64s[i as usize], i64::from_u64(vals[i as usize]) * 10); - } - let mut buf = [0i64; 20]; - mapped.get_range(7, &mut buf[..]); - assert_eq!(&val_i64s[7..][..20], &buf); - } + //#[test] + // fn test_monotonic_mapping_iter() { + // let vals: Vec = (-1..99).map(i64::to_u64).collect(); + // let col = VecColumn::from(&vals); + // let mapped = + // monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!()); + // let val_i64s: Vec = mapped.iter().collect(); + // for i in 0..100 { + // assert_eq!(val_i64s[i as usize], mapped.get_val(i)); + //} + + //#[test] + // fn test_monotonic_mapping_get_range() { + // let vals: Vec = (-1..99).map(i64::to_u64).collect(); + // let col = VecColumn::from(&vals); + // let mapped = + // monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!()); + // assert_eq!(mapped.min_value(), -10i64); + // assert_eq!(mapped.max_value(), 980i64); + // assert_eq!(mapped.num_vals(), 100); + // let val_i64s: Vec = mapped.iter().collect(); + // assert_eq!(val_i64s.len(), 100); + // for i in 0..100 { + // assert_eq!(val_i64s[i as usize], mapped.get_val(i)); + // assert_eq!(val_i64s[i as usize], i64::from_u64(vals[i as usize]) * 10); + //} + // let mut buf = [0i64; 20]; + // mapped.get_range(7, &mut buf[..]); + // assert_eq!(&val_i64s[7..][..20], &buf); + //} } diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index c537d2faa2..07a86cc763 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -13,9 +13,10 @@ use std::sync::Arc; use common::BinarySerializable; use compact_space::CompactSpaceDecompressor; -use fastdivide::DividerU64; -use monotonic_mapping::gcd_min_val_mapping_pairs::{from_gcd_normalized_u64, normalize_with_gcd}; -use monotonic_mapping::min_val_mapping_pairs::{from_normalized_u64, normalize}; +use monotonic_mapping::{ + StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal, + StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval, +}; use ownedbytes::OwnedBytes; use serialize::Header; @@ -35,7 +36,7 @@ use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; pub use self::column::{monotonic_map_column, Column, VecColumn}; use self::linear::LinearCodec; -pub use self::monotonic_mapping::MonotonicallyMappableToU64; +pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128; pub use self::serialize::{ estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader, @@ -82,11 +83,9 @@ pub fn open_u128( bytes: OwnedBytes, ) -> io::Result>> { let reader = CompactSpaceDecompressor::open(bytes)?; - Ok(Arc::new(monotonic_map_column( - reader, - Item::from_u128, - Item::to_u128, - ))) + let inverted: StrictlyMonotonicMappingInverter> = + StrictlyMonotonicMappingToInternal::::new().into(); + Ok(Arc::new(monotonic_map_column(reader, inverted))) } /// Returns the correct codec reader wrapped in the `Arc` for the data. @@ -111,18 +110,15 @@ fn open_specific_codec( let reader = C::open_from_bytes(bytes, normalized_header)?; let min_value = header.min_value; if let Some(gcd) = header.gcd { - let divider = DividerU64::divide_by(gcd.get()); - Ok(Arc::new(monotonic_map_column( - reader, - move |val: u64| from_gcd_normalized_u64(val, min_value, gcd.get()), - move |val| normalize_with_gcd(val, min_value, ÷r), - ))) + let mapping = StrictlyMonotonicMappingInverter::from( + StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value), + ); + Ok(Arc::new(monotonic_map_column(reader, mapping))) } else { - Ok(Arc::new(monotonic_map_column( - reader, - move |val: u64| from_normalized_u64(val, min_value), - move |val| normalize(val, min_value), - ))) + let mapping = StrictlyMonotonicMappingInverter::from( + StrictlyMonotonicMappingToInternalBaseval::new(min_value), + ); + Ok(Arc::new(monotonic_map_column(reader, mapping))) } } diff --git a/fastfield_codecs/src/monotonic_mapping.rs b/fastfield_codecs/src/monotonic_mapping.rs index 0a8310a8e5..1b6db74c33 100644 --- a/fastfield_codecs/src/monotonic_mapping.rs +++ b/fastfield_codecs/src/monotonic_mapping.rs @@ -1,5 +1,9 @@ +use std::marker::PhantomData; + use fastdivide::DividerU64; +use crate::MonotonicallyMappableToU128; + pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync { /// Converts a value to u64. /// @@ -13,51 +17,133 @@ pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync fn from_u64(val: u64) -> Self; } -// Mapping pairs for the case we subtract the min_value and apply a gcd (greatest common divisor) -pub mod gcd_min_val_mapping_pairs { +/// Values need to be strictly monotonic mapped to a `Internal` value (u64 or u128) that can be +/// used in fast field codecs. +/// +/// The monotonic mapping is required so that `PartialOrd` can be used on `Internal` without +/// converting to `External`. +/// +/// All strictly monotonic functions are invertible because they are guaranteed to have a one-to-one +/// mapping from their range to their domain. The `inverse` method is required when opening a codec, +/// so a value can be converted back to its original domain (e.g. ip address or f64) from its +/// internal representation. +pub trait StrictlyMonotonicFn { + /// Strictly monotonically maps the value from External to Internal. + fn mapping(&self, inp: External) -> Internal; + /// Inverse of `mapping`. Maps the value from Internal to External. + fn inverse(&self, out: Internal) -> External; +} + +/// Inverts a strictly monotonic mapping from `StrictlyMonotonicFn` to +/// `StrictlyMonotonicFn`. +pub(crate) struct StrictlyMonotonicMappingInverter { + orig_mapping: T, +} +impl From for StrictlyMonotonicMappingInverter { + fn from(orig_mapping: T) -> Self { + Self { orig_mapping } + } +} + +impl StrictlyMonotonicFn for StrictlyMonotonicMappingInverter +where T: StrictlyMonotonicFn +{ + fn mapping(&self, val: To) -> From { + self.orig_mapping.inverse(val) + } + + fn inverse(&self, val: From) -> To { + self.orig_mapping.mapping(val) + } +} + +/// Applies the strictly monotonic mapping from `T` without any additional changes. +pub(crate) struct StrictlyMonotonicMappingToInternal { + _phantom: PhantomData, +} + +impl StrictlyMonotonicMappingToInternal { + pub(crate) fn new() -> StrictlyMonotonicMappingToInternal { + Self { + _phantom: PhantomData, + } + } +} - use super::*; - pub fn from_gcd_normalized_u64( - val: u64, - min_value: u64, - gcd: u64, - ) -> Item { - Item::from_u64(min_value + val * gcd) +impl + StrictlyMonotonicFn for StrictlyMonotonicMappingToInternal +where T: MonotonicallyMappableToU128 +{ + fn mapping(&self, inp: External) -> u128 { + External::to_u128(inp) } - pub fn normalize_with_gcd( - val: Item, - min_value: u64, - gcd_divider: &DividerU64, - ) -> u64 { - gcd_divider.divide(Item::to_u64(val) - min_value) + fn inverse(&self, out: u128) -> External { + External::from_u128(out) } +} - #[test] - fn monotonic_mapping_roundtrip_test() { - let gcd = std::num::NonZeroU64::new(10).unwrap(); - let divider = DividerU64::divide_by(gcd.get()); +impl + StrictlyMonotonicFn for StrictlyMonotonicMappingToInternal +where T: MonotonicallyMappableToU64 +{ + fn mapping(&self, inp: External) -> u64 { + External::to_u64(inp) + } - let orig_value: u64 = 500; - let normalized_val: u64 = normalize_with_gcd(orig_value, 100, ÷r); - assert_eq!(normalized_val, 40); - assert_eq!( - from_gcd_normalized_u64::(normalized_val, 100, gcd.get()), - 500 - ); + fn inverse(&self, out: u64) -> External { + External::from_u64(out) } } -// Mapping pairs for the case we subtract the min_value -pub mod min_val_mapping_pairs { - use super::*; +/// Strictly monotonic mapping with a gcd and a base value. +pub(crate) struct StrictlyMonotonicMappingToInternalGCDBaseval { + gcd_divider: DividerU64, + gcd: u64, + min_value: u64, +} +impl StrictlyMonotonicMappingToInternalGCDBaseval { + pub(crate) fn new(gcd: u64, min_value: u64) -> Self { + let gcd_divider = DividerU64::divide_by(gcd); + Self { + gcd_divider, + gcd, + min_value, + } + } +} +impl StrictlyMonotonicFn + for StrictlyMonotonicMappingToInternalGCDBaseval +{ + fn mapping(&self, inp: External) -> u64 { + self.gcd_divider + .divide(External::to_u64(inp) - self.min_value) + } + + fn inverse(&self, out: u64) -> External { + External::from_u64(self.min_value + out * self.gcd) + } +} + +/// Strictly monotonic mapping with a base value. +pub(crate) struct StrictlyMonotonicMappingToInternalBaseval { + min_value: u64, +} +impl StrictlyMonotonicMappingToInternalBaseval { + pub(crate) fn new(min_value: u64) -> Self { + Self { min_value } + } +} - pub fn from_normalized_u64(val: u64, min_value: u64) -> Item { - Item::from_u64(min_value + val) +impl StrictlyMonotonicFn + for StrictlyMonotonicMappingToInternalBaseval +{ + fn mapping(&self, val: External) -> u64 { + External::to_u64(val) - self.min_value } - pub fn normalize(val: Item, min_value: u64) -> u64 { - Item::to_u64(val) - min_value + fn inverse(&self, val: u64) -> External { + External::from_u64(self.min_value + val) } } diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 9d653cab1a..f6373a0245 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -22,7 +22,6 @@ use std::num::NonZeroU64; use std::sync::Arc; use common::{BinarySerializable, VInt}; -use fastdivide::DividerU64; use log::warn; use ownedbytes::OwnedBytes; @@ -30,7 +29,10 @@ use crate::bitpacked::BitpackedCodec; use crate::blockwise_linear::BlockwiseLinearCodec; use crate::compact_space::CompactSpaceCompressor; use crate::linear::LinearCodec; -use crate::monotonic_mapping::gcd_min_val_mapping_pairs::normalize_with_gcd; +use crate::monotonic_mapping::{ + StrictlyMonotonicFn, StrictlyMonotonicMappingToInternal, + StrictlyMonotonicMappingToInternalGCDBaseval, +}; use crate::{ monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType, MonotonicallyMappableToU64, VecColumn, ALL_CODEC_TYPES, @@ -59,8 +61,10 @@ pub(crate) struct Header { impl Header { pub fn normalized(self) -> NormalizedHeader { let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1); - let gcd_divider = DividerU64::divide_by(gcd); - let max_value = normalize_with_gcd(self.max_value, self.min_value, &gcd_divider); + let gcd_min_val_mapping = + StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd, self.min_value); + + let max_value = gcd_min_val_mapping.mapping(self.max_value); NormalizedHeader { num_vals: self.num_vals, max_value, @@ -98,12 +102,9 @@ pub fn normalize_column( gcd: Option, ) -> impl Column { let gcd = gcd.map(|gcd| gcd.get()).unwrap_or(1); - let gcd_divider = DividerU64::divide_by(gcd); - monotonic_map_column( - from_column, - move |val| normalize_with_gcd(val, min_value, &gcd_divider), - move |_val| unimplemented!(), // This code is only used in serialization - ) + + let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd, min_value); + monotonic_map_column(from_column, mapping) } impl BinarySerializable for Header { @@ -141,16 +142,15 @@ pub fn estimate( typed_column: impl Column, codec_type: FastFieldCodecType, ) -> Option { - let column = monotonic_map_column(typed_column, T::to_u64, T::from_u64); + let column = monotonic_map_column(typed_column, StrictlyMonotonicMappingToInternal::::new()); let min_value = column.min_value(); let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value)) .filter(|gcd| gcd.get() > 1u64); - let gcd_divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); - let normalized_column = monotonic_map_column( - &column, - |val| normalize_with_gcd(val, min_value, &gcd_divider), - |_val| unimplemented!(), + let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new( + gcd.map(|gcd| gcd.get()).unwrap_or(1u64), + min_value, ); + let normalized_column = monotonic_map_column(&column, mapping); match codec_type { FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column), FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column), @@ -175,7 +175,7 @@ pub fn serialize( output: &mut impl io::Write, codecs: &[FastFieldCodecType], ) -> io::Result<()> { - let column = monotonic_map_column(typed_column, T::to_u64, T::from_u64); + let column = monotonic_map_column(typed_column, StrictlyMonotonicMappingToInternal::::new()); let header = Header::compute_header(&column, codecs).ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidInput, From a8a36b62cd51703321293da7ea6e47d6b50432df Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 7 Oct 2022 09:55:58 +0800 Subject: [PATCH 19/27] enable test --- fastfield_codecs/src/column.rs | 107 +++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 46 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index e12a8014b6..8b9f50d942 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -126,7 +126,8 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column for VecColumn<'a, T> { } impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T> -where V: AsRef<[T]> + ?Sized +where + V: AsRef<[T]> + ?Sized, { fn from(values: &'a V) -> Self { let values = values.as_ref(); @@ -223,7 +224,8 @@ where pub struct IterColumn(T); impl From for IterColumn -where T: Iterator + Clone + ExactSizeIterator +where + T: Iterator + Clone + ExactSizeIterator, { fn from(iter: T) -> Self { IterColumn(iter) @@ -259,20 +261,23 @@ where #[cfg(test)] mod tests { use super::*; - // use crate::MonotonicallyMappableToU64; - - //#[test] - // fn test_monotonic_mapping() { - // let vals = &[1u64, 3u64][..]; - // let col = VecColumn::from(vals); - // let mapped = monotonic_map_column(col, |el| el + 4, |_el| unimplemented!()); - // assert_eq!(mapped.min_value(), 5u64); - // assert_eq!(mapped.max_value(), 7u64); - // assert_eq!(mapped.num_vals(), 2); - // assert_eq!(mapped.num_vals(), 2); - // assert_eq!(mapped.get_val(0), 5); - // assert_eq!(mapped.get_val(1), 7); - //} + use crate::monotonic_mapping::{ + StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval, + StrictlyMonotonicMappingToInternalGCDBaseval, + }; + + #[test] + fn test_monotonic_mapping() { + let vals = &[3u64, 5u64][..]; + let col = VecColumn::from(vals); + let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2)); + assert_eq!(mapped.min_value(), 1u64); + assert_eq!(mapped.max_value(), 3u64); + assert_eq!(mapped.num_vals(), 2); + assert_eq!(mapped.num_vals(), 2); + assert_eq!(mapped.get_val(0), 1); + assert_eq!(mapped.get_val(1), 3); + } #[test] fn test_range_as_col() { @@ -281,34 +286,44 @@ mod tests { assert_eq!(col.max_value(), 99); } - //#[test] - // fn test_monotonic_mapping_iter() { - // let vals: Vec = (-1..99).map(i64::to_u64).collect(); - // let col = VecColumn::from(&vals); - // let mapped = - // monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!()); - // let val_i64s: Vec = mapped.iter().collect(); - // for i in 0..100 { - // assert_eq!(val_i64s[i as usize], mapped.get_val(i)); - //} - - //#[test] - // fn test_monotonic_mapping_get_range() { - // let vals: Vec = (-1..99).map(i64::to_u64).collect(); - // let col = VecColumn::from(&vals); - // let mapped = - // monotonic_map_column(col, |el| i64::from_u64(el) * 10i64, |_| unimplemented!()); - // assert_eq!(mapped.min_value(), -10i64); - // assert_eq!(mapped.max_value(), 980i64); - // assert_eq!(mapped.num_vals(), 100); - // let val_i64s: Vec = mapped.iter().collect(); - // assert_eq!(val_i64s.len(), 100); - // for i in 0..100 { - // assert_eq!(val_i64s[i as usize], mapped.get_val(i)); - // assert_eq!(val_i64s[i as usize], i64::from_u64(vals[i as usize]) * 10); - //} - // let mut buf = [0i64; 20]; - // mapped.get_range(7, &mut buf[..]); - // assert_eq!(&val_i64s[7..][..20], &buf); - //} + #[test] + fn test_monotonic_mapping_iter() { + let vals: Vec = (10..110u64).map(|el| el * 10).collect(); + let col = VecColumn::from(&vals); + let mapped = monotonic_map_column( + col, + StrictlyMonotonicMappingInverter::from( + StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100), + ), + ); + let val_i64s: Vec = mapped.iter().collect(); + for i in 0..100 { + assert_eq!(val_i64s[i as usize], mapped.get_val(i)); + } + } + + #[test] + fn test_monotonic_mapping_get_range() { + let vals: Vec = (0..100u64).map(|el| el * 10).collect(); + let col = VecColumn::from(&vals); + let mapped = monotonic_map_column( + col, + StrictlyMonotonicMappingInverter::from( + StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0), + ), + ); + + assert_eq!(mapped.min_value(), 0u64); + assert_eq!(mapped.max_value(), 9900u64); + assert_eq!(mapped.num_vals(), 100); + let val_u64s: Vec = mapped.iter().collect(); + assert_eq!(val_u64s.len(), 100); + for i in 0..100 { + assert_eq!(val_u64s[i as usize], mapped.get_val(i)); + assert_eq!(val_u64s[i as usize], vals[i as usize] * 10); + } + let mut buf = [0u64; 20]; + mapped.get_range(7, &mut buf[..]); + assert_eq!(&val_u64s[7..][..20], &buf); + } } From 39f4e5845079bd688f14585a129e3a623fdf420c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 7 Oct 2022 10:01:43 +0800 Subject: [PATCH 20/27] improve comment --- fastfield_codecs/src/column.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 8b9f50d942..afbb7785fa 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -126,8 +126,7 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column for VecColumn<'a, T> { } impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T> -where - V: AsRef<[T]> + ?Sized, +where V: AsRef<[T]> + ?Sized { fn from(values: &'a V) -> Self { let values = values.as_ref(); @@ -147,8 +146,11 @@ struct MonotonicMappingColumn { } /// Creates a view of a column transformed by a strictly monotonic mapping. See -/// [`StrictlyMonotonicFn`]. E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3] -/// The provided mappings need to be the inverse of each other. +/// [`StrictlyMonotonicFn`]. +/// +/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3] +/// monotonic_mapping.mapping() is expected to be injective, and we should always have +/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el /// /// The inverse of the mapping is required for: /// `fn get_between_vals(&self, range: RangeInclusive) -> Vec ` @@ -224,8 +226,7 @@ where pub struct IterColumn(T); impl From for IterColumn -where - T: Iterator + Clone + ExactSizeIterator, +where T: Iterator + Clone + ExactSizeIterator { fn from(iter: T) -> Self { IterColumn(iter) From 9a1609d364772f0e9f5a514ebe03ce743d2d31dc Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 7 Oct 2022 16:24:49 +0800 Subject: [PATCH 21/27] add test --- fastfield_codecs/src/monotonic_mapping.rs | 14 ++++++++++++++ src/fastfield/multivalued/reader.rs | 6 +++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fastfield_codecs/src/monotonic_mapping.rs b/fastfield_codecs/src/monotonic_mapping.rs index 1b6db74c33..225ec9bc39 100644 --- a/fastfield_codecs/src/monotonic_mapping.rs +++ b/fastfield_codecs/src/monotonic_mapping.rs @@ -190,3 +190,17 @@ impl MonotonicallyMappableToU64 for f64 { common::u64_to_f64(val) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn strictly_monotonic_test() { + let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100); + + let test_val = 100u64; + assert_eq!(mapping.mapping(test_val), 0); + assert_eq!(mapping.inverse(mapping.mapping(test_val)), test_val); + } +} diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 67759b2809..9f2a0b2ca2 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -260,11 +260,11 @@ mod tests { let offsets = vec![0, 10, 12, 15, 22, 23]; - struct MultiValueLenghtIdx { + struct MultiValueLengthIdx { offsets: Vec, } - impl MultiValueLength for MultiValueLenghtIdx { + impl MultiValueLength for MultiValueLengthIdx { fn get_range(&self, doc_id: crate::DocId) -> std::ops::Range { let idx = doc_id as u64; let start = self.offsets[idx as usize]; @@ -281,7 +281,7 @@ mod tests { } } - let idx = MultiValueLenghtIdx { offsets }; + let idx = MultiValueLengthIdx { offsets }; let docids = positions_to_docids(&positions, &idx); assert_eq!(docids, vec![1, 3, 4]); } From 96315df20d363f63e8d0cb7257f9ec19ae6f9a06 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 7 Oct 2022 16:54:04 +0800 Subject: [PATCH 22/27] use idx part only for positions_to_docid --- src/fastfield/multivalued/reader.rs | 41 +++++++++-------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 9f2a0b2ca2..054bb01e4d 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -162,7 +162,7 @@ impl MultiValuedU128FastFieldReader { pub fn get_between_vals(&self, range: RangeInclusive) -> Vec { let positions = self.vals_reader.get_between_vals(range); - positions_to_docids(&positions, self) + positions_to_docids(&positions, self.idx_reader.as_ref()) } /// Iterates over all elements in the fast field @@ -218,19 +218,20 @@ impl MultiValueLength for MultiValuedU128FastFie /// /// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index. /// -/// Correctness: positions needs to be sorted. +/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically increasing +/// positions. /// /// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a /// docid to its value position. -fn positions_to_docids(positions: &[u64], multival_idx: &T) -> Vec { +fn positions_to_docids(positions: &[u64], idx_reader: &C) -> Vec { let mut docs = vec![]; let mut cur_doc = 0u32; let mut last_doc = None; for pos in positions { loop { - let range = multival_idx.get_range(cur_doc); - if range.contains(pos) { + let end = idx_reader.get_val(cur_doc as u64 + 1); + if end > *pos { // avoid duplicates if Some(cur_doc) == last_doc { break; @@ -249,9 +250,10 @@ fn positions_to_docids(positions: &[u64], multival_idx: &T) #[cfg(test)] mod tests { + use fastfield_codecs::VecColumn; + use crate::core::Index; use crate::fastfield::multivalued::reader::positions_to_docids; - use crate::fastfield::MultiValueLength; use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema}; #[test] @@ -259,31 +261,12 @@ mod tests { let positions = vec![10u64, 11, 15, 20, 21, 22]; let offsets = vec![0, 10, 12, 15, 22, 23]; + { + let column = VecColumn::from(&offsets); - struct MultiValueLengthIdx { - offsets: Vec, - } - - impl MultiValueLength for MultiValueLengthIdx { - fn get_range(&self, doc_id: crate::DocId) -> std::ops::Range { - let idx = doc_id as u64; - let start = self.offsets[idx as usize]; - let end = self.offsets[idx as usize + 1]; - start..end - } - - fn get_len(&self, _doc_id: crate::DocId) -> u64 { - todo!() - } - - fn get_total_len(&self) -> u64 { - todo!() - } + let docids = positions_to_docids(&positions, &column); + assert_eq!(docids, vec![1, 3, 4]); } - - let idx = MultiValueLengthIdx { offsets }; - let docids = positions_to_docids(&positions, &idx); - assert_eq!(docids, vec![1, 3, 4]); } #[test] From f4651738725eb7f34425aa5b9cb9984c7a78ce37 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Fri, 7 Oct 2022 10:20:46 +0200 Subject: [PATCH 23/27] Apply suggestions from code review Co-authored-by: Paul Masurel --- fastfield_codecs/src/monotonic_mapping.rs | 10 +++++++++- fastfield_codecs/src/serialize.rs | 1 - src/fastfield/readers.rs | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fastfield_codecs/src/monotonic_mapping.rs b/fastfield_codecs/src/monotonic_mapping.rs index 225ec9bc39..0b93c5834c 100644 --- a/fastfield_codecs/src/monotonic_mapping.rs +++ b/fastfield_codecs/src/monotonic_mapping.rs @@ -36,6 +36,11 @@ pub trait StrictlyMonotonicFn { /// Inverts a strictly monotonic mapping from `StrictlyMonotonicFn` to /// `StrictlyMonotonicFn`. +/// +/// # Warning +/// +/// This type comes with a footgun. A type being strictly monotonic does not impose that the inverse mapping is strictly monotonic over the entire space External. e.g. a -> a * 2. +/// Use at your own risks. pub(crate) struct StrictlyMonotonicMappingInverter { orig_mapping: T, } @@ -96,7 +101,10 @@ where T: MonotonicallyMappableToU64 } } -/// Strictly monotonic mapping with a gcd and a base value. +/// Mapping dividing by gcd and a base value. +/// +/// The function is assumed to be only called on values divided by passed +/// gcd value. (It is necessary for the function to be monotonic.) pub(crate) struct StrictlyMonotonicMappingToInternalGCDBaseval { gcd_divider: DividerU64, gcd: u64, diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index f6373a0245..c916c758ec 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -102,7 +102,6 @@ pub fn normalize_column( gcd: Option, ) -> impl Column { let gcd = gcd.map(|gcd| gcd.get()).unwrap_or(1); - let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd, min_value); monotonic_map_column(from_column, mapping) } diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 9214a2a44f..d951d0afa8 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -177,7 +177,7 @@ impl FastFieldReaders { /// Returns the `u128` fast field reader reader associated to `field`. /// /// If `field` is not a u128 fast field, this method returns an Error. - pub fn u128(&self, field: Field) -> crate::Result>> { + pub(crate) fn u128(&self, field: Field) -> crate::Result>> { self.check_type(field, FastType::U128, Cardinality::SingleValue)?; let bytes = self.fast_field_data(field, 0)?.read_bytes()?; Ok(open_u128::(bytes)?) From 534b1d33c3f4cb38c3a638ae5d8ca595c5969a74 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Fri, 7 Oct 2022 10:54:32 +0200 Subject: [PATCH 24/27] use ipv6 Co-authored-by: Paul Masurel --- src/fastfield/readers.rs | 2 +- src/schema/document.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index d951d0afa8..712edae620 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -152,7 +152,7 @@ impl FastFieldReaders { /// Returns the `ip` fast field reader reader associated to `field`. /// /// If `field` is not a u128 fast field, this method returns an Error. - pub fn ip_addr(&self, field: Field) -> crate::Result>> { + pub fn ip_addr(&self, field: Field) -> crate::Result>> { self.check_type(field, FastType::U128, Cardinality::SingleValue)?; let bytes = self.fast_field_data(field, 0)?.read_bytes()?; Ok(open_u128::(bytes)?) diff --git a/src/schema/document.rs b/src/schema/document.rs index 4940c8778e..8ce9bb55d0 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -99,7 +99,7 @@ impl Document { } /// Add a IP address field - pub fn add_ip_addr(&mut self, field: Field, value: IpAddr) { + pub fn add_ip_addr(&mut self, field: Field, value: IpAddrV6) { self.add_field_value(field, value); } From b9b913510e84110f2731a51afe2facd1e99a807f Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 7 Oct 2022 16:56:19 +0800 Subject: [PATCH 25/27] fmt --- fastfield_codecs/src/monotonic_mapping.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fastfield_codecs/src/monotonic_mapping.rs b/fastfield_codecs/src/monotonic_mapping.rs index 0b93c5834c..c885203e29 100644 --- a/fastfield_codecs/src/monotonic_mapping.rs +++ b/fastfield_codecs/src/monotonic_mapping.rs @@ -39,8 +39,9 @@ pub trait StrictlyMonotonicFn { /// /// # Warning /// -/// This type comes with a footgun. A type being strictly monotonic does not impose that the inverse mapping is strictly monotonic over the entire space External. e.g. a -> a * 2. -/// Use at your own risks. +/// This type comes with a footgun. A type being strictly monotonic does not impose that the inverse +/// mapping is strictly monotonic over the entire space External. e.g. a -> a * 2. Use at your own +/// risks. pub(crate) struct StrictlyMonotonicMappingInverter { orig_mapping: T, } @@ -102,7 +103,7 @@ where T: MonotonicallyMappableToU64 } /// Mapping dividing by gcd and a base value. -/// +/// /// The function is assumed to be only called on values divided by passed /// gcd value. (It is necessary for the function to be monotonic.) pub(crate) struct StrictlyMonotonicMappingToInternalGCDBaseval { From b2ca83a93cecc72402dff1c9cbed56aad7e11911 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 7 Oct 2022 18:22:11 +0800 Subject: [PATCH 26/27] switch to ipv6, add monotonic_mapping tests --- fastfield_codecs/src/monotonic_mapping.rs | 20 ++++- .../src/monotonic_mapping_u128.rs | 14 ++-- src/fastfield/readers.rs | 11 ++- src/indexer/index_writer.rs | 14 ++-- src/schema/document.rs | 10 ++- src/schema/field_type.rs | 73 ++++++++++--------- src/schema/value.rs | 24 +++--- 7 files changed, 97 insertions(+), 69 deletions(-) diff --git a/fastfield_codecs/src/monotonic_mapping.rs b/fastfield_codecs/src/monotonic_mapping.rs index c885203e29..ebd34e2fad 100644 --- a/fastfield_codecs/src/monotonic_mapping.rs +++ b/fastfield_codecs/src/monotonic_mapping.rs @@ -202,14 +202,30 @@ impl MonotonicallyMappableToU64 for f64 { #[cfg(test)] mod tests { + use super::*; #[test] fn strictly_monotonic_test() { + // identity mapping + test_round_trip(&StrictlyMonotonicMappingToInternal::::new(), 100u64); + // round trip to i64 + test_round_trip(&StrictlyMonotonicMappingToInternal::::new(), 100u64); + // identity mapping + test_round_trip(&StrictlyMonotonicMappingToInternal::::new(), 100u128); + + // base value to i64 round trip + let mapping = StrictlyMonotonicMappingToInternalBaseval::new(100); + test_round_trip::<_, _, u64>(&mapping, 100i64); + // base value and gcd to u64 round trip let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100); + test_round_trip::<_, _, u64>(&mapping, 100u64); + } - let test_val = 100u64; - assert_eq!(mapping.mapping(test_val), 0); + fn test_round_trip, K: std::fmt::Debug + Eq + Copy, L>( + mapping: &T, + test_val: K, + ) { assert_eq!(mapping.inverse(mapping.mapping(test_val)), test_val); } } diff --git a/fastfield_codecs/src/monotonic_mapping_u128.rs b/fastfield_codecs/src/monotonic_mapping_u128.rs index 9f8c0d8cb9..979d6c8c39 100644 --- a/fastfield_codecs/src/monotonic_mapping_u128.rs +++ b/fastfield_codecs/src/monotonic_mapping_u128.rs @@ -1,4 +1,4 @@ -use std::net::{IpAddr, Ipv6Addr}; +use std::net::Ipv6Addr; pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Send + Sync { /// Converts a value to u128. @@ -23,20 +23,16 @@ impl MonotonicallyMappableToU128 for u128 { } } -impl MonotonicallyMappableToU128 for IpAddr { +impl MonotonicallyMappableToU128 for Ipv6Addr { fn to_u128(self) -> u128 { ip_to_u128(self) } fn from_u128(val: u128) -> Self { - IpAddr::from(val.to_be_bytes()) + Ipv6Addr::from(val.to_be_bytes()) } } -fn ip_to_u128(ip_addr: IpAddr) -> u128 { - let ip_addr_v6: Ipv6Addr = match ip_addr { - IpAddr::V4(v4) => v4.to_ipv6_mapped(), - IpAddr::V6(v6) => v6, - }; - u128::from_be_bytes(ip_addr_v6.octets()) +fn ip_to_u128(ip_addr: Ipv6Addr) -> u128 { + u128::from_be_bytes(ip_addr.octets()) } diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 712edae620..257c8345a0 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -1,4 +1,4 @@ -use std::net::IpAddr; +use std::net::Ipv6Addr; use std::sync::Arc; use fastfield_codecs::{open, open_u128, Column}; @@ -155,18 +155,21 @@ impl FastFieldReaders { pub fn ip_addr(&self, field: Field) -> crate::Result>> { self.check_type(field, FastType::U128, Cardinality::SingleValue)?; let bytes = self.fast_field_data(field, 0)?.read_bytes()?; - Ok(open_u128::(bytes)?) + Ok(open_u128::(bytes)?) } /// Returns the `ip` fast field reader reader associated to `field`. /// /// If `field` is not a u128 fast field, this method returns an Error. - pub fn ip_addrs(&self, field: Field) -> crate::Result> { + pub fn ip_addrs( + &self, + field: Field, + ) -> crate::Result> { self.check_type(field, FastType::U128, Cardinality::MultiValues)?; let idx_reader: Arc> = self.typed_fast_field_reader(field)?; let bytes = self.fast_field_data(field, 1)?.read_bytes()?; - let vals_reader = open_u128::(bytes)?; + let vals_reader = open_u128::(bytes)?; Ok(MultiValuedU128FastFieldReader::open( idx_reader, diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index ba57fce360..e7cd65574f 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -803,7 +803,7 @@ impl Drop for IndexWriter { #[cfg(test)] mod tests { use std::collections::{HashMap, HashSet}; - use std::net::IpAddr; + use std::net::Ipv6Addr; use fastfield_codecs::MonotonicallyMappableToU128; use proptest::prelude::*; @@ -1655,7 +1655,7 @@ mod tests { match op { IndexingOp::AddDoc { id } => { let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); - let ip_from_id = IpAddr::from_u128(id as u128); + let ip_from_id = Ipv6Addr::from_u128(id as u128); if id % 3 == 0 { // every 3rd doc has no ip field @@ -1772,14 +1772,14 @@ mod tests { ); // Load all ips addr - let ips: HashSet = searcher + let ips: HashSet = searcher .segment_readers() .iter() .flat_map(|segment_reader| { let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap(); segment_reader.doc_ids_alive().flat_map(move |doc| { let val = ff_reader.get_val(doc as u64); - if val == IpAddr::from_u128(0) { + if val == Ipv6Addr::from_u128(0) { // TODO Fix null handling None } else { @@ -1795,7 +1795,7 @@ mod tests { if id % 3 == 0 { None } else { - Some(IpAddr::from_u128(*id as u128)) + Some(Ipv6Addr::from_u128(*id as u128)) } }) .collect::>(); @@ -1807,11 +1807,11 @@ mod tests { if id % 3 == 0 { None } else { - Some(IpAddr::from_u128(*id as u128)) + Some(Ipv6Addr::from_u128(*id as u128)) } }) .collect::>(); - let ips: HashSet = searcher + let ips: HashSet = searcher .segment_readers() .iter() .flat_map(|segment_reader| { diff --git a/src/schema/document.rs b/src/schema/document.rs index 8ce9bb55d0..5c542bebf9 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,7 +1,7 @@ use std::collections::{HashMap, HashSet}; use std::io::{self, Read, Write}; use std::mem; -use std::net::IpAddr; +use std::net::Ipv6Addr; use common::{BinarySerializable, VInt}; @@ -76,7 +76,9 @@ impl Document { /// Adding a facet to the document. pub fn add_facet(&mut self, field: Field, path: F) - where Facet: From { + where + Facet: From, + { let facet = Facet::from(path); let value = Value::Facet(facet); self.add_field_value(field, value); @@ -98,8 +100,8 @@ impl Document { self.add_field_value(field, value); } - /// Add a IP address field - pub fn add_ip_addr(&mut self, field: Field, value: IpAddrV6) { + /// Add a IP address field. Internally only Ipv6Addr is used. + pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) { self.add_field_value(field, value); } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index b357287910..557d2ec4d4 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,4 +1,4 @@ -use std::net::IpAddr; +use std::net::{IpAddr, Ipv6Addr}; use std::str::FromStr; use serde::{Deserialize, Serialize}; @@ -320,43 +320,50 @@ impl FieldType { /// target field is a `Str`, this method will return an Error. pub fn value_from_json(&self, json: JsonValue) -> Result { match json { - JsonValue::String(field_text) => match self { - FieldType::Date(_) => { - let dt_with_fixed_tz = - OffsetDateTime::parse(&field_text, &Rfc3339).map_err(|_err| { - ValueParsingError::TypeError { + JsonValue::String(field_text) => { + match self { + FieldType::Date(_) => { + let dt_with_fixed_tz = OffsetDateTime::parse(&field_text, &Rfc3339) + .map_err(|_err| ValueParsingError::TypeError { expected: "rfc3339 format", json: JsonValue::String(field_text), + })?; + Ok(DateTime::from_utc(dt_with_fixed_tz).into()) + } + FieldType::Str(_) => Ok(Value::Str(field_text)), + FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => { + Err(ValueParsingError::TypeError { + expected: "an integer", + json: JsonValue::String(field_text), + }) + } + FieldType::Bool(_) => Err(ValueParsingError::TypeError { + expected: "a boolean", + json: JsonValue::String(field_text), + }), + FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))), + FieldType::Bytes(_) => base64::decode(&field_text) + .map(Value::Bytes) + .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }), + FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { + expected: "a json object", + json: JsonValue::String(field_text), + }), + FieldType::IpAddr(_) => { + let ip_addr: IpAddr = IpAddr::from_str(&field_text).map_err(|err| { + ValueParsingError::ParseError { + error: err.to_string(), + json: JsonValue::String(field_text), } })?; - Ok(DateTime::from_utc(dt_with_fixed_tz).into()) - } - FieldType::Str(_) => Ok(Value::Str(field_text)), - FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => { - Err(ValueParsingError::TypeError { - expected: "an integer", - json: JsonValue::String(field_text), - }) + let ip_addr_v6: Ipv6Addr = match ip_addr { + IpAddr::V4(v4) => v4.to_ipv6_mapped(), + IpAddr::V6(v6) => v6, + }; + Ok(Value::IpAddr(ip_addr_v6)) + } } - FieldType::Bool(_) => Err(ValueParsingError::TypeError { - expected: "a boolean", - json: JsonValue::String(field_text), - }), - FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))), - FieldType::Bytes(_) => base64::decode(&field_text) - .map(Value::Bytes) - .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }), - FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { - expected: "a json object", - json: JsonValue::String(field_text), - }), - FieldType::IpAddr(_) => Ok(Value::IpAddr(IpAddr::from_str(&field_text).map_err( - |err| ValueParsingError::ParseError { - error: err.to_string(), - json: JsonValue::String(field_text), - }, - )?)), - }, + } JsonValue::Number(field_val_num) => match self { FieldType::I64(_) | FieldType::Date(_) => { if let Some(field_val_i64) = field_val_num.as_i64() { diff --git a/src/schema/value.rs b/src/schema/value.rs index 559b0f2ea8..c863f0302b 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -1,5 +1,5 @@ use std::fmt; -use std::net::IpAddr; +use std::net::Ipv6Addr; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -33,15 +33,17 @@ pub enum Value { Bytes(Vec), /// Json object value. JsonObject(serde_json::Map), - /// Ip Address value - IpAddr(IpAddr), + /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. + IpAddr(Ipv6Addr), } impl Eq for Value {} impl Serialize for Value { fn serialize(&self, serializer: S) -> Result - where S: Serializer { + where + S: Serializer, + { match *self { Value::Str(ref v) => serializer.serialize_str(v), Value::PreTokStr(ref v) => v.serialize(serializer), @@ -60,7 +62,9 @@ impl Serialize for Value { impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> { + where + D: Deserializer<'de>, + { struct ValueVisitor; impl<'de> Visitor<'de> for ValueVisitor { @@ -208,7 +212,7 @@ impl Value { /// Returns the ip addr, provided the value is of the `Ip` type. /// (Returns None if the value is not of the `Ip` type) - pub fn as_ip_addr(&self) -> Option { + pub fn as_ip_addr(&self) -> Option { if let Value::IpAddr(val) = self { Some(*val) } else { @@ -223,8 +227,8 @@ impl From for Value { } } -impl From for Value { - fn from(v: IpAddr) -> Value { +impl From for Value { + fn from(v: Ipv6Addr) -> Value { Value::IpAddr(v) } } @@ -308,7 +312,7 @@ impl From for Value { mod binary_serialize { use std::io::{self, Read, Write}; - use std::net::IpAddr; + use std::net::Ipv6Addr; use common::{f64_to_u64, u64_to_f64, BinarySerializable}; use fastfield_codecs::MonotonicallyMappableToU128; @@ -465,7 +469,7 @@ mod binary_serialize { } IP_CODE => { let value = u128::deserialize(reader)?; - Ok(Value::IpAddr(IpAddr::from_u128(value))) + Ok(Value::IpAddr(Ipv6Addr::from_u128(value))) } _ => Err(io::Error::new( From 5c9cbee29d2ea91d1502698ea21c2f481723cdd5 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 7 Oct 2022 19:52:00 +0800 Subject: [PATCH 27/27] handle IpV4 serialization case --- src/schema/document.rs | 4 +--- src/schema/schema.rs | 35 +++++++++++++++++++++++++++++++++++ src/schema/value.rs | 17 ++++++++++------- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/schema/document.rs b/src/schema/document.rs index 5c542bebf9..253c38081d 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -76,9 +76,7 @@ impl Document { /// Adding a facet to the document. pub fn add_facet(&mut self, field: Field, path: F) - where - Facet: From, - { + where Facet: From { let facet = Facet::from(path); let value = Value::Facet(facet); self.add_field_value(field, value); diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 0884ef71be..3b7fd22d0c 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -619,12 +619,14 @@ mod tests { schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("author", STRING); schema_builder.add_u64_field("count", count_options); + schema_builder.add_ip_addr_field("ip", FAST | STORED); schema_builder.add_bool_field("is_read", is_read_options); let schema = schema_builder.build(); let doc_json = r#"{ "title": "my title", "author": "fulmicoton", "count": 4, + "ip": "127.0.0.1", "is_read": true }"#; let doc = schema.parse_document(doc_json).unwrap(); @@ -633,6 +635,39 @@ mod tests { assert_eq!(doc, doc_serdeser); } + #[test] + pub fn test_document_to_ipv4_json() { + let mut schema_builder = Schema::builder(); + schema_builder.add_ip_addr_field("ip", FAST | STORED); + let schema = schema_builder.build(); + + // IpV4 loopback + let doc_json = r#"{ + "ip": "127.0.0.1" + }"#; + let doc = schema.parse_document(doc_json).unwrap(); + let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap(); + assert_eq!(value["ip"][0], "127.0.0.1"); + + // Special case IpV6 loopback. We don't want to map that to IPv4 + let doc_json = r#"{ + "ip": "::1" + }"#; + let doc = schema.parse_document(doc_json).unwrap(); + + let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap(); + assert_eq!(value["ip"][0], "::1"); + + // testing ip address of every router in the world + let doc_json = r#"{ + "ip": "192.168.0.1" + }"#; + let doc = schema.parse_document(doc_json).unwrap(); + + let value: serde_json::Value = serde_json::from_str(&schema.to_json(&doc)).unwrap(); + assert_eq!(value["ip"][0], "192.168.0.1"); + } + #[test] pub fn test_document_from_nameddoc() { let mut schema_builder = Schema::builder(); diff --git a/src/schema/value.rs b/src/schema/value.rs index c863f0302b..a92e093fb9 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -41,9 +41,7 @@ impl Eq for Value {} impl Serialize for Value { fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { + where S: Serializer { match *self { Value::Str(ref v) => serializer.serialize_str(v), Value::PreTokStr(ref v) => v.serialize(serializer), @@ -55,16 +53,21 @@ impl Serialize for Value { Value::Facet(ref facet) => facet.serialize(serializer), Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes), Value::JsonObject(ref obj) => obj.serialize(serializer), - Value::IpAddr(ref obj) => obj.serialize(serializer), + Value::IpAddr(ref obj) => { + // Ensure IpV4 addresses get serialized as IpV4, but excluding IpV6 loopback. + if let Some(ip_v4) = obj.to_ipv4_mapped() { + ip_v4.serialize(serializer) + } else { + obj.serialize(serializer) + } + } } } } impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { + where D: Deserializer<'de> { struct ValueVisitor; impl<'de> Visitor<'de> for ValueVisitor {