Skip to content

Commit

Permalink
apply gcd on fastfield as preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed Jul 20, 2022
1 parent 23fe73a commit 4592fc7
Show file tree
Hide file tree
Showing 13 changed files with 591 additions and 111 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Expand Up @@ -60,6 +60,8 @@ pretty_assertions = "1.2.1"
serde_cbor = { version = "0.11.2", optional = true }
async-trait = "0.1.53"
arc-swap = "1.5.0"
gcd = "2.1.0"
libdivide = "0.4.0"

[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
Expand Down
6 changes: 3 additions & 3 deletions fastfield_codecs/src/bitpacked.rs
Expand Up @@ -105,9 +105,9 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
fn serialize(
write: &mut impl Write,
_fastfield_accessor: &impl FastFieldDataAccess,
fn serialize<W: Write>(
write: &mut W,
_fastfield_accessor: &dyn FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
_data_iter1: impl Iterator<Item = u64>,
Expand Down
10 changes: 6 additions & 4 deletions fastfield_codecs/src/lib.rs
Expand Up @@ -40,13 +40,15 @@ pub trait FastFieldCodecSerializer {
/// Serializes the data using the serializer into write.
/// There are multiple iterators, in case the codec needs to read the data multiple times.
/// The iterators should be preferred over using fastfield_accessor for performance reasons.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &impl FastFieldDataAccess,
fn serialize<W>(
write: &mut W,
fastfield_accessor: &dyn FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()>;
) -> io::Result<()>
where
W: Write;
}

/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
Expand Down
11 changes: 7 additions & 4 deletions fastfield_codecs/src/linearinterpol.rs
Expand Up @@ -109,13 +109,16 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
const NAME: &'static str = "LinearInterpol";
const ID: u8 = 2;
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &impl FastFieldDataAccess,
fn serialize<W>(
write: &mut W,
fastfield_accessor: &dyn FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
) -> io::Result<()>
where
W: Write,
{
assert!(stats.min_value <= stats.max_value);

let first_val = fastfield_accessor.get_val(0);
Expand Down
12 changes: 8 additions & 4 deletions fastfield_codecs/src/multilinearinterpol.rs
Expand Up @@ -75,6 +75,7 @@ impl BinarySerializable for Function {
self.positive_val_offset.serialize(write)?;
self.slope.serialize(write)?;
self.num_bits.serialize(write)?;

Ok(())
}

Expand Down Expand Up @@ -193,13 +194,16 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
const NAME: &'static str = "MultiLinearInterpol";
const ID: u8 = 3;
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &impl FastFieldDataAccess,
fn serialize<W>(
write: &mut W,
fastfield_accessor: &dyn FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
_data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
) -> io::Result<()>
where
W: Write,
{
assert!(stats.min_value <= stats.max_value);

let first_val = fastfield_accessor.get_val(0);
Expand Down
2 changes: 1 addition & 1 deletion src/fastfield/bytes/mod.rs
Expand Up @@ -11,7 +11,7 @@ mod tests {
use crate::{DocAddress, DocSet, Index, Searcher, Term};

#[test]
fn test_bytes() -> crate::Result<()> {
fn test_bytes2() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let bytes_field = schema_builder.add_bytes_field("bytesfield", FAST);
let schema = schema_builder.build();
Expand Down
90 changes: 75 additions & 15 deletions src/fastfield/mod.rs
Expand Up @@ -276,10 +276,17 @@ mod tests {
schema_builder.build()
});

pub static SCHEMAI64: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
schema_builder.add_i64_field("field", FAST);
schema_builder.build()
});

pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap());

#[test]
pub fn test_fastfield() {
pub fn test_fastfield2() {
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
Expand Down Expand Up @@ -309,7 +316,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 37);
assert_eq!(file.len(), 55);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
Expand Down Expand Up @@ -340,7 +347,7 @@ mod tests {
serializer.close()?;
}
let file = directory.open_read(path)?;
assert_eq!(file.len(), 62);
assert_eq!(file.len(), 80);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
Expand Down Expand Up @@ -376,7 +383,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
assert_eq!(file.len(), 53);
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
Expand Down Expand Up @@ -408,7 +415,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80043);
assert_eq!(file.len(), 80061);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
Expand Down Expand Up @@ -448,7 +455,8 @@ mod tests {
}
let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 17710 as usize); //bitpacked size
assert_eq!(file.len(), 10175_usize); // linear interpol size
// assert_eq!(file.len(), 10201_usize); // linear interpol size, before gcd = min_value
assert_eq!(file.len(), 93_usize);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();
Expand Down Expand Up @@ -505,10 +513,15 @@ mod tests {
permutation
}

#[test]
fn test_intfastfield_permutation() -> crate::Result<()> {
// Warning: this generates the same permutation at each call
pub fn generate_permutation_gcd() -> Vec<u64> {
let mut permutation: Vec<u64> = (1u64..100_000u64).map(|el| el * 1000).collect();
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
permutation
}

fn test_intfastfield_permutation_with_data(permutation: Vec<u64>) -> crate::Result<()> {
let path = Path::new("test");
let permutation = generate_permutation();
let n = permutation.len();
let directory = RamDirectory::create();
{
Expand All @@ -527,15 +540,27 @@ mod tests {
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;

let mut a = 0u64;
for _ in 0..n {
for a in 0..n {
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
a = fast_field_reader.get(a as u32);
}
}
Ok(())
}

#[test]
fn test_intfastfield_permutation_gcd() -> crate::Result<()> {
let permutation = generate_permutation_gcd();
test_intfastfield_permutation_with_data(permutation)?;
Ok(())
}

#[test]
fn test_intfastfield_permutation() -> crate::Result<()> {
let permutation = generate_permutation();
test_intfastfield_permutation_with_data(permutation)?;
Ok(())
}

#[test]
fn test_merge_missing_date_fast_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -861,7 +886,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 36);
assert_eq!(file.len(), 54);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
Expand Down Expand Up @@ -897,7 +922,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 48);
assert_eq!(file.len(), 66);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
Expand Down Expand Up @@ -931,7 +956,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
assert_eq!(file.len(), 53);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
Expand All @@ -946,6 +971,7 @@ mod bench {
use std::collections::HashMap;
use std::path::Path;

use crate::fastfield::tests::generate_permutation_gcd;
use test::{self, Bencher};

use super::tests::{generate_permutation, FIELD, SCHEMA};
Expand Down Expand Up @@ -1046,4 +1072,38 @@ mod bench {
});
}
}

#[bench]
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation_gcd();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();

b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a) as u32;
}
a
});
}
}
}
7 changes: 7 additions & 0 deletions src/fastfield/multivalued/mod.rs
Expand Up @@ -346,6 +346,13 @@ mod tests {
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
}
}
#[test]
fn test_multivalued_proptest_gcd() {
use IndexingOp::*;
let ops = [AddDoc { id: 9 }, AddDoc { id: 9 }, Merge];

assert!(test_multivalued_no_panic(&ops[..]).is_ok());
}

#[test]
fn test_multivalued_proptest_off_by_one_bug_1151() {
Expand Down

0 comments on commit 4592fc7

Please sign in to comment.