Skip to content

Commit

Permalink
Merge branch 'main' into support-datetime-field
Browse files Browse the repository at this point in the history
  • Loading branch information
evanxg852000 committed Jul 11, 2022
2 parents b4a80b0 + 2406d92 commit e61a59f
Show file tree
Hide file tree
Showing 35 changed files with 300 additions and 129 deletions.
15 changes: 10 additions & 5 deletions .github/workflows/long_running.yml
Expand Up @@ -9,16 +9,21 @@ env:
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000

jobs:
functional_test_unsorted:
test:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
components: rustfmt, clippy

- name: Run indexing_unsorted
run: cargo test indexing_unsorted -- --ignored
functional_test_sorted:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Run indexing_sorted
run: cargo test indexing_sorted -- --ignored

7 changes: 4 additions & 3 deletions .github/workflows/test.yml
Expand Up @@ -16,22 +16,23 @@ jobs:

steps:
- uses: actions/checkout@v3
- name: Build
run: cargo build --verbose --workspace
- name: Install latest nightly to test also against unstable feature flag
uses: actions-rs/toolchain@v1
with:
toolchain: nightly
override: true
components: rustfmt

- name: Install latest nightly to test also against unstable feature flag
- name: Install stable
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
components: rustfmt, clippy

- name: Build
run: cargo build --verbose --workspace

- name: Run tests
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints --verbose --workspace

Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Expand Up @@ -71,7 +71,7 @@ proptest = "1.0.0"
criterion = "0.3.5"
test-log = "0.2.10"
env_logger = "0.9.0"
pprof = { version = "0.9.0", features = ["flamegraph", "criterion"] }
pprof = { version = "0.10.0", features = ["flamegraph", "criterion"] }
futures = "0.3.21"

[dev-dependencies.fail]
Expand Down
2 changes: 1 addition & 1 deletion bitpacker/Cargo.toml
@@ -1,7 +1,7 @@
[package]
name = "tantivy-bitpacker"
version = "0.2.0"
edition = "2018"
edition = "2021"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = []
Expand Down
2 changes: 1 addition & 1 deletion common/Cargo.toml
Expand Up @@ -3,7 +3,7 @@ name = "tantivy-common"
version = "0.3.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2018"
edition = "2021"
description = "common traits and utility functions used by multiple tantivy subcrates"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
2 changes: 1 addition & 1 deletion fastfield_codecs/Cargo.toml
Expand Up @@ -3,7 +3,7 @@ name = "fastfield_codecs"
version = "0.2.0"
authors = ["Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2018"
edition = "2021"
description = "Fast field codecs used by tantivy"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
2 changes: 1 addition & 1 deletion ownedbytes/Cargo.toml
Expand Up @@ -2,7 +2,7 @@
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes"
version = "0.3.0"
edition = "2018"
edition = "2021"
description = "Expose data as static slice"
license = "MIT"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
4 changes: 2 additions & 2 deletions query-grammar/Cargo.toml
Expand Up @@ -9,9 +9,9 @@ homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2018"
edition = "2021"

[dependencies]
combine = {version="4", default-features=false, features=[] }
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }
63 changes: 49 additions & 14 deletions query-grammar/src/query_grammar.rs
Expand Up @@ -16,9 +16,9 @@ use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
// special characters.
const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
];
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#;
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#;

/// Parses a field_name
/// A field name must have at least one character and be followed by a colon.
Expand Down Expand Up @@ -120,22 +120,36 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {

fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
phrase.or(word())
negative_number().or(phrase.or(word()))
}

fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
let term_val_with_field = negative_number().or(term_val());
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
(field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
field_name: Some(field_name),
phrase,
slop,
})
}

fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
let slop =
(char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::<u32>() {
Ok(d) => Ok(d),
_ => Err(StringStreamError::UnexpectedParse),
});
optional(slop).map(|slop| match slop {
Some(d) => d,
_ => 0,
})
}

fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_default_field = term_val().map(|phrase| UserInputLiteral {
let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
field_name: None,
phrase,
slop,
});

attempt(term_query())
.or(term_default_field)
.map(UserInputLeaf::from)
Expand Down Expand Up @@ -522,18 +536,10 @@ mod test {
super::field_name().parse(".my.field.name:a"),
Ok((".my.field.name".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"my\ field:a"#),
Ok(("my field".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"にんじん:a"#),
Ok(("にんじん".to_string(), "a"))
);
assert_eq!(
super::field_name().parse("my\\ field\\ name:a"),
Ok(("my field name".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"my\field:a"#),
Ok((r#"my\field"#.to_string(), "a"))
Expand Down Expand Up @@ -562,6 +568,17 @@ mod test {
super::field_name().parse("_my_field:a"),
Ok(("_my_field".to_string(), "a"))
);
assert_eq!(
super::field_name().parse("~my~field:a"),
Ok(("~my~field".to_string(), "a"))
);
for special_char in SPECIAL_CHARS.iter() {
let query = &format!("\\{special_char}my\\{special_char}field:a");
assert_eq!(
super::field_name().parse(&query),
Ok((format!("{special_char}my{special_char}field"), "a"))
);
}
}

#[test]
Expand Down Expand Up @@ -714,4 +731,22 @@ mod test {
);
test_is_parse_err("abc + ");
}

#[test]
fn test_slop() {
assert!(parse_to_ast().parse("\"a b\"~").is_err());
assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
assert!(parse_to_ast().parse("\"a b\"~a").is_err());
assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());

test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
test_parse_query_to_ast_helper("~Document", "\"~Document\"");
test_parse_query_to_ast_helper("a~2", "\"a~2\"");
test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300");
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
}
}
11 changes: 8 additions & 3 deletions query-grammar/src/user_input_ast.rs
Expand Up @@ -40,14 +40,19 @@ impl Debug for UserInputLeaf {
pub struct UserInputLiteral {
pub field_name: Option<String>,
pub phrase: String,
pub slop: u32,
}

impl fmt::Debug for UserInputLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
match self.field_name {
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
None => write!(formatter, "\"{}\"", self.phrase),
if let Some(ref field) = self.field_name {
write!(formatter, "\"{}\":", field)?;
}
write!(formatter, "\"{}\"", self.phrase)?;
if self.slop > 0 {
write!(formatter, "~{}", self.slop)?;
}
Ok(())
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/aggregation/collector.rs
Expand Up @@ -9,6 +9,7 @@ use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_valida
use crate::collector::{Collector, SegmentCollector};
use crate::{SegmentReader, TantivyError};

/// The default max bucket count, before the aggregation fails.
pub const MAX_BUCKET_COUNT: u32 = 65000;

/// Collector for aggregations.
Expand All @@ -22,6 +23,7 @@ pub struct AggregationCollector {
impl AggregationCollector {
/// Create collector from aggregation request.
///
/// Aggregation fails when the total bucket count is higher than max_bucket_count.
/// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>) -> Self {
Self {
Expand Down
13 changes: 4 additions & 9 deletions src/aggregation/intermediate_agg_result.rs
Expand Up @@ -280,11 +280,9 @@ impl IntermediateBucketResult {
.collect::<crate::Result<Vec<_>>>()?;

buckets.sort_by(|left, right| {
// TODO use total_cmp next stable rust release
left.from
.unwrap_or(f64::MIN)
.partial_cmp(&right.from.unwrap_or(f64::MIN))
.unwrap_or(Ordering::Equal)
.total_cmp(&right.from.unwrap_or(f64::MIN))
});
Ok(BucketResult::Range { buckets })
}
Expand Down Expand Up @@ -441,12 +439,9 @@ impl IntermediateTermBucketResult {
})
.collect::<crate::Result<Vec<_>>>()?;

buckets_with_val.sort_by(|(_, val1), (_, val2)| {
// TODO use total_cmp in next rust stable release
match &order {
Order::Desc => val2.partial_cmp(val1).unwrap_or(std::cmp::Ordering::Equal),
Order::Asc => val1.partial_cmp(val2).unwrap_or(std::cmp::Ordering::Equal),
}
buckets_with_val.sort_by(|(_, val1), (_, val2)| match &order {
Order::Desc => val2.total_cmp(val1),
Order::Asc => val1.total_cmp(val2),
});
buckets = buckets_with_val
.into_iter()
Expand Down
1 change: 1 addition & 0 deletions src/aggregation/mod.rs
Expand Up @@ -166,6 +166,7 @@ use std::fmt::Display;

pub use collector::{
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
MAX_BUCKET_COUNT,
};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
Expand Down
4 changes: 2 additions & 2 deletions src/collector/facet_collector.rs
Expand Up @@ -271,8 +271,8 @@ impl Collector for FacetCollector {
let mut facet_streamer = facet_reader.facet_dict().range().into_stream()?;
if facet_streamer.advance() {
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
// at the beginning of this loop, facet_streamer
// is positioned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Found => {
Expand Down
6 changes: 2 additions & 4 deletions src/collector/tests.rs
Expand Up @@ -69,10 +69,8 @@ pub fn test_filter_collector() -> crate::Result<()> {

/// Stores all of the doc ids.
/// This collector is only used for tests.
/// It is unusable in pr
///
/// actise, as it does not store
/// the segment ordinals
/// It is unusable in practise, as it does
/// not store the segment ordinals
pub struct TestCollector {
pub compute_score: bool,
}
Expand Down
2 changes: 1 addition & 1 deletion src/collector/top_collector.rs
Expand Up @@ -137,7 +137,7 @@ where T: PartialOrd + Clone
/// sorted by type `T`.
///
/// The implementation is based on a `BinaryHeap`.
/// The theorical complexity for collecting the top `K` out of `n` documents
/// The theoretical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
pub(crate) struct TopSegmentCollector<T> {
limit: usize,
Expand Down
4 changes: 2 additions & 2 deletions src/collector/top_score_collector.rs
Expand Up @@ -79,7 +79,7 @@ where
/// sorted by their score.
///
/// The implementation is based on a `BinaryHeap`.
/// The theorical complexity for collecting the top `K` out of `n` documents
/// The theoretical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
///
/// This collector guarantees a stable sorting in case of a tie on the
Expand Down Expand Up @@ -283,7 +283,7 @@ impl TopDocs {
///
/// # See also
///
/// To confortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
/// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
/// [.order_by_fast_field(...)](#method.order_by_fast_field) method.
pub fn order_by_u64_field(
self,
Expand Down
18 changes: 16 additions & 2 deletions src/core/searcher.rs
Expand Up @@ -6,7 +6,7 @@ use crate::core::{Executor, SegmentReader};
use crate::query::Query;
use crate::schema::{Document, Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::StoreReader;
use crate::store::{CacheStats, StoreReader};
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};

/// Identifies the searcher generation accessed by a [Searcher].
Expand Down Expand Up @@ -77,11 +77,13 @@ impl Searcher {
index: Index,
segment_readers: Vec<SegmentReader>,
generation: TrackedObject<SearcherGeneration>,
doc_store_cache_size: usize,
) -> io::Result<Searcher> {
let store_readers: Vec<StoreReader> = segment_readers
.iter()
.map(SegmentReader::get_store_reader)
.map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_size))
.collect::<io::Result<Vec<_>>>()?;

Ok(Searcher {
schema,
index,
Expand Down Expand Up @@ -110,6 +112,18 @@ impl Searcher {
store_reader.get(doc_address.doc_id)
}

/// The cache stats for the underlying store reader.
///
/// Aggregates the sum for each segment store reader.
pub fn doc_store_cache_stats(&self) -> CacheStats {
let cache_stats: CacheStats = self
.store_readers
.iter()
.map(|reader| reader.cache_stats())
.sum();
cache_stats
}

/// Fetches a document in an asynchronous manner.
#[cfg(feature = "quickwit")]
pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {
Expand Down

0 comments on commit e61a59f

Please sign in to comment.