Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move split_full_path to Schema #1692

Merged
merged 5 commits into from Nov 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
50 changes: 1 addition & 49 deletions src/query/query_parser/query_parser.rs
Expand Up @@ -183,7 +183,6 @@ pub struct QueryParser {
conjunction_by_default: bool,
tokenizer_manager: TokenizerManager,
boost: HashMap<Field, Score>,
field_names: HashMap<String, Field>,
}

fn all_negative(ast: &LogicalAst) -> bool {
Expand All @@ -196,31 +195,6 @@ fn all_negative(ast: &LogicalAst) -> bool {
}
}

// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
//
// This function operates directly on bytes (as opposed to codepoint), relying
// on a encoding property of utf-8 for its correctness.
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
let mut splitting_dots_pos = Vec::new();
let mut escape_state = false;
for (pos, b) in field_path.bytes().enumerate() {
if escape_state {
escape_state = false;
continue;
}
match b {
b'\\' => {
escape_state = true;
}
b'.' => {
splitting_dots_pos.push(pos);
}
_ => {}
}
}
splitting_dots_pos
}

impl QueryParser {
/// Creates a `QueryParser`, given
/// * schema - index Schema
Expand All @@ -230,34 +204,19 @@ impl QueryParser {
default_fields: Vec<Field>,
tokenizer_manager: TokenizerManager,
) -> QueryParser {
let field_names = schema
.fields()
.map(|(field, field_entry)| (field_entry.name().to_string(), field))
.collect();
QueryParser {
schema,
default_fields,
tokenizer_manager,
conjunction_by_default: false,
boost: Default::default(),
field_names,
}
}

// Splits a full_path as written in a query, into a field name and a
// json path.
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
if let Some(field) = self.field_names.get(full_path) {
return Some((*field, ""));
}
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
while let Some(pos) = splitting_period_pos.pop() {
let (prefix, suffix) = full_path.split_at(pos);
if let Some(field) = self.field_names.get(prefix) {
return Some((*field, &suffix[1..]));
}
}
None
self.schema.find_field(full_path)
}

/// Creates a `QueryParser`, given
Expand Down Expand Up @@ -1566,13 +1525,6 @@ mod test {
assert_eq!(query_parser.split_full_path("firsty"), None);
}

#[test]
fn test_locate_splitting_dots() {
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
}

#[test]
pub fn test_phrase_slop() {
test_parse_query_to_logical_ast_helper(
Expand Down
96 changes: 96 additions & 0 deletions src/schema/schema.rs
Expand Up @@ -252,6 +252,31 @@ impl Eq for InnerSchema {}
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct Schema(Arc<InnerSchema>);

// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
//
// This function operates directly on bytes (as opposed to codepoint), relying
// on a encoding property of utf-8 for its correctness.
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
let mut splitting_dots_pos = Vec::new();
let mut escape_state = false;
for (pos, b) in field_path.bytes().enumerate() {
if escape_state {
escape_state = false;
continue;
}
match b {
b'\\' => {
escape_state = true;
}
b'.' => {
splitting_dots_pos.push(pos);
}
_ => {}
}
}
splitting_dots_pos
}

impl Schema {
/// Return the `FieldEntry` associated with a `Field`.
pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
Expand Down Expand Up @@ -358,6 +383,28 @@ impl Schema {
}
Ok(doc)
}

/// Searches for a full_path in the schema, returning the field name and a JSON path.
///
/// This function works by checking if the field exists for the exact given full_path.
/// If it's not, it splits the full_path at non-escaped '.' chars and tries to match the
/// prefix with the field names, favoring the longest field names.
///
/// This does not check if field is a JSON field. It is possible for this functions to
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to improve the spec and avoid this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be. I changed the function to continue when the prefix is not a JSON field, and all of the tests passed (probably does not mean it is completely ok to change), but I think it might be better to open a separate PR for it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you open a ticket?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/// return a non-empty JSON path with a non-JSON field.
pub fn find_field<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
if let Some(field) = self.0.fields_map.get(full_path) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No unit tests for find_field!? Can we add some?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mostly copied the same unit test on QueryParser. I can remove the test from QueryParser or add more tests to Schema if that's what we want.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes please that would be awesome!

return Some((*field, ""));
}
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
while let Some(pos) = splitting_period_pos.pop() {
let (prefix, suffix) = full_path.split_at(pos);
if let Some(field) = self.0.fields_map.get(prefix) {
return Some((*field, &suffix[1..]));
}
}
None
}
}

impl Serialize for Schema {
Expand Down Expand Up @@ -436,6 +483,13 @@ mod tests {
use crate::schema::schema::DocParsingError::InvalidJson;
use crate::schema::*;

#[test]
fn test_locate_splitting_dots() {
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
}

#[test]
pub fn is_indexed_test() {
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -936,4 +990,46 @@ mod tests {
]"#;
assert_eq!(schema_json, expected);
}

#[test]
fn test_find_field() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("foo", STRING);

schema_builder.add_text_field("bar", STRING);
schema_builder.add_text_field("foo.bar", STRING);
schema_builder.add_text_field("foo.bar.baz", STRING);
schema_builder.add_text_field("bar.a.b.c", STRING);
let schema = schema_builder.build();

assert_eq!(
schema.find_field("foo.bar"),
Some((schema.get_field("foo.bar").unwrap(), ""))
);
assert_eq!(
schema.find_field("foo.bar.bar"),
Some((schema.get_field("foo.bar").unwrap(), "bar"))
);
assert_eq!(
schema.find_field("foo.bar.baz"),
Some((schema.get_field("foo.bar.baz").unwrap(), ""))
);
assert_eq!(
schema.find_field("foo.toto"),
Some((schema.get_field("foo").unwrap(), "toto"))
);
assert_eq!(
schema.find_field("foo.bar"),
Some((schema.get_field("foo.bar").unwrap(), ""))
);
assert_eq!(
schema.find_field("bar.toto.titi"),
Some((schema.get_field("bar").unwrap(), "toto.titi"))
);

assert_eq!(schema.find_field("hello"), None);
assert_eq!(schema.find_field(""), None);
assert_eq!(schema.find_field("thiswouldbeareallylongfieldname"), None);
assert_eq!(schema.find_field("baz.bar.foo"), None);
}
}