Skip to content

Commit

Permalink
allow setting tokenizer manager on index (#1362)
Browse files Browse the repository at this point in the history
handle json in tokenizer_for_field
  • Loading branch information
PSeitz committed May 9, 2022
1 parent aab6549 commit 7f45a6a
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 19 deletions.
74 changes: 58 additions & 16 deletions src/core/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ fn load_metas(
pub struct IndexBuilder {
schema: Option<Schema>,
index_settings: IndexSettings,
tokenizer_manager: TokenizerManager,
}
impl Default for IndexBuilder {
fn default() -> Self {
Expand All @@ -86,6 +87,7 @@ impl IndexBuilder {
Self {
schema: None,
index_settings: IndexSettings::default(),
tokenizer_manager: TokenizerManager::default(),
}
}

Expand All @@ -103,6 +105,12 @@ impl IndexBuilder {
self
}

/// Set the tokenizers .
pub fn tokenizers(mut self, tokenizers: TokenizerManager) -> Self {
self.tokenizer_manager = tokenizers;
self
}

/// Creates a new index using the `RAMDirectory`.
///
/// The index will be allocated in anonymous memory.
Expand Down Expand Up @@ -154,7 +162,8 @@ impl IndexBuilder {
if !Index::exists(&*dir)? {
return self.create(dir);
}
let index = Index::open(dir)?;
let mut index = Index::open(dir)?;
index.set_tokenizers(self.tokenizer_manager.clone());
if index.schema() == self.get_expect_schema()? {
Ok(index)
} else {
Expand All @@ -176,7 +185,8 @@ impl IndexBuilder {
)?;
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
metas.index_settings = self.index_settings;
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
index.set_tokenizers(self.tokenizer_manager);
Ok(index)
}
}
Expand Down Expand Up @@ -304,6 +314,11 @@ impl Index {
}
}

/// Setter for the tokenizer manager.
pub fn set_tokenizers(&mut self, tokenizers: TokenizerManager) {
self.tokenizers = tokenizers;
}

/// Accessor for the tokenizer manager.
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
Expand All @@ -314,20 +329,31 @@ impl Index {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
FieldType::Str(text_options) => text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
_ => None,
let indexing_options_opt = match field_type {
FieldType::JsonObject(options) => options.get_text_indexing_options(),
FieldType::Str(options) => options.get_indexing_options(),
_ => {
return Err(TantivyError::SchemaError(format!(
"{:?} is not a text field.",
field_entry.name()
)))
}
};
match tokenizer_name_opt {
Some(tokenizer) => Ok(tokenizer),
None => Err(TantivyError::SchemaError(format!(
"{:?} is not a text field.",
field_entry.name()
))),
}
let indexing_options = indexing_options_opt.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"No indexing options set for field {:?}",
field_entry
))
})?;

tokenizer_manager
.get(indexing_options.tokenizer())
.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"No Tokenizer found for field {:?}",
field_entry
))
})
}

/// Create a default `IndexReader` for the given index.
Expand Down Expand Up @@ -557,7 +583,8 @@ impl fmt::Debug for Index {
mod tests {
use crate::directory::{RamDirectory, WatchCallback};
use crate::schema::{Field, Schema, INDEXED, TEXT};
use crate::{Directory, Index, IndexReader, IndexSettings, ReloadPolicy};
use crate::tokenizer::TokenizerManager;
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};

#[test]
fn test_indexer_for_field() {
Expand All @@ -573,6 +600,21 @@ mod tests {
);
}

#[test]
fn test_set_tokenizer_manager() {
let mut schema_builder = Schema::builder();
schema_builder.add_u64_field("num_likes", INDEXED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = IndexBuilder::new()
// set empty tokenizer manager
.tokenizers(TokenizerManager::new())
.schema(schema)
.create_in_ram()
.unwrap();
assert!(index.tokenizers().get("raw").is_none());
}

#[test]
fn test_index_exists() {
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
Expand Down
11 changes: 8 additions & 3 deletions src/tokenizer/tokenizer_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ pub struct TokenizerManager {
}

impl TokenizerManager {
/// Creates an empty tokenizer manager.
pub fn new() -> Self {
Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
}
}

/// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
where TextAnalyzer: From<T> {
Expand Down Expand Up @@ -52,9 +59,7 @@ impl Default for TokenizerManager {
/// - en_stem
/// - ja
fn default() -> TokenizerManager {
let manager = TokenizerManager {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
};
let manager = TokenizerManager::new();
manager.register("raw", RawTokenizer);
manager.register(
"default",
Expand Down

0 comments on commit 7f45a6a

Please sign in to comment.