Skip to content

Commit

Permalink
Add an option to opt out fieldnorms for indexed fields.
Browse files Browse the repository at this point in the history
Closes #922
  • Loading branch information
fulmicoton committed Nov 3, 2020
1 parent 3d192c0 commit bd769f6
Show file tree
Hide file tree
Showing 11 changed files with 198 additions and 59 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -7,6 +7,7 @@ Tantivy 0.14.0
- Added support for Brotli compression in the DocStore. (@ppodolsky)
- Added helper for building intersections and unions in BooleanQuery (@guilload)
- Bugfix in `Query::explain`
- Making it possible to opt out the generation of fieldnorms information for indexed fields. This change breaks compatibility as the meta.json file format is slightly changed. (#922, @pmasurel)

Tantivy 0.13.2
===================
Expand Down
2 changes: 1 addition & 1 deletion src/core/index_meta.rs
Expand Up @@ -301,7 +301,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default","fieldnorms":true},"stored":false}}],"opstamp":0}"#
);
}
}
38 changes: 27 additions & 11 deletions src/fieldnorm/reader.rs
Expand Up @@ -49,7 +49,7 @@ impl FieldNormReaders {
///
/// This metric is important to compute the score of a
/// document : a document having a query word in one its short fields
/// (e.g. title) is likely to be more relevant than in one of its longer field
/// (e.g. title)is likely to be more relevant than in one of its longer field
/// (e.g. body).
///
/// tantivy encodes `fieldnorm` on one byte with some precision loss,
Expand All @@ -61,20 +61,32 @@ impl FieldNormReaders {
/// precompute computationally expensive functions of the fieldnorm
/// in a very short array.
#[derive(Clone)]
pub struct FieldNormReader {
data: OwnedBytes,
pub enum FieldNormReader {
ConstFieldNorm {
fieldnorm_id: u8,
num_docs: u32
},
OneByte(OwnedBytes)
}

impl FieldNormReader {

pub fn const_fieldnorm_id(fieldnorm_id: u8, num_docs: u32) -> FieldNormReader {
FieldNormReader::ConstFieldNorm { fieldnorm_id, num_docs }
}

/// Opens a field norm reader given its file.
pub fn open(fieldnorm_file: FileSlice) -> crate::Result<Self> {
let data = fieldnorm_file.read_bytes()?;
Ok(FieldNormReader { data })
Ok(FieldNormReader::OneByte(data))
}

/// Returns the number of documents in this segment.
pub fn num_docs(&self) -> u32 {
self.data.len() as u32
match self {
Self::ConstFieldNorm { num_docs, ..} => *num_docs,
FieldNormReader::OneByte(vals) => { vals.len() as u32}
}
}

/// Returns the `fieldnorm` associated to a doc id.
Expand All @@ -86,15 +98,21 @@ impl FieldNormReader {
///
/// The fieldnorm is effectively decoded from the
/// `fieldnorm_id` by doing a simple table lookup.
#[inline(always)]
pub fn fieldnorm(&self, doc_id: DocId) -> u32 {
let fieldnorm_id = self.fieldnorm_id(doc_id);
id_to_fieldnorm(fieldnorm_id)
let fieldnorm_id = self.fieldnorm_id(doc_id);
id_to_fieldnorm(fieldnorm_id)
}

/// Returns the `fieldnorm_id` associated to a document.
#[inline(always)]
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
self.data.as_slice()[doc_id as usize]
match self {
FieldNormReader::ConstFieldNorm { fieldnorm_id, .. } => *fieldnorm_id,

FieldNormReader::OneByte(data) =>
data.as_slice()[doc_id as usize]
}
}

/// Converts a `fieldnorm_id` into a fieldnorm.
Expand All @@ -118,9 +136,7 @@ impl FieldNormReader {
.map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>();
let field_norms_data = OwnedBytes::new(field_norms_id);
FieldNormReader {
data: field_norms_data,
}
FieldNormReader::OneByte(field_norms_data)
}
}

Expand Down
48 changes: 24 additions & 24 deletions src/fieldnorm/writer.rs
Expand Up @@ -4,7 +4,7 @@ use super::fieldnorm_to_id;
use super::FieldNormsSerializer;
use crate::schema::Field;
use crate::schema::Schema;
use std::{io, iter};
use std::io;

/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
/// of each document for each field with field norms.
Expand All @@ -13,7 +13,7 @@ use std::{io, iter};
/// byte per document per field.
pub struct FieldNormsWriter {
fields: Vec<Field>,
fieldnorms_buffer: Vec<Vec<u8>>,
fieldnorms_buffer: Vec<Option<Vec<u8>>>,
}

impl FieldNormsWriter {
Expand All @@ -23,7 +23,7 @@ impl FieldNormsWriter {
schema
.fields()
.filter_map(|(field, field_entry)| {
if field_entry.is_indexed() {
if field_entry.has_fieldnorms() {
Some(field)
} else {
None
Expand All @@ -36,17 +36,14 @@ impl FieldNormsWriter {
/// specified in the schema.
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
let max_field = fields
.iter()
.map(Field::field_id)
.max()
.map(|max_field_id| max_field_id as usize + 1)
.unwrap_or(0);
let num_fields = schema.num_fields();
let mut fieldnorms_buffer: Vec<Option<Vec<u8>>> = vec![None; num_fields];
for field in &fields {
fieldnorms_buffer[field.field_id() as usize] = Some(Vec::new());
}
FieldNormsWriter {
fields,
fieldnorms_buffer: iter::repeat_with(Vec::new)
.take(max_field)
.collect::<Vec<_>>(),
fieldnorms_buffer,
}
}

Expand All @@ -55,8 +52,10 @@ impl FieldNormsWriter {
///
/// Will extend with 0-bytes for documents that have not been seen.
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
for field in self.fields.iter() {
self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
for buffer_opt in self.fieldnorms_buffer.iter_mut() {
if let Some(buffer) = buffer_opt {
buffer.resize(max_doc as usize, 0u8);
}
}
}

Expand All @@ -69,21 +68,22 @@ impl FieldNormsWriter {
/// * field - the field being set
/// * fieldnorm - the number of terms present in document `doc` in field `field`
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
assert!(
fieldnorm_buffer.len() <= doc as usize,
"Cannot register a given fieldnorm twice"
);
// we fill intermediary `DocId` as having a fieldnorm of 0.
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
if let Some(fieldnorm_buffer) = self.fieldnorms_buffer[field.field_id() as usize].as_mut() {
assert!(
fieldnorm_buffer.len() <= doc as usize,
"Cannot register a given fieldnorm twice" // we fill intermediary `DocId` as having a fieldnorm of 0.
);
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
}
}

/// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
if let Some(buffer)= self.fieldnorms_buffer[field.field_id() as usize].as_ref() {
fieldnorms_serializer.serialize_field(field, &buffer[..])?;
}
}
fieldnorms_serializer.close()?;
Ok(())
Expand Down
9 changes: 8 additions & 1 deletion src/postings/serializer.rs
Expand Up @@ -329,7 +329,7 @@ pub struct PostingsSerializer<W: Write> {

bm25_weight: Option<BM25Weight>,

num_docs: u32, // Number of docs in the segment
num_docs: u32,
avg_fieldnorm: Score, // Average number of term in the field for that segment.
// this value is used to compute the block wand information.
}
Expand Down Expand Up @@ -367,6 +367,13 @@ impl<W: Write> PostingsSerializer<W> {
}
}

/// Returns the number of documents in the segment currently being serialized.
/// This function may return `None` if there are no fieldnorm for that field.
fn num_docs_in_segment(&self) -> Option<u32> {
self.fieldnorm_reader
.map(|reader| reader.num_docs())
}

pub fn new_term(&mut self, term_doc_freq: u32) {
if self.termfreq_enabled && self.num_docs > 0 {
let bm25_weight = BM25Weight::for_one_term(
Expand Down
3 changes: 3 additions & 0 deletions src/query/term_query/term_query.rs
Expand Up @@ -92,6 +92,8 @@ impl TermQuery {
searcher: &Searcher,
scoring_enabled: bool,
) -> crate::Result<TermWeight> {
let has_fieldnorms = searcher.schema().get_field_entry(self.term.field())
.has_fieldnorms();
let term = self.term.clone();
let bm25_weight = BM25Weight::for_terms(searcher, &[term])?;
let index_record_option = if scoring_enabled {
Expand All @@ -103,6 +105,7 @@ impl TermQuery {
self.term.clone(),
index_record_option,
bm25_weight,
has_fieldnorms
))
}
}
Expand Down
10 changes: 9 additions & 1 deletion src/query/term_query/term_weight.rs
@@ -1,6 +1,7 @@
use super::term_scorer::TermScorer;
use crate::core::SegmentReader;
use crate::docset::DocSet;
use crate::fieldnorm::FieldNormReader;
use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight;
use crate::query::explanation::does_not_match;
Expand All @@ -15,6 +16,7 @@ pub struct TermWeight {
term: Term,
index_record_option: IndexRecordOption,
similarity_weight: BM25Weight,
has_fieldnorms: bool
}

impl Weight for TermWeight {
Expand Down Expand Up @@ -87,11 +89,13 @@ impl TermWeight {
term: Term,
index_record_option: IndexRecordOption,
similarity_weight: BM25Weight,
has_fieldnorms: bool
) -> TermWeight {
TermWeight {
term,
index_record_option,
similarity_weight,
has_fieldnorms
}
}

Expand All @@ -102,7 +106,11 @@ impl TermWeight {
) -> crate::Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field)?;
let fieldnorm_reader = reader.get_fieldnorms_reader(field)?;
let fieldnorm_reader = if self.has_fieldnorms {
reader.get_fieldnorms_reader(field)?
} else {
FieldNormReader::const_fieldnorm_id(1u8, reader.num_docs())
};
let similarity_weight = self.similarity_weight.boost_by(boost);
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option)?;
Expand Down
39 changes: 37 additions & 2 deletions src/schema/field_entry.rs
Expand Up @@ -112,6 +112,23 @@ impl FieldEntry {
}
}

pub fn has_fieldnorms(&self) -> bool {
match self.field_type {
FieldType::Str(ref options) => options.get_indexing_options()
.map(|options| options.fieldnorms())
.unwrap_or(false),
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::F64(ref options)
| FieldType::Date(ref options) => {
options.index_option().has_fieldnorms()
},
FieldType::HierarchicalFacet => false,
FieldType::Bytes(ref _options) => false

}
}

/// Returns true iff the field is a int (signed or unsigned) fast field
pub fn is_fast(&self) -> bool {
match self.field_type {
Expand Down Expand Up @@ -272,7 +289,8 @@ impl<'de> Deserialize<'de> for FieldEntry {
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::TEXT;
use crate::Index;
use crate::schema::{STRING, Schema, TEXT};
use serde_json;

#[test]
Expand All @@ -291,7 +309,8 @@ mod tests {
"options": {
"indexing": {
"record": "position",
"tokenizer": "default"
"tokenizer": "default",
"fieldnorms": true
},
"stored": false
}
Expand All @@ -309,4 +328,20 @@ mod tests {
_ => panic!("expected FieldType::Str"),
}
}

#[test]
fn test_fieldnorms() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let err = searcher.segment_reader(0u32).get_fieldnorms_reader(text);
assert!(matches!(err, Err(crate::TantivyError::SchemaError(_))));
Ok(())
}

}

0 comments on commit bd769f6

Please sign in to comment.