doc fix (#1391)

Documentation fix.
quickwit-oss · Jun 21, 2022 · 11e4225 · 11e4225
1 parent 83d0c13
commit 11e4225
Show file tree

Hide file tree

Showing 30 changed files with 75 additions and 86 deletions.
diff --git a/query-grammar/src/occur.rs b/query-grammar/src/occur.rs
@@ -2,11 +2,11 @@ use std::fmt;
 use std::fmt::Write;
 
 /// Defines whether a term in a query must be present,
-/// should be present or must be not present.
+/// should be present or must not be present.
 #[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
 pub enum Occur {
     /// For a given document to be considered for scoring,
-    /// at least one of the document with the Should or the Must
+    /// at least one of the terms with the Should or the Must
     /// Occur constraint must be within the document.
     Should,
     /// Document without the term are excluded from the search.

diff --git a/src/core/index.rs b/src/core/index.rs
@@ -232,7 +232,7 @@ impl Index {
     }
 
     /// Replace the default single thread search executor pool
-    /// by a thread pool with a given number of threads.
+    /// by a thread pool with as many threads as there are CPUs on the system.
     pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
         let default_num_threads = num_cpus::get();
         self.set_multithread_executor(default_num_threads)
@@ -366,8 +366,7 @@ impl Index {
     /// Create a `IndexReader` for the given index.
     ///
     /// Most project should create at most one reader for a given index.
-    /// This method is typically called only once per `Index` instance,
-    /// over the lifetime of most problem.
+    /// This method is typically called only once per `Index` instance.
     pub fn reader_builder(&self) -> IndexReaderBuilder {
         IndexReaderBuilder::new(self.clone())
     }

diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs
@@ -270,7 +270,7 @@ impl Default for IndexSettings {
 
 /// Settings to presort the documents in an index
 ///
-/// Presorting documents can greatly performance
+/// Presorting documents can greatly improve performance
 /// in some scenarios, by applying top n
 /// optimizations.
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]

diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs
@@ -215,7 +215,7 @@ impl SegmentReader {
     /// term dictionary associated to a specific field,
     /// and opening the posting list associated to any term.
     ///
-    /// If the field is marked as index, a warn is logged and an empty `InvertedIndexReader`
+    /// If the field is not marked as index, a warn is logged and an empty `InvertedIndexReader`
     /// is returned.
     /// Similarly if the field is marked as indexed but no term has been indexed for the given
     /// index. an empty `InvertedIndexReader` is returned (but no warning is logged).

diff --git a/src/docset.rs b/src/docset.rs
@@ -24,7 +24,6 @@ pub trait DocSet: Send {
     ///
     /// Calling `.advance()` on a terminated DocSet should be supported, and TERMINATED should
     /// be returned.
-    /// TODO Test existing docsets.
     fn advance(&mut self) -> DocId;
 
     /// Advances the DocSet forward until reaching the target, or going to the

diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs
@@ -2,12 +2,12 @@
 //! a given Field of a given document.
 //!
 //! This metric is important to compute the score of a
-//! document : a document having a query word in one its short fields
+//! document: a document having a query word in one of its short fields
 //! (e.g. title)  is likely to be more relevant than in one of its longer field
 //! (e.g. body).
 //!
 //! It encodes `fieldnorm` on one byte with some precision loss,
-//! using the exact same scheme as Lucene. Each value is place on a log-scale
+//! using the exact same scheme as Lucene. Each value is placed on a log-scale
 //! that takes values from `0` to `255`.
 //!
 //! A value on this scale is identified by a `fieldnorm_id`.

diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs
@@ -43,22 +43,9 @@ impl FieldNormReaders {
 }
 
 /// Reads the fieldnorm associated to a document.
-/// The fieldnorm represents the length associated to
-/// a given Field of a given document.
-///
-/// This metric is important to compute the score of a
-/// document : a document having a query word in one its short fields
-/// (e.g. title)  is likely to be more relevant than in one of its longer field
-/// (e.g. body).
 ///
-/// tantivy encodes `fieldnorm` on one byte with some precision loss,
-/// using the same scheme as Lucene. Each value is place on a log-scale
-/// that takes values from `0` to `255`.
-///
-/// A value on this scale is identified by a `fieldnorm_id`.
-/// Apart from compression, this scale also makes it possible to
-/// precompute computationally expensive functions of the fieldnorm
-/// in a very short array.
+/// The [fieldnorm](FieldNormReader::fieldnorm) represents the length associated to
+/// a given Field of a given document.
 #[derive(Clone)]
 pub struct FieldNormReader(ReaderImplEnum);
 

diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs
@@ -9,7 +9,7 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
 
 /// The `MergePolicy` defines which segments should be merged.
 ///
-/// Every time a the list of segments changes, the segment updater
+/// Every time the list of segments changes, the segment updater
 /// asks the merge policy if some segments should be merged.
 pub trait MergePolicy: marker::Send + marker::Sync + Debug {
     /// Given the list of segment metas, returns the list of merge candidates.

diff --git a/src/positions/mod.rs b/src/positions/mod.rs
@@ -1,15 +1,15 @@
 //! Tantivy can (if instructed to do so in the schema) store the term positions in a given field.
-//! This positions are expressed as token ordinal. For instance,
+//! This position is expressed as token ordinal. For instance,
 //! In "The beauty and the beast", the term "the" appears in position 0 and position 4.
 //! This information is useful to run phrase queries.
 //!
-//! The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
-//! for all terms of a given field, one term after the other.
+//! The [position](../enum.SegmentComponent.html#variant.Positions) file contains all of the
+//! bitpacked positions delta, for all terms of a given field, one term after the other.
 //!
-//! Each terms is encoded independently.
-//! Like for positing lists, tantivy rely on simd bitpacking to encode the positions delta in blocks
-//! of 128 deltas. Because we rarely have a multiple of 128, a final block may encode the remaining
-//! values variable byte encoding.
+//! Each term is encoded independently.
+//! Like for positing lists, tantivy relies on simd bitpacking to encode the positions delta in
+//! blocks of 128 deltas. Because we rarely have a multiple of 128, a final block may encode the
+//! remaining values variable byte encoding.
 //!
 //! In order to make reading possible, the term delta positions first encodes the number of
 //! bitpacked blocks, then the bitwidth for each blocks, then the actual bitpacked block and finally

diff --git a/src/positions/reader.rs b/src/positions/reader.rs
@@ -6,11 +6,11 @@ use crate::directory::OwnedBytes;
 use crate::positions::COMPRESSION_BLOCK_SIZE;
 use crate::postings::compression::{BlockDecoder, VIntDecoder};
 
-/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
-/// This means we need to skip to the `nth` positions efficiently.
+/// When accessing the positions of a term, we get a positions_idx from the `Terminfo`.
+/// This means we need to skip to the `nth` position efficiently.
 ///
 /// Blocks are compressed using bitpacking, so `skip_read` contains the number of bits
-/// (values can go from 0bit to 32 bits) required to decompress every block.
+/// (values can go from 0 to 32 bits) required to decompress every block.
 ///
 /// A given block obviously takes `(128 x  num_bit_for_the_block / num_bits_in_a_byte)`,
 /// so skipping a block without decompressing it is just a matter of advancing that many

diff --git a/src/positions/serializer.rs b/src/positions/serializer.rs
@@ -33,7 +33,7 @@ impl<W: io::Write> PositionSerializer<W> {
     /// at this point.
     /// When called before writing the positions of a term, this value is used as
     /// start offset.
-    /// When called after writing the positions of a term, this value is used as a
+    /// When called after writing the positions of a term, this value is used as
     /// end offset.
     pub fn written_bytes(&self) -> u64 {
         self.positions_wrt.written_bytes()
@@ -74,7 +74,7 @@ impl<W: io::Write> PositionSerializer<W> {
         self.block.clear();
     }
 
-    /// Close the positions for the given term.
+    /// Close the positions for the current term.
     pub fn close_term(&mut self) -> io::Result<()> {
         self.flush_block();
         VInt(self.bit_widths.len() as u64).serialize(&mut self.positions_wrt)?;

diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs
@@ -43,7 +43,7 @@ use crate::{DocId, Score};
 /// * `close()`
 ///
 /// Terms have to be pushed in a lexicographically-sorted order.
-/// Within a term, document have to be pushed in increasing order.
+/// Within a term, documents have to be pushed in increasing order.
 ///
 /// A description of the serialization format is
 /// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
@@ -55,7 +55,7 @@ pub struct InvertedIndexSerializer {
 }
 
 impl InvertedIndexSerializer {
-    /// Open a new `PostingsSerializer` for the given segment
+    /// Open a new `InvertedIndexSerializer` for the given segment
     pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
         use crate::SegmentComponent::{Positions, Postings, Terms};
         let inv_index_serializer = InvertedIndexSerializer {
@@ -187,7 +187,7 @@ impl<'a> FieldSerializer<'a> {
         Ok(term_ordinal)
     }
 
-    /// Serialize the information that a document contains the current term,
+    /// Serialize the information that a document contains for the current term:
     /// its term frequency, and the position deltas.
     ///
     /// At this point, the positions are already `delta-encoded`.
@@ -207,7 +207,7 @@ impl<'a> FieldSerializer<'a> {
 
     /// Finish the serialization for this term postings.
     ///
-    /// If the current block is incomplete, it need to be encoded
+    /// If the current block is incomplete, it needs to be encoded
     /// using `VInt` encoding.
     pub fn close_term(&mut self) -> io::Result<()> {
         fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
@@ -231,7 +231,7 @@ impl<'a> FieldSerializer<'a> {
         Ok(())
     }
 
-    /// Closes the current current field.
+    /// Closes the current field.
     pub fn close(mut self) -> io::Result<()> {
         self.close_term()?;
         if let Some(positions_serializer) = self.positions_serializer_opt {

diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs
@@ -17,6 +17,9 @@ use crate::schema::{Field, IndexRecordOption, Term};
 ///
 /// **This is my favorite part of the job.**
 ///
+/// [Slop](PhraseQuery::set_slop) allows leniency in term proximity
+/// for some performance tradeof.
+///
 /// Using a `PhraseQuery` on a field requires positions
 /// to be indexed for this field.
 #[derive(Clone, Debug)]
@@ -59,6 +62,9 @@ impl PhraseQuery {
     }
 
     /// Slop allowed for the phrase.
+    ///
+    /// The query will match if its terms are seperated by `slop` terms at most.
+    /// By default the slop is 0 meaning query terms need to be adjacent.  
     pub fn set_slop(&mut self, value: u32) {
         self.slop = value;
     }

diff --git a/src/query/query.rs b/src/query/query.rs
@@ -16,7 +16,7 @@ use crate::{DocAddress, Term};
 /// - a set of documents
 /// - a way to score these documents
 ///
-/// When performing a [search](#method.search),  these documents will then
+/// When performing a [search](Searcher::search),  these documents will then
 /// be pushed to a [Collector](../collector/trait.Collector.html),
 /// which will in turn be in charge of deciding what to do with them.
 ///

diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs
@@ -254,7 +254,7 @@ impl QueryParser {
 
     /// Creates a `QueryParser`, given
     ///  * an index
-    ///  * a set of default - fields used to search if no field is specifically defined
+    ///  * a set of default fields used to search if no field is specifically defined
     ///   in the query.
     pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
         QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())

diff --git a/src/query/range_query.rs b/src/query/range_query.rs
@@ -23,7 +23,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
     }
 }
 
-/// `RangeQuery` match all documents that have at least one term within a defined range.
+/// `RangeQuery` matches all documents that have at least one term within a defined range.
 ///
 /// Matched document will all get a constant `Score` of one.
 ///

diff --git a/src/schema/document.rs b/src/schema/document.rs
@@ -11,10 +11,7 @@ use crate::DateTime;
 /// Tantivy's Document is the object that can
 /// be indexed and then searched for.
 ///
-/// Documents are fundamentally a collection of unordered couple `(field, value)`.
-/// In this list, one field may appear more than once.
-
-/// Documents are really just a list of couple `(field, value)`.
+/// Documents are fundamentally a collection of unordered couples `(field, value)`.
 /// In this list, one field may appear more than once.
 #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
 pub struct Document {
@@ -125,7 +122,7 @@ impl Document {
         self.add_field_value(field, value.into());
     }
 
-    /// Add a bytes field
+    /// Add a JSON field
     pub fn add_json_object(
         &mut self,
         field: Field,

diff --git a/src/schema/facet.rs b/src/schema/facet.rs
@@ -74,15 +74,14 @@ impl Facet {
     /// Creates a `Facet` from its binary representation.
     pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
         // facet bytes validation. `0u8` is used a separator but that is still legal utf-8
-        // Ok(Facet(String::from_utf8(encoded_bytes)?))
         String::from_utf8(encoded_bytes).map(Facet)
     }
 
     /// Parse a text representation of a facet.
     ///
-    /// It is conceptually, if one of the steps of this path
-    /// contains a `/` or a `\`, it should be escaped
-    /// using an anti-slash `/`.
+    /// If one of the segments of this path
+    /// contains a `/`, it should be escaped
+    /// using an anti-slash `\`.
     pub fn from_text<T>(path: &T) -> Result<Facet, FacetParseError>
     where T: ?Sized + AsRef<str> {
         #[derive(Copy, Clone)]

diff --git a/src/schema/field.rs b/src/schema/field.rs
@@ -3,7 +3,7 @@ use std::io::{Read, Write};
 
 use common::BinarySerializable;
 
-/// `Field` is represented by an unsigned 32-bit integer type
+/// `Field` is represented by an unsigned 32-bit integer type.
 /// The schema holds the mapping between field names and `Field` objects.
 #[derive(
     Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, serde::Serialize, serde::Deserialize,

diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs
@@ -96,7 +96,7 @@ impl FieldEntry {
         self.field_type.has_fieldnorms()
     }
 
-    /// Returns true if the field is a int (signed or unsigned) fast field
+    /// Returns true if the field is a fast field
     pub fn is_fast(&self) -> bool {
         self.field_type.is_fast()
     }

diff --git a/src/schema/flags.rs b/src/schema/flags.rs
@@ -36,8 +36,6 @@ pub struct FastFlag;
 ///
 /// Fast fields can be random-accessed rapidly. Fields useful for scoring, filtering
 /// or collection should be mark as fast fields.
-/// The `FAST` flag can only be used when building `NumericOptions` (`u64`, `i64`, `f64` and `bool`
-/// fields)
 pub const FAST: SchemaFlagList<FastFlag, ()> = SchemaFlagList {
     head: FastFlag,
     tail: (),

diff --git a/src/schema/mod.rs b/src/schema/mod.rs
@@ -71,17 +71,8 @@
 //! setting the field as stored defines whether the field will be
 //! returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called,
 //! and setting the field as indexed means that we will be able perform queries such as
-//! `num_stars:10`. Note that unlike text fields, u64 can only be indexed in one way for the moment.
-//! This may change when we will start supporting range queries.
-//!
-//! The `fast` option on the other hand is specific to u64 fields, and is only relevant
-//! if you are implementing your own queries. This functionality is somewhat similar to Lucene's
-//! `DocValues`.
-//!
-//! u64 that are indexed as fast will be stored in a special data structure that will
-//! make it possible to access the u64 value given the doc id rapidly. This is useful if the value
-//! of the field is required during scoring or collection for instance.
-//!
+//! `num_stars:10`. Note that unlike text fields, numeric fields can only be indexed in one way for
+//! the moment.
 //!
 //! ### Shortcuts
 //!
@@ -99,6 +90,21 @@
 //! schema_builder.add_text_field("title", TEXT | STORED);
 //! let schema = schema_builder.build();
 //! ```
+//!
+//! ### Fast fields
+//! This functionality is somewhat similar to Lucene's `DocValues`.
+//!
+//! Fields that are indexed as [FAST] will be stored in a special data structure that will
+//! make it possible to access the value given the doc id rapidly. This is useful if the value
+//! of the field is required during scoring or collection for instance.
+//!
+//! ```
+//! use tantivy::schema::*;
+//! let mut schema_builder = Schema::builder();
+//! schema_builder.add_u64_field("population", STORED | FAST);
+//! schema_builder.add_text_field("zip_code", STRING | FAST);
+//! let schema = schema_builder.build();
+//! ```
 
 mod document;
 mod facet;

diff --git a/src/schema/schema.rs b/src/schema/schema.rs
@@ -292,7 +292,7 @@ impl Schema {
         self.0.fields_map.get(field_name).cloned()
     }
 
-    /// Create a named document off the doc.
+    /// Create document from a named doc.
     pub fn convert_named_doc(
         &self,
         named_doc: NamedFieldDocument,
@@ -308,7 +308,7 @@ impl Schema {
         Ok(document)
     }
 
-    /// Create a named document off the doc.
+    /// Create a named document from the doc.
     pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
         let mut field_map = BTreeMap::new();
         for (field, field_values) in doc.get_sorted_field_values() {