caio.co/de/cantine

Merge branch 'tique_docs'

Id
7ec9e0077f3cb60f4a10248a1370f22b44a3f8b7
Author
Caio
Commit time
2020-01-29T13:00:55+01:00

Modified Cargo.toml

@@ -1,7 +1,6
[workspace]

members = [
"cantine",
"tique",
- "tique_derive",
]

Modified tique/Cargo.toml

@@ -1,15 +1,20
[package]
name = "tique"
version = "0.1.0"
authors = ["Caio Romão <contact@caio.co>"]
edition = "2018"

+[features]
+default = []
+unstable = ["tique_derive", "serde", "byteorder", "nom"]
+
[dependencies]
-tique_derive = { path = "../tique_derive" }
tantivy = "0.11"
-serde = { version = "1", features = ["derive"] }
-byteorder = "1.3"
-nom = "5"
+
+tique_derive = { path = "./tique_derive", optional = true }
+serde = { version = "1", features = ["derive"], optional = true }
+byteorder = { version = "1.3", optional = true }
+nom = { version = "5", optional = true }

[dev-dependencies]
zerocopy = "0.2"

Modified tique/src/lib.rs

@@ -1,7 +1,45
+//! Utilities to drive a tantivy search index
+//!
+//! # Overview
+//!
+//! ## `conditional_collector`
+//!
+//! Collectors with built-in support for changing the ordering and
+//! cursor-based pagination (or rather: support for conditionally
+//! skipping documents that match the query).
+//!
+//! ```rust
+//! use tique::conditional_collector::{Ascending, TopCollector};
+//! # let f64_field = tantivy::schema::Field::from_field_id(0);
+//!
+//! let min_rank_collector =
+//! TopCollector::<f64, Ascending, _>::new(10, true).top_fast_field(f64_field);
+//! ```
+//!
+//! Check the module docs for more details.
+//!
+//! ## Unstable
+//!
+//! This crate also contains unpolished functionality that is made availble
+//! through the `unstable` feature flag:
+//!
+//! * `tique::query_parser`: A very simple query parser that only knows
+//! about term and phrase queries (and their negation). Mostly an excuse
+//! to play with `nom`
+//!
+//! * `tique_derive::FilterAndAggregation`: Takes a struct of (possibly
+//! `Option`) primitives and generates a bunch of code to assist with
+//! indexing, filtering and aggregating.
pub mod conditional_collector;
+
+#[cfg(feature = "unstable")]
pub mod queryparser;

+#[cfg(feature = "unstable")]
mod derive;

+#[cfg(feature = "unstable")]
pub use derive::RangeStats;
+
+#[cfg(feature = "unstable")]
pub use tique_derive::FilterAndAggregation;

Modified tique/src/conditional_collector/custom_score.rs

@@ -1,18 +1,18
use std::marker::PhantomData;

use tantivy::{
- collector::{Collector, SegmentCollector},
+ collector::{Collector, CustomScorer, CustomSegmentScorer, SegmentCollector},
DocId, Result, Score, SegmentLocalId, SegmentReader,
};

use super::{
top_collector::TopSegmentCollector,
topk::{TopK, TopKProvider},
- traits::{CheckCondition, ConditionForSegment, DocScorer, ScorerForSegment},
+ traits::{CheckCondition, ConditionForSegment},
CollectionResult,
};

-pub struct CustomScoreTopCollector<T, P, C, S>
+pub(crate) struct CustomScoreTopCollector<T, P, C, S>
where
T: PartialOrd,
P: TopKProvider<T, DocId>,
@@ -47,10 +47,10
T: 'static + PartialOrd + Copy + Send + Sync,
P: 'static + Send + Sync + TopKProvider<T, DocId>,
C: Sync + ConditionForSegment<T>,
- S: ScorerForSegment<T>,
+ S: CustomScorer<T>,
{
type Fruit = CollectionResult<T>;
- type Child = CustomScoreTopSegmentCollector<T, C::Type, S::Type, P::Child>;
+ type Child = CustomScoreTopSegmentCollector<T, C::Type, S::Child, P::Child>;

fn requires_scoring(&self) -> bool {
false
@@ -65,7 +65,7
segment_id: SegmentLocalId,
reader: &SegmentReader,
) -> Result<Self::Child> {
- let scorer = self.scorer_for_segment.for_segment(reader);
+ let scorer = self.scorer_for_segment.segment_scorer(reader)?;
Ok(CustomScoreTopSegmentCollector::new(
segment_id,
P::new_topk(self.limit),
@@ -103,7 +103,7
T: 'static + PartialOrd + Copy + Send + Sync,
K: 'static + TopK<T, DocId>,
C: CheckCondition<T>,
- S: DocScorer<T>,
+ S: CustomSegmentScorer<T>,
{
type Fruit = CollectionResult<T>;

Modified tique/src/conditional_collector/mod.rs

@@ -1,9 +1,106
+//! Top-K Collectors, with ordering and condition support.
+//!
+//! This is a collection of collectors that provide top docs
+//! rank functionality very similar to `tantivy::TopDocs`, with
+//! added support for declaring the ordering (ascending or
+//! descending) and collection-time conditions.
+//!
+//! ```rust
+//! # use tique::conditional_collector::{Descending,TopCollector};
+//! # use tantivy::Score;
+//! # let condition_for_segment = true;
+//! let collector =
+//! TopCollector::<Score, Descending, _>::new(10, condition_for_segment);
+//! ```
+//!
+//! NOTE: Usually the score type (`Score` above, a `f32`) is inferred
+//! so there's no need to specify it.
+//!
+//! # Ordering Support
+//!
+//! When constructing a top collector you *must* specify how to
+//! actually order the items: in ascending or descending order.
+//!
+//! You simply choose `Ascending` or `Descending` and let the
+//! compiler know:
+//!
+//! ```rust
+//! # use tique::conditional_collector::{Ascending,TopCollector};
+//! # use tantivy::Score;
+//! # let limit = 10;
+//! # let condition_for_segment = true;
+//! let collector =
+//! TopCollector::<Score, Ascending, _>::new(limit, condition_for_segment);
+//! ```
+//!
+//! # Condition Support
+//!
+//! A "condition" is simply a way to tell the collector that
+//! a document is a valid candidate to the top. It behaves
+//! just like a query filter would, but does not limit the
+//! candidates before the collector sees them.
+//!
+//! This is a valid condition that accepts everything:
+//!
+//! ```rust
+//! let condition_for_segment = true;
+//! ```
+//!
+//! Generally speaking, a `condition` is anything that implements
+//! the `ConditionForSegment` trait and you can use closures as a
+//! shortcut:
+//!
+//! ```rust
+//! # use tantivy::{Score,SegmentReader};
+//! # use tique::conditional_collector::{TopCollector,Ascending};
+//! # let limit = 10;
+//! let condition_for_segment = move |reader: &SegmentReader| {
+//! // Fetch useful stuff from the `reader`, then:
+//! move |segment_id, doc_id, score, is_ascending| {
+//! // Express whatever logic you want
+//! true
+//! }
+//! };
+//!
+//! let collector =
+//! TopCollector::<Score, Ascending, _>::new(limit, condition_for_segment);
+//! ```
+//!
+//! ## Aside: Pagination with Constant Memory
+//!
+//! If you've been using `tantivy` for a while, you're probably
+//! used to seeing tuples like `(T, DocAddress)` (T is usually
+//! `tantivy::Score`, but changes if you customize the score
+//! somehow).
+//!
+//! You can also use these tuples as a condition and they act
+//! like a cursor for pagination, so when you do something like:
+//!
+//! ```rust
+//! # use tantivy::DocAddress;
+//! # use tique::conditional_collector::{TopCollector,Descending};
+//! let limit = 10;
+//! let condition_for_segment = (0.42, DocAddress(0, 1));
+//! let collector =
+//! TopCollector::<_, Descending, _>::new(limit, condition_for_segment);
+//! ```
+//!
+//! What you are asking for is the top `limit` documents that appear
+//! *after* (because you chose the `Descending` order) documents
+//! that scored `0.42` at whatever query you throw at it (and in
+//! case multiple docs score the name, the collector knows to
+//! break even by the `DocAddress`).
+//!
+//! The results that you get after your search will contain more
+//! `(T, DocAddress)` tuples you can use to keep pagination
+//! going without ever having to increase `limit`.
+//!
+//! Check `examples/conditional_collector_tutorial.rs` for more details.
mod custom_score;
mod top_collector;
mod topk;
+mod traits;

-pub mod traits;
-
-pub use custom_score::CustomScoreTopCollector;
pub use top_collector::{CollectionResult, TopCollector};
pub use topk::{Ascending, Descending};
+pub use traits::*;

Modified tique/src/conditional_collector/top_collector.rs

@@ -1,16 +1,88
use std::marker::PhantomData;

use tantivy::{
- collector::{Collector, SegmentCollector},
+ collector::{Collector, CustomScorer, SegmentCollector},
DocAddress, DocId, Result, Score, SegmentLocalId, SegmentReader,
};

use super::{
+ custom_score::CustomScoreTopCollector,
topk::{TopK, TopKProvider},
traits::{CheckCondition, ConditionForSegment},
- CustomScoreTopCollector,
};

+/// A TopCollector like tantivy's, with added support for ordering
+/// and conditions.
+///
+/// # Examples
+///
+/// ## A top-collector that behaves like `tantivy`s
+///
+/// The first `Score` type is usually inferred but we need to be
+/// explicit for this example.
+///
+/// ```rust
+/// # use tique::conditional_collector::{TopCollector,Descending};
+/// let collector =
+/// TopCollector::<tantivy::Score, Descending, _>::new(10, true);
+/// ```
+///
+/// ## Custom condition from a closure.
+///
+/// ```rust
+/// # use tantivy::{Score,SegmentReader};
+/// # use tique::conditional_collector::{TopCollector,Ascending};
+/// let condition_for_segment = |reader: &SegmentReader| {
+/// // Fetch useful stuff from the `reader`, then:
+/// |segment_id, doc_id, score, is_ascending| {
+/// // Express whatever logic you want
+/// true
+/// }
+/// };
+///
+/// let collector =
+/// TopCollector::<Score, Ascending, _>::new(20, condition_for_segment);
+/// ```
+///
+/// ## Customizing the Score
+///
+/// ```rust
+/// # use tique::conditional_collector::{TopCollector, Ascending};
+/// # use tantivy::{SegmentReader, DocId};
+/// # let limit = 10;
+/// # let condition = true;
+/// // Any `tantivy::collector::CustomScorer` is valid
+/// let scorer = |reader: &SegmentReader| {
+/// |doc_id: DocId| -720
+/// };
+///
+/// let custom_collector =
+/// TopCollector::<i64, Ascending, _>::new(limit, condition)
+/// .with_custom_scorer(scorer);
+/// ```
+///
+/// ## Using a fast field as the score
+///
+/// One typical use-case for customizing scores is sorting by a
+/// fast field, so we provide a helper for that.
+///
+/// *CAUTION*: Using a field that is not `FAST` or is of a different
+/// type than the one you specify will lead to a panic at runtime.
+///
+/// ```rust
+/// # use tique::conditional_collector::{Ascending, Descending, TopCollector};
+/// # let rank_field = tantivy::schema::Field::from_field_id(0);
+/// # let id_field = tantivy::schema::Field::from_field_id(1);
+/// # let limit = 10;
+/// # let condition = true;
+/// let min_rank_collector =
+/// TopCollector::<f64, Ascending, _>::new(limit, condition)
+/// .top_fast_field(rank_field);
+///
+/// let top_ids_collector =
+/// TopCollector::<u64, Descending, _>::new(limit, condition)
+/// .top_fast_field(id_field);
+/// ```
pub struct TopCollector<T, P, CF> {
limit: usize,
condition_for_segment: CF,
@@ -24,6 +96,9
P: TopKProvider<T, DocId>,
CF: ConditionForSegment<T>,
{
+ /// Creates a new TopCollector with capacity of `limit`
+ /// and respecting the given `ConditionForSegment`
+ /// implementation.
pub fn new(limit: usize, condition_for_segment: CF) -> Self {
if limit < 1 {
panic!("Limit must be greater than 0");
@@ -37,6 +112,26
}
}

+impl<T, P, CF> TopCollector<T, P, CF>
+where
+ T: 'static + Copy + Send + Sync + PartialOrd,
+ P: 'static + Send + Sync + TopKProvider<T, DocId>,
+ CF: Send + Sync + ConditionForSegment<T>,
+{
+ /// Transforms this collector into that that uses the given
+ /// scorer instead of the default scoring functionality.
+ pub fn with_custom_scorer<C: CustomScorer<T>>(
+ self,
+ custom_scorer: C,
+ ) -> impl Collector<Fruit = CollectionResult<T>> {
+ CustomScoreTopCollector::<T, P, _, _>::new(
+ self.limit,
+ self.condition_for_segment,
+ custom_scorer,
+ )
+ }
+}
+
macro_rules! impl_top_fast_field {
($type: ident, $err: literal) => {
impl<P, CF> TopCollector<$type, P, CF>
@@ -44,6 +139,9
P: 'static + Send + Sync + TopKProvider<$type, DocId>,
CF: Send + Sync + ConditionForSegment<$type>,
{
+ /// Transforms this collector into one that sorts by the given
+ /// fast field. Will panic if the field is not FAST or the wrong
+ /// type.
pub fn top_fast_field(
self,
field: tantivy::schema::Field,
@@ -170,14 +268,23
}
}

+/// The basic result type, containing the top selected items and
+/// additional metadata.
#[derive(Debug)]
pub struct CollectionResult<T> {
+ /// How many documents were seen. Analogous to the result of a
+ /// simple count collector.
pub total: usize,
+ /// How many of the documents we saw actually passed our
+ /// condition
pub visited: usize,
+ /// The top found items, as you would get from `tantivy::TopDocs`
pub items: Vec<(T, DocAddress)>,
}

impl<T> CollectionResult<T> {
+ /// Wether the same query that created this result would have
+ /// more results if we paginated (or increased the top-k limit)
pub fn has_next(&self) -> bool {
self.visited - self.items.len() > 0
}

Modified tique/src/conditional_collector/topk.rs

@@ -19,6 +19,7
fn merge_many(limit: usize, items: Vec<CollectionResult<T>>) -> CollectionResult<T>;
}

+/// Marker to create a TopCollector in *ascending* order
pub struct Ascending;

impl<T: PartialOrd, D: Ord> TopKProvider<T, D> for Ascending {
@@ -33,6 +34,7
}
}

+/// Marker to create a TopCollector in *descending* order
pub struct Descending;

impl<T: PartialOrd, D: Ord> TopKProvider<T, D> for Descending {

Modified tique/src/conditional_collector/traits.rs

@@ -1,9 +1,12
use std::cmp::Ordering;

use tantivy::{DocAddress, DocId, SegmentLocalId, SegmentReader};

use super::topk::Scored;

+/// A trait that allows defining arbitrary conditions to be checked
+/// before considering a matching document for inclusion in the
+/// top results.
pub trait ConditionForSegment<T>: Clone {
type Type: CheckCondition<T>;
fn for_segment(&self, reader: &SegmentReader) -> Self::Type;
@@ -27,6 +30,19
}
}

+impl<T> ConditionForSegment<T> for (T, DocAddress)
+where
+ T: 'static + PartialOrd + Copy,
+{
+ type Type = Self;
+ fn for_segment(&self, _reader: &SegmentReader) -> Self::Type {
+ *self
+ }
+}
+
+/// The condition that gets checked before collection. In order for
+/// a document to appear in the results it must first return true
+/// for `check`.
pub trait CheckCondition<T>: 'static + Clone {
fn check(&self, segment_id: SegmentLocalId, doc_id: DocId, score: T, ascending: bool) -> bool;
}
@@ -48,7 +64,7

impl<T> CheckCondition<T> for (T, DocAddress)
where
- T: 'static + PartialOrd + Clone + Copy,
+ T: 'static + PartialOrd + Copy,
{
fn check(&self, segment_id: SegmentLocalId, doc_id: DocId, score: T, ascending: bool) -> bool {
let wanted = if ascending {
@@ -59,35 +75,5

Scored::new(self.0, self.1).cmp(&Scored::new(score, DocAddress(segment_id, doc_id)))
== wanted
- }
-}
-
-pub trait ScorerForSegment<T>: Sync {
- type Type: DocScorer<T>;
- fn for_segment(&self, reader: &SegmentReader) -> Self::Type;
-}
-
-impl<T, C, F> ScorerForSegment<T> for F
-where
- F: 'static + Sync + Send + Fn(&SegmentReader) -> C,
- C: DocScorer<T>,
-{
- type Type = C;
-
- fn for_segment(&self, reader: &SegmentReader) -> Self::Type {
- (self)(reader)
- }
-}
-
-pub trait DocScorer<T>: 'static {
- fn score(&self, doc_id: DocId) -> T;
-}
-
-impl<F, T> DocScorer<T> for F
-where
- F: 'static + Sync + Send + Fn(DocId) -> T,
-{
- fn score(&self, doc_id: DocId) -> T {
- (self)(doc_id)
}
}

Renamed tique_derive/tests/basic.rs to tique/tique_derive/tests/basic.rs

No visible change

Renamed tique_derive/src/lib.rs to tique/tique_derive/src/lib.rs

No visible change

Renamed tique_derive/Cargo.toml to tique/tique_derive/Cargo.toml

@@ -13,6 +13,6
quote = "1.0"

[dev-dependencies]
-tique = { path = "../tique" }
+tique = { path = ".." }
serde = { version = "1.0", features = ["derive"] }
tantivy = "0.11"

Created tique/examples/conditional_collector_tutorial.rs

@@ -1,0 +1,194
+use std::{cmp::Ordering, ops::Neg};
+
+use tantivy::{
+ collector::TopDocs,
+ query::AllQuery,
+ schema::{SchemaBuilder, Value, FAST, STORED},
+ Document, Index, Result, SegmentReader,
+};
+
+use tique::conditional_collector::{Ascending, Descending, TopCollector};
+
+pub fn main() -> Result<()> {
+ // First, we create a test index with a couple of fields
+ // And with some documents already in.
+ let mut builder = SchemaBuilder::new();
+
+ let id_field = builder.add_u64_field("id", FAST | STORED);
+ let rank_field = builder.add_f64_field("rank", FAST);
+
+ let index = Index::create_in_ram(builder.build());
+ let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
+
+ const NUM_DOCS: i32 = 100;
+ const PAGE_SIZE: usize = 10;
+
+ for i in 0..NUM_DOCS {
+ let mut doc = Document::new();
+ doc.add_f64(rank_field, f64::from(i.neg()));
+ doc.add_u64(id_field, i as u64);
+ writer.add_document(doc);
+ }
+
+ writer.commit()?;
+ let reader = index.reader()?;
+ let searcher = reader.searcher();
+
+ // Know that we have an index and a way to search it, let's
+ // create our collectors:
+
+ // Let's use one from tantivy to make sure things work as stated
+ let tantivy_collector = TopDocs::with_limit(PAGE_SIZE);
+
+ // Now create a conditional_collector that behaves like the one
+ // above. The first `_` is `tantivy::Score`, but it gets inferred.
+ let tique_collector = TopCollector::<_, Descending, _>::new(PAGE_SIZE, true);
+
+ let (tantivy_top, tique_top) =
+ searcher.search(&AllQuery, &(tantivy_collector, tique_collector))?;
+
+ assert_eq!(tantivy_top.len(), tique_top.items.len());
+ // Phew!
+
+ // Noticed that we checked against `tique_top.items`? It's because
+ // tique's collectors come with some extra metadata to make it more
+ // useful.
+
+ // We know how many documents matched the *query*, (not
+ // necessarily the range), just like a count collector would.
+ // So we expect it to be the number of documents in the index
+ // given our query.
+ assert_eq!(NUM_DOCS as usize, tique_top.total);
+
+ // We also know if there would have been more items if we
+ // asked for:
+ assert!(tique_top.has_next());
+
+ // This in useful information because it tells us that
+ // we can keep searching easily.
+
+ // One simple way to get the next page is to ask for more
+ // results and shift. It's a super fast way that can become
+ // problematic for digging deep into very large indices.
+ let tantivy_next_collector = TopDocs::with_limit(PAGE_SIZE * 2);
+
+ // Our conditional_collector types know how to paginate based
+ // on their own results, which allows you to keep memory stable
+ // while spending more CPU time doing comparisons:
+
+ let last_result = tique_top.items.into_iter().last().unwrap();
+ let tique_next_collector = TopCollector::<_, Descending, _>::new(PAGE_SIZE, last_result);
+
+ // One disadvantage of this approach is that you can't simply
+ // skip to an arbitrary page. When that's a requirement, the
+ // best idea is to use the "memory hungry" approach until a
+ // certain threshold, then switch to cursor-based.
+ // You can even use tantivy's result to paginate:
+
+ let last_tantivy_result = tantivy_top.into_iter().last().unwrap();
+ let tique_next_collector_via_tantivy =
+ TopCollector::<_, Descending, _>::new(PAGE_SIZE, last_tantivy_result);
+
+ let (tantivy_until_next, tique_next, tique_same_next) = searcher.search(
+ &AllQuery,
+ &(
+ tantivy_next_collector,
+ tique_next_collector,
+ tique_next_collector_via_tantivy,
+ ),
+ )?;
+
+ assert_eq!(tique_next.items, tique_same_next.items);
+ assert_eq!(tantivy_until_next[PAGE_SIZE..], tique_next.items[..]);
+
+ // We can also sort by the fast fields we indexed:
+
+ let min_rank_collector =
+ TopCollector::<f64, Ascending, _>::new(3, true).top_fast_field(rank_field);
+
+ let top_ids_collector =
+ TopCollector::<u64, Descending, _>::new(3, true).top_fast_field(id_field);
+
+ let (min_rank, top_ids) =
+ searcher.search(&AllQuery, &(min_rank_collector, top_ids_collector))?;
+
+ assert_eq!(
+ vec![99, 98, 97],
+ top_ids
+ .items
+ .into_iter()
+ .map(|(score, _addr)| score)
+ .collect::<Vec<u64>>()
+ );
+
+ assert_eq!(
+ vec![-99.0, -98.0, -97.0],
+ min_rank
+ .items
+ .into_iter()
+ .map(|(score, _addr)| score)
+ .collect::<Vec<f64>>()
+ );
+
+ // There's more to conditions than booleans and `(T, DocAddress)`,
+ // by the way. It's whatever implements the trait
+ // `tique::conditional_collector::traits::ConditionForSegment`
+
+ // So let's say we decide to make a pagination feature public
+ // but very understandably don't want to expose DocAddress.
+ // We can always retrieve a STORED field via a DocAddress,
+ // so returning a public id from a search result is easy.
+
+ // For the search part we can do something like this:
+
+ let first_page_collector =
+ TopCollector::<f64, Descending, _>::new(PAGE_SIZE, true).top_fast_field(rank_field);
+
+ let page = searcher.search(&AllQuery, &first_page_collector)?;
+
+ let mut result = Vec::new();
+ for (score, addr) in page.items.iter() {
+ let doc = searcher.doc(*addr)?;
+ if let Some(Value::U64(public_id)) = doc.get_first(id_field) {
+ result.push((*score, *public_id));
+ }
+ }
+
+ assert!(page.has_next());
+ // So whenever `page.has_next()` is true, `result.last()` will
+ // contain the cursor for our next page.
+ let (ref_score, ref_id) = *result.last().unwrap();
+
+ // And you can keep paginating beaking even scores via the
+ // public id as follows:
+ let paginator = move |reader: &SegmentReader| {
+ let id_reader = reader
+ .fast_fields()
+ .u64(id_field)
+ .expect("id field is u64 FAST");
+
+ move |_segment_id, doc_id, score, is_ascending: bool| {
+ let public_id = id_reader.get(doc_id);
+
+ match ref_score.partial_cmp(&score) {
+ Some(Ordering::Greater) => !is_ascending,
+ Some(Ordering::Less) => is_ascending,
+ Some(Ordering::Equal) => ref_id < public_id,
+ None => false,
+ }
+ }
+ };
+
+ let second_page_collector =
+ TopCollector::<f64, Descending, _>::new(PAGE_SIZE, paginator).top_fast_field(rank_field);
+
+ let two_pages_collector =
+ TopCollector::<f64, Descending, _>::new(PAGE_SIZE * 2, true).top_fast_field(rank_field);
+
+ let (two_pages, second_page) =
+ searcher.search(&AllQuery, &(two_pages_collector, second_page_collector))?;
+
+ assert_eq!(two_pages.items[PAGE_SIZE..], second_page.items[..]);
+
+ Ok(())
+}