More documentation work
- Id
- fb45893146f6b052396a591166f72384693e950e
- Author
- Caio
- Commit time
- 2020-01-25T14:43:19+01:00
Modified tique/src/conditional_collector/mod.rs
-//! Top-K Collector, with ordering and condition support.
+//! Top-K Collectors, with ordering and condition support.
//!
-//! # Tutorial
+//! This is a collection of collectors that provide top docs
+//! rank functionality very similar to `tantivy::TopDocs`, with
+//! added support for declaring the ordering (ascending or
+//! descending) and collection-time conditions.
//!
//! ```rust
-//! use std::{cmp::Ordering, ops::Neg};
-//!
-//! use tantivy::{
-//! collector::TopDocs,
-//! query::AllQuery,
-//! schema::{Field, Value, SchemaBuilder, FAST, STORED},
-//! Index, SegmentReader, Document, Score
-//! };
-//!
-//! use tique::conditional_collector::{
-//! TopCollector,
-//! Ascending, Descending
-//! };
-//!
-//! const NUM_DOCS: i32 = 100;
-//! const K: usize = 10;
-//!
-//! // Let's create a collector that behaves like tantivy's `TopCollector`
-//! // The first `_` is `Score`, but it can be inferred:
-//! let tique_collector =
-//! TopCollector::<_, Descending, _>::new(K, true);
-//!
-//! // Let's double check :)
-//! let tantivy_collector = TopDocs::with_limit(K);
-//!
-//! // Now let's create a simple test index
-//! let mut builder = SchemaBuilder::new();
-//! let rank_field = builder.add_f64_field("rank", FAST);
-//! let id_field = builder.add_u64_field("id_field", FAST | STORED);
-//!
-//! let index = Index::create_in_ram(builder.build());
-//! let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
-//!
-//! for i in 0..NUM_DOCS {
-//! let mut doc = Document::new();
-//! doc.add_f64(rank_field, f64::from(i.neg()));
-//! doc.add_u64(id_field, i as u64);
-//! writer.add_document(doc);
-//! }
-//!
-//! writer.commit()?;
-//!
-//! // Now let's search our index
-//! let reader = index.reader()?;
-//! let searcher = reader.searcher();
-//!
-//! let (tantivy_top, tique_top) = searcher.search(
-//! &AllQuery, &(tantivy_collector, tique_collector))?;
-//!
-//! assert_eq!(tantivy_top.len(), tique_top.items.len());
-//! // Phew!
-//!
-//! // Noticed that we checked against `tique_top.items`? It's because
-//! // tique's collectors come with some extra metadata to make it more
-//! // useful.
-//!
-//! // We know how many documents matched the *query*, (not
-//! // necessarily the range), just like a count collector would.
-//! // So we expect it to be the number of documents in the index
-//! // given our query.
-//! assert_eq!(NUM_DOCS as usize, tique_top.total);
-//!
-//! // We also know if there would have been more items if we
-//! // asked for:
-//! assert!(tique_top.has_next());
-//!
-//! // This in useful information because it tells us that
-//! // we can keep searching easily.
-//!
-//! // One simple way to get the next page is to ask for more
-//! // results and shift. It's a super fast way that can become
-//! // problematic for digging deep into very large indices.
-//! let tantivy_next_collector = TopDocs::with_limit(K * 2);
-//!
-//! // The `tique::conditional_collector` collections know how
-//! // to paginate based on their own results, which allows you
-//! // to keep memory stable while spending more CPU time doing
-//! // comparisons:
-//!
-//! let last_result = tique_top.items.into_iter().last().unwrap();
-//! let tique_next_collector = TopCollector::<_, Descending, _>::new(K, last_result);
-//!
-//! // One disadvantage of this approach is that you can't simply
-//! // skip to an arbitrary page. When that's a requirement, the
-//! // best idea is to use the "memory hungry" approach until a
-//! // certain threshold, then switch to cursor-based.
-//! // You can even use tantivy's result to paginate:
-//!
-//! let last_tantivy_result = tantivy_top.into_iter().last().unwrap();
-//! let tique_next_collector_via_tantivy =
-//! TopCollector::<_, Descending, _>::new(K, last_tantivy_result);
-//!
-//! let (tantivy_until_next, tique_next, tique_same_next) = searcher.search(
-//! &AllQuery,
-//! &(tantivy_next_collector,tique_next_collector, tique_next_collector_via_tantivy))?;
-//!
-//! assert_eq!(tique_next.items, tique_same_next.items);
-//! assert_eq!(tantivy_until_next[K..], tique_next.items[..]);
-//!
-//! // We can also sort by the fast fields we indexed:
-//!
-//! let min_rank_collector =
-//! TopCollector::<f64, Ascending, _>::new(3, true)
-//! .top_fast_field(rank_field);
-//!
-//! let top_ids_collector =
-//! TopCollector::<u64, Descending, _>::new(3, true)
-//! .top_fast_field(id_field);
-//!
-//! let (min_rank, top_ids) =
-//! searcher.search(&AllQuery, &(min_rank_collector, top_ids_collector))?;
-//!
-//! assert_eq!(
-//! vec![99, 98, 97],
-//! top_ids.items.into_iter().map(|(score, _addr)| score).collect::<Vec<u64>>()
-//! );
-//!
-//! assert_eq!(
-//! vec![-99.0, -98.0, -97.0],
-//! min_rank.items.into_iter().map(|(score, _addr)| score).collect::<Vec<f64>>()
-//! );
-//!
-//! // There's more to conditions than booleans and `(T, DocAddress)`,
-//! // by the way. It's whatever implements the trait
-//! // `tique::conditional_collector::traits::ConditionForSegment`
-//!
-//! // So let's say we decide to make a pagination feature public
-//! // but very understandably don't want to expose DocAddress.
-//! // We can always retrieve a STORED field via a DocAddress,
-//! // so returning a public id from a search result is easy.
-//!
-//! // For the search part we can do something like this:
-//!
-//! const PAGE_SIZE: usize = 15;
-//! let first_page_collector =
-//! TopCollector::<f64, Descending, _>::new(PAGE_SIZE, true)
-//! .top_fast_field(rank_field);
-//!
-//! let page = searcher.search(&AllQuery, &first_page_collector)?;
-//!
-//! let mut result : Vec<(f64, u64)> = Vec::new();
-//! for (score, addr) in page.items.iter() {
-//! let doc = searcher.doc(*addr)?;
-//! if let Some(&Value::U64(public_id)) = doc.get_first(id_field) {
-//! result.push((*score, public_id));
-//! }
-//! }
-//!
-//! // So whenever `page.has_next()` is true, `result.last()` will
-//! // contain the cursor for our next page.
-//! assert!(page.has_next());
-//! let (ref_score, ref_id) = *result.last().unwrap();
-//!
-//! // And you can keep paginating beaking even via the
-//! // public id as follows:
-//!
-//! let paginator = move |reader: &SegmentReader| {
-//! let id_reader = reader.fast_fields().u64(id_field)
-//! .expect("id field is u64 FAST");
-//!
-//! move |_segment_id, doc_id, score, is_ascending: bool| {
-//! let public_id = id_reader.get(doc_id);
-//!
-//! match ref_score.partial_cmp(&score) {
-//! Some(Ordering::Greater) => !is_ascending,
-//! Some(Ordering::Less) => is_ascending,
-//! Some(Ordering::Equal) => ref_id < public_id,
-//! None => false,
-//! }
-//! }
-//! };
-//!
-//! let second_page_collector =
-//! TopCollector::<f64, Descending, _>::new(PAGE_SIZE, paginator)
-//! .top_fast_field(rank_field);
-//!
-//! let two_pages_collector =
-//! TopCollector::<f64, Descending, _>::new(PAGE_SIZE * 2, true)
-//! .top_fast_field(rank_field);
-//!
-//! let (two_pages, second_page) = searcher.search(
-//! &AllQuery,
-//! &(two_pages_collector, second_page_collector))?;
-//!
-//! assert_eq!(two_pages.items[PAGE_SIZE..], second_page.items[..]);
-//!
-//! # Ok::<(), tantivy::Error>(())
+//! # use tique::conditional_collector::{Descending,TopCollector};
+//! # use tantivy::Score;
+//! # let condition_for_segment = true;
+//! let collector =
+//! TopCollector::<Score, Descending, _>::new(10, condition_for_segment);
//! ```
+//!
+//! NOTE: Usually the score type (`Score` above, a `f32`) is inferred
+//! so there's no need to specify it.
+//!
+//! # Ordering Support
+//!
+//! When constructing a top collector you *must* specify how to
+//! actually order the items: in ascending or descending order.
+//!
+//! You simply choose `Ascending` or `Descending` and let the
+//! compiler know:
+//!
+//! ```rust
+//! # use tique::conditional_collector::{Ascending,TopCollector};
+//! # use tantivy::Score;
+//! # let limit = 10;
+//! # let condition_for_segment = true;
+//! let collector =
+//! TopCollector::<Score, Ascending, _>::new(limit, condition_for_segment);
+//! ```
+//!
+//! # Condition Support
+//!
+//! A "condition" is simply a way to tell the collector that
+//! a document is a valid candidate to the top. It behaves
+//! just like a query filter would, but does not limit the
+//! candidates before the collector sees them.
+//!
+//! This is a valid condition that accepts everything:
+//!
+//! ```rust
+//! let condition_for_segment = true;
+//! ```
+//!
+//! Generally speaking, a `condition` is anything that implements
+//! the `ConditionForSegment` trait and you can use closures as a
+//! shortcut:
+//!
+//! ```rust
+//! # use tantivy::{Score,SegmentReader};
+//! # use tique::conditional_collector::{TopCollector,Ascending};
+//! # let limit = 10;
+//! let condition_for_segment = move |reader: &SegmentReader| {
+//! // Fetch useful stuff from the `reader`, then:
+//! move |segment_id, doc_id, score, is_ascending| {
+//! // Express whatever logic you want
+//! true
+//! }
+//! };
+//!
+//! let collector =
+//! TopCollector::<Score, Ascending, _>::new(limit, condition_for_segment);
+//! ```
+//!
+//! ## Aside: Pagination with Constant Memory
+//!
+//! If you've been using `tantivy` for a while, you're probably
+//! used to seeing tuples like `(T, DocAddress)` (T is usually
+//! `tantivy::Score`, but changes if you customize the score
+//! somehow).
+//!
+//! You can also use these tuples as a condition and they act
+//! like a cursor for pagination, so when you do something like:
+//!
+//! ```rust
+//! # use tantivy::DocAddress;
+//! # use tique::conditional_collector::{TopCollector,Descending};
+//! let limit = 10;
+//! let condition_for_segment = (0.42, DocAddress(0, 1));
+//! let collector =
+//! TopCollector::<_, Descending, _>::new(limit, condition_for_segment);
+//! ```
+//!
+//! What you are asking for is the top `limit` documents that appear
+//! *after* (because you chose the `Descending` order) documents
+//! that scored `0.42` at whatever query you throw at it (and in
+//! case multiple docs score the name, the collector knows to
+//! break even by the `DocAddress`).
+//!
+//! The results that you get after your search will contain more
+//! `(T, DocAddress)` tuples you can use to keep pagination
+//! going without ever having to increase `limit`.
+//!
+//! Check `examples/conditional_collector_tutorial.rs` for more details.
mod custom_score;
mod top_collector;
mod topk;
Modified tique/src/conditional_collector/top_collector.rs
CustomScoreTopCollector,
};
+/// TopCollector is like tantivy's, with ordering and condition
+/// support.
+///
+/// # Examples
+///
+/// ## A top-collector that behaves like `tantivy`s
+///
+/// The first `Score` type is usually inferred but we need to be
+/// explicit for this example.
+///
+/// ```rust
+/// # use tique::conditional_collector::{TopCollector,Descending};
+/// let collector =
+/// TopCollector::<tantivy::Score, Descending, _>::new(10, true);
+/// ```
+///
+/// ## Custom condition from a closure.
+///
+/// ```rust
+/// # use tantivy::{Score,SegmentReader};
+/// # use tique::conditional_collector::{TopCollector,Ascending};
+/// let condition_for_segment = |reader: &SegmentReader| {
+/// // Fetch useful stuff from the `reader`, then:
+/// |segment_id, doc_id, score, is_ascending| {
+/// // Express whatever logic you want
+/// true
+/// }
+/// };
+///
+/// let collector =
+/// TopCollector::<Score, Ascending, _>::new(20, condition_for_segment);
+/// ```
+///
+/// ## Using a fast field as the score
+///
+/// *CAUTION*: Using a field that is not `FAST` or is of a different
+/// type than the one you specify will lead to a panic at runtime.
+///
+/// ```rust
+/// # use tique::conditional_collector::{Ascending, Descending, TopCollector};
+/// # let rank_field = tantivy::schema::Field::from_field_id(0);
+/// # let id_field = tantivy::schema::Field::from_field_id(1);
+/// # let limit = 10;
+/// # let condition = true;
+/// let min_rank_collector =
+/// TopCollector::<f64, Ascending, _>::new(limit, condition)
+/// .top_fast_field(rank_field);
+///
+/// let top_ids_collector =
+/// TopCollector::<u64, Descending, _>::new(limit, condition)
+/// .top_fast_field(id_field);
+///
+/// ```
pub struct TopCollector<T, P, CF> {
limit: usize,
condition_for_segment: CF,
_score: PhantomData<T>,
_provider: PhantomData<P>,
}
-
-/// TopCollector is like tantivy's, with ordering and condition
-/// support.
impl<T, P, CF> TopCollector<T, P, CF>
where
T: PartialOrd,
P: TopKProvider<T, DocId>,
CF: ConditionForSegment<T>,
{
+ /// Creates a new TopCollector with capacity of `limit`
+ /// and respecting the given `ConditionForSegment`
+ /// implementation.
pub fn new(limit: usize, condition_for_segment: CF) -> Self {
if limit < 1 {
panic!("Limit must be greater than 0");
P: 'static + Send + Sync + TopKProvider<$type, DocId>,
CF: Send + Sync + ConditionForSegment<$type>,
{
+ /// Transforms this collector into one that sorts by the given
+ /// fast field. Will panic if the field is not FAST or the wrong
+ /// type.
pub fn top_fast_field(
self,
field: tantivy::schema::Field,
Created tique/examples/conditional_collector_tutorial.rs
+use std::{cmp::Ordering, ops::Neg};
+
+use tantivy::{
+ collector::TopDocs,
+ query::AllQuery,
+ schema::{SchemaBuilder, Value, FAST, STORED},
+ Document, Index, Result, SegmentReader,
+};
+
+use tique::conditional_collector::{Ascending, Descending, TopCollector};
+
+pub fn main() -> Result<()> {
+ // First, we create a test index with a couple of fields
+ // And with some documents already in.
+ let mut builder = SchemaBuilder::new();
+
+ let id_field = builder.add_u64_field("id", FAST | STORED);
+ let rank_field = builder.add_f64_field("rank", FAST);
+
+ let index = Index::create_in_ram(builder.build());
+ let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
+
+ const NUM_DOCS: i32 = 100;
+ const PAGE_SIZE: usize = 10;
+
+ for i in 0..NUM_DOCS {
+ let mut doc = Document::new();
+ doc.add_f64(rank_field, f64::from(i.neg()));
+ doc.add_u64(id_field, i as u64);
+ writer.add_document(doc);
+ }
+
+ writer.commit()?;
+ let reader = index.reader()?;
+ let searcher = reader.searcher();
+
+ // Know that we have an index and a way to search it, let's
+ // create our collectors:
+
+ // Let's use one from tantivy to make sure things work as stated
+ let tantivy_collector = TopDocs::with_limit(PAGE_SIZE);
+
+ // Now create a conditional_collector that behaves like the one
+ // above. The first `_` is `tantivy::Score`, but it gets inferred.
+ let tique_collector = TopCollector::<_, Descending, _>::new(PAGE_SIZE, true);
+
+ let (tantivy_top, tique_top) =
+ searcher.search(&AllQuery, &(tantivy_collector, tique_collector))?;
+
+ assert_eq!(tantivy_top.len(), tique_top.items.len());
+ // Phew!
+
+ // Noticed that we checked against `tique_top.items`? It's because
+ // tique's collectors come with some extra metadata to make it more
+ // useful.
+
+ // We know how many documents matched the *query*, (not
+ // necessarily the range), just like a count collector would.
+ // So we expect it to be the number of documents in the index
+ // given our query.
+ assert_eq!(NUM_DOCS as usize, tique_top.total);
+
+ // We also know if there would have been more items if we
+ // asked for:
+ assert!(tique_top.has_next());
+
+ // This in useful information because it tells us that
+ // we can keep searching easily.
+
+ // One simple way to get the next page is to ask for more
+ // results and shift. It's a super fast way that can become
+ // problematic for digging deep into very large indices.
+ let tantivy_next_collector = TopDocs::with_limit(PAGE_SIZE * 2);
+
+ // Our conditional_collector types know how to paginate based
+ // on their own results, which allows you to keep memory stable
+ // while spending more CPU time doing comparisons:
+
+ let last_result = tique_top.items.into_iter().last().unwrap();
+ let tique_next_collector = TopCollector::<_, Descending, _>::new(PAGE_SIZE, last_result);
+
+ // One disadvantage of this approach is that you can't simply
+ // skip to an arbitrary page. When that's a requirement, the
+ // best idea is to use the "memory hungry" approach until a
+ // certain threshold, then switch to cursor-based.
+ // You can even use tantivy's result to paginate:
+
+ let last_tantivy_result = tantivy_top.into_iter().last().unwrap();
+ let tique_next_collector_via_tantivy =
+ TopCollector::<_, Descending, _>::new(PAGE_SIZE, last_tantivy_result);
+
+ let (tantivy_until_next, tique_next, tique_same_next) = searcher.search(
+ &AllQuery,
+ &(
+ tantivy_next_collector,
+ tique_next_collector,
+ tique_next_collector_via_tantivy,
+ ),
+ )?;
+
+ assert_eq!(tique_next.items, tique_same_next.items);
+ assert_eq!(tantivy_until_next[PAGE_SIZE..], tique_next.items[..]);
+
+ // We can also sort by the fast fields we indexed:
+
+ let min_rank_collector =
+ TopCollector::<f64, Ascending, _>::new(3, true).top_fast_field(rank_field);
+
+ let top_ids_collector =
+ TopCollector::<u64, Descending, _>::new(3, true).top_fast_field(id_field);
+
+ let (min_rank, top_ids) =
+ searcher.search(&AllQuery, &(min_rank_collector, top_ids_collector))?;
+
+ assert_eq!(
+ vec![99, 98, 97],
+ top_ids
+ .items
+ .into_iter()
+ .map(|(score, _addr)| score)
+ .collect::<Vec<u64>>()
+ );
+
+ assert_eq!(
+ vec![-99.0, -98.0, -97.0],
+ min_rank
+ .items
+ .into_iter()
+ .map(|(score, _addr)| score)
+ .collect::<Vec<f64>>()
+ );
+
+ // There's more to conditions than booleans and `(T, DocAddress)`,
+ // by the way. It's whatever implements the trait
+ // `tique::conditional_collector::traits::ConditionForSegment`
+
+ // So let's say we decide to make a pagination feature public
+ // but very understandably don't want to expose DocAddress.
+ // We can always retrieve a STORED field via a DocAddress,
+ // so returning a public id from a search result is easy.
+
+ // For the search part we can do something like this:
+
+ let first_page_collector =
+ TopCollector::<f64, Descending, _>::new(PAGE_SIZE, true).top_fast_field(rank_field);
+
+ let page = searcher.search(&AllQuery, &first_page_collector)?;
+
+ let mut result = Vec::new();
+ for (score, addr) in page.items.iter() {
+ let doc = searcher.doc(*addr)?;
+ if let Some(Value::U64(public_id)) = doc.get_first(id_field) {
+ result.push((*score, *public_id));
+ }
+ }
+
+ assert!(page.has_next());
+ // So whenever `page.has_next()` is true, `result.last()` will
+ // contain the cursor for our next page.
+ let (ref_score, ref_id) = *result.last().unwrap();
+
+ // And you can keep paginating beaking even scores via the
+ // public id as follows:
+ let paginator = move |reader: &SegmentReader| {
+ let id_reader = reader
+ .fast_fields()
+ .u64(id_field)
+ .expect("id field is u64 FAST");
+
+ move |_segment_id, doc_id, score, is_ascending: bool| {
+ let public_id = id_reader.get(doc_id);
+
+ match ref_score.partial_cmp(&score) {
+ Some(Ordering::Greater) => !is_ascending,
+ Some(Ordering::Less) => is_ascending,
+ Some(Ordering::Equal) => ref_id < public_id,
+ None => false,
+ }
+ }
+ };
+
+ let second_page_collector =
+ TopCollector::<f64, Descending, _>::new(PAGE_SIZE, paginator).top_fast_field(rank_field);
+
+ let two_pages_collector =
+ TopCollector::<f64, Descending, _>::new(PAGE_SIZE * 2, true).top_fast_field(rank_field);
+
+ let (two_pages, second_page) =
+ searcher.search(&AllQuery, &(two_pages_collector, second_page_collector))?;
+
+ assert_eq!(two_pages.items[PAGE_SIZE..], second_page.items[..]);
+
+ Ok(())
+}