caio.co/de/cantine

Adapt tique to tantivy 0.15

Two simple changes:

 * SegmentLocalId is now SegmentOrd, so `segment_id` labels
   were also updated to `segment_ord`

 * DocAddress is now a struct with named members instead of
   a tuple newtype, so the code is changed to construct
   it like `DocAddress{...}` instead of `DocAddress(...)`
Id
1994eee1924d67d411a3a2f59a154d1a80da22d8
Author
Caio
Commit time
2021-06-14T16:09:42+02:00

Modified tique/Cargo.toml

@@ -20,7 +20,7
queryparser = ["nom"]

[dependencies]
-tantivy = "0.14"
+tantivy = "0.15"
nom = { version = "6", optional = true }

[dev-dependencies]

Modified tique/examples/conditional_collector_tutorial.rs

@@ -1,7 +1,8
use std::{cmp::Ordering, ops::Neg};

use tantivy::{
collector::TopDocs,
+ fastfield::FastFieldReader,
query::AllQuery,
schema::{SchemaBuilder, Value, FAST, STORED},
Document, Index, Result, SegmentReader,

Modified tique/src/dismax.rs

@@ -333,7 +333,10

let dismax = DisMaxQuery::new(vec![Box::new(foo_query), Box::new(bar_query)], 0.0);

- let baz_doc = DocAddress(0, 3);
+ let baz_doc = DocAddress {
+ segment_ord: 0,
+ doc_id: 3,
+ };
assert!(
dismax.explain(&searcher, baz_doc).is_err(),
"Shouldn't be able to explain a non-matching doc"
@@ -341,7 +344,13

// Ensure every other doc can be explained
for doc_id in 0..3 {
- let explanation = dismax.explain(&searcher, DocAddress(0, doc_id))?;
+ let explanation = dismax.explain(
+ &searcher,
+ DocAddress {
+ segment_ord: 0,
+ doc_id,
+ },
+ )?;
assert!(explanation.to_pretty_json().contains("DisMaxQuery"));
}

Modified tique/src/topterms.rs

@@ -291,10 +291,13
where
F: FnMut(Term, u32),
{
- let DocAddress(seg_id, doc_id) = doc;
+ let DocAddress {
+ segment_ord,
+ doc_id,
+ } = doc;

- let reader = searcher.segment_reader(seg_id);
- let inverted_index = reader.inverted_index(field.clone())?;
+ let reader = searcher.segment_reader(segment_ord);
+ let inverted_index = reader.inverted_index(field)?;
let mut termstream = inverted_index.terms().stream()?;

while let Some((bytes, terminfo)) = termstream.next() {
@@ -374,12 +377,18
let text_termfreq = termfreq(&text, body, &index.tokenizer_for_field(body)?);

let reader = index.reader()?;
- assert!(
- termfreq_for_doc(&reader.searcher(), body, DocAddress(0, 0), |term, tf| {
+ assert!(termfreq_for_doc(
+ &reader.searcher(),
+ body,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 0
+ },
+ |term, tf| {
assert_eq!(Some(&tf), text_termfreq.get(&term));
- })
- .is_ok()
- );
+ }
+ )
+ .is_ok());

Ok(())
}
@@ -459,18 +468,36
&& doc_freq < num_docs
};

- let marley_keywords =
- topterms.extract_filtered_from_doc(5, DocAddress(0, 0), &keyword_filter);
+ let marley_keywords = topterms.extract_filtered_from_doc(
+ 5,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 0,
+ },
+ &keyword_filter,
+ );

assert_word_found("marley", marley_keywords);

- let holmes_keywords =
- topterms.extract_filtered_from_doc(5, DocAddress(0, 1), &keyword_filter);
+ let holmes_keywords = topterms.extract_filtered_from_doc(
+ 5,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 1,
+ },
+ &keyword_filter,
+ );

assert_word_found("dangerous", holmes_keywords);

- let groucho_keywords =
- topterms.extract_filtered_from_doc(5, DocAddress(0, 2), &keyword_filter);
+ let groucho_keywords = topterms.extract_filtered_from_doc(
+ 5,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 2,
+ },
+ &keyword_filter,
+ );

let reader = index.reader()?;
let searcher = reader.searcher();
@@ -480,7 +507,10
)?;

assert_eq!(
- Some(DocAddress(0, 2)),
+ Some(DocAddress {
+ segment_ord: 0,
+ doc_id: 2
+ }),
similar_to_groucho.first().map(|x| x.1),
"expected groucho's to be the most similar to its own keyword set"
);

Modified tique/src/conditional_collector/custom_score.rs

@@ -1,8 +1,8
use std::marker::PhantomData;

use tantivy::{
collector::{Collector, CustomScorer, CustomSegmentScorer, SegmentCollector},
- DocId, Result, Score, SegmentLocalId, SegmentReader,
+ DocId, Result, Score, SegmentOrdinal, SegmentReader,
};

use super::{
@@ -62,7 +62,7

fn for_segment(
&self,
- segment_id: SegmentLocalId,
+ segment_id: SegmentOrdinal,
reader: &SegmentReader,
) -> Result<Self::Child> {
let scorer = self.scorer_for_segment.segment_scorer(reader)?;
@@ -90,7 +90,7
C: CheckCondition<T>,
K: TopK<T, DocId>,
{
- pub fn new(segment_id: SegmentLocalId, topk: K, scorer: S, condition: C) -> Self {
+ pub fn new(segment_id: SegmentOrdinal, topk: K, scorer: S, condition: C) -> Self {
Self {
scorer,
collector: TopSegmentCollector::new(segment_id, topk, condition),
@@ -141,7 +141,7

let got = &res.items[0];
// Is disregarded and doc_id is used instead
- assert_eq!((got.1).1, got.0)
+ assert_eq!((got.1).doc_id, got.0)
}

#[test]

Modified tique/src/conditional_collector/mod.rs

@@ -56,7 +56,7
//! # let limit = 10;
//! let condition_for_segment = move |reader: &SegmentReader| {
//! // Fetch useful stuff from the `reader`, then:
-//! move |segment_id, doc_id, score, is_ascending| {
+//! move |segment_ord, doc_id, score, is_ascending| {
//! // Express whatever logic you want
//! true
//! }
@@ -80,7 +80,7
//! # use tantivy::DocAddress;
//! # use tique::conditional_collector::{TopCollector,Descending};
//! let limit = 10;
-//! let condition_for_segment = (0.42, DocAddress(0, 1));
+//! let condition_for_segment = (0.42, DocAddress{segment_ord: 0, doc_id: 1});
//! let collector =
//! TopCollector::<_, Descending, _>::new(limit, condition_for_segment);
//! ```

Modified tique/src/conditional_collector/top_collector.rs

@@ -1,8 +1,9
use std::marker::PhantomData;

use tantivy::{
collector::{Collector, CustomScorer, SegmentCollector},
- DocAddress, DocId, Result, Score, SegmentLocalId, SegmentReader,
+ fastfield::FastFieldReader,
+ DocAddress, DocId, Result, Score, SegmentOrdinal, SegmentReader,
};

use super::{
@@ -34,7 +35,7
/// # use tique::conditional_collector::{TopCollector,Ascending};
/// let condition_for_segment = |reader: &SegmentReader| {
/// // Fetch useful stuff from the `reader`, then:
-/// |segment_id, doc_id, score, is_ascending| {
+/// |segment_ord, doc_id, score, is_ascending| {
/// // Express whatever logic you want
/// true
/// }
@@ -182,11 +183,11

fn for_segment(
&self,
- segment_id: SegmentLocalId,
+ segment_ord: SegmentOrdinal,
reader: &SegmentReader,
) -> Result<Self::Child> {
Ok(TopSegmentCollector::new(
- segment_id,
+ segment_ord,
P::new_topk(self.limit),
self.condition_for_segment.for_segment(reader),
))
@@ -196,7 +197,7
pub struct TopSegmentCollector<T, K, C> {
total: usize,
visited: usize,
- segment_id: SegmentLocalId,
+ segment_ord: SegmentOrdinal,
topk: K,
condition: C,
_marker: PhantomData<T>,
@@ -208,11 +209,11
K: TopK<T, DocId>,
C: CheckCondition<T>,
{
- pub fn new(segment_id: SegmentLocalId, topk: K, condition: C) -> Self {
+ pub fn new(segment_ord: SegmentOrdinal, topk: K, condition: C) -> Self {
Self {
total: 0,
visited: 0,
- segment_id,
+ segment_ord,
topk,
condition,
_marker: PhantomData,
@@ -228,7 +229,7
self.total += 1;
if self
.condition
- .check(self.segment_id, doc, score, K::ASCENDING)
+ .check(self.segment_ord, doc, score, K::ASCENDING)
{
self.visited += 1;
self.topk.visit(doc, score);
@@ -236,12 +237,20
}

pub fn into_unsorted_collection_result(self) -> CollectionResult<T> {
- let segment_id = self.segment_id;
+ let segment_ord = self.segment_ord;
let items = self
.topk
.into_vec()
.into_iter()
- .map(|(doc, score)| (score, DocAddress(segment_id, doc)))
+ .map(|(doc_id, score)| {
+ (
+ score,
+ DocAddress {
+ segment_ord,
+ doc_id,
+ },
+ )
+ })
.collect();

CollectionResult {
@@ -354,8 +363,11
assert_eq!(4, result.total);
assert_eq!(2, result.items.len());
for (score, doc) in result.items {
- let DocAddress(seg_id, doc_id) = doc;
- assert!(condition(seg_id, doc_id, score, true))
+ let DocAddress {
+ segment_ord,
+ doc_id,
+ } = doc;
+ assert!(condition(segment_ord, doc_id, score, true))
}
}

@@ -392,7 +404,13
// then we pick an arbitrary position to pivot and
// expect the DescendingTopK to pick everything below
// and the AscendingTopK to pick everything above
- let marker = (0.5, DocAddress(0, 4));
+ let marker = (
+ 0.5,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 4,
+ },
+ );

check_segment_collector(
DescendingTopK::new(10),

Modified tique/src/conditional_collector/traits.rs

@@ -1,6 +1,6
use std::cmp::Ordering;

-use tantivy::{DocAddress, DocId, SegmentLocalId, SegmentReader};
+use tantivy::{DocAddress, DocId, SegmentOrdinal, SegmentReader};

use super::topk::Scored;

@@ -53,21 +53,21
///
/// The `ascending` parameter signals the ordering chosen via
/// `Ascending` or `Descending`
- fn check(&self, segment_id: SegmentLocalId, doc_id: DocId, score: T, ascending: bool) -> bool;
+ fn check(&self, segment_ord: SegmentOrdinal, doc_id: DocId, score: T, ascending: bool) -> bool;
}

impl<T> CheckCondition<T> for bool {
- fn check(&self, _: SegmentLocalId, _: DocId, _: T, _: bool) -> bool {
+ fn check(&self, _: SegmentOrdinal, _: DocId, _: T, _: bool) -> bool {
*self
}
}

impl<F, T> CheckCondition<T> for F
where
- F: 'static + Clone + Fn(SegmentLocalId, DocId, T, bool) -> bool,
+ F: 'static + Clone + Fn(SegmentOrdinal, DocId, T, bool) -> bool,
{
- fn check(&self, segment_id: SegmentLocalId, doc_id: DocId, score: T, ascending: bool) -> bool {
- (self)(segment_id, doc_id, score, ascending)
+ fn check(&self, segment_ord: SegmentOrdinal, doc_id: DocId, score: T, ascending: bool) -> bool {
+ (self)(segment_ord, doc_id, score, ascending)
}
}

@@ -75,14 +75,19
where
T: 'static + PartialOrd + Copy,
{
- fn check(&self, segment_id: SegmentLocalId, doc_id: DocId, score: T, ascending: bool) -> bool {
+ fn check(&self, segment_ord: SegmentOrdinal, doc_id: DocId, score: T, ascending: bool) -> bool {
let wanted = if ascending {
Ordering::Less
} else {
Ordering::Greater
};

- Scored::new(self.0, self.1).cmp(&Scored::new(score, DocAddress(segment_id, doc_id)))
- == wanted
+ Scored::new(self.0, self.1).cmp(&Scored::new(
+ score,
+ DocAddress {
+ segment_ord,
+ doc_id,
+ },
+ )) == wanted
}
}

Modified tique/src/queryparser/parser.rs

@@ -298,6 +298,12
use super::*;

use tantivy::tokenizer::TokenizerManager;
+ use tantivy::{
+ collector::TopDocs,
+ doc,
+ schema::{SchemaBuilder, TEXT},
+ DocAddress, SegmentOrdinal,
+ };

fn test_interpreter() -> Interpreter {
Interpreter {
@@ -357,12 +363,12
assert!(single_field_test_parser().parse("").is_none());
}

- use tantivy::{
- collector::TopDocs,
- doc,
- schema::{SchemaBuilder, TEXT},
- DocAddress,
- };
+ fn doc_addr(segment_ord: SegmentOrdinal, doc_id: u32) -> DocAddress {
+ DocAddress {
+ segment_ord,
+ doc_id,
+ }
+ }

#[test]
fn index_integration() -> Result<()> {
@@ -372,7 +378,7
let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?;

- let doc_across = DocAddress(0, 0);
+ let doc_across = doc_addr(0, 0);
writer.add_document(doc!(
title => "Across the Universe",
plot => "Musical based on The Beatles songbook and set in the 60s England, \
@@ -380,14 +386,14
with the anti-war movement and social protests of the 60s."
));

- let doc_moulin = DocAddress(0, 1);
+ let doc_moulin = doc_addr(0, 1);
writer.add_document(doc!(
title => "Moulin Rouge!",
plot => "A poet falls for a beautiful courtesan whom a jealous duke covets in \
this stylish musical, with music drawn from familiar 20th century sources."
));

- let doc_once = DocAddress(0, 2);
+ let doc_once = doc_addr(0, 2);
writer.add_document(doc!(
title => "Once",
plot => "A modern-day musical about a busker and an immigrant and their eventful\
@@ -464,7 +470,7
let found = searcher.search(&normal_query, &TopDocs::with_limit(3))?;
assert_eq!(3, found.len());
// the first doc matches perfectly on `field_b`
- assert_eq!(DocAddress(0, 0), found[0].1);
+ assert_eq!(doc_addr(0, 0), found[0].1);

parser.set_boost(field_a, Some(1.5));
let boosted_query = parser.parse(&input).unwrap();
@@ -473,7 +479,7
assert_eq!(3, found.len());
// the first doc matches perfectly on field_b
// but now matching on `field_a` is super important
- assert_eq!(DocAddress(0, 1), found[0].1);
+ assert_eq!(doc_addr(0, 1), found[0].1);

Ok(())
}