Adapt tique to tantivy 0.15
Two simple changes: * SegmentLocalId is now SegmentOrd, so `segment_id` labels were also updated to `segment_ord` * DocAddress is now a struct with named members instead of a tuple newtype, so the code is changed to construct it like `DocAddress{...}` instead of `DocAddress(...)`
- Id
- 1994eee1924d67d411a3a2f59a154d1a80da22d8
- Author
- Caio
- Commit time
- 2021-06-14T16:09:42+02:00
Modified tique/Cargo.toml
queryparser = ["nom"]
[dependencies]
-tantivy = "0.14"
+tantivy = "0.15"
nom = { version = "6", optional = true }
[dev-dependencies]
Modified tique/examples/conditional_collector_tutorial.rs
use std::{cmp::Ordering, ops::Neg};
use tantivy::{
collector::TopDocs,
+ fastfield::FastFieldReader,
query::AllQuery,
schema::{SchemaBuilder, Value, FAST, STORED},
Document, Index, Result, SegmentReader,
Modified tique/src/dismax.rs
let dismax = DisMaxQuery::new(vec![Box::new(foo_query), Box::new(bar_query)], 0.0);
- let baz_doc = DocAddress(0, 3);
+ let baz_doc = DocAddress {
+ segment_ord: 0,
+ doc_id: 3,
+ };
assert!(
dismax.explain(&searcher, baz_doc).is_err(),
"Shouldn't be able to explain a non-matching doc"
// Ensure every other doc can be explained
for doc_id in 0..3 {
- let explanation = dismax.explain(&searcher, DocAddress(0, doc_id))?;
+ let explanation = dismax.explain(
+ &searcher,
+ DocAddress {
+ segment_ord: 0,
+ doc_id,
+ },
+ )?;
assert!(explanation.to_pretty_json().contains("DisMaxQuery"));
}
Modified tique/src/topterms.rs
where
F: FnMut(Term, u32),
{
- let DocAddress(seg_id, doc_id) = doc;
+ let DocAddress {
+ segment_ord,
+ doc_id,
+ } = doc;
- let reader = searcher.segment_reader(seg_id);
- let inverted_index = reader.inverted_index(field.clone())?;
+ let reader = searcher.segment_reader(segment_ord);
+ let inverted_index = reader.inverted_index(field)?;
let mut termstream = inverted_index.terms().stream()?;
while let Some((bytes, terminfo)) = termstream.next() {
let text_termfreq = termfreq(&text, body, &index.tokenizer_for_field(body)?);
let reader = index.reader()?;
- assert!(
- termfreq_for_doc(&reader.searcher(), body, DocAddress(0, 0), |term, tf| {
+ assert!(termfreq_for_doc(
+ &reader.searcher(),
+ body,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 0
+ },
+ |term, tf| {
assert_eq!(Some(&tf), text_termfreq.get(&term));
- })
- .is_ok()
- );
+ }
+ )
+ .is_ok());
Ok(())
}
&& doc_freq < num_docs
};
- let marley_keywords =
- topterms.extract_filtered_from_doc(5, DocAddress(0, 0), &keyword_filter);
+ let marley_keywords = topterms.extract_filtered_from_doc(
+ 5,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 0,
+ },
+ &keyword_filter,
+ );
assert_word_found("marley", marley_keywords);
- let holmes_keywords =
- topterms.extract_filtered_from_doc(5, DocAddress(0, 1), &keyword_filter);
+ let holmes_keywords = topterms.extract_filtered_from_doc(
+ 5,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 1,
+ },
+ &keyword_filter,
+ );
assert_word_found("dangerous", holmes_keywords);
- let groucho_keywords =
- topterms.extract_filtered_from_doc(5, DocAddress(0, 2), &keyword_filter);
+ let groucho_keywords = topterms.extract_filtered_from_doc(
+ 5,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 2,
+ },
+ &keyword_filter,
+ );
let reader = index.reader()?;
let searcher = reader.searcher();
)?;
assert_eq!(
- Some(DocAddress(0, 2)),
+ Some(DocAddress {
+ segment_ord: 0,
+ doc_id: 2
+ }),
similar_to_groucho.first().map(|x| x.1),
"expected groucho's to be the most similar to its own keyword set"
);
Modified tique/src/conditional_collector/custom_score.rs
use std::marker::PhantomData;
use tantivy::{
collector::{Collector, CustomScorer, CustomSegmentScorer, SegmentCollector},
- DocId, Result, Score, SegmentLocalId, SegmentReader,
+ DocId, Result, Score, SegmentOrdinal, SegmentReader,
};
use super::{
fn for_segment(
&self,
- segment_id: SegmentLocalId,
+ segment_id: SegmentOrdinal,
reader: &SegmentReader,
) -> Result<Self::Child> {
let scorer = self.scorer_for_segment.segment_scorer(reader)?;
C: CheckCondition<T>,
K: TopK<T, DocId>,
{
- pub fn new(segment_id: SegmentLocalId, topk: K, scorer: S, condition: C) -> Self {
+ pub fn new(segment_id: SegmentOrdinal, topk: K, scorer: S, condition: C) -> Self {
Self {
scorer,
collector: TopSegmentCollector::new(segment_id, topk, condition),
let got = &res.items[0];
// Is disregarded and doc_id is used instead
- assert_eq!((got.1).1, got.0)
+ assert_eq!((got.1).doc_id, got.0)
}
#[test]
Modified tique/src/conditional_collector/mod.rs
//! # let limit = 10;
//! let condition_for_segment = move |reader: &SegmentReader| {
//! // Fetch useful stuff from the `reader`, then:
-//! move |segment_id, doc_id, score, is_ascending| {
+//! move |segment_ord, doc_id, score, is_ascending| {
//! // Express whatever logic you want
//! true
//! }
//! # use tantivy::DocAddress;
//! # use tique::conditional_collector::{TopCollector,Descending};
//! let limit = 10;
-//! let condition_for_segment = (0.42, DocAddress(0, 1));
+//! let condition_for_segment = (0.42, DocAddress{segment_ord: 0, doc_id: 1});
//! let collector =
//! TopCollector::<_, Descending, _>::new(limit, condition_for_segment);
//! ```
Modified tique/src/conditional_collector/top_collector.rs
use std::marker::PhantomData;
use tantivy::{
collector::{Collector, CustomScorer, SegmentCollector},
- DocAddress, DocId, Result, Score, SegmentLocalId, SegmentReader,
+ fastfield::FastFieldReader,
+ DocAddress, DocId, Result, Score, SegmentOrdinal, SegmentReader,
};
use super::{
/// # use tique::conditional_collector::{TopCollector,Ascending};
/// let condition_for_segment = |reader: &SegmentReader| {
/// // Fetch useful stuff from the `reader`, then:
-/// |segment_id, doc_id, score, is_ascending| {
+/// |segment_ord, doc_id, score, is_ascending| {
/// // Express whatever logic you want
/// true
/// }
fn for_segment(
&self,
- segment_id: SegmentLocalId,
+ segment_ord: SegmentOrdinal,
reader: &SegmentReader,
) -> Result<Self::Child> {
Ok(TopSegmentCollector::new(
- segment_id,
+ segment_ord,
P::new_topk(self.limit),
self.condition_for_segment.for_segment(reader),
))
pub struct TopSegmentCollector<T, K, C> {
total: usize,
visited: usize,
- segment_id: SegmentLocalId,
+ segment_ord: SegmentOrdinal,
topk: K,
condition: C,
_marker: PhantomData<T>,
K: TopK<T, DocId>,
C: CheckCondition<T>,
{
- pub fn new(segment_id: SegmentLocalId, topk: K, condition: C) -> Self {
+ pub fn new(segment_ord: SegmentOrdinal, topk: K, condition: C) -> Self {
Self {
total: 0,
visited: 0,
- segment_id,
+ segment_ord,
topk,
condition,
_marker: PhantomData,
self.total += 1;
if self
.condition
- .check(self.segment_id, doc, score, K::ASCENDING)
+ .check(self.segment_ord, doc, score, K::ASCENDING)
{
self.visited += 1;
self.topk.visit(doc, score);
}
pub fn into_unsorted_collection_result(self) -> CollectionResult<T> {
- let segment_id = self.segment_id;
+ let segment_ord = self.segment_ord;
let items = self
.topk
.into_vec()
.into_iter()
- .map(|(doc, score)| (score, DocAddress(segment_id, doc)))
+ .map(|(doc_id, score)| {
+ (
+ score,
+ DocAddress {
+ segment_ord,
+ doc_id,
+ },
+ )
+ })
.collect();
CollectionResult {
assert_eq!(4, result.total);
assert_eq!(2, result.items.len());
for (score, doc) in result.items {
- let DocAddress(seg_id, doc_id) = doc;
- assert!(condition(seg_id, doc_id, score, true))
+ let DocAddress {
+ segment_ord,
+ doc_id,
+ } = doc;
+ assert!(condition(segment_ord, doc_id, score, true))
}
}
// then we pick an arbitrary position to pivot and
// expect the DescendingTopK to pick everything below
// and the AscendingTopK to pick everything above
- let marker = (0.5, DocAddress(0, 4));
+ let marker = (
+ 0.5,
+ DocAddress {
+ segment_ord: 0,
+ doc_id: 4,
+ },
+ );
check_segment_collector(
DescendingTopK::new(10),
Modified tique/src/conditional_collector/traits.rs
use std::cmp::Ordering;
-use tantivy::{DocAddress, DocId, SegmentLocalId, SegmentReader};
+use tantivy::{DocAddress, DocId, SegmentOrdinal, SegmentReader};
use super::topk::Scored;
///
/// The `ascending` parameter signals the ordering chosen via
/// `Ascending` or `Descending`
- fn check(&self, segment_id: SegmentLocalId, doc_id: DocId, score: T, ascending: bool) -> bool;
+ fn check(&self, segment_ord: SegmentOrdinal, doc_id: DocId, score: T, ascending: bool) -> bool;
}
impl<T> CheckCondition<T> for bool {
- fn check(&self, _: SegmentLocalId, _: DocId, _: T, _: bool) -> bool {
+ fn check(&self, _: SegmentOrdinal, _: DocId, _: T, _: bool) -> bool {
*self
}
}
impl<F, T> CheckCondition<T> for F
where
- F: 'static + Clone + Fn(SegmentLocalId, DocId, T, bool) -> bool,
+ F: 'static + Clone + Fn(SegmentOrdinal, DocId, T, bool) -> bool,
{
- fn check(&self, segment_id: SegmentLocalId, doc_id: DocId, score: T, ascending: bool) -> bool {
- (self)(segment_id, doc_id, score, ascending)
+ fn check(&self, segment_ord: SegmentOrdinal, doc_id: DocId, score: T, ascending: bool) -> bool {
+ (self)(segment_ord, doc_id, score, ascending)
}
}
where
T: 'static + PartialOrd + Copy,
{
- fn check(&self, segment_id: SegmentLocalId, doc_id: DocId, score: T, ascending: bool) -> bool {
+ fn check(&self, segment_ord: SegmentOrdinal, doc_id: DocId, score: T, ascending: bool) -> bool {
let wanted = if ascending {
Ordering::Less
} else {
Ordering::Greater
};
- Scored::new(self.0, self.1).cmp(&Scored::new(score, DocAddress(segment_id, doc_id)))
- == wanted
+ Scored::new(self.0, self.1).cmp(&Scored::new(
+ score,
+ DocAddress {
+ segment_ord,
+ doc_id,
+ },
+ )) == wanted
}
}
Modified tique/src/queryparser/parser.rs
use super::*;
use tantivy::tokenizer::TokenizerManager;
+ use tantivy::{
+ collector::TopDocs,
+ doc,
+ schema::{SchemaBuilder, TEXT},
+ DocAddress, SegmentOrdinal,
+ };
fn test_interpreter() -> Interpreter {
Interpreter {
assert!(single_field_test_parser().parse("").is_none());
}
- use tantivy::{
- collector::TopDocs,
- doc,
- schema::{SchemaBuilder, TEXT},
- DocAddress,
- };
+ fn doc_addr(segment_ord: SegmentOrdinal, doc_id: u32) -> DocAddress {
+ DocAddress {
+ segment_ord,
+ doc_id,
+ }
+ }
#[test]
fn index_integration() -> Result<()> {
let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
- let doc_across = DocAddress(0, 0);
+ let doc_across = doc_addr(0, 0);
writer.add_document(doc!(
title => "Across the Universe",
plot => "Musical based on The Beatles songbook and set in the 60s England, \
with the anti-war movement and social protests of the 60s."
));
- let doc_moulin = DocAddress(0, 1);
+ let doc_moulin = doc_addr(0, 1);
writer.add_document(doc!(
title => "Moulin Rouge!",
plot => "A poet falls for a beautiful courtesan whom a jealous duke covets in \
this stylish musical, with music drawn from familiar 20th century sources."
));
- let doc_once = DocAddress(0, 2);
+ let doc_once = doc_addr(0, 2);
writer.add_document(doc!(
title => "Once",
plot => "A modern-day musical about a busker and an immigrant and their eventful\
let found = searcher.search(&normal_query, &TopDocs::with_limit(3))?;
assert_eq!(3, found.len());
// the first doc matches perfectly on `field_b`
- assert_eq!(DocAddress(0, 0), found[0].1);
+ assert_eq!(doc_addr(0, 0), found[0].1);
parser.set_boost(field_a, Some(1.5));
let boosted_query = parser.parse(&input).unwrap();
assert_eq!(3, found.len());
// the first doc matches perfectly on field_b
// but now matching on `field_a` is super important
- assert_eq!(DocAddress(0, 1), found[0].1);
+ assert_eq!(doc_addr(0, 1), found[0].1);
Ok(())
}