caio.co/de/cantine

Support for conversion into weighted queries

Id
41b86436f3692d3fc201dbd29b062d913d909fca
Author
Caio
Commit time
2020-02-19T15:58:19+01:00

Modified tique/src/lib.rs

@@ -33,6 +33,8
//! # let index = Index::create_in_ram(builder.build());
//! let topterms = TopTerms::new(&index, vec![body, title])?;
//! let keywords = topterms.extract(5, "the quick fox jumps over the lazy dog");
+//!
+//! let similarity_query = keywords.into_boosted_query(1.0);
//! # Ok::<(), tantivy::TantivyError>(())
//!```

Modified tique/src/topterms.rs

@@ -75,7 +75,7
use std::{collections::HashMap, str};

use tantivy::{
- query::BooleanQuery,
+ query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery},
schema::{Field, FieldType, IndexRecordOption, Schema},
tokenizer::TextAnalyzer,
DocAddress, DocSet, Index, IndexReader, Postings, Result, Searcher, SkipResult, Term,
@@ -226,6 +226,25
BooleanQuery::new_multiterms_query(self.0.into_iter().map(|(term, _score)| term).collect())
}

+ /// Same as `into_query`, but with terms boosted by their
+ /// relative importance. The boost for each term is computed
+ /// as `boost_factor * (score / max_score)`.
+ /// The `boost_factor` parameter is useful when building more
+ /// complex queries; `1.0` is a good default.
+ pub fn into_boosted_query(self, boost_factor: f32) -> BooleanQuery {
+ let max_score = self.0.first().map(|(_term, score)| *score).unwrap_or(0.0);
+
+ let mut clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
+
+ for (term, score) in self.0.into_iter() {
+ let boost = boost_factor * (score / max_score);
+ let tq = Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
+ clauses.push((Occur::Should, Box::new(BoostQuery::new(tq, boost))));
+ }
+
+ BooleanQuery::from(clauses)
+ }
+
/// Iterates over the terms of this keywords set, more relevant
/// terms appear first
pub fn terms(&self) -> impl Iterator<Item = &Term> {
@@ -238,8 +257,6
pub fn into_sorted_vec(self) -> Vec<(Term, f32)> {
self.0
}
-
- // TODO into_boosted_query, using the scaled tf/idf scores scaled with
}

impl From<DescendingTopK<f32, Term>> for Keywords {