caio.co/de/cantine

Use multiple fields when indexing and querying

This patch splits the `fulltext` index field into its sources: name,
ingredients and instructions, making them available for individual
strict querying.

Unsurprisingly, using an OR-by-default boolean-query-based with multiple
fields yields awful results for simple queries. To account for that, we
switch the query parser to use dismax with a 10% tiebreaking increment.
Id
8ce2f7f9bddb2a4dce78b1ff17c6cb810a1503e8
Author
Caio
Commit time
2020-03-17T09:48:24+01:00

Modified cantine/src/index.rs

@@ -25,13 +25,19
#[derive(Clone)]
pub struct RecipeIndex {
pub id: Field,
- pub fulltext: Field,
+
+ pub name: Field,
+ pub ingredients: Field,
+ pub instructions: Field,
+
pub features_bincode: Field,
pub features: FeaturesFilterFields,
}

const FIELD_ID: &str = "id";
-const FIELD_FULLTEXT: &str = "fulltext";
+const FIELD_NAME: &str = "name";
+const FIELD_INGREDIENTS: &str = "ingredients";
+const FIELD_INSTRUCTIONS: &str = "instructions";
const FIELD_FEATURES_BINCODE: &str = "features_bincode";

impl RecipeIndex {
@@ -39,16 +45,17
let mut doc = Document::new();
doc.add_u64(self.id, recipe.recipe_id);

- let mut fulltext = Vec::new();
+ doc.add_text(self.name, recipe.name.as_str());

- fulltext.push(recipe.name.as_str());
- for ingredient in &recipe.ingredients {
- fulltext.push(ingredient.as_str());
- }
- for instruction in &recipe.instructions {
- fulltext.push(instruction.as_str());
- }
- doc.add_text(self.fulltext, fulltext.join("\n").as_str());
+ recipe
+ .ingredients
+ .iter()
+ .for_each(|i| doc.add_text(self.ingredients, i));
+
+ recipe
+ .instructions
+ .iter()
+ .for_each(|i| doc.add_text(self.instructions, i));

doc.add_bytes(
self.features_bincode,
@@ -184,7 +191,11
fn from(builder: &mut SchemaBuilder) -> Self {
RecipeIndex {
id: builder.add_u64_field(FIELD_ID, STORED | FAST),
- fulltext: builder.add_text_field(FIELD_FULLTEXT, TEXT),
+
+ name: builder.add_text_field(FIELD_NAME, TEXT),
+ ingredients: builder.add_text_field(FIELD_INGREDIENTS, TEXT),
+ instructions: builder.add_text_field(FIELD_INSTRUCTIONS, TEXT),
+
features_bincode: builder.add_bytes_field(FIELD_FEATURES_BINCODE),
features: Features::create_schema(builder, INDEXED | FAST),
}
@@ -195,22 +206,20
type Error = TantivyError;

fn try_from(schema: &Schema) -> Result<Self> {
- let id = schema
- .get_field(FIELD_ID)
- .ok_or_else(|| TantivyError::SchemaError(format!("Missing field {}", FIELD_ID)))?;
-
- let fulltext = schema.get_field(FIELD_FULLTEXT).ok_or_else(|| {
- TantivyError::SchemaError(format!("Missing field {}", FIELD_FULLTEXT))
- })?;
-
- let features_bincode = schema.get_field(FIELD_FEATURES_BINCODE).ok_or_else(|| {
- TantivyError::SchemaError(format!("Missing field {}", FIELD_FEATURES_BINCODE))
- })?;
+ let get_field = |name| {
+ schema
+ .get_field(name)
+ .ok_or_else(|| TantivyError::SchemaError(format!("Missing field {}", name)))
+ };

Ok(RecipeIndex {
- id,
- fulltext,
- features_bincode,
+ id: get_field(FIELD_ID)?,
+
+ name: get_field(FIELD_NAME)?,
+ ingredients: get_field(FIELD_INGREDIENTS)?,
+ instructions: get_field(FIELD_INSTRUCTIONS)?,
+
+ features_bincode: get_field(FIELD_FEATURES_BINCODE)?,
features: FeaturesFilterFields::try_from(schema)?,
})
}

Modified cantine/src/main.rs

@@ -154,7 +154,7
let mut subqueries: Vec<(Occur, Box<dyn Query>)> = Vec::new();

if let Some(fulltext) = &query.fulltext {
- if let Some(parsed) = self.query_parser.parse(fulltext.as_str()) {
+ if let Some(parsed) = self.query_parser.parse_dixmax(fulltext.as_str(), 0.1) {
subqueries.push((Occur::Must, parsed));
}
}
@@ -218,7 +218,20

let index = Index::open_in_dir(&index_path)?;
let recipe_index = RecipeIndex::try_from(&index.schema())?;
- let query_parser = QueryParser::new(&index, vec![recipe_index.fulltext])?;
+ let mut query_parser = QueryParser::new(
+ &index,
+ vec![
+ recipe_index.name,
+ recipe_index.ingredients,
+ recipe_index.instructions,
+ ],
+ )?;
+
+ // XXX This is as scientific as "4" is random
+ // Reduce importance of instructions match
+ query_parser.set_boost(recipe_index.instructions, Some(0.7));
+ // And make name matches slightly more important than ingredient
+ query_parser.set_boost(recipe_index.name, Some(1.15));

let reader = index.reader()?;
let search_state = Arc::new(SearchState {

Modified cantine/tests/index_integration.rs

@@ -295,7 +295,14
let reader = GLOBAL.index.reader()?;
let searcher = reader.searcher();

- let parser = QueryParser::new(&GLOBAL.index, vec![GLOBAL.cantine.fulltext])?;
+ let parser = QueryParser::new(
+ &GLOBAL.index,
+ vec![
+ GLOBAL.cantine.name,
+ GLOBAL.cantine.ingredients,
+ GLOBAL.cantine.instructions,
+ ],
+ )?;

let query = parser.parse("+potato +cheese").unwrap();

Modified cantine/src/bin/check_sim.rs

@@ -45,7 +45,14

let recipe_index = Arc::new(RecipeIndex::try_from(&index.schema())?);
let database = Arc::new(DatabaseReader::<Recipe>::open(&db_path)?);
- let topterms = Arc::new(TopTerms::new(&index, vec![recipe_index.fulltext])?);
+ let topterms = Arc::new(TopTerms::new(
+ &index,
+ vec![
+ recipe_index.name,
+ recipe_index.ingredients,
+ recipe_index.instructions,
+ ],
+ )?);

let (id_sender, id_receiver) = crossbeam_channel::unbounded();
let (checked_sender, checked_receiver) = mpsc::channel();