Use multiple fields when indexing and querying
This patch splits the `fulltext` index field into its sources: name, ingredients and instructions, making them available for individual strict querying. Unsurprisingly, using an OR-by-default boolean-query-based with multiple fields yields awful results for simple queries. To account for that, we switch the query parser to use dismax with a 10% tiebreaking increment.
- Id
- 8ce2f7f9bddb2a4dce78b1ff17c6cb810a1503e8
- Author
- Caio
- Commit time
- 2020-03-17T09:48:24+01:00
Modified cantine/src/index.rs
#[derive(Clone)]
pub struct RecipeIndex {
pub id: Field,
- pub fulltext: Field,
+
+ pub name: Field,
+ pub ingredients: Field,
+ pub instructions: Field,
+
pub features_bincode: Field,
pub features: FeaturesFilterFields,
}
const FIELD_ID: &str = "id";
-const FIELD_FULLTEXT: &str = "fulltext";
+const FIELD_NAME: &str = "name";
+const FIELD_INGREDIENTS: &str = "ingredients";
+const FIELD_INSTRUCTIONS: &str = "instructions";
const FIELD_FEATURES_BINCODE: &str = "features_bincode";
impl RecipeIndex {
let mut doc = Document::new();
doc.add_u64(self.id, recipe.recipe_id);
- let mut fulltext = Vec::new();
+ doc.add_text(self.name, recipe.name.as_str());
- fulltext.push(recipe.name.as_str());
- for ingredient in &recipe.ingredients {
- fulltext.push(ingredient.as_str());
- }
- for instruction in &recipe.instructions {
- fulltext.push(instruction.as_str());
- }
- doc.add_text(self.fulltext, fulltext.join("\n").as_str());
+ recipe
+ .ingredients
+ .iter()
+ .for_each(|i| doc.add_text(self.ingredients, i));
+
+ recipe
+ .instructions
+ .iter()
+ .for_each(|i| doc.add_text(self.instructions, i));
doc.add_bytes(
self.features_bincode,
fn from(builder: &mut SchemaBuilder) -> Self {
RecipeIndex {
id: builder.add_u64_field(FIELD_ID, STORED | FAST),
- fulltext: builder.add_text_field(FIELD_FULLTEXT, TEXT),
+
+ name: builder.add_text_field(FIELD_NAME, TEXT),
+ ingredients: builder.add_text_field(FIELD_INGREDIENTS, TEXT),
+ instructions: builder.add_text_field(FIELD_INSTRUCTIONS, TEXT),
+
features_bincode: builder.add_bytes_field(FIELD_FEATURES_BINCODE),
features: Features::create_schema(builder, INDEXED | FAST),
}
type Error = TantivyError;
fn try_from(schema: &Schema) -> Result<Self> {
- let id = schema
- .get_field(FIELD_ID)
- .ok_or_else(|| TantivyError::SchemaError(format!("Missing field {}", FIELD_ID)))?;
-
- let fulltext = schema.get_field(FIELD_FULLTEXT).ok_or_else(|| {
- TantivyError::SchemaError(format!("Missing field {}", FIELD_FULLTEXT))
- })?;
-
- let features_bincode = schema.get_field(FIELD_FEATURES_BINCODE).ok_or_else(|| {
- TantivyError::SchemaError(format!("Missing field {}", FIELD_FEATURES_BINCODE))
- })?;
+ let get_field = |name| {
+ schema
+ .get_field(name)
+ .ok_or_else(|| TantivyError::SchemaError(format!("Missing field {}", name)))
+ };
Ok(RecipeIndex {
- id,
- fulltext,
- features_bincode,
+ id: get_field(FIELD_ID)?,
+
+ name: get_field(FIELD_NAME)?,
+ ingredients: get_field(FIELD_INGREDIENTS)?,
+ instructions: get_field(FIELD_INSTRUCTIONS)?,
+
+ features_bincode: get_field(FIELD_FEATURES_BINCODE)?,
features: FeaturesFilterFields::try_from(schema)?,
})
}
Modified cantine/src/main.rs
let mut subqueries: Vec<(Occur, Box<dyn Query>)> = Vec::new();
if let Some(fulltext) = &query.fulltext {
- if let Some(parsed) = self.query_parser.parse(fulltext.as_str()) {
+ if let Some(parsed) = self.query_parser.parse_dixmax(fulltext.as_str(), 0.1) {
subqueries.push((Occur::Must, parsed));
}
}
let index = Index::open_in_dir(&index_path)?;
let recipe_index = RecipeIndex::try_from(&index.schema())?;
- let query_parser = QueryParser::new(&index, vec![recipe_index.fulltext])?;
+ let mut query_parser = QueryParser::new(
+ &index,
+ vec![
+ recipe_index.name,
+ recipe_index.ingredients,
+ recipe_index.instructions,
+ ],
+ )?;
+
+ // XXX This is as scientific as "4" is random
+ // Reduce importance of instructions match
+ query_parser.set_boost(recipe_index.instructions, Some(0.7));
+ // And make name matches slightly more important than ingredient
+ query_parser.set_boost(recipe_index.name, Some(1.15));
let reader = index.reader()?;
let search_state = Arc::new(SearchState {
Modified cantine/tests/index_integration.rs
let reader = GLOBAL.index.reader()?;
let searcher = reader.searcher();
- let parser = QueryParser::new(&GLOBAL.index, vec![GLOBAL.cantine.fulltext])?;
+ let parser = QueryParser::new(
+ &GLOBAL.index,
+ vec![
+ GLOBAL.cantine.name,
+ GLOBAL.cantine.ingredients,
+ GLOBAL.cantine.instructions,
+ ],
+ )?;
let query = parser.parse("+potato +cheese").unwrap();
Modified cantine/src/bin/check_sim.rs
let recipe_index = Arc::new(RecipeIndex::try_from(&index.schema())?);
let database = Arc::new(DatabaseReader::<Recipe>::open(&db_path)?);
- let topterms = Arc::new(TopTerms::new(&index, vec![recipe_index.fulltext])?);
+ let topterms = Arc::new(TopTerms::new(
+ &index,
+ vec![
+ recipe_index.name,
+ recipe_index.ingredients,
+ recipe_index.instructions,
+ ],
+ )?);
let (id_sender, id_receiver) = crossbeam_channel::unbounded();
let (checked_sender, checked_receiver) = mpsc::channel();