caio.co/de/cantine

Read options solely from `std::env`

StructOpt + clap are great and all, but I drive everything with
environment variables in production already and am more comfortable
with the language, so spending extra compilation time for the sake
of a cute CLI interface is not worth it anymore.

My 2013 CPU is thankful. Now my biggest gripe WRT build is that
`tantivy` pulls `failure` which implies a build dependency on GCC
for the sake of stacktraces... So xbuild for ARM is a bit annoying.
Id
486b5659d1b121a1d3c7bd94468cc4b8f8f4b169
Author
Caio
Commit time
2020-02-02T12:04:04+01:00

Modified cantine/Cargo.toml

@@ -23,7 +23,6
memmap = "0.7"
serde_json = "1.0"
serde = { version = "1.0", features = ["derive"] }
-structopt = { version = "0.3", default-features = false }
tantivy = "0.11"
uuid = { version = "0.8", features = ["serde"] }
zerocopy = "0.2"

Modified cantine/src/main.rs

@@ -1,8 +1,7
-use std::{convert::TryFrom, path::PathBuf, sync::Arc};
+use std::{convert::TryFrom, env, io, path::Path, str::FromStr, sync::Arc};

use env_logger;
use serde::Serialize;
-use structopt::StructOpt;
use tique::queryparser::QueryParser;
use uuid::Uuid;

@@ -23,15 +22,6
RecipeInfo, SearchCursor, SearchQuery, SearchResult, Sort,
},
};
-
-#[derive(Debug, StructOpt)]
-pub struct ApiOptions {
- /// Path to the data directory
- base_path: PathBuf,
- /// Only aggregate when found less recipes than given threshold
- #[structopt(short, long)]
- agg_threshold: Option<usize>,
-}

type RecipeDatabase = Arc<DatabaseReader<Recipe>>;

@@ -200,13 +190,31
}
}

+const BASE_DIR: &str = "BASE_DIR";
+const AGG_THRESHOLD: &str = "AGG_THRESHOLD";
+
+fn get_env(key: &str) -> Result<String> {
+ env::var(key).map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, key).into())
+}
+
#[actix_rt::main]
async fn main() -> Result<()> {
env_logger::init();
- let options = ApiOptions::from_args();

- let index_path = options.base_path.join("tantivy");
- let db_path = options.base_path.join("database");
+ let base_dir = get_env(BASE_DIR)?;
+ let threshold = get_env(AGG_THRESHOLD)
+ .ok()
+ .map(|v| usize::from_str(&v).expect("valid usize"));
+
+ log::info!(
+ "Starting with base_dir={} agg_threshold={:?}",
+ base_dir,
+ threshold
+ );
+
+ let base_path = Path::new(&base_dir);
+ let index_path = base_path.join("tantivy");
+ let db_path = base_path.join("database");

let index = Index::open_in_dir(&index_path)?;
let recipe_index = RecipeIndex::try_from(&index.schema())?;
@@ -216,13 +224,12
true,
);

- let agg_threshold = options.agg_threshold.unwrap_or(std::usize::MAX);
let reader = index.reader()?;
let search_state = Arc::new(SearchState {
reader,
recipe_index,
query_parser,
- agg_threshold,
+ agg_threshold: threshold.unwrap_or(std::usize::MAX),
});

let database: RecipeDatabase = Arc::new(DatabaseReader::open(&db_path)?);

Modified cantine/src/bin/load.rs

@@ -1,8 +1,8
use std::{
+ env,
io::{self, BufRead},
- num::NonZeroUsize,
path::Path,
- result::Result as StdResult,
+ str::FromStr,
sync::{mpsc::channel, Arc, RwLock},
thread::spawn,
time::Instant,
@@ -10,7 +10,6

use crossbeam_channel::unbounded;
use serde_json;
-use structopt::StructOpt;

use tantivy::{self, directory::MmapDirectory, schema::SchemaBuilder, Index, Result};

@@ -19,33 +18,20
use cantine::model::Recipe;

/// Loads recipes as json into cantine's database and index
-#[derive(Debug, StructOpt)]
-#[structopt(name = "load")]
+#[derive(Debug)]
pub struct LoadOptions {
/// Size for tantivy's writer buffer in MBs
- #[structopt(short, long, default_value = "1000")]
- buffer_size: NonZeroUsize,
+ buffer_size: usize,
/// How many recipes to ingest before comitting
- #[structopt(short, long, default_value = "300000")]
- commit_every: NonZeroUsize,
+ commit_every: usize,
/// Number of worker threads to start
- #[structopt(short, long, default_value = "4")]
- num_producers: NonZeroUsize,
+ num_producers: usize,
/// Path to a non-existing directory
- #[structopt(validator = does_not_exist)]
output_dir: String,
}

-fn does_not_exist(dir_path: String) -> StdResult<(), String> {
- if Path::new(dir_path.as_str()).exists() {
- Err("Path already exists".to_owned())
- } else {
- Ok(())
- }
-}
-
fn load(options: LoadOptions) -> Result<()> {
- println!("Started with {:?}", &options);
+ log::info!("Started with {:?}", &options);

let base_path = Path::new(options.output_dir.as_str());
let db_path = base_path.join("database");
@@ -65,10 +51,10
// A MpSc channel to control index commit and write to db
let (recipe_sender, recipe_receiver) = channel();

- let buffer_size = options.buffer_size.get();
+ let buffer_size = options.buffer_size;
let writer = Arc::new(RwLock::new(index.writer(buffer_size * 1_000_000)?));

- let num_producers = options.num_producers.get();
+ let num_producers = options.num_producers;
let mut workers = Vec::with_capacity(num_producers);
for _ in 0..num_producers {
let receiver = line_receiver.clone();
@@ -101,10 +87,10
num_recipes += 1;
db.append(&recipe)?;

- if num_recipes % options.commit_every.get() == 0 {
+ if num_recipes % options.commit_every == 0 {
writer.write()?.commit()?;

- println!(
+ log::info!(
"DiskWriter: {} Documents so far (@ {} secs).",
num_recipes,
cur.elapsed().as_secs()
@@ -114,7 +100,7

writer.write()?.commit()?;

- println!(
+ log::info!(
"DiskWriter: Wrote {} documents in {} seconds",
num_recipes,
cur.elapsed().as_secs()
@@ -137,11 +123,39

disk_writer.join().unwrap()?;

- println!("Done!");
+ log::info!("Done!");

Ok(())
}

+const BUFFER_SIZE: &str = "BUFFER_SIZE";
+const COMMIT_EVERY: &str = "COMMIT_EVERY";
+const NUM_PRODUCERS: &str = "NUM_PRODUCERS";
+
+fn get_usize_from_env_or(key: &str, default: usize) -> usize {
+ env::var(key)
+ .ok()
+ .map(|v| usize::from_str(&v).expect("valid usize"))
+ .unwrap_or(default)
+}
+
fn main() -> Result<()> {
- load(LoadOptions::from_args())
+ let output_dir = env::args()
+ .nth(1)
+ .expect("First parameter must be the output directory");
+
+ let buffer_size = get_usize_from_env_or(BUFFER_SIZE, 1000);
+
+ let commit_every = get_usize_from_env_or(COMMIT_EVERY, 300_000);
+
+ let num_producers = get_usize_from_env_or(NUM_PRODUCERS, 4);
+
+ let options = LoadOptions {
+ output_dir,
+ buffer_size,
+ commit_every,
+ num_producers,
+ };
+
+ load(options)
}