diff options
| author | Matthieu Pignolet <matthieu@puffer.fish> | 2025-05-19 17:06:20 +0400 |
|---|---|---|
| committer | Matthieu Pignolet <matthieu@puffer.fish> | 2025-05-19 17:06:20 +0400 |
| commit | aa197695f6832d63dfb10f5cbd53e4adb2579805 (patch) | |
| tree | 9718e0f94f2434f3d833c0b058b4331d1eedde64 | |
| parent | ca310bf1b62988fd0a5bf91531a8147a0f6a3343 (diff) | |
feat: add indexor code
| -rw-r--r-- | bin/index/Cargo.toml | 10 | ||||
| -rw-r--r-- | bin/index/src/main.rs | 51 |
2 files changed, 61 insertions, 0 deletions
diff --git a/bin/index/Cargo.toml b/bin/index/Cargo.toml new file mode 100644 index 0000000..098e60c --- /dev/null +++ b/bin/index/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "index" +version = "0.1.0" +edition = "2024" + +[dependencies] +bincode = { version = "2.0.1", features = ["serde"] } +csv = "1.3.1" +db = { path = "../../libs/db" } +serde = "1.0.219" diff --git a/bin/index/src/main.rs b/bin/index/src/main.rs new file mode 100644 index 0000000..31eb49f --- /dev/null +++ b/bin/index/src/main.rs @@ -0,0 +1,51 @@ +use std::fs; + +use db::{ + save::Save, + types::{GraphemeString, PhonemeString}, +}; + +fn main() { + let mut save = Save::default(); + + // Read from the + let mut vocabulary = csv::ReaderBuilder::new() + .has_headers(false) + .double_quote(false) + .escape(Some(b'\\')) + .flexible(true) + .comment(Some(b'#')) + .from_path("./data/all.csv") + .unwrap(); + + let mut phonems = vec![]; + + // Reduce all the records into the save index + // this is used to get all the phonemes represented in the csv + for record in vocabulary.records() { + let record = record.unwrap(); + let word = GraphemeString(record.get(0).unwrap().to_string()); + let mut pron: Vec<PhonemeString> = record + .get(1) + .unwrap() + .split(',') + .map(|a| PhonemeString(a.to_string().trim().to_string())) + .collect(); + for a in &pron { + save.reverse_index.insert(a.clone(), word.clone()); + } + phonems.append(&mut pron); + } + + for phoneme in phonems.iter() { + save.trie.insert(&phoneme); + } + + fs::write( + "data/db.bin", + bincode::encode_to_vec(bincode::serde::Compat(&save), bincode::config::standard()).unwrap(), + ) + .unwrap(); + + println!("Generated to assets/db.bin"); +} |
