summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthieu Pignolet <matthieu@puffer.fish>2025-05-19 17:06:20 +0400
committerMatthieu Pignolet <matthieu@puffer.fish>2025-05-19 17:06:20 +0400
commitaa197695f6832d63dfb10f5cbd53e4adb2579805 (patch)
tree9718e0f94f2434f3d833c0b058b4331d1eedde64
parentca310bf1b62988fd0a5bf91531a8147a0f6a3343 (diff)
feat: add indexor code
-rw-r--r--bin/index/Cargo.toml10
-rw-r--r--bin/index/src/main.rs51
2 files changed, 61 insertions, 0 deletions
diff --git a/bin/index/Cargo.toml b/bin/index/Cargo.toml
new file mode 100644
index 0000000..098e60c
--- /dev/null
+++ b/bin/index/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "index"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+bincode = { version = "2.0.1", features = ["serde"] }
+csv = "1.3.1"
+db = { path = "../../libs/db" }
+serde = "1.0.219"
diff --git a/bin/index/src/main.rs b/bin/index/src/main.rs
new file mode 100644
index 0000000..31eb49f
--- /dev/null
+++ b/bin/index/src/main.rs
@@ -0,0 +1,51 @@
+use std::fs;
+
+use db::{
+ save::Save,
+ types::{GraphemeString, PhonemeString},
+};
+
+fn main() {
+ let mut save = Save::default();
+
+ // Read from the
+ let mut vocabulary = csv::ReaderBuilder::new()
+ .has_headers(false)
+ .double_quote(false)
+ .escape(Some(b'\\'))
+ .flexible(true)
+ .comment(Some(b'#'))
+ .from_path("./data/all.csv")
+ .unwrap();
+
+ let mut phonems = vec![];
+
+ // Reduce all the records into the save index
+ // this is used to get all the phonemes represented in the csv
+ for record in vocabulary.records() {
+ let record = record.unwrap();
+ let word = GraphemeString(record.get(0).unwrap().to_string());
+ let mut pron: Vec<PhonemeString> = record
+ .get(1)
+ .unwrap()
+ .split(',')
+ .map(|a| PhonemeString(a.to_string().trim().to_string()))
+ .collect();
+ for a in &pron {
+ save.reverse_index.insert(a.clone(), word.clone());
+ }
+ phonems.append(&mut pron);
+ }
+
+ for phoneme in phonems.iter() {
+ save.trie.insert(&phoneme);
+ }
+
+ fs::write(
+ "data/db.bin",
+ bincode::encode_to_vec(bincode::serde::Compat(&save), bincode::config::standard()).unwrap(),
+ )
+ .unwrap();
+
+ println!("Generated to assets/db.bin");
+}