diff options
| author | Matthieu Pignolet <matthieu@matthieu-dev.xyz> | 2025-01-21 14:17:26 +0400 |
|---|---|---|
| committer | Matthieu Pignolet <matthieu@matthieu-dev.xyz> | 2025-01-21 14:17:26 +0400 |
| commit | 3d88aa6b6244c73720e5067ffeb0e77d6edc8f8b (patch) | |
| tree | 68ca1e0bed6de6e46dfe11c03608dbd3f2c48e08 | |
| parent | c4c27a43bd32a3dcd9fe6c984b2b6b6521c5def9 (diff) | |
remove the fuzzy logic to replace with a proper logic based on syllabes
| -rw-r--r-- | autofeur_db/Cargo.lock | 16 | ||||
| -rw-r--r-- | autofeur_db/Cargo.toml | 2 | ||||
| -rw-r--r-- | autofeur_db/src/inference.rs | 34 |
3 files changed, 26 insertions, 26 deletions
diff --git a/autofeur_db/Cargo.lock b/autofeur_db/Cargo.lock index 1a40894..d2cfdd1 100644 --- a/autofeur_db/Cargo.lock +++ b/autofeur_db/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "adler" @@ -57,9 +57,9 @@ dependencies = [ "bincode", "csv", "hyper", + "hypher", "itertools", "kdam", - "levenshtein", "querystring", "rand", "reqwest", @@ -543,6 +543,12 @@ dependencies = [ ] [[package]] +name = "hypher" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b24ad5637230df201ab1034d593f1d09bf7f2a9274f2e8897638078579f4265" + +[[package]] name = "idna" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -645,12 +651,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] -name = "levenshtein" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760" - -[[package]] name = "libc" version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/autofeur_db/Cargo.toml b/autofeur_db/Cargo.toml index c305fb3..ac0fa13 100644 --- a/autofeur_db/Cargo.toml +++ b/autofeur_db/Cargo.toml @@ -23,7 +23,7 @@ kdam = { version = "0.3", features = ["gradient", "template"] } anyhow = "1.0.68" itertools = "0.10.5" querystring = "1.1.0" -levenshtein = "1.0.5" +hypher = { version = "0.1", features = ["english", "french"] } [[bin]] name = "generate" diff --git a/autofeur_db/src/inference.rs b/autofeur_db/src/inference.rs index b8f2f87..49192f3 100644 --- a/autofeur_db/src/inference.rs +++ b/autofeur_db/src/inference.rs @@ -1,9 +1,8 @@ -use std::{collections::VecDeque, env, ops::Add}; +use std::{env, ops::Add}; use anyhow::anyhow; +use hypher::hyphenate; use itertools::Itertools; -use levenshtein::levenshtein; -use unicode_segmentation::UnicodeSegmentation; use crate::save::Save; @@ -38,26 +37,27 @@ impl Save<'_> { println!("Matching {} by adding {}", word, completion); - // we finally just need to compute the end of the word which matches the sound - let mut found = None; + let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec(); + let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec(); - let mut characters: VecDeque<&str> = word.graphemes(true).collect(); - while let Some(_) = characters.pop_front() { - let sub: String = characters.iter().join(""); - let inference = call_inference_service(&sub).await?; + // input: test + // output found: testames + // out syl: tes - tames + // output expect: tames + // we just need to remove the prefix - if levenshtein(&inference, &completion) < 5 { - found = Some(sub); - break; + + let mut i = 0; + for (index, syl) in source_word_syllabes.iter().enumerate() { + if *source_word_syllabes[index] == **syl { + i = index } else { - if found.is_none() { - found = Some(sub); - } - println!("did not match a={}, b={}", inference, completion) + break; } } - let found = found.ok_or_else(|| anyhow!("no prefix could be matched"))?; + completed_syllabes.drain(0..i); // we finally just need to compute the end of the word which matches the sound + let found = completed_syllabes.join(""); println!("{} is equivalent to {}", completion, found); Ok(format!("{} ({})", found, word)) |
