From 3d88aa6b6244c73720e5067ffeb0e77d6edc8f8b Mon Sep 17 00:00:00 2001 From: Matthieu Pignolet Date: Tue, 21 Jan 2025 14:17:26 +0400 Subject: [PATCH] remove the fuzzy logic to replace with a proper logic based on syllabes --- autofeur_db/Cargo.lock | 16 ++++++++-------- autofeur_db/Cargo.toml | 2 +- autofeur_db/src/inference.rs | 34 +++++++++++++++++----------------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/autofeur_db/Cargo.lock b/autofeur_db/Cargo.lock index 1a40894..d2cfdd1 100644 --- a/autofeur_db/Cargo.lock +++ b/autofeur_db/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "adler" @@ -57,9 +57,9 @@ dependencies = [ "bincode", "csv", "hyper", + "hypher", "itertools", "kdam", - "levenshtein", "querystring", "rand", "reqwest", @@ -542,6 +542,12 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "hypher" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b24ad5637230df201ab1034d593f1d09bf7f2a9274f2e8897638078579f4265" + [[package]] name = "idna" version = "0.3.0" @@ -644,12 +650,6 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" -[[package]] -name = "levenshtein" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760" - [[package]] name = "libc" version = "0.2.139" diff --git a/autofeur_db/Cargo.toml b/autofeur_db/Cargo.toml index c305fb3..ac0fa13 100644 --- a/autofeur_db/Cargo.toml +++ b/autofeur_db/Cargo.toml @@ -23,7 +23,7 @@ kdam = { version = "0.3", features = ["gradient", "template"] } anyhow = "1.0.68" itertools = "0.10.5" querystring = "1.1.0" -levenshtein = "1.0.5" +hypher = { version = "0.1", features = ["english", "french"] } [[bin]] name = "generate" diff --git a/autofeur_db/src/inference.rs b/autofeur_db/src/inference.rs index b8f2f87..49192f3 100644 --- a/autofeur_db/src/inference.rs +++ b/autofeur_db/src/inference.rs @@ -1,9 +1,8 @@ -use std::{collections::VecDeque, env, ops::Add}; +use std::{env, ops::Add}; use anyhow::anyhow; +use hypher::hyphenate; use itertools::Itertools; -use levenshtein::levenshtein; -use unicode_segmentation::UnicodeSegmentation; use crate::save::Save; @@ -38,26 +37,27 @@ impl Save<'_> { println!("Matching {} by adding {}", word, completion); - // we finally just need to compute the end of the word which matches the sound - let mut found = None; + let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec(); + let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec(); - let mut characters: VecDeque<&str> = word.graphemes(true).collect(); - while let Some(_) = characters.pop_front() { - let sub: String = characters.iter().join(""); - let inference = call_inference_service(&sub).await?; + // input: test + // output found: testames + // out syl: tes - tames + // output expect: tames + // we just need to remove the prefix - if levenshtein(&inference, &completion) < 5 { - found = Some(sub); - break; + + let mut i = 0; + for (index, syl) in source_word_syllabes.iter().enumerate() { + if *source_word_syllabes[index] == **syl { + i = index } else { - if found.is_none() { - found = Some(sub); - } - println!("did not match a={}, b={}", inference, completion) + break; } } - let found = found.ok_or_else(|| anyhow!("no prefix could be matched"))?; + completed_syllabes.drain(0..i); // we finally just need to compute the end of the word which matches the sound + let found = completed_syllabes.join(""); println!("{} is equivalent to {}", completion, found); Ok(format!("{} ({})", found, word)) -- 2.39.5