]> git.puffer.fish Git - matthieu/gru.git/commitdiff
remove the fuzzy logic to replace with a proper logic based on syllabes
authorMatthieu Pignolet <matthieu@matthieu-dev.xyz>
Tue, 21 Jan 2025 10:17:26 +0000 (14:17 +0400)
committerMatthieu Pignolet <matthieu@matthieu-dev.xyz>
Tue, 21 Jan 2025 10:17:26 +0000 (14:17 +0400)
autofeur_db/Cargo.lock
autofeur_db/Cargo.toml
autofeur_db/src/inference.rs

index 1a4089411bbcb59ad095790b444850f7c919f011..d2cfdd11f03f10c0a4df19ef844ec85de09548e3 100644 (file)
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "adler"
@@ -57,9 +57,9 @@ dependencies = [
  "bincode",
  "csv",
  "hyper",
+ "hypher",
  "itertools",
  "kdam",
- "levenshtein",
  "querystring",
  "rand",
  "reqwest",
@@ -542,6 +542,12 @@ dependencies = [
  "tokio-native-tls",
 ]
 
+[[package]]
+name = "hypher"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b24ad5637230df201ab1034d593f1d09bf7f2a9274f2e8897638078579f4265"
+
 [[package]]
 name = "idna"
 version = "0.3.0"
@@ -644,12 +650,6 @@ version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
-[[package]]
-name = "levenshtein"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
-
 [[package]]
 name = "libc"
 version = "0.2.139"
index c305fb3ea32206488a156588474f134ed10dd015..ac0fa13f00ac284a42d125a8e4e138b7320c6863 100644 (file)
@@ -23,7 +23,7 @@ kdam = { version = "0.3", features = ["gradient", "template"] }
 anyhow = "1.0.68"
 itertools = "0.10.5"
 querystring = "1.1.0"
-levenshtein = "1.0.5"
+hypher = { version = "0.1", features = ["english", "french"] }
 
 [[bin]]
 name = "generate"
index b8f2f87ff6bf4f12012789ccbecd2d506f08abad..49192f3d4b4c806a733122e85f076ada951ada83 100644 (file)
@@ -1,9 +1,8 @@
-use std::{collections::VecDeque, env, ops::Add};
+use std::{env, ops::Add};
 
 use anyhow::anyhow;
+use hypher::hyphenate;
 use itertools::Itertools;
-use levenshtein::levenshtein;
-use unicode_segmentation::UnicodeSegmentation;
 
 use crate::save::Save;
 
@@ -38,26 +37,27 @@ impl Save<'_> {
 
         println!("Matching {} by adding {}", word, completion);
 
-        // we finally just need to compute the end of the word which matches the sound
-        let mut found = None;
+        let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec();
+        let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec();
 
-        let mut characters: VecDeque<&str> = word.graphemes(true).collect();
-        while let Some(_) = characters.pop_front() {
-            let sub: String = characters.iter().join("");
-            let inference = call_inference_service(&sub).await?;
+        // input:           test
+        // output found:    testames
+        // out syl:         tes - tames
+        // output expect:   tames
+        // we just need to remove the prefix
 
-            if levenshtein(&inference, &completion) < 5 {
-                found = Some(sub);
-                break;
+
+        let mut i = 0;
+        for (index, syl) in source_word_syllabes.iter().enumerate() {
+            if *source_word_syllabes[index] == **syl {
+                i = index
             } else {
-                if found.is_none() {
-                    found = Some(sub);
-                }
-                println!("did not match a={}, b={}", inference, completion)
+                break;
             }
         }
 
-        let found = found.ok_or_else(|| anyhow!("no prefix could be matched"))?;
+        completed_syllabes.drain(0..i);        // we finally just need to compute the end of the word which matches the sound
+        let found = completed_syllabes.join("");
         println!("{} is equivalent to {}", completion, found);
 
         Ok(format!("{} ({})", found, word))