]> git.puffer.fish Git - matthieu/gru.git/commitdiff
proper utf8 support for input words & better "cutting" using scyllabus cuts
authorMatthieu Pignolet <m@mpgn.dev>
Wed, 29 Jan 2025 09:24:30 +0000 (13:24 +0400)
committerMatthieu Pignolet <m@mpgn.dev>
Wed, 29 Jan 2025 09:24:30 +0000 (13:24 +0400)
autofeur_db/Cargo.lock
autofeur_db/Cargo.toml
autofeur_db/src/bin/server.rs
autofeur_db/src/inference.rs
discordjs/src/index.mjs

index d2cfdd11f03f10c0a4df19ef844ec85de09548e3..ece64ce12174a31884a59ebeff4ce10bf5294ef9 100644 (file)
@@ -60,16 +60,19 @@ dependencies = [
  "hypher",
  "itertools",
  "kdam",
+ "levenshtein",
  "querystring",
  "rand",
  "reqwest",
  "serde",
  "serde_json",
+ "strsim",
  "tokio",
  "tower",
  "tower-http",
  "trie-rs",
  "unicode-segmentation",
+ "url",
 ]
 
 [[package]]
@@ -650,6 +653,12 @@ version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
+[[package]]
+name = "levenshtein"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
+
 [[package]]
 name = "libc"
 version = "0.2.139"
@@ -1239,6 +1248,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "syn"
 version = "1.0.107"
index ac0fa13f00ac284a42d125a8e4e138b7320c6863..d70d03be75045c3ece7dd3e6d13a58d08e20c29c 100644 (file)
@@ -24,6 +24,9 @@ anyhow = "1.0.68"
 itertools = "0.10.5"
 querystring = "1.1.0"
 hypher = { version = "0.1", features = ["english", "french"] }
+levenshtein = "1.0.5"
+strsim = "0.11.1"
+url = "*"
 
 [[bin]]
 name = "generate"
index 8c350f9205d897d6b138fa0bddd5816f6e0da7a1..b5edb808e3ac4d94b72f144a9a7b026985cb84b2 100644 (file)
@@ -6,16 +6,7 @@ use std::collections::HashMap;
 use std::{fs, net::SocketAddr, sync::Arc};
 use tower::{make::Shared, ServiceBuilder};
 use tower_http::add_extension::AddExtensionLayer;
-
-fn parse_query(query: &str) -> HashMap<String, String> {
-    query
-        .split('&')
-        .filter_map(|s| {
-            s.split_once('=')
-                .and_then(|t| Some((t.0.to_owned(), t.1.to_owned())))
-        })
-        .collect()
-}
+use url::form_urlencoded::parse;
 
 fn anyhow_response(err: anyhow::Error) -> Response<Body> {
     Response::builder()
@@ -34,7 +25,18 @@ async fn handler(request: Request<Body>) -> Result<Response<Body>, hyper::Error>
         Ok(ok) => ok,
         Err(err) => return Ok(err),
     };
-    let data = match parse_query(query)
+
+    let params: HashMap<String, String> = request
+        .uri()
+        .query()
+        .map(|v| {
+            url::form_urlencoded::parse(v.as_bytes())
+                .into_owned()
+                .collect()
+        })
+        .unwrap_or_else(HashMap::new);
+
+    let data = match params
         .get("grapheme")
         .ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified")))
     {
index 49192f3d4b4c806a733122e85f076ada951ada83..f681cd06a0287dc6617b44f2df8cdf7325921fad 100644 (file)
@@ -20,6 +20,10 @@ async fn call_inference_service(word: &str) -> anyhow::Result<String> {
 impl Save<'_> {
     pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> {
         let phonemes = call_inference_service(prefix).await?;
+        let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French)
+            .into_iter()
+            .collect_vec();
+        println!("syl: [{}]", source_word_syllabes.join(","));
 
         let completion = self
             .trie
@@ -37,8 +41,9 @@ impl Save<'_> {
 
         println!("Matching {} by adding {}", word, completion);
 
-        let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec();
-        let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec();
+        let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French)
+            .into_iter()
+            .collect_vec();
 
         // input:           test
         // output found:    testames
@@ -46,18 +51,33 @@ impl Save<'_> {
         // output expect:   tames
         // we just need to remove the prefix
 
-
+        println!(
+            "[{}] cmp [{}]",
+            source_word_syllabes.join(","),
+            completed_syllabes.join(",")
+        );
         let mut i = 0;
-        for (index, syl) in source_word_syllabes.iter().enumerate() {
-            if *source_word_syllabes[index] == **syl {
+        let maxindex = source_word_syllabes.len() - 1;
+        for (index, syl) in completed_syllabes.iter().enumerate() {
+            if maxindex < index {
+                break;
+            }
+
+            let phon1 = &source_word_syllabes[index].to_lowercase();
+            let phon2 = &(**syl).to_lowercase();
+
+            println!("comparing syllab {} vs {}", phon1, phon2);
+
+            if strsim::levenshtein(phon1, phon2) < 2 {
                 i = index
             } else {
+                println!("found scyl break at {}", i);
                 break;
             }
         }
 
-        completed_syllabes.drain(0..i);        // we finally just need to compute the end of the word which matches the sound
-        let found = completed_syllabes.join("");
+        // we finally just need to compute the end of the word which matches the sound
+        let found = completed_syllabes.drain(i+1..).join("");
         println!("{} is equivalent to {}", completion, found);
 
         Ok(format!("{} ({})", found, word))
index 43ae9f2bec01570ac4de821b1c26bdbde184fe5b..f0a4bae73d814f05668c0d2b12d55e7f1234a60b 100644 (file)
@@ -43,8 +43,6 @@ const sanitizeWord = (sentence) => {
     .trim()
     .split(" ")
     .slice(-1)[0]
-    .normalize('NFKD')
-    .replace(/[\u0300-\u036f]/g, "")
     .replaceAll(/(?:https?|ftp):\/\/[\n\S]+/g, "")
     .replaceAll(/\:([a-z]|[A-Z])+\:/g, "")
     .replaceAll(/(\?|\!|\.|\,|\;)/g, "")