From 2827ce3e822245e0d88d385b6b8750f7f95dba2d Mon Sep 17 00:00:00 2001 From: Matthieu Pignolet Date: Wed, 29 Jan 2025 13:24:30 +0400 Subject: [PATCH] proper utf8 support for input words & better "cutting" using scyllabus cuts --- autofeur_db/Cargo.lock | 15 +++++++++++++++ autofeur_db/Cargo.toml | 3 +++ autofeur_db/src/bin/server.rs | 24 +++++++++++++----------- autofeur_db/src/inference.rs | 34 +++++++++++++++++++++++++++------- discordjs/src/index.mjs | 2 -- 5 files changed, 58 insertions(+), 20 deletions(-) diff --git a/autofeur_db/Cargo.lock b/autofeur_db/Cargo.lock index d2cfdd1..ece64ce 100644 --- a/autofeur_db/Cargo.lock +++ b/autofeur_db/Cargo.lock @@ -60,16 +60,19 @@ dependencies = [ "hypher", "itertools", "kdam", + "levenshtein", "querystring", "rand", "reqwest", "serde", "serde_json", + "strsim", "tokio", "tower", "tower-http", "trie-rs", "unicode-segmentation", + "url", ] [[package]] @@ -650,6 +653,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "levenshtein" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760" + [[package]] name = "libc" version = "0.2.139" @@ -1239,6 +1248,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "syn" version = "1.0.107" diff --git a/autofeur_db/Cargo.toml b/autofeur_db/Cargo.toml index ac0fa13..d70d03b 100644 --- a/autofeur_db/Cargo.toml +++ b/autofeur_db/Cargo.toml @@ -24,6 +24,9 @@ anyhow = "1.0.68" itertools = "0.10.5" querystring = "1.1.0" hypher = { version = "0.1", features = ["english", "french"] } +levenshtein = "1.0.5" +strsim = "0.11.1" +url = "*" [[bin]] name = "generate" diff --git a/autofeur_db/src/bin/server.rs b/autofeur_db/src/bin/server.rs index 8c350f9..b5edb80 100644 --- a/autofeur_db/src/bin/server.rs +++ b/autofeur_db/src/bin/server.rs @@ -6,16 +6,7 @@ use std::collections::HashMap; use std::{fs, net::SocketAddr, sync::Arc}; use tower::{make::Shared, ServiceBuilder}; use tower_http::add_extension::AddExtensionLayer; - -fn parse_query(query: &str) -> HashMap { - query - .split('&') - .filter_map(|s| { - s.split_once('=') - .and_then(|t| Some((t.0.to_owned(), t.1.to_owned()))) - }) - .collect() -} +use url::form_urlencoded::parse; fn anyhow_response(err: anyhow::Error) -> Response { Response::builder() @@ -34,7 +25,18 @@ async fn handler(request: Request) -> Result, hyper::Error> Ok(ok) => ok, Err(err) => return Ok(err), }; - let data = match parse_query(query) + + let params: HashMap = request + .uri() + .query() + .map(|v| { + url::form_urlencoded::parse(v.as_bytes()) + .into_owned() + .collect() + }) + .unwrap_or_else(HashMap::new); + + let data = match params .get("grapheme") .ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified"))) { diff --git a/autofeur_db/src/inference.rs b/autofeur_db/src/inference.rs index 49192f3..f681cd0 100644 --- a/autofeur_db/src/inference.rs +++ b/autofeur_db/src/inference.rs @@ -20,6 +20,10 @@ async fn call_inference_service(word: &str) -> anyhow::Result { impl Save<'_> { pub async fn inference(&self, prefix: &str) -> anyhow::Result { let phonemes = call_inference_service(prefix).await?; + let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French) + .into_iter() + .collect_vec(); + println!("syl: [{}]", source_word_syllabes.join(",")); let completion = self .trie @@ -37,8 +41,9 @@ impl Save<'_> { println!("Matching {} by adding {}", word, completion); - let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec(); - let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec(); + let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French) + .into_iter() + .collect_vec(); // input: test // output found: testames @@ -46,18 +51,33 @@ impl Save<'_> { // output expect: tames // we just need to remove the prefix - + println!( + "[{}] cmp [{}]", + source_word_syllabes.join(","), + completed_syllabes.join(",") + ); let mut i = 0; - for (index, syl) in source_word_syllabes.iter().enumerate() { - if *source_word_syllabes[index] == **syl { + let maxindex = source_word_syllabes.len() - 1; + for (index, syl) in completed_syllabes.iter().enumerate() { + if maxindex < index { + break; + } + + let phon1 = &source_word_syllabes[index].to_lowercase(); + let phon2 = &(**syl).to_lowercase(); + + println!("comparing syllab {} vs {}", phon1, phon2); + + if strsim::levenshtein(phon1, phon2) < 2 { i = index } else { + println!("found scyl break at {}", i); break; } } - completed_syllabes.drain(0..i); // we finally just need to compute the end of the word which matches the sound - let found = completed_syllabes.join(""); + // we finally just need to compute the end of the word which matches the sound + let found = completed_syllabes.drain(i+1..).join(""); println!("{} is equivalent to {}", completion, found); Ok(format!("{} ({})", found, word)) diff --git a/discordjs/src/index.mjs b/discordjs/src/index.mjs index 43ae9f2..f0a4bae 100644 --- a/discordjs/src/index.mjs +++ b/discordjs/src/index.mjs @@ -43,8 +43,6 @@ const sanitizeWord = (sentence) => { .trim() .split(" ") .slice(-1)[0] - .normalize('NFKD') - .replace(/[\u0300-\u036f]/g, "") .replaceAll(/(?:https?|ftp):\/\/[\n\S]+/g, "") .replaceAll(/\:([a-z]|[A-Z])+\:/g, "") .replaceAll(/(\?|\!|\.|\,|\;)/g, "") -- 2.39.5