diff options
| author | Matthieu Pignolet <m@mpgn.dev> | 2025-01-29 13:24:30 +0400 | 
|---|---|---|
| committer | Matthieu Pignolet <m@mpgn.dev> | 2025-01-29 13:24:30 +0400 | 
| commit | 2827ce3e822245e0d88d385b6b8750f7f95dba2d (patch) | |
| tree | 4ace59df5d87122f199109cb332fb6217415e7dc /autofeur_db | |
| parent | 3d88aa6b6244c73720e5067ffeb0e77d6edc8f8b (diff) | |
proper utf8 support for input words & better "cutting" using scyllabus cuts
Diffstat (limited to 'autofeur_db')
| -rw-r--r-- | autofeur_db/Cargo.lock | 15 | ||||
| -rw-r--r-- | autofeur_db/Cargo.toml | 3 | ||||
| -rw-r--r-- | autofeur_db/src/bin/server.rs | 24 | ||||
| -rw-r--r-- | autofeur_db/src/inference.rs | 34 | 
4 files changed, 58 insertions, 18 deletions
diff --git a/autofeur_db/Cargo.lock b/autofeur_db/Cargo.lock index d2cfdd1..ece64ce 100644 --- a/autofeur_db/Cargo.lock +++ b/autofeur_db/Cargo.lock @@ -60,16 +60,19 @@ dependencies = [   "hypher",   "itertools",   "kdam", + "levenshtein",   "querystring",   "rand",   "reqwest",   "serde",   "serde_json", + "strsim",   "tokio",   "tower",   "tower-http",   "trie-rs",   "unicode-segmentation", + "url",  ]  [[package]] @@ -651,6 +654,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"  checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"  [[package]] +name = "levenshtein" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760" + +[[package]]  name = "libc"  version = "0.2.139"  source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1240,6 +1249,12 @@ dependencies = [  ]  [[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]]  name = "syn"  version = "1.0.107"  source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/autofeur_db/Cargo.toml b/autofeur_db/Cargo.toml index ac0fa13..d70d03b 100644 --- a/autofeur_db/Cargo.toml +++ b/autofeur_db/Cargo.toml @@ -24,6 +24,9 @@ anyhow = "1.0.68"  itertools = "0.10.5"  querystring = "1.1.0"  hypher = { version = "0.1", features = ["english", "french"] } +levenshtein = "1.0.5" +strsim = "0.11.1" +url = "*"  [[bin]]  name = "generate" diff --git a/autofeur_db/src/bin/server.rs b/autofeur_db/src/bin/server.rs index 8c350f9..b5edb80 100644 --- a/autofeur_db/src/bin/server.rs +++ b/autofeur_db/src/bin/server.rs @@ -6,16 +6,7 @@ use std::collections::HashMap;  use std::{fs, net::SocketAddr, sync::Arc};  use tower::{make::Shared, ServiceBuilder};  use tower_http::add_extension::AddExtensionLayer; - -fn parse_query(query: &str) -> HashMap<String, String> { -    query -        .split('&') -        .filter_map(|s| { -            s.split_once('=') -                .and_then(|t| Some((t.0.to_owned(), t.1.to_owned()))) -        }) -        .collect() -} +use url::form_urlencoded::parse;  fn anyhow_response(err: anyhow::Error) -> Response<Body> {      Response::builder() @@ -34,7 +25,18 @@ async fn handler(request: Request<Body>) -> Result<Response<Body>, hyper::Error>          Ok(ok) => ok,          Err(err) => return Ok(err),      }; -    let data = match parse_query(query) + +    let params: HashMap<String, String> = request +        .uri() +        .query() +        .map(|v| { +            url::form_urlencoded::parse(v.as_bytes()) +                .into_owned() +                .collect() +        }) +        .unwrap_or_else(HashMap::new); + +    let data = match params          .get("grapheme")          .ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified")))      { diff --git a/autofeur_db/src/inference.rs b/autofeur_db/src/inference.rs index 49192f3..f681cd0 100644 --- a/autofeur_db/src/inference.rs +++ b/autofeur_db/src/inference.rs @@ -20,6 +20,10 @@ async fn call_inference_service(word: &str) -> anyhow::Result<String> {  impl Save<'_> {      pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> {          let phonemes = call_inference_service(prefix).await?; +        let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French) +            .into_iter() +            .collect_vec(); +        println!("syl: [{}]", source_word_syllabes.join(","));          let completion = self              .trie @@ -37,8 +41,9 @@ impl Save<'_> {          println!("Matching {} by adding {}", word, completion); -        let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec(); -        let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec(); +        let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French) +            .into_iter() +            .collect_vec();          // input:           test          // output found:    testames @@ -46,18 +51,33 @@ impl Save<'_> {          // output expect:   tames          // we just need to remove the prefix - +        println!( +            "[{}] cmp [{}]", +            source_word_syllabes.join(","), +            completed_syllabes.join(",") +        );          let mut i = 0; -        for (index, syl) in source_word_syllabes.iter().enumerate() { -            if *source_word_syllabes[index] == **syl { +        let maxindex = source_word_syllabes.len() - 1; +        for (index, syl) in completed_syllabes.iter().enumerate() { +            if maxindex < index { +                break; +            } + +            let phon1 = &source_word_syllabes[index].to_lowercase(); +            let phon2 = &(**syl).to_lowercase(); + +            println!("comparing syllab {} vs {}", phon1, phon2); + +            if strsim::levenshtein(phon1, phon2) < 2 {                  i = index              } else { +                println!("found scyl break at {}", i);                  break;              }          } -        completed_syllabes.drain(0..i);        // we finally just need to compute the end of the word which matches the sound -        let found = completed_syllabes.join(""); +        // we finally just need to compute the end of the word which matches the sound +        let found = completed_syllabes.drain(i+1..).join("");          println!("{} is equivalent to {}", completion, found);          Ok(format!("{} ({})", found, word))  | 
