diff options
| author | Matthieu Pignolet <m@mpgn.dev> | 2025-05-05 20:22:56 +0400 |
|---|---|---|
| committer | Matthieu Pignolet <m@mpgn.dev> | 2025-05-05 20:22:56 +0400 |
| commit | fdfb65f920ebc5c72bd84397489ff3a79c017e77 (patch) | |
| tree | e350724a9fd1c013a087288d7852665e61f5834e | |
| parent | 95454a978af4575dfea77a8955b285a3f845df1b (diff) | |
feat: initial aline implementation
| -rw-r--r-- | Cargo.lock | 110 | ||||
| -rw-r--r-- | src/lib.rs | 240 |
2 files changed, 341 insertions, 9 deletions
diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..165c3df --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,110 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aline" +version = "0.1.0" +dependencies = [ + "array2d", + "once_cell", + "serde", + "serde_json", +] + +[[package]] +name = "array2d" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b39cb2c1bf5a7c0dd097aa95ab859cf87dab5a4328900f5388942dc1889f74" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" @@ -1,14 +1,236 @@ -pub fn add(left: u64, right: u64) -> u64 { - left + right +use array2d::Array2D; +use core::f32; +use std::collections::{HashMap, HashSet}; +mod constants; + +use constants::EXTRACTED; + +/// Compute the alignment of two phonetic strings. +/// +/// (Kondrak 2002: 51) +pub fn align(str1: String, str2: String, epsilon: Option<f32>) -> Vec<Vec<(String, String)>> { + let epsilon = epsilon + .or(Some(0f32)) + .expect("the default value of epsilon should be 0"); + + assert!( + 0f32 <= epsilon && epsilon <= 1f32, + "Epsilon must be between 0.0 and 1.0." + ); + + let m = str1.len(); + let n = str2.len(); + + let mut S = Array2D::filled_with(0f32, m + 1, n + 1); + + for i in 1..m + 1 { + for j in 1..n + 1 { + let mut edit: [f32; 5] = [ + S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]), + S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]), + S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]), + f32::MIN, + f32::MIN, + ]; + + if i > 1 { + edit[3] = S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]); + } + + if j > 1 { + edit[4] = S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]); + } + + S[(i, j)] = edit + .into_iter() + .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row was empty")) + .expect("the S matrix is empty"); + } + } + + // T = (1 - epsilon) * np.amax(S) + let T = (1f32 - epsilon) + * S.rows_iter() + .map(|r| { + r.max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row is empty")) + .expect("a S row is empty") + }) + .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("S is empty")) + .expect("S is empty"); + + let mut alignments: Vec<Vec<(String, String)>> = vec![]; + for i in 1..m + 1 { + for j in 1..n + 1 { + if S[(i, j)] >= T { + let mut al = vec![]; + _retrieve(i, j, 0f32, &S, T, &str1, &str2, &mut al); + alignments.push(al); + } + } + } + + alignments +} + +/// Retrieve the path through the similarity matrix S starting at (i, j). +fn _retrieve( + i: usize, + j: usize, + s: f32, + S: &Array2D<f32>, + T: f32, + str1: &str, + str2: &str, + out: &mut Vec<(String, String)>, +) { + if S[(i, j)] == 0f32 { + return; + } + + if j > 1 && S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]) + s >= T { + out.insert( + 0, + (str1[i - 1..i - 1].to_string(), str2[j - 2..j].to_string()), + ); + + _retrieve( + i - 1, + j - 2, + s + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]), + S, + T, + str1, + str2, + out, + ); + } else if i > 1 && S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]) + s >= T + { + out.insert( + 0, + (str1[i - 2..i].to_string(), str2[j - 1..j - 1].to_string()), + ); + + _retrieve( + i - 2, + j - 1, + s + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]), + S, + T, + str1, + str2, + out, + ); + } else if S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]) + s >= T { + out.insert(0, ("-".to_string(), str2[j - 1..j - 1].to_string())); + + _retrieve( + i, + j - 1, + s + sigma_skip(&str2[j - 1..j - 1]), + S, + T, + str1, + str2, + out, + ); + } else if S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]) + s >= T { + out.insert(0, (str1[i - 1..i - 1].to_string(), "-".to_string())); + _retrieve( + i - 1, + j, + s + sigma_skip(&str1[i - 1..i - 1]), + S, + T, + str1, + str2, + out, + ); + } else if S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]) + s >= T { + out.insert( + 0, + ( + str1[i - 1..i - 1].to_string(), + str2[j - 1..j - 1].to_string(), + ), + ); + _retrieve( + i - 1, + j - 1, + s + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]), + S, + T, + str1, + str2, + out, + ); + } +} + +/// Returns score of an indel of P. +/// +/// (Kondrak 2002: 54) +fn sigma_skip(_p: &str) -> f32 { + return EXTRACTED.cskip; +} + +/// Returns score of a substitution of P with Q. +/// +/// (Kondrak 2002: 54) +fn sigma_sub(p: &str, q: &str) -> f32 { + return EXTRACTED.csub - delta(p, q) - V(p) - V(q); +} + +/// Returns score of an expansion/compression. +/// +/// (Kondrak 2002: 54) +fn sigma_exp(p: &str, q: &str) -> f32 { + let q1 = q.chars().nth(0).unwrap().to_string(); + let q2 = q.chars().nth(1).unwrap().to_string(); + + return EXTRACTED.cexp - delta(p, &q1) - delta(p, &q2) - V(p) - f32::max(V(&q1), V(&q2)); } -#[cfg(test)] -mod tests { - use super::*; +/// Return weighted sum of difference between P and Q. +/// +/// (Kondrak 2002: 54) +fn delta(p: &str, q: &str) -> f32 { + let features = R(p, q); + let mut total = 0f32; + + for f in features { + total += diff(p, q, f) * EXTRACTED.salience[f]; + } + + return total; +} +/// Returns difference between phonetic segments P and Q for feature F. +/// +/// (Kondrak 2002: 52, 54) +fn diff(p: &str, q: &str, f: &str) -> f32 { + let p_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[p]; + let q_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[q]; + + return (EXTRACTED.similarity_matrix[&p_features[f]] + - EXTRACTED.similarity_matrix[&q_features[f]]) + .abs(); +} + +/// Return relevant features for segment comparison. +/// +/// (Kondrak 2002: 54) +fn R(p: &str, q: &str) -> &'static HashSet<std::string::String> { + if EXTRACTED.consonants.contains(p) || EXTRACTED.consonants.contains(q) { + return &EXTRACTED.rc; + } + return &EXTRACTED.rc; +} - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); +/// Return vowel weight if P is vowel. +/// +/// (Kondrak 2002: 54) +fn V(p: &str) -> f32 { + if EXTRACTED.consonants.contains(p) { + return 0f32; } + return EXTRACTED.cvwl; } |
