summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthieu Pignolet <m@mpgn.dev>2025-05-05 20:22:56 +0400
committerMatthieu Pignolet <m@mpgn.dev>2025-05-05 20:22:56 +0400
commitfdfb65f920ebc5c72bd84397489ff3a79c017e77 (patch)
treee350724a9fd1c013a087288d7852665e61f5834e
parent95454a978af4575dfea77a8955b285a3f845df1b (diff)
feat: initial aline implementation
-rw-r--r--Cargo.lock110
-rw-r--r--src/lib.rs240
2 files changed, 341 insertions, 9 deletions
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..165c3df
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,110 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "aline"
+version = "0.1.0"
+dependencies = [
+ "array2d",
+ "once_cell",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "array2d"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8b39cb2c1bf5a7c0dd097aa95ab859cf87dab5a4328900f5388942dc1889f74"
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "ryu"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+
+[[package]]
+name = "serde"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.140"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.101"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
diff --git a/src/lib.rs b/src/lib.rs
index b93cf3f..cfa4ab2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,14 +1,236 @@
-pub fn add(left: u64, right: u64) -> u64 {
- left + right
+use array2d::Array2D;
+use core::f32;
+use std::collections::{HashMap, HashSet};
+mod constants;
+
+use constants::EXTRACTED;
+
+/// Compute the alignment of two phonetic strings.
+///
+/// (Kondrak 2002: 51)
+pub fn align(str1: String, str2: String, epsilon: Option<f32>) -> Vec<Vec<(String, String)>> {
+ let epsilon = epsilon
+ .or(Some(0f32))
+ .expect("the default value of epsilon should be 0");
+
+ assert!(
+ 0f32 <= epsilon && epsilon <= 1f32,
+ "Epsilon must be between 0.0 and 1.0."
+ );
+
+ let m = str1.len();
+ let n = str2.len();
+
+ let mut S = Array2D::filled_with(0f32, m + 1, n + 1);
+
+ for i in 1..m + 1 {
+ for j in 1..n + 1 {
+ let mut edit: [f32; 5] = [
+ S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]),
+ S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]),
+ S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]),
+ f32::MIN,
+ f32::MIN,
+ ];
+
+ if i > 1 {
+ edit[3] = S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]);
+ }
+
+ if j > 1 {
+ edit[4] = S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]);
+ }
+
+ S[(i, j)] = edit
+ .into_iter()
+ .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row was empty"))
+ .expect("the S matrix is empty");
+ }
+ }
+
+ // T = (1 - epsilon) * np.amax(S)
+ let T = (1f32 - epsilon)
+ * S.rows_iter()
+ .map(|r| {
+ r.max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row is empty"))
+ .expect("a S row is empty")
+ })
+ .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("S is empty"))
+ .expect("S is empty");
+
+ let mut alignments: Vec<Vec<(String, String)>> = vec![];
+ for i in 1..m + 1 {
+ for j in 1..n + 1 {
+ if S[(i, j)] >= T {
+ let mut al = vec![];
+ _retrieve(i, j, 0f32, &S, T, &str1, &str2, &mut al);
+ alignments.push(al);
+ }
+ }
+ }
+
+ alignments
+}
+
+/// Retrieve the path through the similarity matrix S starting at (i, j).
+fn _retrieve(
+ i: usize,
+ j: usize,
+ s: f32,
+ S: &Array2D<f32>,
+ T: f32,
+ str1: &str,
+ str2: &str,
+ out: &mut Vec<(String, String)>,
+) {
+ if S[(i, j)] == 0f32 {
+ return;
+ }
+
+ if j > 1 && S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]) + s >= T {
+ out.insert(
+ 0,
+ (str1[i - 1..i - 1].to_string(), str2[j - 2..j].to_string()),
+ );
+
+ _retrieve(
+ i - 1,
+ j - 2,
+ s + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ } else if i > 1 && S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]) + s >= T
+ {
+ out.insert(
+ 0,
+ (str1[i - 2..i].to_string(), str2[j - 1..j - 1].to_string()),
+ );
+
+ _retrieve(
+ i - 2,
+ j - 1,
+ s + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ } else if S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]) + s >= T {
+ out.insert(0, ("-".to_string(), str2[j - 1..j - 1].to_string()));
+
+ _retrieve(
+ i,
+ j - 1,
+ s + sigma_skip(&str2[j - 1..j - 1]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ } else if S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]) + s >= T {
+ out.insert(0, (str1[i - 1..i - 1].to_string(), "-".to_string()));
+ _retrieve(
+ i - 1,
+ j,
+ s + sigma_skip(&str1[i - 1..i - 1]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ } else if S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]) + s >= T {
+ out.insert(
+ 0,
+ (
+ str1[i - 1..i - 1].to_string(),
+ str2[j - 1..j - 1].to_string(),
+ ),
+ );
+ _retrieve(
+ i - 1,
+ j - 1,
+ s + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ }
+}
+
+/// Returns score of an indel of P.
+///
+/// (Kondrak 2002: 54)
+fn sigma_skip(_p: &str) -> f32 {
+ return EXTRACTED.cskip;
+}
+
+/// Returns score of a substitution of P with Q.
+///
+/// (Kondrak 2002: 54)
+fn sigma_sub(p: &str, q: &str) -> f32 {
+ return EXTRACTED.csub - delta(p, q) - V(p) - V(q);
+}
+
+/// Returns score of an expansion/compression.
+///
+/// (Kondrak 2002: 54)
+fn sigma_exp(p: &str, q: &str) -> f32 {
+ let q1 = q.chars().nth(0).unwrap().to_string();
+ let q2 = q.chars().nth(1).unwrap().to_string();
+
+ return EXTRACTED.cexp - delta(p, &q1) - delta(p, &q2) - V(p) - f32::max(V(&q1), V(&q2));
}
-#[cfg(test)]
-mod tests {
- use super::*;
+/// Return weighted sum of difference between P and Q.
+///
+/// (Kondrak 2002: 54)
+fn delta(p: &str, q: &str) -> f32 {
+ let features = R(p, q);
+ let mut total = 0f32;
+
+ for f in features {
+ total += diff(p, q, f) * EXTRACTED.salience[f];
+ }
+
+ return total;
+}
+/// Returns difference between phonetic segments P and Q for feature F.
+///
+/// (Kondrak 2002: 52, 54)
+fn diff(p: &str, q: &str, f: &str) -> f32 {
+ let p_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[p];
+ let q_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[q];
+
+ return (EXTRACTED.similarity_matrix[&p_features[f]]
+ - EXTRACTED.similarity_matrix[&q_features[f]])
+ .abs();
+}
+
+/// Return relevant features for segment comparison.
+///
+/// (Kondrak 2002: 54)
+fn R(p: &str, q: &str) -> &'static HashSet<std::string::String> {
+ if EXTRACTED.consonants.contains(p) || EXTRACTED.consonants.contains(q) {
+ return &EXTRACTED.rc;
+ }
+ return &EXTRACTED.rc;
+}
- #[test]
- fn it_works() {
- let result = add(2, 2);
- assert_eq!(result, 4);
+/// Return vowel weight if P is vowel.
+///
+/// (Kondrak 2002: 54)
+fn V(p: &str) -> f32 {
+ if EXTRACTED.consonants.contains(p) {
+ return 0f32;
}
+ return EXTRACTED.cvwl;
}