diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib.rs | 240 |
1 files changed, 231 insertions, 9 deletions
@@ -1,14 +1,236 @@ -pub fn add(left: u64, right: u64) -> u64 { - left + right +use array2d::Array2D; +use core::f32; +use std::collections::{HashMap, HashSet}; +mod constants; + +use constants::EXTRACTED; + +/// Compute the alignment of two phonetic strings. +/// +/// (Kondrak 2002: 51) +pub fn align(str1: String, str2: String, epsilon: Option<f32>) -> Vec<Vec<(String, String)>> { + let epsilon = epsilon + .or(Some(0f32)) + .expect("the default value of epsilon should be 0"); + + assert!( + 0f32 <= epsilon && epsilon <= 1f32, + "Epsilon must be between 0.0 and 1.0." + ); + + let m = str1.len(); + let n = str2.len(); + + let mut S = Array2D::filled_with(0f32, m + 1, n + 1); + + for i in 1..m + 1 { + for j in 1..n + 1 { + let mut edit: [f32; 5] = [ + S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]), + S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]), + S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]), + f32::MIN, + f32::MIN, + ]; + + if i > 1 { + edit[3] = S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]); + } + + if j > 1 { + edit[4] = S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]); + } + + S[(i, j)] = edit + .into_iter() + .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row was empty")) + .expect("the S matrix is empty"); + } + } + + // T = (1 - epsilon) * np.amax(S) + let T = (1f32 - epsilon) + * S.rows_iter() + .map(|r| { + r.max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row is empty")) + .expect("a S row is empty") + }) + .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("S is empty")) + .expect("S is empty"); + + let mut alignments: Vec<Vec<(String, String)>> = vec![]; + for i in 1..m + 1 { + for j in 1..n + 1 { + if S[(i, j)] >= T { + let mut al = vec![]; + _retrieve(i, j, 0f32, &S, T, &str1, &str2, &mut al); + alignments.push(al); + } + } + } + + alignments +} + +/// Retrieve the path through the similarity matrix S starting at (i, j). +fn _retrieve( + i: usize, + j: usize, + s: f32, + S: &Array2D<f32>, + T: f32, + str1: &str, + str2: &str, + out: &mut Vec<(String, String)>, +) { + if S[(i, j)] == 0f32 { + return; + } + + if j > 1 && S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]) + s >= T { + out.insert( + 0, + (str1[i - 1..i - 1].to_string(), str2[j - 2..j].to_string()), + ); + + _retrieve( + i - 1, + j - 2, + s + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]), + S, + T, + str1, + str2, + out, + ); + } else if i > 1 && S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]) + s >= T + { + out.insert( + 0, + (str1[i - 2..i].to_string(), str2[j - 1..j - 1].to_string()), + ); + + _retrieve( + i - 2, + j - 1, + s + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]), + S, + T, + str1, + str2, + out, + ); + } else if S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]) + s >= T { + out.insert(0, ("-".to_string(), str2[j - 1..j - 1].to_string())); + + _retrieve( + i, + j - 1, + s + sigma_skip(&str2[j - 1..j - 1]), + S, + T, + str1, + str2, + out, + ); + } else if S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]) + s >= T { + out.insert(0, (str1[i - 1..i - 1].to_string(), "-".to_string())); + _retrieve( + i - 1, + j, + s + sigma_skip(&str1[i - 1..i - 1]), + S, + T, + str1, + str2, + out, + ); + } else if S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]) + s >= T { + out.insert( + 0, + ( + str1[i - 1..i - 1].to_string(), + str2[j - 1..j - 1].to_string(), + ), + ); + _retrieve( + i - 1, + j - 1, + s + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]), + S, + T, + str1, + str2, + out, + ); + } +} + +/// Returns score of an indel of P. +/// +/// (Kondrak 2002: 54) +fn sigma_skip(_p: &str) -> f32 { + return EXTRACTED.cskip; +} + +/// Returns score of a substitution of P with Q. +/// +/// (Kondrak 2002: 54) +fn sigma_sub(p: &str, q: &str) -> f32 { + return EXTRACTED.csub - delta(p, q) - V(p) - V(q); +} + +/// Returns score of an expansion/compression. +/// +/// (Kondrak 2002: 54) +fn sigma_exp(p: &str, q: &str) -> f32 { + let q1 = q.chars().nth(0).unwrap().to_string(); + let q2 = q.chars().nth(1).unwrap().to_string(); + + return EXTRACTED.cexp - delta(p, &q1) - delta(p, &q2) - V(p) - f32::max(V(&q1), V(&q2)); } -#[cfg(test)] -mod tests { - use super::*; +/// Return weighted sum of difference between P and Q. +/// +/// (Kondrak 2002: 54) +fn delta(p: &str, q: &str) -> f32 { + let features = R(p, q); + let mut total = 0f32; + + for f in features { + total += diff(p, q, f) * EXTRACTED.salience[f]; + } + + return total; +} +/// Returns difference between phonetic segments P and Q for feature F. +/// +/// (Kondrak 2002: 52, 54) +fn diff(p: &str, q: &str, f: &str) -> f32 { + let p_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[p]; + let q_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[q]; + + return (EXTRACTED.similarity_matrix[&p_features[f]] + - EXTRACTED.similarity_matrix[&q_features[f]]) + .abs(); +} + +/// Return relevant features for segment comparison. +/// +/// (Kondrak 2002: 54) +fn R(p: &str, q: &str) -> &'static HashSet<std::string::String> { + if EXTRACTED.consonants.contains(p) || EXTRACTED.consonants.contains(q) { + return &EXTRACTED.rc; + } + return &EXTRACTED.rc; +} - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); +/// Return vowel weight if P is vowel. +/// +/// (Kondrak 2002: 54) +fn V(p: &str) -> f32 { + if EXTRACTED.consonants.contains(p) { + return 0f32; } + return EXTRACTED.cvwl; } |
