use array2d::Array2D; use core::f32; use std::collections::{HashMap, HashSet}; mod constants; use constants::EXTRACTED; /// Compute the alignment of two phonetic strings. /// /// (Kondrak 2002: 51) pub fn align(str1: String, str2: String, epsilon: Option) -> Vec> { let epsilon = epsilon .or(Some(0f32)) .expect("the default value of epsilon should be 0"); assert!( 0f32 <= epsilon && epsilon <= 1f32, "Epsilon must be between 0.0 and 1.0." ); let m = str1.len(); let n = str2.len(); let mut S = Array2D::filled_with(0f32, m + 1, n + 1); for i in 1..m + 1 { for j in 1..n + 1 { let mut edit: [f32; 5] = [ S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]), S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]), S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]), f32::MIN, f32::MIN, ]; if i > 1 { edit[3] = S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]); } if j > 1 { edit[4] = S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]); } S[(i, j)] = edit .into_iter() .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row was empty")) .expect("the S matrix is empty"); } } // T = (1 - epsilon) * np.amax(S) let T = (1f32 - epsilon) * S.rows_iter() .map(|r| { r.max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row is empty")) .expect("a S row is empty") }) .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("S is empty")) .expect("S is empty"); let mut alignments: Vec> = vec![]; for i in 1..m + 1 { for j in 1..n + 1 { if S[(i, j)] >= T { let mut al = vec![]; _retrieve(i, j, 0f32, &S, T, &str1, &str2, &mut al); alignments.push(al); } } } alignments } /// Retrieve the path through the similarity matrix S starting at (i, j). fn _retrieve( i: usize, j: usize, s: f32, S: &Array2D, T: f32, str1: &str, str2: &str, out: &mut Vec<(String, String)>, ) { if S[(i, j)] == 0f32 { return; } if j > 1 && S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]) + s >= T { out.insert( 0, (str1[i - 1..i - 1].to_string(), str2[j - 2..j].to_string()), ); _retrieve( i - 1, j - 2, s + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]), S, T, str1, str2, out, ); } else if i > 1 && S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]) + s >= T { out.insert( 0, (str1[i - 2..i].to_string(), str2[j - 1..j - 1].to_string()), ); _retrieve( i - 2, j - 1, s + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]), S, T, str1, str2, out, ); } else if S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]) + s >= T { out.insert(0, ("-".to_string(), str2[j - 1..j - 1].to_string())); _retrieve( i, j - 1, s + sigma_skip(&str2[j - 1..j - 1]), S, T, str1, str2, out, ); } else if S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]) + s >= T { out.insert(0, (str1[i - 1..i - 1].to_string(), "-".to_string())); _retrieve( i - 1, j, s + sigma_skip(&str1[i - 1..i - 1]), S, T, str1, str2, out, ); } else if S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]) + s >= T { out.insert( 0, ( str1[i - 1..i - 1].to_string(), str2[j - 1..j - 1].to_string(), ), ); _retrieve( i - 1, j - 1, s + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]), S, T, str1, str2, out, ); } } /// Returns score of an indel of P. /// /// (Kondrak 2002: 54) fn sigma_skip(_p: &str) -> f32 { return EXTRACTED.cskip; } /// Returns score of a substitution of P with Q. /// /// (Kondrak 2002: 54) fn sigma_sub(p: &str, q: &str) -> f32 { return EXTRACTED.csub - delta(p, q) - V(p) - V(q); } /// Returns score of an expansion/compression. /// /// (Kondrak 2002: 54) fn sigma_exp(p: &str, q: &str) -> f32 { let q1 = q.chars().nth(0).unwrap().to_string(); let q2 = q.chars().nth(1).unwrap().to_string(); return EXTRACTED.cexp - delta(p, &q1) - delta(p, &q2) - V(p) - f32::max(V(&q1), V(&q2)); } /// Return weighted sum of difference between P and Q. /// /// (Kondrak 2002: 54) fn delta(p: &str, q: &str) -> f32 { let features = R(p, q); let mut total = 0f32; for f in features { total += diff(p, q, f) * EXTRACTED.salience[f]; } return total; } /// Returns difference between phonetic segments P and Q for feature F. /// /// (Kondrak 2002: 52, 54) fn diff(p: &str, q: &str, f: &str) -> f32 { let p_features: &HashMap = &EXTRACTED.feature_matrix[p]; let q_features: &HashMap = &EXTRACTED.feature_matrix[q]; return (EXTRACTED.similarity_matrix[&p_features[f]] - EXTRACTED.similarity_matrix[&q_features[f]]) .abs(); } /// Return relevant features for segment comparison. /// /// (Kondrak 2002: 54) fn R(p: &str, q: &str) -> &'static HashSet { if EXTRACTED.consonants.contains(p) || EXTRACTED.consonants.contains(q) { return &EXTRACTED.rc; } return &EXTRACTED.rc; } /// Return vowel weight if P is vowel. /// /// (Kondrak 2002: 54) fn V(p: &str) -> f32 { if EXTRACTED.consonants.contains(p) { return 0f32; } return EXTRACTED.cvwl; }