diff options
Diffstat (limited to 'src/lib.rs')
| -rw-r--r-- | src/lib.rs | 236 |
1 files changed, 0 insertions, 236 deletions
diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index de0db50..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,236 +0,0 @@ -use array2d::Array2D; -use core::f32; -use std::collections::{HashMap, HashSet}; -mod constants; - -use constants::EXTRACTED; - -/// Compute the alignment of two phonetic strings. -/// -/// (Kondrak 2002: 51) -pub fn align(str1: String, str2: String, epsilon: Option<f32>) -> Vec<Vec<(String, String)>> { - let epsilon = epsilon - .or(Some(0f32)) - .expect("the default value of epsilon should be 0"); - - assert!( - 0f32 <= epsilon && epsilon <= 1f32, - "Epsilon must be between 0.0 and 1.0." - ); - - let m = str1.len(); - let n = str2.len(); - - let mut S = Array2D::filled_with(0f32, m + 1, n + 1); - - for i in 1..m + 1 { - for j in 1..n + 1 { - let mut edit: [f32; 5] = [ - S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]), - S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]), - S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]), - f32::MIN, - f32::MIN, - ]; - - if i > 1 { - edit[3] = S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]); - } - - if j > 1 { - edit[4] = S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]); - } - - S[(i, j)] = edit - .into_iter() - .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row was empty")) - .expect("the S matrix is empty"); - } - } - - // T = (1 - epsilon) * np.amax(S) - let T = (1f32 - epsilon) - * S.rows_iter() - .map(|r| { - r.max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row is empty")) - .expect("a S row is empty") - }) - .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("S is empty")) - .expect("S is empty"); - - let mut alignments: Vec<Vec<(String, String)>> = vec![]; - for i in 1..m + 1 { - for j in 1..n + 1 { - if S[(i, j)] >= T { - let mut al = vec![]; - _retrieve(i, j, 0f32, &S, T, &str1, &str2, &mut al); - alignments.push(al); - } - } - } - - alignments -} - -/// Retrieve the path through the similarity matrix S starting at (i, j). -fn _retrieve( - i: usize, - j: usize, - s: f32, - S: &Array2D<f32>, - T: f32, - str1: &str, - str2: &str, - out: &mut Vec<(String, String)>, -) { - if S[(i, j)] == 0f32 { - return; - } - - if j > 1 && S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]) + s >= T { - out.insert( - 0, - (str1[i - 1..i - 1].to_string(), str2[j - 2..j].to_string()), - ); - - _retrieve( - i - 1, - j - 2, - s + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]), - S, - T, - str1, - str2, - out, - ); - } else if i > 1 && S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]) + s >= T - { - out.insert( - 0, - (str1[i - 2..i].to_string(), str2[j - 1..j - 1].to_string()), - ); - - _retrieve( - i - 2, - j - 1, - s + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]), - S, - T, - str1, - str2, - out, - ); - } else if S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]) + s >= T { - out.insert(0, ("-".to_string(), str2[j - 1..j - 1].to_string())); - - _retrieve( - i, - j - 1, - s + sigma_skip(&str2[j - 1..j - 1]), - S, - T, - str1, - str2, - out, - ); - } else if S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]) + s >= T { - out.insert(0, (str1[i - 1..i - 1].to_string(), "-".to_string())); - _retrieve( - i - 1, - j, - s + sigma_skip(&str1[i - 1..i - 1]), - S, - T, - str1, - str2, - out, - ); - } else if S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]) + s >= T { - out.insert( - 0, - ( - str1[i - 1..i - 1].to_string(), - str2[j - 1..j - 1].to_string(), - ), - ); - _retrieve( - i - 1, - j - 1, - s + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]), - S, - T, - str1, - str2, - out, - ); - } -} - -/// Returns score of an indel of P. -/// -/// (Kondrak 2002: 54) -fn sigma_skip(_p: &str) -> f32 { - return EXTRACTED.cskip; -} - -/// Returns score of a substitution of P with Q. -/// -/// (Kondrak 2002: 54) -fn sigma_sub(p: &str, q: &str) -> f32 { - return EXTRACTED.csub - delta(p, q) - V(p) - V(q); -} - -/// Returns score of an expansion/compression. -/// -/// (Kondrak 2002: 54) -fn sigma_exp(p: &str, q: &str) -> f32 { - let q1 = &q[0..0]; - let q2 = &q[1..1]; - - return EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - V(p) - f32::max(V(&q1), V(&q2)); -} - -/// Return weighted sum of difference between P and Q. -/// -/// (Kondrak 2002: 54) -fn delta(p: &str, q: &str) -> f32 { - let features = R(p, q); - let mut total = 0f32; - - for f in features { - total += diff(p, q, f) * EXTRACTED.salience[f]; - } - - return total; -} -/// Returns difference between phonetic segments P and Q for feature F. -/// -/// (Kondrak 2002: 52, 54) -fn diff(p: &str, q: &str, f: &str) -> f32 { - let p_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[p]; - let q_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[q]; - - return (EXTRACTED.similarity_matrix[&p_features[f]] - - EXTRACTED.similarity_matrix[&q_features[f]]) - .abs(); -} - -/// Return relevant features for segment comparison. -/// -/// (Kondrak 2002: 54) -fn R(p: &str, q: &str) -> &'static HashSet<std::string::String> { - if EXTRACTED.consonants.contains(p) || EXTRACTED.consonants.contains(q) { - return &EXTRACTED.rc; - } - return &EXTRACTED.rc; -} - -/// Return vowel weight if P is vowel. -/// -/// (Kondrak 2002: 54) -fn V(p: &str) -> f32 { - if EXTRACTED.consonants.contains(p) { - return 0f32; - } - return EXTRACTED.cvwl; -} |
