summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/lib.rs240
1 files changed, 231 insertions, 9 deletions
diff --git a/src/lib.rs b/src/lib.rs
index b93cf3f..cfa4ab2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,14 +1,236 @@
-pub fn add(left: u64, right: u64) -> u64 {
- left + right
+use array2d::Array2D;
+use core::f32;
+use std::collections::{HashMap, HashSet};
+mod constants;
+
+use constants::EXTRACTED;
+
+/// Compute the alignment of two phonetic strings.
+///
+/// (Kondrak 2002: 51)
+pub fn align(str1: String, str2: String, epsilon: Option<f32>) -> Vec<Vec<(String, String)>> {
+ let epsilon = epsilon
+ .or(Some(0f32))
+ .expect("the default value of epsilon should be 0");
+
+ assert!(
+ 0f32 <= epsilon && epsilon <= 1f32,
+ "Epsilon must be between 0.0 and 1.0."
+ );
+
+ let m = str1.len();
+ let n = str2.len();
+
+ let mut S = Array2D::filled_with(0f32, m + 1, n + 1);
+
+ for i in 1..m + 1 {
+ for j in 1..n + 1 {
+ let mut edit: [f32; 5] = [
+ S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]),
+ S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]),
+ S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]),
+ f32::MIN,
+ f32::MIN,
+ ];
+
+ if i > 1 {
+ edit[3] = S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]);
+ }
+
+ if j > 1 {
+ edit[4] = S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]);
+ }
+
+ S[(i, j)] = edit
+ .into_iter()
+ .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row was empty"))
+ .expect("the S matrix is empty");
+ }
+ }
+
+ // T = (1 - epsilon) * np.amax(S)
+ let T = (1f32 - epsilon)
+ * S.rows_iter()
+ .map(|r| {
+ r.max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("a S row is empty"))
+ .expect("a S row is empty")
+ })
+ .max_by(|x, y| x.abs().partial_cmp(&y.abs()).expect("S is empty"))
+ .expect("S is empty");
+
+ let mut alignments: Vec<Vec<(String, String)>> = vec![];
+ for i in 1..m + 1 {
+ for j in 1..n + 1 {
+ if S[(i, j)] >= T {
+ let mut al = vec![];
+ _retrieve(i, j, 0f32, &S, T, &str1, &str2, &mut al);
+ alignments.push(al);
+ }
+ }
+ }
+
+ alignments
+}
+
+/// Retrieve the path through the similarity matrix S starting at (i, j).
+fn _retrieve(
+ i: usize,
+ j: usize,
+ s: f32,
+ S: &Array2D<f32>,
+ T: f32,
+ str1: &str,
+ str2: &str,
+ out: &mut Vec<(String, String)>,
+) {
+ if S[(i, j)] == 0f32 {
+ return;
+ }
+
+ if j > 1 && S[(i - 1, j - 2)] + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]) + s >= T {
+ out.insert(
+ 0,
+ (str1[i - 1..i - 1].to_string(), str2[j - 2..j].to_string()),
+ );
+
+ _retrieve(
+ i - 1,
+ j - 2,
+ s + sigma_exp(&str1[i - 1..i - 1], &str2[j - 2..j]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ } else if i > 1 && S[(i - 2, j - 1)] + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]) + s >= T
+ {
+ out.insert(
+ 0,
+ (str1[i - 2..i].to_string(), str2[j - 1..j - 1].to_string()),
+ );
+
+ _retrieve(
+ i - 2,
+ j - 1,
+ s + sigma_exp(&str2[j - 1..j - 1], &str1[i - 2..i]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ } else if S[(i, j - 1)] + sigma_skip(&str2[j - 1..j - 1]) + s >= T {
+ out.insert(0, ("-".to_string(), str2[j - 1..j - 1].to_string()));
+
+ _retrieve(
+ i,
+ j - 1,
+ s + sigma_skip(&str2[j - 1..j - 1]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ } else if S[(i - 1, j)] + sigma_skip(&str1[i - 1..i - 1]) + s >= T {
+ out.insert(0, (str1[i - 1..i - 1].to_string(), "-".to_string()));
+ _retrieve(
+ i - 1,
+ j,
+ s + sigma_skip(&str1[i - 1..i - 1]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ } else if S[(i - 1, j - 1)] + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]) + s >= T {
+ out.insert(
+ 0,
+ (
+ str1[i - 1..i - 1].to_string(),
+ str2[j - 1..j - 1].to_string(),
+ ),
+ );
+ _retrieve(
+ i - 1,
+ j - 1,
+ s + sigma_sub(&str1[i - 1..i - 1], &str2[j - 1..j - 1]),
+ S,
+ T,
+ str1,
+ str2,
+ out,
+ );
+ }
+}
+
+/// Returns score of an indel of P.
+///
+/// (Kondrak 2002: 54)
+fn sigma_skip(_p: &str) -> f32 {
+ return EXTRACTED.cskip;
+}
+
+/// Returns score of a substitution of P with Q.
+///
+/// (Kondrak 2002: 54)
+fn sigma_sub(p: &str, q: &str) -> f32 {
+ return EXTRACTED.csub - delta(p, q) - V(p) - V(q);
+}
+
+/// Returns score of an expansion/compression.
+///
+/// (Kondrak 2002: 54)
+fn sigma_exp(p: &str, q: &str) -> f32 {
+ let q1 = q.chars().nth(0).unwrap().to_string();
+ let q2 = q.chars().nth(1).unwrap().to_string();
+
+ return EXTRACTED.cexp - delta(p, &q1) - delta(p, &q2) - V(p) - f32::max(V(&q1), V(&q2));
}
-#[cfg(test)]
-mod tests {
- use super::*;
+/// Return weighted sum of difference between P and Q.
+///
+/// (Kondrak 2002: 54)
+fn delta(p: &str, q: &str) -> f32 {
+ let features = R(p, q);
+ let mut total = 0f32;
+
+ for f in features {
+ total += diff(p, q, f) * EXTRACTED.salience[f];
+ }
+
+ return total;
+}
+/// Returns difference between phonetic segments P and Q for feature F.
+///
+/// (Kondrak 2002: 52, 54)
+fn diff(p: &str, q: &str, f: &str) -> f32 {
+ let p_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[p];
+ let q_features: &HashMap<String, String> = &EXTRACTED.feature_matrix[q];
+
+ return (EXTRACTED.similarity_matrix[&p_features[f]]
+ - EXTRACTED.similarity_matrix[&q_features[f]])
+ .abs();
+}
+
+/// Return relevant features for segment comparison.
+///
+/// (Kondrak 2002: 54)
+fn R(p: &str, q: &str) -> &'static HashSet<std::string::String> {
+ if EXTRACTED.consonants.contains(p) || EXTRACTED.consonants.contains(q) {
+ return &EXTRACTED.rc;
+ }
+ return &EXTRACTED.rc;
+}
- #[test]
- fn it_works() {
- let result = add(2, 2);
- assert_eq!(result, 4);
+/// Return vowel weight if P is vowel.
+///
+/// (Kondrak 2002: 54)
+fn V(p: &str) -> f32 {
+ if EXTRACTED.consonants.contains(p) {
+ return 0f32;
}
+ return EXTRACTED.cvwl;
}