1 files changed, 271 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e79e040
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,271 @@
+// ALINE phonetic sequence alignment in Rust
+// Port of NLTK's ALINE module (Greg Kondrak, 2002)
+
+/// ALINE
+/// https://webdocs.cs.ualberta.ca/~kondrak/
+/// Copyright 2002 by Grzegorz Kondrak.
+/// 
+/// ALINE is an algorithm for aligning phonetic sequences, described in [1].
+/// This module is a port of Kondrak's (2002) ALINE. It provides functions for
+/// phonetic sequence alignment and similarity analysis. These are useful in
+/// historical linguistics, sociolinguistics and synchronic phonology.
+/// 
+/// ALINE has parameters that can be tuned for desired output. These parameters are:
+/// - C_skip, C_sub, C_exp, C_vwl
+/// - Salience weights
+/// - Segmental features
+/// 
+/// In this implementation, some parameters have been changed from their default
+/// values as described in [1], in order to replicate published results. All changes
+/// are noted in comments.
+/// 
+/// # Get optimal alignment of two phonetic sequences
+/// 
+/// ```
+/// use aline::align;
+/// 
+/// let alignment = align("θin", "tenwis", 0.0);
+/// 
+/// assert_eq!(
+///     alignment,
+///     vec![
+///         vec![
+///             ("θ", "t"),
+///             ("i", "e"),
+///             ("n", "n")
+///         ].iter()
+///         .map(|(a, b)| (a.to_string(), b.to_string()))
+///         .collect::<Vec<(String, String)>>()
+///     ]
+/// );
+/// ```
+/// 
+/// [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
+/// University of Toronto.
+
+use std::{collections::HashSet, f64};
+
+use constants::EXTRACTED;
+use unicode_segmentation::UnicodeSegmentation;
+mod constants;
+
+#[cfg(test)]
+mod test;
+
+/// Compute the alignment of two phonetic strings.
+/// 
+/// (Kondrak 2002: 51)
+pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> {
+    assert!(
+        (0.0..=1.0).contains(&epsilon),
+        "Epsilon must be between 0.0 and 1.0."
+    );
+
+    let str1_chars: Vec<&str> = str1.graphemes(true).collect();
+    let str2_chars: Vec<&str> = str2.graphemes(true).collect();
+    let m = str1_chars.len();
+    let n = str2_chars.len();
+
+    // This includes Kondrak's initialization of row 0 and column 0 to all 0s.
+    let mut s = vec![vec![0.0; n + 1]; m + 1];
+    for i in 1..=m {
+        for j in 1..=n {
+            let edit1 = s[i - 1][j] + sigma_skip();
+            let edit2 = s[i][j - 1] + sigma_skip();
+
+            let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);
+
+            let edit4 = if i > 1 {
+                s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
+            } else {
+                -f64::INFINITY
+            };
+
+            let edit5 = if j > 1 {
+                s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
+            } else {
+                -f64::INFINITY
+            };
+
+            s[i][j] = [edit1, edit2, edit3, edit4, edit5]
+                .iter()
+                .fold(0f64, |prev, curr| f64::max(prev, *curr));
+        }
+    }
+
+    let t = (1.0 - epsilon)
+        * s.iter()
+            .flat_map(|row| row.iter())
+            .cloned()
+            .fold(f64::NAN, f64::max);
+
+    let mut aligns = Vec::new();
+    for i in 1..=m {
+        for j in 1..=n {
+            if s[i][j] >= t {
+                let mut out = Vec::new();
+                retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
+                aligns.push(out);
+            }
+        }
+    }
+    aligns
+}
+
+/// Retrieve the path through the similarity matrix S starting at (i, j).
+#[inline]
+fn retrieve<'a>(
+    i: usize,
+    j: usize,
+    score: f64,
+    s: &Vec<Vec<f64>>,
+    t: f64,
+    str1: &[&str],
+    str2: &[&str],
+    out: &'a mut Vec<(String, String)>,
+) -> &'a mut Vec<(String, String)> {
+    if s[i][j] == 0.0 {
+        return out;
+    }
+
+    if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
+        // j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T
+
+        let key = str2[j - 2..j].join("");
+        out.insert(0, (str1[i - 1].to_string(), key));
+
+        retrieve(
+            i - 1,
+            j - 2,
+            score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
+            s,
+            t,
+            str1,
+            str2,
+            out,
+        );
+    } else if i > 1
+        && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
+    {
+        // i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
+        let key = str1[i - 2..i].join("");
+        out.insert(0, (key, str2[j - 1].to_string()));
+
+        retrieve(
+            i - 2,
+            j - 1,
+            score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
+            s,
+            t,
+            str1,
+            str2,
+            out,
+        );
+    } else if (s[i][j - 1] + sigma_skip() + score) >= t {
+        // S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T
+
+        out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
+        retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
+    } else if (s[i - 1][j] + sigma_skip() + score) >= t {
+        // S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T
+
+        out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
+        retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
+    } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
+        // S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T
+        out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));
+
+        retrieve(
+            i - 1,
+            j - 1,
+            score + sigma_sub(str1[i - 1], str2[j - 1]),
+            s,
+            t,
+            str1,
+            str2,
+            out,
+        );
+    }
+
+    return out;
+}
+
+/// Returns score of an indel of P.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_skip() -> f64 {
+    EXTRACTED.cskip
+}
+
+/// Returns score of a substitution of P with Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_sub(p: &str, q: &str) -> f64 {
+    EXTRACTED.csub - delta(p, q) - v(p) - v(q)
+}
+
+/// Returns score of an expansion/compression.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
+    EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
+}
+
+/// Return weighted sum of difference between P and Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn delta(p: &str, q: &str) -> f64 {
+    let features = r(p, q);
+    features
+        .iter()
+        .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
+        .sum()
+}
+
+/// Returns difference between phonetic segments P and Q for feature F.
+///
+/// (Kondrak 2002: 52, 54)
+#[inline]
+fn diff(p: &str, q: &str, f: &str) -> f64 {
+    let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
+    let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
+    let p_similarity = *EXTRACTED
+        .similarity_matrix
+        .get(p_features)
+        .unwrap_or_else(|| unreachable!());
+    let q_similarity = *EXTRACTED
+        .similarity_matrix
+        .get(q_features)
+        .unwrap_or_else(|| unreachable!());
+    (p_similarity - q_similarity).abs()
+}
+
+/// Return relevant features for segment comparison.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> {
+    if EXTRACTED.consonants.contains(&p.to_string())
+        || EXTRACTED.consonants.contains(&q.to_string())
+    {
+        &EXTRACTED.rc
+    } else {
+        &EXTRACTED.rv
+    }
+}
+
+/// Return vowel weight if P is vowel.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn v(p: &str) -> f64 {
+    if !EXTRACTED.consonants.contains(&p.to_string()) {
+        EXTRACTED.cvwl
+    } else {
+        0.0
+    }
+}