summaryrefslogtreecommitdiff
path: root/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib.rs')
-rw-r--r--src/lib.rs271
1 files changed, 271 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e79e040
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,271 @@
+// ALINE phonetic sequence alignment in Rust
+// Port of NLTK's ALINE module (Greg Kondrak, 2002)
+
+/// ALINE
+/// https://webdocs.cs.ualberta.ca/~kondrak/
+/// Copyright 2002 by Grzegorz Kondrak.
+///
+/// ALINE is an algorithm for aligning phonetic sequences, described in [1].
+/// This module is a port of Kondrak's (2002) ALINE. It provides functions for
+/// phonetic sequence alignment and similarity analysis. These are useful in
+/// historical linguistics, sociolinguistics and synchronic phonology.
+///
+/// ALINE has parameters that can be tuned for desired output. These parameters are:
+/// - C_skip, C_sub, C_exp, C_vwl
+/// - Salience weights
+/// - Segmental features
+///
+/// In this implementation, some parameters have been changed from their default
+/// values as described in [1], in order to replicate published results. All changes
+/// are noted in comments.
+///
+/// # Get optimal alignment of two phonetic sequences
+///
+/// ```
+/// use aline::align;
+///
+/// let alignment = align("θin", "tenwis", 0.0);
+///
+/// assert_eq!(
+/// alignment,
+/// vec![
+/// vec![
+/// ("θ", "t"),
+/// ("i", "e"),
+/// ("n", "n")
+/// ].iter()
+/// .map(|(a, b)| (a.to_string(), b.to_string()))
+/// .collect::<Vec<(String, String)>>()
+/// ]
+/// );
+/// ```
+///
+/// [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
+/// University of Toronto.
+
+use std::{collections::HashSet, f64};
+
+use constants::EXTRACTED;
+use unicode_segmentation::UnicodeSegmentation;
+mod constants;
+
+#[cfg(test)]
+mod test;
+
+/// Compute the alignment of two phonetic strings.
+///
+/// (Kondrak 2002: 51)
+pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> {
+ assert!(
+ (0.0..=1.0).contains(&epsilon),
+ "Epsilon must be between 0.0 and 1.0."
+ );
+
+ let str1_chars: Vec<&str> = str1.graphemes(true).collect();
+ let str2_chars: Vec<&str> = str2.graphemes(true).collect();
+ let m = str1_chars.len();
+ let n = str2_chars.len();
+
+ // This includes Kondrak's initialization of row 0 and column 0 to all 0s.
+ let mut s = vec![vec![0.0; n + 1]; m + 1];
+ for i in 1..=m {
+ for j in 1..=n {
+ let edit1 = s[i - 1][j] + sigma_skip();
+ let edit2 = s[i][j - 1] + sigma_skip();
+
+ let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);
+
+ let edit4 = if i > 1 {
+ s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
+ } else {
+ -f64::INFINITY
+ };
+
+ let edit5 = if j > 1 {
+ s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
+ } else {
+ -f64::INFINITY
+ };
+
+ s[i][j] = [edit1, edit2, edit3, edit4, edit5]
+ .iter()
+ .fold(0f64, |prev, curr| f64::max(prev, *curr));
+ }
+ }
+
+ let t = (1.0 - epsilon)
+ * s.iter()
+ .flat_map(|row| row.iter())
+ .cloned()
+ .fold(f64::NAN, f64::max);
+
+ let mut aligns = Vec::new();
+ for i in 1..=m {
+ for j in 1..=n {
+ if s[i][j] >= t {
+ let mut out = Vec::new();
+ retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
+ aligns.push(out);
+ }
+ }
+ }
+ aligns
+}
+
+/// Retrieve the path through the similarity matrix S starting at (i, j).
+#[inline]
+fn retrieve<'a>(
+ i: usize,
+ j: usize,
+ score: f64,
+ s: &Vec<Vec<f64>>,
+ t: f64,
+ str1: &[&str],
+ str2: &[&str],
+ out: &'a mut Vec<(String, String)>,
+) -> &'a mut Vec<(String, String)> {
+ if s[i][j] == 0.0 {
+ return out;
+ }
+
+ if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
+ // j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T
+
+ let key = str2[j - 2..j].join("");
+ out.insert(0, (str1[i - 1].to_string(), key));
+
+ retrieve(
+ i - 1,
+ j - 2,
+ score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ } else if i > 1
+ && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
+ {
+ // i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
+ let key = str1[i - 2..i].join("");
+ out.insert(0, (key, str2[j - 1].to_string()));
+
+ retrieve(
+ i - 2,
+ j - 1,
+ score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ } else if (s[i][j - 1] + sigma_skip() + score) >= t {
+ // S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T
+
+ out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
+ retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
+ } else if (s[i - 1][j] + sigma_skip() + score) >= t {
+ // S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T
+
+ out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
+ retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
+ } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
+ // S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T
+ out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));
+
+ retrieve(
+ i - 1,
+ j - 1,
+ score + sigma_sub(str1[i - 1], str2[j - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ }
+
+ return out;
+}
+
+/// Returns score of an indel of P.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_skip() -> f64 {
+ EXTRACTED.cskip
+}
+
+/// Returns score of a substitution of P with Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_sub(p: &str, q: &str) -> f64 {
+ EXTRACTED.csub - delta(p, q) - v(p) - v(q)
+}
+
+/// Returns score of an expansion/compression.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
+ EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
+}
+
+/// Return weighted sum of difference between P and Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn delta(p: &str, q: &str) -> f64 {
+ let features = r(p, q);
+ features
+ .iter()
+ .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
+ .sum()
+}
+
+/// Returns difference between phonetic segments P and Q for feature F.
+///
+/// (Kondrak 2002: 52, 54)
+#[inline]
+fn diff(p: &str, q: &str, f: &str) -> f64 {
+ let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
+ let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
+ let p_similarity = *EXTRACTED
+ .similarity_matrix
+ .get(p_features)
+ .unwrap_or_else(|| unreachable!());
+ let q_similarity = *EXTRACTED
+ .similarity_matrix
+ .get(q_features)
+ .unwrap_or_else(|| unreachable!());
+ (p_similarity - q_similarity).abs()
+}
+
+/// Return relevant features for segment comparison.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> {
+ if EXTRACTED.consonants.contains(&p.to_string())
+ || EXTRACTED.consonants.contains(&q.to_string())
+ {
+ &EXTRACTED.rc
+ } else {
+ &EXTRACTED.rv
+ }
+}
+
+/// Return vowel weight if P is vowel.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn v(p: &str) -> f64 {
+ if !EXTRACTED.consonants.contains(&p.to_string()) {
+ EXTRACTED.cvwl
+ } else {
+ 0.0
+ }
+}