//! ALINE
//!
//! Copyright 2002 by Grzegorz Kondrak.
//!
//! ALINE is an algorithm for aligning phonetic sequences, described in \[1\].
//! This module is a port of Kondrak's (2002) ALINE. It provides functions for
//! phonetic sequence alignment and similarity analysis. These are useful in
//! historical linguistics, sociolinguistics and synchronic phonology.
//!
//! ALINE has parameters that can be tuned for desired output. These parameters are:
//! - C_skip, C_sub, C_exp, C_vwl
//! - Salience weights
//! - Segmental features
//!
//! In this implementation, some parameters have been changed from their default
//! values as described in \[1\], in order to replicate published results. All changes
//! are noted in comments.
//!
//! # Get optimal alignment of two phonetic sequences
//!
//! ```
//! use aline::align;
//!
//! let alignment = align("θin", "tenwis", 0.0);
//!
//! assert_eq!(
//! alignment,
//! vec![
//! vec![
//! ("θ", "t"),
//! ("i", "e"),
//! ("n", "n")
//! ].iter()
//! .map(|(a, b)| (a.to_string(), b.to_string()))
//! .collect::>()
//! ]
//! );
//! ```
//!
//! \[1\] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
//! University of Toronto.
use std::{collections::HashSet, f64};
use constants::EXTRACTED;
use unicode_segmentation::UnicodeSegmentation;
mod constants;
#[cfg(test)]
mod test;
/// Compute the alignment of two phonetic strings.
///
/// (Kondrak 2002: 51)
pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec> {
assert!(
(0.0..=1.0).contains(&epsilon),
"Epsilon must be between 0.0 and 1.0."
);
let str1_chars: Vec<&str> = str1.graphemes(true).collect();
let str2_chars: Vec<&str> = str2.graphemes(true).collect();
let m = str1_chars.len();
let n = str2_chars.len();
// This includes Kondrak's initialization of row 0 and column 0 to all 0s.
let mut s = vec![vec![0.0; n + 1]; m + 1];
for i in 1..=m {
for j in 1..=n {
let edit1 = s[i - 1][j] + sigma_skip();
let edit2 = s[i][j - 1] + sigma_skip();
let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);
let edit4 = if i > 1 {
s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
} else {
-f64::INFINITY
};
let edit5 = if j > 1 {
s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
} else {
-f64::INFINITY
};
s[i][j] = [edit1, edit2, edit3, edit4, edit5]
.iter()
.fold(0f64, |prev, curr| f64::max(prev, *curr));
}
}
let t = (1.0 - epsilon)
* s.iter()
.flat_map(|row| row.iter())
.cloned()
.fold(f64::NAN, f64::max);
let mut aligns = Vec::new();
for i in 1..=m {
for j in 1..=n {
if s[i][j] >= t {
let mut out = Vec::new();
retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
aligns.push(out);
}
}
}
aligns
}
/// Retrieve the path through the similarity matrix S starting at (i, j).
#[inline]
fn retrieve<'a>(
i: usize,
j: usize,
score: f64,
s: &Vec>,
t: f64,
str1: &[&str],
str2: &[&str],
out: &'a mut Vec<(String, String)>,
) -> &'a mut Vec<(String, String)> {
if s[i][j] == 0.0 {
return out;
}
if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
let key = str2[j - 2..j].join("");
out.insert(0, (str1[i - 1].to_string(), key));
retrieve(
i - 1,
j - 2,
score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
s,
t,
str1,
str2,
out,
);
} else if i > 1
&& (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
{
let key = str1[i - 2..i].join("");
out.insert(0, (key, str2[j - 1].to_string()));
retrieve(
i - 2,
j - 1,
score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
s,
t,
str1,
str2,
out,
);
} else if (s[i][j - 1] + sigma_skip() + score) >= t {
out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
} else if (s[i - 1][j] + sigma_skip() + score) >= t {
out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
} else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));
retrieve(
i - 1,
j - 1,
score + sigma_sub(str1[i - 1], str2[j - 1]),
s,
t,
str1,
str2,
out,
);
}
return out;
}
/// Returns score of an indel of P.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn sigma_skip() -> f64 {
EXTRACTED.cskip
}
/// Returns score of a substitution of P with Q.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn sigma_sub(p: &str, q: &str) -> f64 {
EXTRACTED.csub - delta(p, q) - v(p) - v(q)
}
/// Returns score of an expansion/compression.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
}
/// Return weighted sum of difference between P and Q.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn delta(p: &str, q: &str) -> f64 {
let features = r(p, q);
features
.iter()
.map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
.sum()
}
/// Returns difference between phonetic segments P and Q for feature F.
///
/// (Kondrak 2002: 52, 54)
#[inline]
pub fn diff(p: &str, q: &str, f: &str) -> f64 {
let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
let p_similarity = *EXTRACTED
.similarity_matrix
.get(p_features)
.unwrap_or_else(|| unreachable!());
let q_similarity = *EXTRACTED
.similarity_matrix
.get(q_features)
.unwrap_or_else(|| unreachable!());
(p_similarity - q_similarity).abs()
}
/// Return relevant features for segment comparison.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn r<'a>(p: &str, q: &str) -> &'static HashSet {
if EXTRACTED.consonants.contains(&p.to_string())
|| EXTRACTED.consonants.contains(&q.to_string())
{
&EXTRACTED.rc
} else {
&EXTRACTED.rv
}
}
/// Return vowel weight if P is vowel.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn v(p: &str) -> f64 {
if !EXTRACTED.consonants.contains(&p.to_string()) {
EXTRACTED.cvwl
} else {
0.0
}
}