//! ALINE //! //! Copyright 2002 by Grzegorz Kondrak. //! //! ALINE is an algorithm for aligning phonetic sequences, described in \[1\]. //! This module is a port of Kondrak's (2002) ALINE. It provides functions for //! phonetic sequence alignment and similarity analysis. These are useful in //! historical linguistics, sociolinguistics and synchronic phonology. //! //! ALINE has parameters that can be tuned for desired output. These parameters are: //! - C_skip, C_sub, C_exp, C_vwl //! - Salience weights //! - Segmental features //! //! In this implementation, some parameters have been changed from their default //! values as described in \[1\], in order to replicate published results. All changes //! are noted in comments. //! //! # Get optimal alignment of two phonetic sequences //! //! ``` //! use aline::align; //! //! let alignment = align("θin", "tenwis", 0.0); //! //! assert_eq!( //! alignment, //! vec![ //! vec![ //! ("θ", "t"), //! ("i", "e"), //! ("n", "n") //! ].iter() //! .map(|(a, b)| (a.to_string(), b.to_string())) //! .collect::>() //! ] //! ); //! ``` //! //! \[1\] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation, //! University of Toronto. use std::{collections::HashSet, f64}; use constants::EXTRACTED; use unicode_segmentation::UnicodeSegmentation; mod constants; #[cfg(test)] mod test; /// Compute the alignment of two phonetic strings. /// /// (Kondrak 2002: 51) pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec> { assert!( (0.0..=1.0).contains(&epsilon), "Epsilon must be between 0.0 and 1.0." ); let str1_chars: Vec<&str> = str1.graphemes(true).collect(); let str2_chars: Vec<&str> = str2.graphemes(true).collect(); let m = str1_chars.len(); let n = str2_chars.len(); // This includes Kondrak's initialization of row 0 and column 0 to all 0s. let mut s = vec![vec![0.0; n + 1]; m + 1]; for i in 1..=m { for j in 1..=n { let edit1 = s[i - 1][j] + sigma_skip(); let edit2 = s[i][j - 1] + sigma_skip(); let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]); let edit4 = if i > 1 { s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1]) } else { -f64::INFINITY }; let edit5 = if j > 1 { s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1]) } else { -f64::INFINITY }; s[i][j] = [edit1, edit2, edit3, edit4, edit5] .iter() .fold(0f64, |prev, curr| f64::max(prev, *curr)); } } let t = (1.0 - epsilon) * s.iter() .flat_map(|row| row.iter()) .cloned() .fold(f64::NAN, f64::max); let mut aligns = Vec::new(); for i in 1..=m { for j in 1..=n { if s[i][j] >= t { let mut out = Vec::new(); retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out); aligns.push(out); } } } aligns } /// Retrieve the path through the similarity matrix S starting at (i, j). #[inline] fn retrieve<'a>( i: usize, j: usize, score: f64, s: &Vec>, t: f64, str1: &[&str], str2: &[&str], out: &'a mut Vec<(String, String)>, ) -> &'a mut Vec<(String, String)> { if s[i][j] == 0.0 { return out; } if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t { let key = str2[j - 2..j].join(""); out.insert(0, (str1[i - 1].to_string(), key)); retrieve( i - 1, j - 2, score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]), s, t, str1, str2, out, ); } else if i > 1 && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t { let key = str1[i - 2..i].join(""); out.insert(0, (key, str2[j - 1].to_string())); retrieve( i - 2, j - 1, score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]), s, t, str1, str2, out, ); } else if (s[i][j - 1] + sigma_skip() + score) >= t { out.insert(0, ("-".to_string(), str2[j - 1].to_string())); retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out); } else if (s[i - 1][j] + sigma_skip() + score) >= t { out.insert(0, (str1[i - 1].to_string(), "-".to_string())); retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out); } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t { out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string())); retrieve( i - 1, j - 1, score + sigma_sub(str1[i - 1], str2[j - 1]), s, t, str1, str2, out, ); } return out; } /// Returns score of an indel of P. /// /// (Kondrak 2002: 54) #[inline] pub fn sigma_skip() -> f64 { EXTRACTED.cskip } /// Returns score of a substitution of P with Q. /// /// (Kondrak 2002: 54) #[inline] pub fn sigma_sub(p: &str, q: &str) -> f64 { EXTRACTED.csub - delta(p, q) - v(p) - v(q) } /// Returns score of an expansion/compression. /// /// (Kondrak 2002: 54) #[inline] pub fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 { EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2)) } /// Return weighted sum of difference between P and Q. /// /// (Kondrak 2002: 54) #[inline] pub fn delta(p: &str, q: &str) -> f64 { let features = r(p, q); features .iter() .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!())) .sum() } /// Returns difference between phonetic segments P and Q for feature F. /// /// (Kondrak 2002: 52, 54) #[inline] pub fn diff(p: &str, q: &str, f: &str) -> f64 { let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f]; let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f]; let p_similarity = *EXTRACTED .similarity_matrix .get(p_features) .unwrap_or_else(|| unreachable!()); let q_similarity = *EXTRACTED .similarity_matrix .get(q_features) .unwrap_or_else(|| unreachable!()); (p_similarity - q_similarity).abs() } /// Return relevant features for segment comparison. /// /// (Kondrak 2002: 54) #[inline] pub fn r<'a>(p: &str, q: &str) -> &'static HashSet { if EXTRACTED.consonants.contains(&p.to_string()) || EXTRACTED.consonants.contains(&q.to_string()) { &EXTRACTED.rc } else { &EXTRACTED.rv } } /// Return vowel weight if P is vowel. /// /// (Kondrak 2002: 54) #[inline] pub fn v(p: &str) -> f64 { if !EXTRACTED.consonants.contains(&p.to_string()) { EXTRACTED.cvwl } else { 0.0 } }