diff options
Diffstat (limited to 'src/lib.rs')
| -rw-r--r-- | src/lib.rs | 271 | 
1 files changed, 271 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..e79e040 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,271 @@ +// ALINE phonetic sequence alignment in Rust +// Port of NLTK's ALINE module (Greg Kondrak, 2002) + +/// ALINE +/// https://webdocs.cs.ualberta.ca/~kondrak/ +/// Copyright 2002 by Grzegorz Kondrak. +///  +/// ALINE is an algorithm for aligning phonetic sequences, described in [1]. +/// This module is a port of Kondrak's (2002) ALINE. It provides functions for +/// phonetic sequence alignment and similarity analysis. These are useful in +/// historical linguistics, sociolinguistics and synchronic phonology. +///  +/// ALINE has parameters that can be tuned for desired output. These parameters are: +/// - C_skip, C_sub, C_exp, C_vwl +/// - Salience weights +/// - Segmental features +///  +/// In this implementation, some parameters have been changed from their default +/// values as described in [1], in order to replicate published results. All changes +/// are noted in comments. +///  +/// # Get optimal alignment of two phonetic sequences +///  +/// ``` +/// use aline::align; +///  +/// let alignment = align("θin", "tenwis", 0.0); +///  +/// assert_eq!( +///     alignment, +///     vec![ +///         vec![ +///             ("θ", "t"), +///             ("i", "e"), +///             ("n", "n") +///         ].iter() +///         .map(|(a, b)| (a.to_string(), b.to_string())) +///         .collect::<Vec<(String, String)>>() +///     ] +/// ); +/// ``` +///  +/// [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation, +/// University of Toronto. + +use std::{collections::HashSet, f64}; + +use constants::EXTRACTED; +use unicode_segmentation::UnicodeSegmentation; +mod constants; + +#[cfg(test)] +mod test; + +/// Compute the alignment of two phonetic strings. +///  +/// (Kondrak 2002: 51) +pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> { +    assert!( +        (0.0..=1.0).contains(&epsilon), +        "Epsilon must be between 0.0 and 1.0." +    ); + +    let str1_chars: Vec<&str> = str1.graphemes(true).collect(); +    let str2_chars: Vec<&str> = str2.graphemes(true).collect(); +    let m = str1_chars.len(); +    let n = str2_chars.len(); + +    // This includes Kondrak's initialization of row 0 and column 0 to all 0s. +    let mut s = vec![vec![0.0; n + 1]; m + 1]; +    for i in 1..=m { +        for j in 1..=n { +            let edit1 = s[i - 1][j] + sigma_skip(); +            let edit2 = s[i][j - 1] + sigma_skip(); + +            let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]); + +            let edit4 = if i > 1 { +                s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1]) +            } else { +                -f64::INFINITY +            }; + +            let edit5 = if j > 1 { +                s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1]) +            } else { +                -f64::INFINITY +            }; + +            s[i][j] = [edit1, edit2, edit3, edit4, edit5] +                .iter() +                .fold(0f64, |prev, curr| f64::max(prev, *curr)); +        } +    } + +    let t = (1.0 - epsilon) +        * s.iter() +            .flat_map(|row| row.iter()) +            .cloned() +            .fold(f64::NAN, f64::max); + +    let mut aligns = Vec::new(); +    for i in 1..=m { +        for j in 1..=n { +            if s[i][j] >= t { +                let mut out = Vec::new(); +                retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out); +                aligns.push(out); +            } +        } +    } +    aligns +} + +/// Retrieve the path through the similarity matrix S starting at (i, j). +#[inline] +fn retrieve<'a>( +    i: usize, +    j: usize, +    score: f64, +    s: &Vec<Vec<f64>>, +    t: f64, +    str1: &[&str], +    str2: &[&str], +    out: &'a mut Vec<(String, String)>, +) -> &'a mut Vec<(String, String)> { +    if s[i][j] == 0.0 { +        return out; +    } + +    if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t { +        // j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T + +        let key = str2[j - 2..j].join(""); +        out.insert(0, (str1[i - 1].to_string(), key)); + +        retrieve( +            i - 1, +            j - 2, +            score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]), +            s, +            t, +            str1, +            str2, +            out, +        ); +    } else if i > 1 +        && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t +    { +        // i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T +        let key = str1[i - 2..i].join(""); +        out.insert(0, (key, str2[j - 1].to_string())); + +        retrieve( +            i - 2, +            j - 1, +            score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]), +            s, +            t, +            str1, +            str2, +            out, +        ); +    } else if (s[i][j - 1] + sigma_skip() + score) >= t { +        // S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T + +        out.insert(0, ("-".to_string(), str2[j - 1].to_string())); +        retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out); +    } else if (s[i - 1][j] + sigma_skip() + score) >= t { +        // S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T + +        out.insert(0, (str1[i - 1].to_string(), "-".to_string())); +        retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out); +    } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t { +        // S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T +        out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string())); + +        retrieve( +            i - 1, +            j - 1, +            score + sigma_sub(str1[i - 1], str2[j - 1]), +            s, +            t, +            str1, +            str2, +            out, +        ); +    } + +    return out; +} + +/// Returns score of an indel of P. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_skip() -> f64 { +    EXTRACTED.cskip +} + +/// Returns score of a substitution of P with Q. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_sub(p: &str, q: &str) -> f64 { +    EXTRACTED.csub - delta(p, q) - v(p) - v(q) +} + +/// Returns score of an expansion/compression. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 { +    EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2)) +} + +/// Return weighted sum of difference between P and Q. +/// +/// (Kondrak 2002: 54) +#[inline] +fn delta(p: &str, q: &str) -> f64 { +    let features = r(p, q); +    features +        .iter() +        .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!())) +        .sum() +} + +/// Returns difference between phonetic segments P and Q for feature F. +/// +/// (Kondrak 2002: 52, 54) +#[inline] +fn diff(p: &str, q: &str, f: &str) -> f64 { +    let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f]; +    let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f]; +    let p_similarity = *EXTRACTED +        .similarity_matrix +        .get(p_features) +        .unwrap_or_else(|| unreachable!()); +    let q_similarity = *EXTRACTED +        .similarity_matrix +        .get(q_features) +        .unwrap_or_else(|| unreachable!()); +    (p_similarity - q_similarity).abs() +} + +/// Return relevant features for segment comparison. +/// +/// (Kondrak 2002: 54) +#[inline] +fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> { +    if EXTRACTED.consonants.contains(&p.to_string()) +        || EXTRACTED.consonants.contains(&q.to_string()) +    { +        &EXTRACTED.rc +    } else { +        &EXTRACTED.rv +    } +} + +/// Return vowel weight if P is vowel. +/// +/// (Kondrak 2002: 54) +#[inline] +fn v(p: &str) -> f64 { +    if !EXTRACTED.consonants.contains(&p.to_string()) { +        EXTRACTED.cvwl +    } else { +        0.0 +    } +}  | 
