//! ALINE
//! <https://webdocs.cs.ualberta.ca/~kondrak/>
//! Copyright 2002 by Grzegorz Kondrak.
//! 
//! ALINE is an algorithm for aligning phonetic sequences, described in \[1\].
//! This module is a port of Kondrak's (2002) ALINE. It provides functions for
//! phonetic sequence alignment and similarity analysis. These are useful in
//! historical linguistics, sociolinguistics and synchronic phonology.
//! 
//! ALINE has parameters that can be tuned for desired output. These parameters are:
//! - C_skip, C_sub, C_exp, C_vwl
//! - Salience weights
//! - Segmental features
//! 
//! In this implementation, some parameters have been changed from their default
//! values as described in \[1\], in order to replicate published results. All changes
//! are noted in comments.
//! 
//! # Get optimal alignment of two phonetic sequences
//! 
//! ```
//! use aline::align;
//! 
//! let alignment = align("θin", "tenwis", 0.0);
//! 
//! assert_eq!(
//!     alignment,
//!     vec![
//!         vec![
//!             ("θ", "t"),
//!             ("i", "e"),
//!             ("n", "n")
//!         ].iter()
//!         .map(|(a, b)| (a.to_string(), b.to_string()))
//!         .collect::<Vec<(String, String)>>()
//!     ]
//! );
//! ```
//! 
//! \[1\] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
//! University of Toronto.

use std::{collections::HashSet, f64};

use constants::EXTRACTED;
use unicode_segmentation::UnicodeSegmentation;
mod constants;

#[cfg(test)]
mod test;

/// Compute the alignment of two phonetic strings.
/// 
/// (Kondrak 2002: 51)
pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> {
    assert!(
        (0.0..=1.0).contains(&epsilon),
        "Epsilon must be between 0.0 and 1.0."
    );

    let str1_chars: Vec<&str> = str1.graphemes(true).collect();
    let str2_chars: Vec<&str> = str2.graphemes(true).collect();
    let m = str1_chars.len();
    let n = str2_chars.len();

    // This includes Kondrak's initialization of row 0 and column 0 to all 0s.
    let mut s = vec![vec![0.0; n + 1]; m + 1];
    for i in 1..=m {
        for j in 1..=n {
            let edit1 = s[i - 1][j] + sigma_skip();
            let edit2 = s[i][j - 1] + sigma_skip();

            let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);

            let edit4 = if i > 1 {
                s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
            } else {
                -f64::INFINITY
            };

            let edit5 = if j > 1 {
                s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
            } else {
                -f64::INFINITY
            };

            s[i][j] = [edit1, edit2, edit3, edit4, edit5]
                .iter()
                .fold(0f64, |prev, curr| f64::max(prev, *curr));
        }
    }

    let t = (1.0 - epsilon)
        * s.iter()
            .flat_map(|row| row.iter())
            .cloned()
            .fold(f64::NAN, f64::max);

    let mut aligns = Vec::new();
    for i in 1..=m {
        for j in 1..=n {
            if s[i][j] >= t {
                let mut out = Vec::new();
                retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
                aligns.push(out);
            }
        }
    }
    aligns
}

/// Retrieve the path through the similarity matrix S starting at (i, j).
#[inline]
fn retrieve<'a>(
    i: usize,
    j: usize,
    score: f64,
    s: &Vec<Vec<f64>>,
    t: f64,
    str1: &[&str],
    str2: &[&str],
    out: &'a mut Vec<(String, String)>,
) -> &'a mut Vec<(String, String)> {
    if s[i][j] == 0.0 {
        return out;
    }

    if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
        let key = str2[j - 2..j].join("");
        out.insert(0, (str1[i - 1].to_string(), key));

        retrieve(
            i - 1,
            j - 2,
            score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
            s,
            t,
            str1,
            str2,
            out,
        );
    } else if i > 1
        && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
    {
        let key = str1[i - 2..i].join("");
        out.insert(0, (key, str2[j - 1].to_string()));

        retrieve(
            i - 2,
            j - 1,
            score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
            s,
            t,
            str1,
            str2,
            out,
        );
    } else if (s[i][j - 1] + sigma_skip() + score) >= t {
        out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
        retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
    } else if (s[i - 1][j] + sigma_skip() + score) >= t {
        out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
        retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
    } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
        out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));

        retrieve(
            i - 1,
            j - 1,
            score + sigma_sub(str1[i - 1], str2[j - 1]),
            s,
            t,
            str1,
            str2,
            out,
        );
    }

    return out;
}

/// Returns score of an indel of P.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn sigma_skip() -> f64 {
    EXTRACTED.cskip
}

/// Returns score of a substitution of P with Q.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn sigma_sub(p: &str, q: &str) -> f64 {
    EXTRACTED.csub - delta(p, q) - v(p) - v(q)
}

/// Returns score of an expansion/compression.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
    EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
}

/// Return weighted sum of difference between P and Q.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn delta(p: &str, q: &str) -> f64 {
    let features = r(p, q);
    features
        .iter()
        .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
        .sum()
}

/// Returns difference between phonetic segments P and Q for feature F.
///
/// (Kondrak 2002: 52, 54)
#[inline]
pub fn diff(p: &str, q: &str, f: &str) -> f64 {
    let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
    let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
    let p_similarity = *EXTRACTED
        .similarity_matrix
        .get(p_features)
        .unwrap_or_else(|| unreachable!());
    let q_similarity = *EXTRACTED
        .similarity_matrix
        .get(q_features)
        .unwrap_or_else(|| unreachable!());
    (p_similarity - q_similarity).abs()
}

/// Return relevant features for segment comparison.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> {
    if EXTRACTED.consonants.contains(&p.to_string())
        || EXTRACTED.consonants.contains(&q.to_string())
    {
        &EXTRACTED.rc
    } else {
        &EXTRACTED.rv
    }
}

/// Return vowel weight if P is vowel.
///
/// (Kondrak 2002: 54)
#[inline]
pub fn v(p: &str) -> f64 {
    if !EXTRACTED.consonants.contains(&p.to_string()) {
        EXTRACTED.cvwl
    } else {
        0.0
    }
}