diff options
| author | Matthieu Pignolet <matthieu@puffer.fish> | 2025-05-18 23:11:31 +0400 | 
|---|---|---|
| committer | Matthieu Pignolet <matthieu@puffer.fish> | 2025-05-18 23:11:31 +0400 | 
| commit | 59c004feb284b2ed36bef5961da5d8ff216aaefb (patch) | |
| tree | a59fcd1d0938b1567710b0927e2d710b4fe0b53a /src | |
| parent | 2d92468b5ee98bd624d2dee20bcf19eb8b8c5e16 (diff) | |
major: tidy the project before publishing
Diffstat (limited to 'src')
| -rw-r--r-- | src/bin/aline-demo/main.rs | 101 | ||||
| -rw-r--r-- | src/constants.rs | 32 | ||||
| -rw-r--r-- | src/extract.json | 1009 | ||||
| -rwxr-xr-x | src/extract.py | 20 | ||||
| -rw-r--r-- | src/lib.rs | 271 | ||||
| -rw-r--r-- | src/test.rs | 62 | 
6 files changed, 1495 insertions, 0 deletions
diff --git a/src/bin/aline-demo/main.rs b/src/bin/aline-demo/main.rs new file mode 100644 index 0000000..7844066 --- /dev/null +++ b/src/bin/aline-demo/main.rs @@ -0,0 +1,101 @@ +use aline; + +fn main() { +    let mut data: Vec<(String, String)> = vec![]; +    for line in COGNATE_DATE.split("\n") { +        let mut pair = line.split(","); + +        let a = pair.next().unwrap().to_string(); +        let b = pair.next().unwrap().to_string(); + +        data.push((a, b)); +    } + +    for (a, b) in data.into_iter() { +        let alignment = &aline::align(&a, &b, 0.0)[0]; + +        print!("{} ~ {} :", a, b); +        for alignment in alignment { +            let alignment = &alignment; +            print!(" ({}, {})", alignment.0, alignment.1); +        } +        print!("\n"); +    } +} + + +const COGNATE_DATE: &str = r"jo,ʒə +tu,ty +nosotros,nu +kjen,ki +ke,kwa +todos,tu +una,ən +dos,dø +tres,trwa +ombre,om +arbol,arbrə +pluma,plym +kabeθa,kap +boka,buʃ +pje,pje +koraθon,kœr +ber,vwar +benir,vənir +deθir,dir +pobre,povrə +ðis,dIzes +ðæt,das +wat,vas +nat,nixt +loŋ,laŋ +mæn,man +fleʃ,flajʃ +bləd,blyt +feðər,fEdər +hær,hAr +ir,Or +aj,awgə +nowz,nAzə +mawθ,munt +təŋ,tsuŋə +fut,fys +nij,knI +hænd,hant +hart,herts +livər,lEbər +ænd,ante +æt,ad +blow,flAre +ir,awris +ijt,edere +fiʃ,piʃkis +flow,fluere +staɾ,stella +ful,plenus +græs,gramen +hart,kordis +horn,korny +aj,ego +nij,genU +məðər,mAter +mawntən,mons +nejm,nomen +njuw,nowus +wən,unus +rawnd,rotundus +sow,suere +sit,sedere +θrij,tres +tuwθ,dentis +θin,tenwis +kinwawa,kenuaʔ +nina,nenah +napewa,napɛw +wapimini,wapemen +namesa,namɛʔs +okimawa,okemaw +ʃiʃipa,seʔsep +ahkohkwa,ahkɛh +pematesiweni,pematesewen +asenja,aʔsɛn";
\ No newline at end of file diff --git a/src/constants.rs b/src/constants.rs new file mode 100644 index 0000000..83e11a1 --- /dev/null +++ b/src/constants.rs @@ -0,0 +1,32 @@ + + +use std::collections::{HashMap, HashSet}; + +use once_cell::sync::Lazy; +use serde::{Serialize, Deserialize}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct Extracted { +    #[serde(rename = "C_skip")] +    pub cskip: f64, +    #[serde(rename = "C_sub")] +    pub csub: f64, +    #[serde(rename = "C_exp")] +    pub cexp: f64, +    #[serde(rename = "C_vwl")] +    pub cvwl: f64, +    pub consonants: HashSet<String>, +    #[serde(rename = "R_c")] +    pub rc: HashSet<String>, +    #[serde(rename = "R_v")] +    pub rv: HashSet<String>, +    pub similarity_matrix: HashMap<String, f64>, +    pub salience: HashMap<String, f64>, +    pub feature_matrix: HashMap<String, HashMap<String, String>>, +} + +const EXTRACTED_JSON: &str = include_str!("extract.json"); +pub static EXTRACTED: Lazy<Extracted> = Lazy::new(|| { +    serde_json::from_str(EXTRACTED_JSON).unwrap() +}); + diff --git a/src/extract.json b/src/extract.json new file mode 100644 index 0000000..db9d2cf --- /dev/null +++ b/src/extract.json @@ -0,0 +1,1009 @@ +{ +  "C_skip": -10, +  "C_sub": 35, +  "C_exp": 45, +  "C_vwl": 5, +  "consonants": [ +    "B", +    "N", +    "R", +    "b", +    "c", +    "d", +    "f", +    "g", +    "h", +    "j", +    "k", +    "l", +    "m", +    "n", +    "p", +    "q", +    "r", +    "s", +    "t", +    "v", +    "x", +    "z", +    "ç", +    "ð", +    "ħ", +    "ŋ", +    "ɖ", +    "ɟ", +    "ɢ", +    "ɣ", +    "ɦ", +    "ɬ", +    "ɮ", +    "ɰ", +    "ɱ", +    "ɲ", +    "ɳ", +    "ɴ", +    "ɸ", +    "ɹ", +    "ɻ", +    "ɽ", +    "ɾ", +    "ʀ", +    "ʁ", +    "ʂ", +    "ʃ", +    "ʈ", +    "ʋ", +    "ʐ ", +    "ʒ", +    "ʔ", +    "ʕ", +    "ʙ", +    "ʝ", +    "β", +    "θ", +    "χ", +    "ʐ", +    "w" +  ], +  "R_c": [ +    "aspirated", +    "lateral", +    "manner", +    "nasal", +    "place", +    "retroflex", +    "syllabic", +    "voice" +  ], +  "R_v": [ +    "back", +    "lateral", +    "long", +    "manner", +    "nasal", +    "place", +    "retroflex", +    "round", +    "syllabic", +    "voice" +  ], +  "similarity_matrix": { +    "bilabial": 1.0, +    "labiodental": 0.95, +    "dental": 0.9, +    "alveolar": 0.85, +    "retroflex": 0.8, +    "palato-alveolar": 0.75, +    "palatal": 0.7, +    "velar": 0.6, +    "uvular": 0.5, +    "pharyngeal": 0.3, +    "glottal": 0.1, +    "labiovelar": 1.0, +    "vowel": -1.0, +    "stop": 1.0, +    "affricate": 0.9, +    "fricative": 0.85, +    "trill": 0.7, +    "tap": 0.65, +    "approximant": 0.6, +    "high vowel": 0.4, +    "mid vowel": 0.2, +    "low vowel": 0.0, +    "vowel2": 0.5, +    "high": 1.0, +    "mid": 0.5, +    "low": 0.0, +    "front": 1.0, +    "central": 0.5, +    "back": 0.0, +    "plus": 1.0, +    "minus": 0.0 +  }, +  "salience": { +    "syllabic": 5, +    "place": 40, +    "manner": 50, +    "voice": 5, +    "nasal": 20, +    "retroflex": 10, +    "lateral": 10, +    "aspirated": 5, +    "long": 0, +    "high": 3, +    "back": 2, +    "round": 2 +  }, +  "feature_matrix": { +    "p": { +      "place": "bilabial", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "b": { +      "place": "bilabial", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "t": { +      "place": "alveolar", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "d": { +      "place": "alveolar", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʈ": { +      "place": "retroflex", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "plus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɖ": { +      "place": "retroflex", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "plus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "c": { +      "place": "palatal", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɟ": { +      "place": "palatal", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "k": { +      "place": "velar", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "g": { +      "place": "velar", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "q": { +      "place": "uvular", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɢ": { +      "place": "uvular", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʔ": { +      "place": "glottal", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "m": { +      "place": "bilabial", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "plus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɱ": { +      "place": "labiodental", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "plus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "n": { +      "place": "alveolar", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "plus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɳ": { +      "place": "retroflex", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "plus", +      "retroflex": "plus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɲ": { +      "place": "palatal", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "plus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ŋ": { +      "place": "velar", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "plus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɴ": { +      "place": "uvular", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "plus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "N": { +      "place": "uvular", +      "manner": "stop", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "plus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʙ": { +      "place": "bilabial", +      "manner": "trill", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "B": { +      "place": "bilabial", +      "manner": "trill", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "r": { +      "place": "alveolar", +      "manner": "trill", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "plus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʀ": { +      "place": "uvular", +      "manner": "trill", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "R": { +      "place": "uvular", +      "manner": "trill", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɾ": { +      "place": "alveolar", +      "manner": "tap", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɽ": { +      "place": "retroflex", +      "manner": "tap", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "plus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɸ": { +      "place": "bilabial", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "β": { +      "place": "bilabial", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "f": { +      "place": "labiodental", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "v": { +      "place": "labiodental", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "θ": { +      "place": "dental", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ð": { +      "place": "dental", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "s": { +      "place": "alveolar", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "z": { +      "place": "alveolar", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʃ": { +      "place": "palato-alveolar", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʒ": { +      "place": "palato-alveolar", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʂ": { +      "place": "retroflex", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "plus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʐ": { +      "place": "retroflex", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "plus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ç": { +      "place": "palatal", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʝ": { +      "place": "palatal", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "x": { +      "place": "velar", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɣ": { +      "place": "velar", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "χ": { +      "place": "uvular", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʁ": { +      "place": "uvular", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ħ": { +      "place": "pharyngeal", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ʕ": { +      "place": "pharyngeal", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "h": { +      "place": "glottal", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɦ": { +      "place": "glottal", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɬ": { +      "place": "alveolar", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "minus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "plus", +      "aspirated": "minus" +    }, +    "ɮ": { +      "place": "alveolar", +      "manner": "fricative", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "plus", +      "aspirated": "minus" +    }, +    "ʋ": { +      "place": "labiodental", +      "manner": "approximant", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɹ": { +      "place": "alveolar", +      "manner": "approximant", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɻ": { +      "place": "retroflex", +      "manner": "approximant", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "plus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "j": { +      "place": "palatal", +      "manner": "approximant", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "ɰ": { +      "place": "velar", +      "manner": "approximant", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "l": { +      "place": "alveolar", +      "manner": "approximant", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "plus", +      "aspirated": "minus" +    }, +    "w": { +      "place": "labiovelar", +      "manner": "approximant", +      "syllabic": "minus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "aspirated": "minus" +    }, +    "i": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "high", +      "back": "front", +      "round": "minus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "y": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "high", +      "back": "front", +      "round": "plus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "e": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "front", +      "round": "minus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "E": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "front", +      "round": "minus", +      "long": "plus", +      "aspirated": "minus" +    }, +    "ø": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "front", +      "round": "plus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "ɛ": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "front", +      "round": "minus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "œ": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "front", +      "round": "plus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "æ": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "low", +      "back": "front", +      "round": "minus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "a": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "low", +      "back": "front", +      "round": "minus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "A": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "low", +      "back": "front", +      "round": "minus", +      "long": "plus", +      "aspirated": "minus" +    }, +    "ɨ": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "high", +      "back": "central", +      "round": "minus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "ʉ": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "high", +      "back": "central", +      "round": "plus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "ə": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "central", +      "round": "minus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "u": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "high", +      "back": "back", +      "round": "plus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "U": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "high", +      "back": "back", +      "round": "plus", +      "long": "plus", +      "aspirated": "minus" +    }, +    "o": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "back", +      "round": "plus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "O": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "back", +      "round": "plus", +      "long": "plus", +      "aspirated": "minus" +    }, +    "ɔ": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "mid", +      "back": "back", +      "round": "plus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "ɒ": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "low", +      "back": "back", +      "round": "minus", +      "long": "minus", +      "aspirated": "minus" +    }, +    "I": { +      "place": "vowel", +      "manner": "vowel2", +      "syllabic": "plus", +      "voice": "plus", +      "nasal": "minus", +      "retroflex": "minus", +      "lateral": "minus", +      "high": "high", +      "back": "front", +      "round": "minus", +      "long": "plus", +      "aspirated": "minus" +    } +  } +} diff --git a/src/extract.py b/src/extract.py new file mode 100755 index 0000000..b2faba5 --- /dev/null +++ b/src/extract.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +from nltk.metrics.aline import * +import json + +extract = { +    'C_skip': C_skip, +    'C_sub': C_sub, +    'C_exp': C_exp, +    'C_vwl': C_vwl, +     +    'consonants': consonants, +    'R_c': R_c, +    'R_v': R_v, +    'similarity_matrix': similarity_matrix, +    'salience': salience, +    'feature_matrix': feature_matrix, +} + +print(json.dumps(extract, indent=2, ensure_ascii=False)) diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..e79e040 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,271 @@ +// ALINE phonetic sequence alignment in Rust +// Port of NLTK's ALINE module (Greg Kondrak, 2002) + +/// ALINE +/// https://webdocs.cs.ualberta.ca/~kondrak/ +/// Copyright 2002 by Grzegorz Kondrak. +///  +/// ALINE is an algorithm for aligning phonetic sequences, described in [1]. +/// This module is a port of Kondrak's (2002) ALINE. It provides functions for +/// phonetic sequence alignment and similarity analysis. These are useful in +/// historical linguistics, sociolinguistics and synchronic phonology. +///  +/// ALINE has parameters that can be tuned for desired output. These parameters are: +/// - C_skip, C_sub, C_exp, C_vwl +/// - Salience weights +/// - Segmental features +///  +/// In this implementation, some parameters have been changed from their default +/// values as described in [1], in order to replicate published results. All changes +/// are noted in comments. +///  +/// # Get optimal alignment of two phonetic sequences +///  +/// ``` +/// use aline::align; +///  +/// let alignment = align("θin", "tenwis", 0.0); +///  +/// assert_eq!( +///     alignment, +///     vec![ +///         vec![ +///             ("θ", "t"), +///             ("i", "e"), +///             ("n", "n") +///         ].iter() +///         .map(|(a, b)| (a.to_string(), b.to_string())) +///         .collect::<Vec<(String, String)>>() +///     ] +/// ); +/// ``` +///  +/// [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation, +/// University of Toronto. + +use std::{collections::HashSet, f64}; + +use constants::EXTRACTED; +use unicode_segmentation::UnicodeSegmentation; +mod constants; + +#[cfg(test)] +mod test; + +/// Compute the alignment of two phonetic strings. +///  +/// (Kondrak 2002: 51) +pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> { +    assert!( +        (0.0..=1.0).contains(&epsilon), +        "Epsilon must be between 0.0 and 1.0." +    ); + +    let str1_chars: Vec<&str> = str1.graphemes(true).collect(); +    let str2_chars: Vec<&str> = str2.graphemes(true).collect(); +    let m = str1_chars.len(); +    let n = str2_chars.len(); + +    // This includes Kondrak's initialization of row 0 and column 0 to all 0s. +    let mut s = vec![vec![0.0; n + 1]; m + 1]; +    for i in 1..=m { +        for j in 1..=n { +            let edit1 = s[i - 1][j] + sigma_skip(); +            let edit2 = s[i][j - 1] + sigma_skip(); + +            let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]); + +            let edit4 = if i > 1 { +                s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1]) +            } else { +                -f64::INFINITY +            }; + +            let edit5 = if j > 1 { +                s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1]) +            } else { +                -f64::INFINITY +            }; + +            s[i][j] = [edit1, edit2, edit3, edit4, edit5] +                .iter() +                .fold(0f64, |prev, curr| f64::max(prev, *curr)); +        } +    } + +    let t = (1.0 - epsilon) +        * s.iter() +            .flat_map(|row| row.iter()) +            .cloned() +            .fold(f64::NAN, f64::max); + +    let mut aligns = Vec::new(); +    for i in 1..=m { +        for j in 1..=n { +            if s[i][j] >= t { +                let mut out = Vec::new(); +                retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out); +                aligns.push(out); +            } +        } +    } +    aligns +} + +/// Retrieve the path through the similarity matrix S starting at (i, j). +#[inline] +fn retrieve<'a>( +    i: usize, +    j: usize, +    score: f64, +    s: &Vec<Vec<f64>>, +    t: f64, +    str1: &[&str], +    str2: &[&str], +    out: &'a mut Vec<(String, String)>, +) -> &'a mut Vec<(String, String)> { +    if s[i][j] == 0.0 { +        return out; +    } + +    if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t { +        // j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T + +        let key = str2[j - 2..j].join(""); +        out.insert(0, (str1[i - 1].to_string(), key)); + +        retrieve( +            i - 1, +            j - 2, +            score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]), +            s, +            t, +            str1, +            str2, +            out, +        ); +    } else if i > 1 +        && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t +    { +        // i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T +        let key = str1[i - 2..i].join(""); +        out.insert(0, (key, str2[j - 1].to_string())); + +        retrieve( +            i - 2, +            j - 1, +            score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]), +            s, +            t, +            str1, +            str2, +            out, +        ); +    } else if (s[i][j - 1] + sigma_skip() + score) >= t { +        // S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T + +        out.insert(0, ("-".to_string(), str2[j - 1].to_string())); +        retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out); +    } else if (s[i - 1][j] + sigma_skip() + score) >= t { +        // S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T + +        out.insert(0, (str1[i - 1].to_string(), "-".to_string())); +        retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out); +    } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t { +        // S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T +        out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string())); + +        retrieve( +            i - 1, +            j - 1, +            score + sigma_sub(str1[i - 1], str2[j - 1]), +            s, +            t, +            str1, +            str2, +            out, +        ); +    } + +    return out; +} + +/// Returns score of an indel of P. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_skip() -> f64 { +    EXTRACTED.cskip +} + +/// Returns score of a substitution of P with Q. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_sub(p: &str, q: &str) -> f64 { +    EXTRACTED.csub - delta(p, q) - v(p) - v(q) +} + +/// Returns score of an expansion/compression. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 { +    EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2)) +} + +/// Return weighted sum of difference between P and Q. +/// +/// (Kondrak 2002: 54) +#[inline] +fn delta(p: &str, q: &str) -> f64 { +    let features = r(p, q); +    features +        .iter() +        .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!())) +        .sum() +} + +/// Returns difference between phonetic segments P and Q for feature F. +/// +/// (Kondrak 2002: 52, 54) +#[inline] +fn diff(p: &str, q: &str, f: &str) -> f64 { +    let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f]; +    let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f]; +    let p_similarity = *EXTRACTED +        .similarity_matrix +        .get(p_features) +        .unwrap_or_else(|| unreachable!()); +    let q_similarity = *EXTRACTED +        .similarity_matrix +        .get(q_features) +        .unwrap_or_else(|| unreachable!()); +    (p_similarity - q_similarity).abs() +} + +/// Return relevant features for segment comparison. +/// +/// (Kondrak 2002: 54) +#[inline] +fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> { +    if EXTRACTED.consonants.contains(&p.to_string()) +        || EXTRACTED.consonants.contains(&q.to_string()) +    { +        &EXTRACTED.rc +    } else { +        &EXTRACTED.rv +    } +} + +/// Return vowel weight if P is vowel. +/// +/// (Kondrak 2002: 54) +#[inline] +fn v(p: &str) -> f64 { +    if !EXTRACTED.consonants.contains(&p.to_string()) { +        EXTRACTED.cvwl +    } else { +        0.0 +    } +} diff --git a/src/test.rs b/src/test.rs new file mode 100644 index 0000000..633153a --- /dev/null +++ b/src/test.rs @@ -0,0 +1,62 @@ +use crate::{align, delta}; + +#[test] +fn test_aline() { +    assert_eq!( +        align("θin", "tenwis", 0.0), +        vec![ +            vec![("θ", "t"), ("i", "e"), ("n", "n")] +                .iter() +                .map(|(a, b)| (a.to_string(), b.to_string())) +                .collect::<Vec<(String, String)>>() +        ] +    ); + +    assert_eq!( +        align("jo", "ʒə", 0.0), +        vec![ +            vec![("j", "ʒ"), ("o", "ə")] +                .iter() +                .map(|(a, b)| (a.to_string(), b.to_string())) +                .collect::<Vec<(String, String)>>() +        ] +    ); + +    assert_eq!( +        align("pematesiweni", "pematesewen", 0.0), +        vec![ +            vec![ +                ("p", "p"), +                ("e", "e"), +                ("m", "m"), +                ("a", "a"), +                ("t", "t"), +                ("e", "e"), +                ("s", "s"), +                ("i", "e"), +                ("w", "w"), +                ("e", "e"), +                ("n", "n"), +            ] +            .iter() +            .map(|(a, b)| (a.to_string(), b.to_string())) +            .collect::<Vec<(String, String)>>() +        ] +    ); + +    assert_eq!( +        align("tuwθ", "dentis", 0.0), +        vec![ +            vec![("t", "t"), ("u", "i"), ("w", "-"), ("θ", "s")] +                .iter() +                .map(|(a, b)| (a.to_string(), b.to_string())) +                .collect::<Vec<(String, String)>>() +        ] +    ) +} + +#[test] +fn test_aline_deltas() { +    assert_eq!(delta("p", "q"), 20.0); +    assert_eq!(delta("a", "A"), 0.0); +}  | 
