major: tidy the project before publishing

author: Matthieu Pignolet <matthieu@puffer.fish> 2025-05-18 23:11:31 +0400
committer: Matthieu Pignolet <matthieu@puffer.fish> 2025-05-18 23:11:31 +0400
commit: 59c004feb284b2ed36bef5961da5d8ff216aaefb (patch)
tree: a59fcd1d0938b1567710b0927e2d710b4fe0b53a /src
parent: 2d92468b5ee98bd624d2dee20bcf19eb8b8c5e16 (diff)
6 files changed, 1495 insertions, 0 deletions
diff --git a/src/bin/aline-demo/main.rs b/src/bin/aline-demo/main.rs
new file mode 100644
index 0000000..7844066
--- /dev/null
+++ b/src/bin/aline-demo/main.rs
@@ -0,0 +1,101 @@
+use aline;
+
+fn main() {
+    let mut data: Vec<(String, String)> = vec![];
+    for line in COGNATE_DATE.split("\n") {
+        let mut pair = line.split(",");
+
+        let a = pair.next().unwrap().to_string();
+        let b = pair.next().unwrap().to_string();
+
+        data.push((a, b));
+    }
+
+    for (a, b) in data.into_iter() {
+        let alignment = &aline::align(&a, &b, 0.0)[0];
+
+        print!("{} ~ {} :", a, b);
+        for alignment in alignment {
+            let alignment = &alignment;
+            print!(" ({}, {})", alignment.0, alignment.1);
+        }
+        print!("\n");
+    }
+}
+
+
+const COGNATE_DATE: &str = r"jo,ʒə
+tu,ty
+nosotros,nu
+kjen,ki
+ke,kwa
+todos,tu
+una,ən
+dos,dø
+tres,trwa
+ombre,om
+arbol,arbrə
+pluma,plym
+kabeθa,kap
+boka,buʃ
+pje,pje
+koraθon,kœr
+ber,vwar
+benir,vənir
+deθir,dir
+pobre,povrə
+ðis,dIzes
+ðæt,das
+wat,vas
+nat,nixt
+loŋ,laŋ
+mæn,man
+fleʃ,flajʃ
+bləd,blyt
+feðər,fEdər
+hær,hAr
+ir,Or
+aj,awgə
+nowz,nAzə
+mawθ,munt
+təŋ,tsuŋə
+fut,fys
+nij,knI
+hænd,hant
+hart,herts
+livər,lEbər
+ænd,ante
+æt,ad
+blow,flAre
+ir,awris
+ijt,edere
+fiʃ,piʃkis
+flow,fluere
+staɾ,stella
+ful,plenus
+græs,gramen
+hart,kordis
+horn,korny
+aj,ego
+nij,genU
+məðər,mAter
+mawntən,mons
+nejm,nomen
+njuw,nowus
+wən,unus
+rawnd,rotundus
+sow,suere
+sit,sedere
+θrij,tres
+tuwθ,dentis
+θin,tenwis
+kinwawa,kenuaʔ
+nina,nenah
+napewa,napɛw
+wapimini,wapemen
+namesa,namɛʔs
+okimawa,okemaw
+ʃiʃipa,seʔsep
+ahkohkwa,ahkɛh
+pematesiweni,pematesewen
+asenja,aʔsɛn";
+\ No newline at end of file
diff --git a/src/constants.rs b/src/constants.rs
new file mode 100644
index 0000000..83e11a1
--- /dev/null
+++ b/src/constants.rs
@@ -0,0 +1,32 @@
+
+
+use std::collections::{HashMap, HashSet};
+
+use once_cell::sync::Lazy;
+use serde::{Serialize, Deserialize};
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct Extracted {
+    #[serde(rename = "C_skip")]
+    pub cskip: f64,
+    #[serde(rename = "C_sub")]
+    pub csub: f64,
+    #[serde(rename = "C_exp")]
+    pub cexp: f64,
+    #[serde(rename = "C_vwl")]
+    pub cvwl: f64,
+    pub consonants: HashSet<String>,
+    #[serde(rename = "R_c")]
+    pub rc: HashSet<String>,
+    #[serde(rename = "R_v")]
+    pub rv: HashSet<String>,
+    pub similarity_matrix: HashMap<String, f64>,
+    pub salience: HashMap<String, f64>,
+    pub feature_matrix: HashMap<String, HashMap<String, String>>,
+}
+
+const EXTRACTED_JSON: &str = include_str!("extract.json");
+pub static EXTRACTED: Lazy<Extracted> = Lazy::new(|| {
+    serde_json::from_str(EXTRACTED_JSON).unwrap()
+});
+
diff --git a/src/extract.json b/src/extract.json
new file mode 100644
index 0000000..db9d2cf
--- /dev/null
+++ b/src/extract.json
@@ -0,0 +1,1009 @@
+{
+  "C_skip": -10,
+  "C_sub": 35,
+  "C_exp": 45,
+  "C_vwl": 5,
+  "consonants": [
+    "B",
+    "N",
+    "R",
+    "b",
+    "c",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "t",
+    "v",
+    "x",
+    "z",
+    "ç",
+    "ð",
+    "ħ",
+    "ŋ",
+    "ɖ",
+    "ɟ",
+    "ɢ",
+    "ɣ",
+    "ɦ",
+    "ɬ",
+    "ɮ",
+    "ɰ",
+    "ɱ",
+    "ɲ",
+    "ɳ",
+    "ɴ",
+    "ɸ",
+    "ɹ",
+    "ɻ",
+    "ɽ",
+    "ɾ",
+    "ʀ",
+    "ʁ",
+    "ʂ",
+    "ʃ",
+    "ʈ",
+    "ʋ",
+    "ʐ ",
+    "ʒ",
+    "ʔ",
+    "ʕ",
+    "ʙ",
+    "ʝ",
+    "β",
+    "θ",
+    "χ",
+    "ʐ",
+    "w"
+  ],
+  "R_c": [
+    "aspirated",
+    "lateral",
+    "manner",
+    "nasal",
+    "place",
+    "retroflex",
+    "syllabic",
+    "voice"
+  ],
+  "R_v": [
+    "back",
+    "lateral",
+    "long",
+    "manner",
+    "nasal",
+    "place",
+    "retroflex",
+    "round",
+    "syllabic",
+    "voice"
+  ],
+  "similarity_matrix": {
+    "bilabial": 1.0,
+    "labiodental": 0.95,
+    "dental": 0.9,
+    "alveolar": 0.85,
+    "retroflex": 0.8,
+    "palato-alveolar": 0.75,
+    "palatal": 0.7,
+    "velar": 0.6,
+    "uvular": 0.5,
+    "pharyngeal": 0.3,
+    "glottal": 0.1,
+    "labiovelar": 1.0,
+    "vowel": -1.0,
+    "stop": 1.0,
+    "affricate": 0.9,
+    "fricative": 0.85,
+    "trill": 0.7,
+    "tap": 0.65,
+    "approximant": 0.6,
+    "high vowel": 0.4,
+    "mid vowel": 0.2,
+    "low vowel": 0.0,
+    "vowel2": 0.5,
+    "high": 1.0,
+    "mid": 0.5,
+    "low": 0.0,
+    "front": 1.0,
+    "central": 0.5,
+    "back": 0.0,
+    "plus": 1.0,
+    "minus": 0.0
+  },
+  "salience": {
+    "syllabic": 5,
+    "place": 40,
+    "manner": 50,
+    "voice": 5,
+    "nasal": 20,
+    "retroflex": 10,
+    "lateral": 10,
+    "aspirated": 5,
+    "long": 0,
+    "high": 3,
+    "back": 2,
+    "round": 2
+  },
+  "feature_matrix": {
+    "p": {
+      "place": "bilabial",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "b": {
+      "place": "bilabial",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "t": {
+      "place": "alveolar",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "d": {
+      "place": "alveolar",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʈ": {
+      "place": "retroflex",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "plus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɖ": {
+      "place": "retroflex",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "plus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "c": {
+      "place": "palatal",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɟ": {
+      "place": "palatal",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "k": {
+      "place": "velar",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "g": {
+      "place": "velar",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "q": {
+      "place": "uvular",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɢ": {
+      "place": "uvular",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʔ": {
+      "place": "glottal",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "m": {
+      "place": "bilabial",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "plus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɱ": {
+      "place": "labiodental",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "plus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "n": {
+      "place": "alveolar",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "plus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɳ": {
+      "place": "retroflex",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "plus",
+      "retroflex": "plus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɲ": {
+      "place": "palatal",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "plus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ŋ": {
+      "place": "velar",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "plus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɴ": {
+      "place": "uvular",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "plus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "N": {
+      "place": "uvular",
+      "manner": "stop",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "plus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʙ": {
+      "place": "bilabial",
+      "manner": "trill",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "B": {
+      "place": "bilabial",
+      "manner": "trill",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "r": {
+      "place": "alveolar",
+      "manner": "trill",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "plus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʀ": {
+      "place": "uvular",
+      "manner": "trill",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "R": {
+      "place": "uvular",
+      "manner": "trill",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɾ": {
+      "place": "alveolar",
+      "manner": "tap",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɽ": {
+      "place": "retroflex",
+      "manner": "tap",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "plus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɸ": {
+      "place": "bilabial",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "β": {
+      "place": "bilabial",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "f": {
+      "place": "labiodental",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "v": {
+      "place": "labiodental",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "θ": {
+      "place": "dental",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ð": {
+      "place": "dental",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "s": {
+      "place": "alveolar",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "z": {
+      "place": "alveolar",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʃ": {
+      "place": "palato-alveolar",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʒ": {
+      "place": "palato-alveolar",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʂ": {
+      "place": "retroflex",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "plus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʐ": {
+      "place": "retroflex",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "plus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ç": {
+      "place": "palatal",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʝ": {
+      "place": "palatal",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "x": {
+      "place": "velar",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɣ": {
+      "place": "velar",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "χ": {
+      "place": "uvular",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʁ": {
+      "place": "uvular",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ħ": {
+      "place": "pharyngeal",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ʕ": {
+      "place": "pharyngeal",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "h": {
+      "place": "glottal",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɦ": {
+      "place": "glottal",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɬ": {
+      "place": "alveolar",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "minus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "plus",
+      "aspirated": "minus"
+    },
+    "ɮ": {
+      "place": "alveolar",
+      "manner": "fricative",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "plus",
+      "aspirated": "minus"
+    },
+    "ʋ": {
+      "place": "labiodental",
+      "manner": "approximant",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɹ": {
+      "place": "alveolar",
+      "manner": "approximant",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɻ": {
+      "place": "retroflex",
+      "manner": "approximant",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "plus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "j": {
+      "place": "palatal",
+      "manner": "approximant",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "ɰ": {
+      "place": "velar",
+      "manner": "approximant",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "l": {
+      "place": "alveolar",
+      "manner": "approximant",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "plus",
+      "aspirated": "minus"
+    },
+    "w": {
+      "place": "labiovelar",
+      "manner": "approximant",
+      "syllabic": "minus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "aspirated": "minus"
+    },
+    "i": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "high",
+      "back": "front",
+      "round": "minus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "y": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "high",
+      "back": "front",
+      "round": "plus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "e": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "front",
+      "round": "minus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "E": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "front",
+      "round": "minus",
+      "long": "plus",
+      "aspirated": "minus"
+    },
+    "ø": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "front",
+      "round": "plus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "ɛ": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "front",
+      "round": "minus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "œ": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "front",
+      "round": "plus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "æ": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "low",
+      "back": "front",
+      "round": "minus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "a": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "low",
+      "back": "front",
+      "round": "minus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "A": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "low",
+      "back": "front",
+      "round": "minus",
+      "long": "plus",
+      "aspirated": "minus"
+    },
+    "ɨ": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "high",
+      "back": "central",
+      "round": "minus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "ʉ": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "high",
+      "back": "central",
+      "round": "plus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "ə": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "central",
+      "round": "minus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "u": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "high",
+      "back": "back",
+      "round": "plus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "U": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "high",
+      "back": "back",
+      "round": "plus",
+      "long": "plus",
+      "aspirated": "minus"
+    },
+    "o": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "back",
+      "round": "plus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "O": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "back",
+      "round": "plus",
+      "long": "plus",
+      "aspirated": "minus"
+    },
+    "ɔ": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "mid",
+      "back": "back",
+      "round": "plus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "ɒ": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "low",
+      "back": "back",
+      "round": "minus",
+      "long": "minus",
+      "aspirated": "minus"
+    },
+    "I": {
+      "place": "vowel",
+      "manner": "vowel2",
+      "syllabic": "plus",
+      "voice": "plus",
+      "nasal": "minus",
+      "retroflex": "minus",
+      "lateral": "minus",
+      "high": "high",
+      "back": "front",
+      "round": "minus",
+      "long": "plus",
+      "aspirated": "minus"
+    }
+  }
+}
diff --git a/src/extract.py b/src/extract.py
new file mode 100755
index 0000000..b2faba5
--- /dev/null
+++ b/src/extract.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+from nltk.metrics.aline import *
+import json
+
+extract = {
+    'C_skip': C_skip,
+    'C_sub': C_sub,
+    'C_exp': C_exp,
+    'C_vwl': C_vwl,
+    
+    'consonants': consonants,
+    'R_c': R_c,
+    'R_v': R_v,
+    'similarity_matrix': similarity_matrix,
+    'salience': salience,
+    'feature_matrix': feature_matrix,
+}
+
+print(json.dumps(extract, indent=2, ensure_ascii=False))
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e79e040
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,271 @@
+// ALINE phonetic sequence alignment in Rust
+// Port of NLTK's ALINE module (Greg Kondrak, 2002)
+
+/// ALINE
+/// https://webdocs.cs.ualberta.ca/~kondrak/
+/// Copyright 2002 by Grzegorz Kondrak.
+/// 
+/// ALINE is an algorithm for aligning phonetic sequences, described in [1].
+/// This module is a port of Kondrak's (2002) ALINE. It provides functions for
+/// phonetic sequence alignment and similarity analysis. These are useful in
+/// historical linguistics, sociolinguistics and synchronic phonology.
+/// 
+/// ALINE has parameters that can be tuned for desired output. These parameters are:
+/// - C_skip, C_sub, C_exp, C_vwl
+/// - Salience weights
+/// - Segmental features
+/// 
+/// In this implementation, some parameters have been changed from their default
+/// values as described in [1], in order to replicate published results. All changes
+/// are noted in comments.
+/// 
+/// # Get optimal alignment of two phonetic sequences
+/// 
+/// ```
+/// use aline::align;
+/// 
+/// let alignment = align("θin", "tenwis", 0.0);
+/// 
+/// assert_eq!(
+///     alignment,
+///     vec![
+///         vec![
+///             ("θ", "t"),
+///             ("i", "e"),
+///             ("n", "n")
+///         ].iter()
+///         .map(|(a, b)| (a.to_string(), b.to_string()))
+///         .collect::<Vec<(String, String)>>()
+///     ]
+/// );
+/// ```
+/// 
+/// [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
+/// University of Toronto.
+
+use std::{collections::HashSet, f64};
+
+use constants::EXTRACTED;
+use unicode_segmentation::UnicodeSegmentation;
+mod constants;
+
+#[cfg(test)]
+mod test;
+
+/// Compute the alignment of two phonetic strings.
+/// 
+/// (Kondrak 2002: 51)
+pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> {
+    assert!(
+        (0.0..=1.0).contains(&epsilon),
+        "Epsilon must be between 0.0 and 1.0."
+    );
+
+    let str1_chars: Vec<&str> = str1.graphemes(true).collect();
+    let str2_chars: Vec<&str> = str2.graphemes(true).collect();
+    let m = str1_chars.len();
+    let n = str2_chars.len();
+
+    // This includes Kondrak's initialization of row 0 and column 0 to all 0s.
+    let mut s = vec![vec![0.0; n + 1]; m + 1];
+    for i in 1..=m {
+        for j in 1..=n {
+            let edit1 = s[i - 1][j] + sigma_skip();
+            let edit2 = s[i][j - 1] + sigma_skip();
+
+            let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);
+
+            let edit4 = if i > 1 {
+                s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
+            } else {
+                -f64::INFINITY
+            };
+
+            let edit5 = if j > 1 {
+                s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
+            } else {
+                -f64::INFINITY
+            };
+
+            s[i][j] = [edit1, edit2, edit3, edit4, edit5]
+                .iter()
+                .fold(0f64, |prev, curr| f64::max(prev, *curr));
+        }
+    }
+
+    let t = (1.0 - epsilon)
+        * s.iter()
+            .flat_map(|row| row.iter())
+            .cloned()
+            .fold(f64::NAN, f64::max);
+
+    let mut aligns = Vec::new();
+    for i in 1..=m {
+        for j in 1..=n {
+            if s[i][j] >= t {
+                let mut out = Vec::new();
+                retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
+                aligns.push(out);
+            }
+        }
+    }
+    aligns
+}
+
+/// Retrieve the path through the similarity matrix S starting at (i, j).
+#[inline]
+fn retrieve<'a>(
+    i: usize,
+    j: usize,
+    score: f64,
+    s: &Vec<Vec<f64>>,
+    t: f64,
+    str1: &[&str],
+    str2: &[&str],
+    out: &'a mut Vec<(String, String)>,
+) -> &'a mut Vec<(String, String)> {
+    if s[i][j] == 0.0 {
+        return out;
+    }
+
+    if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
+        // j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T
+
+        let key = str2[j - 2..j].join("");
+        out.insert(0, (str1[i - 1].to_string(), key));
+
+        retrieve(
+            i - 1,
+            j - 2,
+            score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
+            s,
+            t,
+            str1,
+            str2,
+            out,
+        );
+    } else if i > 1
+        && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
+    {
+        // i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
+        let key = str1[i - 2..i].join("");
+        out.insert(0, (key, str2[j - 1].to_string()));
+
+        retrieve(
+            i - 2,
+            j - 1,
+            score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
+            s,
+            t,
+            str1,
+            str2,
+            out,
+        );
+    } else if (s[i][j - 1] + sigma_skip() + score) >= t {
+        // S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T
+
+        out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
+        retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
+    } else if (s[i - 1][j] + sigma_skip() + score) >= t {
+        // S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T
+
+        out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
+        retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
+    } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
+        // S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T
+        out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));
+
+        retrieve(
+            i - 1,
+            j - 1,
+            score + sigma_sub(str1[i - 1], str2[j - 1]),
+            s,
+            t,
+            str1,
+            str2,
+            out,
+        );
+    }
+
+    return out;
+}
+
+/// Returns score of an indel of P.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_skip() -> f64 {
+    EXTRACTED.cskip
+}
+
+/// Returns score of a substitution of P with Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_sub(p: &str, q: &str) -> f64 {
+    EXTRACTED.csub - delta(p, q) - v(p) - v(q)
+}
+
+/// Returns score of an expansion/compression.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
+    EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
+}
+
+/// Return weighted sum of difference between P and Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn delta(p: &str, q: &str) -> f64 {
+    let features = r(p, q);
+    features
+        .iter()
+        .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
+        .sum()
+}
+
+/// Returns difference between phonetic segments P and Q for feature F.
+///
+/// (Kondrak 2002: 52, 54)
+#[inline]
+fn diff(p: &str, q: &str, f: &str) -> f64 {
+    let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
+    let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
+    let p_similarity = *EXTRACTED
+        .similarity_matrix
+        .get(p_features)
+        .unwrap_or_else(|| unreachable!());
+    let q_similarity = *EXTRACTED
+        .similarity_matrix
+        .get(q_features)
+        .unwrap_or_else(|| unreachable!());
+    (p_similarity - q_similarity).abs()
+}
+
+/// Return relevant features for segment comparison.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> {
+    if EXTRACTED.consonants.contains(&p.to_string())
+        || EXTRACTED.consonants.contains(&q.to_string())
+    {
+        &EXTRACTED.rc
+    } else {
+        &EXTRACTED.rv
+    }
+}
+
+/// Return vowel weight if P is vowel.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn v(p: &str) -> f64 {
+    if !EXTRACTED.consonants.contains(&p.to_string()) {
+        EXTRACTED.cvwl
+    } else {
+        0.0
+    }
+}
diff --git a/src/test.rs b/src/test.rs
new file mode 100644
index 0000000..633153a
--- /dev/null
+++ b/src/test.rs
@@ -0,0 +1,62 @@
+use crate::{align, delta};
+
+#[test]
+fn test_aline() {
+    assert_eq!(
+        align("θin", "tenwis", 0.0),
+        vec![
+            vec![("θ", "t"), ("i", "e"), ("n", "n")]
+                .iter()
+                .map(|(a, b)| (a.to_string(), b.to_string()))
+                .collect::<Vec<(String, String)>>()
+        ]
+    );
+
+    assert_eq!(
+        align("jo", "ʒə", 0.0),
+        vec![
+            vec![("j", "ʒ"), ("o", "ə")]
+                .iter()
+                .map(|(a, b)| (a.to_string(), b.to_string()))
+                .collect::<Vec<(String, String)>>()
+        ]
+    );
+
+    assert_eq!(
+        align("pematesiweni", "pematesewen", 0.0),
+        vec![
+            vec![
+                ("p", "p"),
+                ("e", "e"),
+                ("m", "m"),
+                ("a", "a"),
+                ("t", "t"),
+                ("e", "e"),
+                ("s", "s"),
+                ("i", "e"),
+                ("w", "w"),
+                ("e", "e"),
+                ("n", "n"),
+            ]
+            .iter()
+            .map(|(a, b)| (a.to_string(), b.to_string()))
+            .collect::<Vec<(String, String)>>()
+        ]
+    );
+
+    assert_eq!(
+        align("tuwθ", "dentis", 0.0),
+        vec![
+            vec![("t", "t"), ("u", "i"), ("w", "-"), ("θ", "s")]
+                .iter()
+                .map(|(a, b)| (a.to_string(), b.to_string()))
+                .collect::<Vec<(String, String)>>()
+        ]
+    )
+}
+
+#[test]
+fn test_aline_deltas() {
+    assert_eq!(delta("p", "q"), 20.0);
+    assert_eq!(delta("a", "A"), 0.0);
+}
author	Matthieu Pignolet <matthieu@puffer.fish>	2025-05-18 23:11:31 +0400
committer	Matthieu Pignolet <matthieu@puffer.fish>	2025-05-18 23:11:31 +0400
commit	59c004feb284b2ed36bef5961da5d8ff216aaefb (patch)
tree	a59fcd1d0938b1567710b0927e2d710b4fe0b53a /src
parent	2d92468b5ee98bd624d2dee20bcf19eb8b8c5e16 (diff)