summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMatthieu Pignolet <matthieu@puffer.fish>2025-05-18 23:11:31 +0400
committerMatthieu Pignolet <matthieu@puffer.fish>2025-05-18 23:11:31 +0400
commit59c004feb284b2ed36bef5961da5d8ff216aaefb (patch)
treea59fcd1d0938b1567710b0927e2d710b4fe0b53a /src
parent2d92468b5ee98bd624d2dee20bcf19eb8b8c5e16 (diff)
major: tidy the project before publishing
Diffstat (limited to 'src')
-rw-r--r--src/bin/aline-demo/main.rs101
-rw-r--r--src/constants.rs32
-rw-r--r--src/extract.json1009
-rwxr-xr-xsrc/extract.py20
-rw-r--r--src/lib.rs271
-rw-r--r--src/test.rs62
6 files changed, 1495 insertions, 0 deletions
diff --git a/src/bin/aline-demo/main.rs b/src/bin/aline-demo/main.rs
new file mode 100644
index 0000000..7844066
--- /dev/null
+++ b/src/bin/aline-demo/main.rs
@@ -0,0 +1,101 @@
+use aline;
+
+fn main() {
+ let mut data: Vec<(String, String)> = vec![];
+ for line in COGNATE_DATE.split("\n") {
+ let mut pair = line.split(",");
+
+ let a = pair.next().unwrap().to_string();
+ let b = pair.next().unwrap().to_string();
+
+ data.push((a, b));
+ }
+
+ for (a, b) in data.into_iter() {
+ let alignment = &aline::align(&a, &b, 0.0)[0];
+
+ print!("{} ~ {} :", a, b);
+ for alignment in alignment {
+ let alignment = &alignment;
+ print!(" ({}, {})", alignment.0, alignment.1);
+ }
+ print!("\n");
+ }
+}
+
+
+const COGNATE_DATE: &str = r"jo,ʒə
+tu,ty
+nosotros,nu
+kjen,ki
+ke,kwa
+todos,tu
+una,ən
+dos,dø
+tres,trwa
+ombre,om
+arbol,arbrə
+pluma,plym
+kabeθa,kap
+boka,buʃ
+pje,pje
+koraθon,kœr
+ber,vwar
+benir,vənir
+deθir,dir
+pobre,povrə
+ðis,dIzes
+ðæt,das
+wat,vas
+nat,nixt
+loŋ,laŋ
+mæn,man
+fleʃ,flajʃ
+bləd,blyt
+feðər,fEdər
+hær,hAr
+ir,Or
+aj,awgə
+nowz,nAzə
+mawθ,munt
+təŋ,tsuŋə
+fut,fys
+nij,knI
+hænd,hant
+hart,herts
+livər,lEbər
+ænd,ante
+æt,ad
+blow,flAre
+ir,awris
+ijt,edere
+fiʃ,piʃkis
+flow,fluere
+staɾ,stella
+ful,plenus
+græs,gramen
+hart,kordis
+horn,korny
+aj,ego
+nij,genU
+məðər,mAter
+mawntən,mons
+nejm,nomen
+njuw,nowus
+wən,unus
+rawnd,rotundus
+sow,suere
+sit,sedere
+θrij,tres
+tuwθ,dentis
+θin,tenwis
+kinwawa,kenuaʔ
+nina,nenah
+napewa,napɛw
+wapimini,wapemen
+namesa,namɛʔs
+okimawa,okemaw
+ʃiʃipa,seʔsep
+ahkohkwa,ahkɛh
+pematesiweni,pematesewen
+asenja,aʔsɛn"; \ No newline at end of file
diff --git a/src/constants.rs b/src/constants.rs
new file mode 100644
index 0000000..83e11a1
--- /dev/null
+++ b/src/constants.rs
@@ -0,0 +1,32 @@
+
+
+use std::collections::{HashMap, HashSet};
+
+use once_cell::sync::Lazy;
+use serde::{Serialize, Deserialize};
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct Extracted {
+ #[serde(rename = "C_skip")]
+ pub cskip: f64,
+ #[serde(rename = "C_sub")]
+ pub csub: f64,
+ #[serde(rename = "C_exp")]
+ pub cexp: f64,
+ #[serde(rename = "C_vwl")]
+ pub cvwl: f64,
+ pub consonants: HashSet<String>,
+ #[serde(rename = "R_c")]
+ pub rc: HashSet<String>,
+ #[serde(rename = "R_v")]
+ pub rv: HashSet<String>,
+ pub similarity_matrix: HashMap<String, f64>,
+ pub salience: HashMap<String, f64>,
+ pub feature_matrix: HashMap<String, HashMap<String, String>>,
+}
+
+const EXTRACTED_JSON: &str = include_str!("extract.json");
+pub static EXTRACTED: Lazy<Extracted> = Lazy::new(|| {
+ serde_json::from_str(EXTRACTED_JSON).unwrap()
+});
+
diff --git a/src/extract.json b/src/extract.json
new file mode 100644
index 0000000..db9d2cf
--- /dev/null
+++ b/src/extract.json
@@ -0,0 +1,1009 @@
+{
+ "C_skip": -10,
+ "C_sub": 35,
+ "C_exp": 45,
+ "C_vwl": 5,
+ "consonants": [
+ "B",
+ "N",
+ "R",
+ "b",
+ "c",
+ "d",
+ "f",
+ "g",
+ "h",
+ "j",
+ "k",
+ "l",
+ "m",
+ "n",
+ "p",
+ "q",
+ "r",
+ "s",
+ "t",
+ "v",
+ "x",
+ "z",
+ "ç",
+ "ð",
+ "ħ",
+ "ŋ",
+ "ɖ",
+ "ɟ",
+ "ɢ",
+ "ɣ",
+ "ɦ",
+ "ɬ",
+ "ɮ",
+ "ɰ",
+ "ɱ",
+ "ɲ",
+ "ɳ",
+ "ɴ",
+ "ɸ",
+ "ɹ",
+ "ɻ",
+ "ɽ",
+ "ɾ",
+ "ʀ",
+ "ʁ",
+ "ʂ",
+ "ʃ",
+ "ʈ",
+ "ʋ",
+ "ʐ ",
+ "ʒ",
+ "ʔ",
+ "ʕ",
+ "ʙ",
+ "ʝ",
+ "β",
+ "θ",
+ "χ",
+ "ʐ",
+ "w"
+ ],
+ "R_c": [
+ "aspirated",
+ "lateral",
+ "manner",
+ "nasal",
+ "place",
+ "retroflex",
+ "syllabic",
+ "voice"
+ ],
+ "R_v": [
+ "back",
+ "lateral",
+ "long",
+ "manner",
+ "nasal",
+ "place",
+ "retroflex",
+ "round",
+ "syllabic",
+ "voice"
+ ],
+ "similarity_matrix": {
+ "bilabial": 1.0,
+ "labiodental": 0.95,
+ "dental": 0.9,
+ "alveolar": 0.85,
+ "retroflex": 0.8,
+ "palato-alveolar": 0.75,
+ "palatal": 0.7,
+ "velar": 0.6,
+ "uvular": 0.5,
+ "pharyngeal": 0.3,
+ "glottal": 0.1,
+ "labiovelar": 1.0,
+ "vowel": -1.0,
+ "stop": 1.0,
+ "affricate": 0.9,
+ "fricative": 0.85,
+ "trill": 0.7,
+ "tap": 0.65,
+ "approximant": 0.6,
+ "high vowel": 0.4,
+ "mid vowel": 0.2,
+ "low vowel": 0.0,
+ "vowel2": 0.5,
+ "high": 1.0,
+ "mid": 0.5,
+ "low": 0.0,
+ "front": 1.0,
+ "central": 0.5,
+ "back": 0.0,
+ "plus": 1.0,
+ "minus": 0.0
+ },
+ "salience": {
+ "syllabic": 5,
+ "place": 40,
+ "manner": 50,
+ "voice": 5,
+ "nasal": 20,
+ "retroflex": 10,
+ "lateral": 10,
+ "aspirated": 5,
+ "long": 0,
+ "high": 3,
+ "back": 2,
+ "round": 2
+ },
+ "feature_matrix": {
+ "p": {
+ "place": "bilabial",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "b": {
+ "place": "bilabial",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "t": {
+ "place": "alveolar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "d": {
+ "place": "alveolar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʈ": {
+ "place": "retroflex",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɖ": {
+ "place": "retroflex",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "c": {
+ "place": "palatal",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɟ": {
+ "place": "palatal",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "k": {
+ "place": "velar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "g": {
+ "place": "velar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "q": {
+ "place": "uvular",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɢ": {
+ "place": "uvular",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʔ": {
+ "place": "glottal",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "m": {
+ "place": "bilabial",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɱ": {
+ "place": "labiodental",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "n": {
+ "place": "alveolar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɳ": {
+ "place": "retroflex",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɲ": {
+ "place": "palatal",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ŋ": {
+ "place": "velar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɴ": {
+ "place": "uvular",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "N": {
+ "place": "uvular",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʙ": {
+ "place": "bilabial",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "B": {
+ "place": "bilabial",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "r": {
+ "place": "alveolar",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʀ": {
+ "place": "uvular",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "R": {
+ "place": "uvular",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɾ": {
+ "place": "alveolar",
+ "manner": "tap",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɽ": {
+ "place": "retroflex",
+ "manner": "tap",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɸ": {
+ "place": "bilabial",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "β": {
+ "place": "bilabial",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "f": {
+ "place": "labiodental",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "v": {
+ "place": "labiodental",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "θ": {
+ "place": "dental",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ð": {
+ "place": "dental",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "s": {
+ "place": "alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "z": {
+ "place": "alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʃ": {
+ "place": "palato-alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʒ": {
+ "place": "palato-alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʂ": {
+ "place": "retroflex",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʐ": {
+ "place": "retroflex",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ç": {
+ "place": "palatal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʝ": {
+ "place": "palatal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "x": {
+ "place": "velar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɣ": {
+ "place": "velar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "χ": {
+ "place": "uvular",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʁ": {
+ "place": "uvular",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ħ": {
+ "place": "pharyngeal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʕ": {
+ "place": "pharyngeal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "h": {
+ "place": "glottal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɦ": {
+ "place": "glottal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɬ": {
+ "place": "alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "plus",
+ "aspirated": "minus"
+ },
+ "ɮ": {
+ "place": "alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "plus",
+ "aspirated": "minus"
+ },
+ "ʋ": {
+ "place": "labiodental",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɹ": {
+ "place": "alveolar",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɻ": {
+ "place": "retroflex",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "j": {
+ "place": "palatal",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɰ": {
+ "place": "velar",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "l": {
+ "place": "alveolar",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "plus",
+ "aspirated": "minus"
+ },
+ "w": {
+ "place": "labiovelar",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "i": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "y": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "front",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "e": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "E": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "minus",
+ "long": "plus",
+ "aspirated": "minus"
+ },
+ "ø": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "ɛ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "œ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "æ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "low",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "a": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "low",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "A": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "low",
+ "back": "front",
+ "round": "minus",
+ "long": "plus",
+ "aspirated": "minus"
+ },
+ "ɨ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "central",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "ʉ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "central",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "ə": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "central",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "u": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "back",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "U": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "back",
+ "round": "plus",
+ "long": "plus",
+ "aspirated": "minus"
+ },
+ "o": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "back",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "O": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "back",
+ "round": "plus",
+ "long": "plus",
+ "aspirated": "minus"
+ },
+ "ɔ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "back",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "ɒ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "low",
+ "back": "back",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "I": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "front",
+ "round": "minus",
+ "long": "plus",
+ "aspirated": "minus"
+ }
+ }
+}
diff --git a/src/extract.py b/src/extract.py
new file mode 100755
index 0000000..b2faba5
--- /dev/null
+++ b/src/extract.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+from nltk.metrics.aline import *
+import json
+
+extract = {
+ 'C_skip': C_skip,
+ 'C_sub': C_sub,
+ 'C_exp': C_exp,
+ 'C_vwl': C_vwl,
+
+ 'consonants': consonants,
+ 'R_c': R_c,
+ 'R_v': R_v,
+ 'similarity_matrix': similarity_matrix,
+ 'salience': salience,
+ 'feature_matrix': feature_matrix,
+}
+
+print(json.dumps(extract, indent=2, ensure_ascii=False))
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e79e040
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,271 @@
+// ALINE phonetic sequence alignment in Rust
+// Port of NLTK's ALINE module (Greg Kondrak, 2002)
+
+/// ALINE
+/// https://webdocs.cs.ualberta.ca/~kondrak/
+/// Copyright 2002 by Grzegorz Kondrak.
+///
+/// ALINE is an algorithm for aligning phonetic sequences, described in [1].
+/// This module is a port of Kondrak's (2002) ALINE. It provides functions for
+/// phonetic sequence alignment and similarity analysis. These are useful in
+/// historical linguistics, sociolinguistics and synchronic phonology.
+///
+/// ALINE has parameters that can be tuned for desired output. These parameters are:
+/// - C_skip, C_sub, C_exp, C_vwl
+/// - Salience weights
+/// - Segmental features
+///
+/// In this implementation, some parameters have been changed from their default
+/// values as described in [1], in order to replicate published results. All changes
+/// are noted in comments.
+///
+/// # Get optimal alignment of two phonetic sequences
+///
+/// ```
+/// use aline::align;
+///
+/// let alignment = align("θin", "tenwis", 0.0);
+///
+/// assert_eq!(
+/// alignment,
+/// vec![
+/// vec![
+/// ("θ", "t"),
+/// ("i", "e"),
+/// ("n", "n")
+/// ].iter()
+/// .map(|(a, b)| (a.to_string(), b.to_string()))
+/// .collect::<Vec<(String, String)>>()
+/// ]
+/// );
+/// ```
+///
+/// [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
+/// University of Toronto.
+
+use std::{collections::HashSet, f64};
+
+use constants::EXTRACTED;
+use unicode_segmentation::UnicodeSegmentation;
+mod constants;
+
+#[cfg(test)]
+mod test;
+
+/// Compute the alignment of two phonetic strings.
+///
+/// (Kondrak 2002: 51)
+pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> {
+ assert!(
+ (0.0..=1.0).contains(&epsilon),
+ "Epsilon must be between 0.0 and 1.0."
+ );
+
+ let str1_chars: Vec<&str> = str1.graphemes(true).collect();
+ let str2_chars: Vec<&str> = str2.graphemes(true).collect();
+ let m = str1_chars.len();
+ let n = str2_chars.len();
+
+ // This includes Kondrak's initialization of row 0 and column 0 to all 0s.
+ let mut s = vec![vec![0.0; n + 1]; m + 1];
+ for i in 1..=m {
+ for j in 1..=n {
+ let edit1 = s[i - 1][j] + sigma_skip();
+ let edit2 = s[i][j - 1] + sigma_skip();
+
+ let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);
+
+ let edit4 = if i > 1 {
+ s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
+ } else {
+ -f64::INFINITY
+ };
+
+ let edit5 = if j > 1 {
+ s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
+ } else {
+ -f64::INFINITY
+ };
+
+ s[i][j] = [edit1, edit2, edit3, edit4, edit5]
+ .iter()
+ .fold(0f64, |prev, curr| f64::max(prev, *curr));
+ }
+ }
+
+ let t = (1.0 - epsilon)
+ * s.iter()
+ .flat_map(|row| row.iter())
+ .cloned()
+ .fold(f64::NAN, f64::max);
+
+ let mut aligns = Vec::new();
+ for i in 1..=m {
+ for j in 1..=n {
+ if s[i][j] >= t {
+ let mut out = Vec::new();
+ retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
+ aligns.push(out);
+ }
+ }
+ }
+ aligns
+}
+
+/// Retrieve the path through the similarity matrix S starting at (i, j).
+#[inline]
+fn retrieve<'a>(
+ i: usize,
+ j: usize,
+ score: f64,
+ s: &Vec<Vec<f64>>,
+ t: f64,
+ str1: &[&str],
+ str2: &[&str],
+ out: &'a mut Vec<(String, String)>,
+) -> &'a mut Vec<(String, String)> {
+ if s[i][j] == 0.0 {
+ return out;
+ }
+
+ if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
+ // j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T
+
+ let key = str2[j - 2..j].join("");
+ out.insert(0, (str1[i - 1].to_string(), key));
+
+ retrieve(
+ i - 1,
+ j - 2,
+ score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ } else if i > 1
+ && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
+ {
+ // i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
+ let key = str1[i - 2..i].join("");
+ out.insert(0, (key, str2[j - 1].to_string()));
+
+ retrieve(
+ i - 2,
+ j - 1,
+ score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ } else if (s[i][j - 1] + sigma_skip() + score) >= t {
+ // S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T
+
+ out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
+ retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
+ } else if (s[i - 1][j] + sigma_skip() + score) >= t {
+ // S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T
+
+ out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
+ retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
+ } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
+ // S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T
+ out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));
+
+ retrieve(
+ i - 1,
+ j - 1,
+ score + sigma_sub(str1[i - 1], str2[j - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ }
+
+ return out;
+}
+
+/// Returns score of an indel of P.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_skip() -> f64 {
+ EXTRACTED.cskip
+}
+
+/// Returns score of a substitution of P with Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_sub(p: &str, q: &str) -> f64 {
+ EXTRACTED.csub - delta(p, q) - v(p) - v(q)
+}
+
+/// Returns score of an expansion/compression.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
+ EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
+}
+
+/// Return weighted sum of difference between P and Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn delta(p: &str, q: &str) -> f64 {
+ let features = r(p, q);
+ features
+ .iter()
+ .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
+ .sum()
+}
+
+/// Returns difference between phonetic segments P and Q for feature F.
+///
+/// (Kondrak 2002: 52, 54)
+#[inline]
+fn diff(p: &str, q: &str, f: &str) -> f64 {
+ let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
+ let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
+ let p_similarity = *EXTRACTED
+ .similarity_matrix
+ .get(p_features)
+ .unwrap_or_else(|| unreachable!());
+ let q_similarity = *EXTRACTED
+ .similarity_matrix
+ .get(q_features)
+ .unwrap_or_else(|| unreachable!());
+ (p_similarity - q_similarity).abs()
+}
+
+/// Return relevant features for segment comparison.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> {
+ if EXTRACTED.consonants.contains(&p.to_string())
+ || EXTRACTED.consonants.contains(&q.to_string())
+ {
+ &EXTRACTED.rc
+ } else {
+ &EXTRACTED.rv
+ }
+}
+
+/// Return vowel weight if P is vowel.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn v(p: &str) -> f64 {
+ if !EXTRACTED.consonants.contains(&p.to_string()) {
+ EXTRACTED.cvwl
+ } else {
+ 0.0
+ }
+}
diff --git a/src/test.rs b/src/test.rs
new file mode 100644
index 0000000..633153a
--- /dev/null
+++ b/src/test.rs
@@ -0,0 +1,62 @@
+use crate::{align, delta};
+
+#[test]
+fn test_aline() {
+ assert_eq!(
+ align("θin", "tenwis", 0.0),
+ vec![
+ vec![("θ", "t"), ("i", "e"), ("n", "n")]
+ .iter()
+ .map(|(a, b)| (a.to_string(), b.to_string()))
+ .collect::<Vec<(String, String)>>()
+ ]
+ );
+
+ assert_eq!(
+ align("jo", "ʒə", 0.0),
+ vec![
+ vec![("j", "ʒ"), ("o", "ə")]
+ .iter()
+ .map(|(a, b)| (a.to_string(), b.to_string()))
+ .collect::<Vec<(String, String)>>()
+ ]
+ );
+
+ assert_eq!(
+ align("pematesiweni", "pematesewen", 0.0),
+ vec![
+ vec![
+ ("p", "p"),
+ ("e", "e"),
+ ("m", "m"),
+ ("a", "a"),
+ ("t", "t"),
+ ("e", "e"),
+ ("s", "s"),
+ ("i", "e"),
+ ("w", "w"),
+ ("e", "e"),
+ ("n", "n"),
+ ]
+ .iter()
+ .map(|(a, b)| (a.to_string(), b.to_string()))
+ .collect::<Vec<(String, String)>>()
+ ]
+ );
+
+ assert_eq!(
+ align("tuwθ", "dentis", 0.0),
+ vec![
+ vec![("t", "t"), ("u", "i"), ("w", "-"), ("θ", "s")]
+ .iter()
+ .map(|(a, b)| (a.to_string(), b.to_string()))
+ .collect::<Vec<(String, String)>>()
+ ]
+ )
+}
+
+#[test]
+fn test_aline_deltas() {
+ assert_eq!(delta("p", "q"), 20.0);
+ assert_eq!(delta("a", "A"), 0.0);
+}