summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthieu Pignolet <matthieu@puffer.fish>2025-05-18 19:00:51 +0400
committerMatthieu Pignolet <matthieu@puffer.fish>2025-05-18 19:00:51 +0400
commit20c7daa45e55063ff91e4a5a8ee09af53710a6d9 (patch)
treef3ea47cd3697a4bdd455d8991ba42d0b123fb1bb
parentbdefb4e5f084a1e468ff4ee8d42035caf15c8e8d (diff)
feat: adding the library implementation
-rw-r--r--aline/Cargo.lock117
-rw-r--r--aline/Cargo.toml10
-rw-r--r--aline/src/constants.rs32
-rw-r--r--aline/src/extract.json1009
-rwxr-xr-xaline/src/extract.py20
-rw-r--r--aline/src/lib.rs224
6 files changed, 1412 insertions, 0 deletions
diff --git a/aline/Cargo.lock b/aline/Cargo.lock
new file mode 100644
index 0000000..72e8a01
--- /dev/null
+++ b/aline/Cargo.lock
@@ -0,0 +1,117 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "aline"
+version = "0.1.0"
+dependencies = [
+ "array2d",
+ "once_cell",
+ "serde",
+ "serde_json",
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "array2d"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8b39cb2c1bf5a7c0dd097aa95ab859cf87dab5a4328900f5388942dc1889f74"
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "ryu"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+
+[[package]]
+name = "serde"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.140"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.101"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
diff --git a/aline/Cargo.toml b/aline/Cargo.toml
new file mode 100644
index 0000000..d829946
--- /dev/null
+++ b/aline/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "aline"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+once_cell = "1.21.3"
+serde = { version = "1.0.219", features = ["derive"] }
+serde_json = "1.0.140"
+unicode-segmentation = "1.12.0"
diff --git a/aline/src/constants.rs b/aline/src/constants.rs
new file mode 100644
index 0000000..83e11a1
--- /dev/null
+++ b/aline/src/constants.rs
@@ -0,0 +1,32 @@
+
+
+use std::collections::{HashMap, HashSet};
+
+use once_cell::sync::Lazy;
+use serde::{Serialize, Deserialize};
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct Extracted {
+ #[serde(rename = "C_skip")]
+ pub cskip: f64,
+ #[serde(rename = "C_sub")]
+ pub csub: f64,
+ #[serde(rename = "C_exp")]
+ pub cexp: f64,
+ #[serde(rename = "C_vwl")]
+ pub cvwl: f64,
+ pub consonants: HashSet<String>,
+ #[serde(rename = "R_c")]
+ pub rc: HashSet<String>,
+ #[serde(rename = "R_v")]
+ pub rv: HashSet<String>,
+ pub similarity_matrix: HashMap<String, f64>,
+ pub salience: HashMap<String, f64>,
+ pub feature_matrix: HashMap<String, HashMap<String, String>>,
+}
+
+const EXTRACTED_JSON: &str = include_str!("extract.json");
+pub static EXTRACTED: Lazy<Extracted> = Lazy::new(|| {
+ serde_json::from_str(EXTRACTED_JSON).unwrap()
+});
+
diff --git a/aline/src/extract.json b/aline/src/extract.json
new file mode 100644
index 0000000..db9d2cf
--- /dev/null
+++ b/aline/src/extract.json
@@ -0,0 +1,1009 @@
+{
+ "C_skip": -10,
+ "C_sub": 35,
+ "C_exp": 45,
+ "C_vwl": 5,
+ "consonants": [
+ "B",
+ "N",
+ "R",
+ "b",
+ "c",
+ "d",
+ "f",
+ "g",
+ "h",
+ "j",
+ "k",
+ "l",
+ "m",
+ "n",
+ "p",
+ "q",
+ "r",
+ "s",
+ "t",
+ "v",
+ "x",
+ "z",
+ "ç",
+ "ð",
+ "ħ",
+ "ŋ",
+ "ɖ",
+ "ɟ",
+ "ɢ",
+ "ɣ",
+ "ɦ",
+ "ɬ",
+ "ɮ",
+ "ɰ",
+ "ɱ",
+ "ɲ",
+ "ɳ",
+ "ɴ",
+ "ɸ",
+ "ɹ",
+ "ɻ",
+ "ɽ",
+ "ɾ",
+ "ʀ",
+ "ʁ",
+ "ʂ",
+ "ʃ",
+ "ʈ",
+ "ʋ",
+ "ʐ ",
+ "ʒ",
+ "ʔ",
+ "ʕ",
+ "ʙ",
+ "ʝ",
+ "β",
+ "θ",
+ "χ",
+ "ʐ",
+ "w"
+ ],
+ "R_c": [
+ "aspirated",
+ "lateral",
+ "manner",
+ "nasal",
+ "place",
+ "retroflex",
+ "syllabic",
+ "voice"
+ ],
+ "R_v": [
+ "back",
+ "lateral",
+ "long",
+ "manner",
+ "nasal",
+ "place",
+ "retroflex",
+ "round",
+ "syllabic",
+ "voice"
+ ],
+ "similarity_matrix": {
+ "bilabial": 1.0,
+ "labiodental": 0.95,
+ "dental": 0.9,
+ "alveolar": 0.85,
+ "retroflex": 0.8,
+ "palato-alveolar": 0.75,
+ "palatal": 0.7,
+ "velar": 0.6,
+ "uvular": 0.5,
+ "pharyngeal": 0.3,
+ "glottal": 0.1,
+ "labiovelar": 1.0,
+ "vowel": -1.0,
+ "stop": 1.0,
+ "affricate": 0.9,
+ "fricative": 0.85,
+ "trill": 0.7,
+ "tap": 0.65,
+ "approximant": 0.6,
+ "high vowel": 0.4,
+ "mid vowel": 0.2,
+ "low vowel": 0.0,
+ "vowel2": 0.5,
+ "high": 1.0,
+ "mid": 0.5,
+ "low": 0.0,
+ "front": 1.0,
+ "central": 0.5,
+ "back": 0.0,
+ "plus": 1.0,
+ "minus": 0.0
+ },
+ "salience": {
+ "syllabic": 5,
+ "place": 40,
+ "manner": 50,
+ "voice": 5,
+ "nasal": 20,
+ "retroflex": 10,
+ "lateral": 10,
+ "aspirated": 5,
+ "long": 0,
+ "high": 3,
+ "back": 2,
+ "round": 2
+ },
+ "feature_matrix": {
+ "p": {
+ "place": "bilabial",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "b": {
+ "place": "bilabial",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "t": {
+ "place": "alveolar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "d": {
+ "place": "alveolar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʈ": {
+ "place": "retroflex",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɖ": {
+ "place": "retroflex",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "c": {
+ "place": "palatal",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɟ": {
+ "place": "palatal",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "k": {
+ "place": "velar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "g": {
+ "place": "velar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "q": {
+ "place": "uvular",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɢ": {
+ "place": "uvular",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʔ": {
+ "place": "glottal",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "m": {
+ "place": "bilabial",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɱ": {
+ "place": "labiodental",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "n": {
+ "place": "alveolar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɳ": {
+ "place": "retroflex",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɲ": {
+ "place": "palatal",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ŋ": {
+ "place": "velar",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɴ": {
+ "place": "uvular",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "N": {
+ "place": "uvular",
+ "manner": "stop",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "plus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʙ": {
+ "place": "bilabial",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "B": {
+ "place": "bilabial",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "r": {
+ "place": "alveolar",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʀ": {
+ "place": "uvular",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "R": {
+ "place": "uvular",
+ "manner": "trill",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɾ": {
+ "place": "alveolar",
+ "manner": "tap",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɽ": {
+ "place": "retroflex",
+ "manner": "tap",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɸ": {
+ "place": "bilabial",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "β": {
+ "place": "bilabial",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "f": {
+ "place": "labiodental",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "v": {
+ "place": "labiodental",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "θ": {
+ "place": "dental",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ð": {
+ "place": "dental",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "s": {
+ "place": "alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "z": {
+ "place": "alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʃ": {
+ "place": "palato-alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʒ": {
+ "place": "palato-alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʂ": {
+ "place": "retroflex",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʐ": {
+ "place": "retroflex",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ç": {
+ "place": "palatal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʝ": {
+ "place": "palatal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "x": {
+ "place": "velar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɣ": {
+ "place": "velar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "χ": {
+ "place": "uvular",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʁ": {
+ "place": "uvular",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ħ": {
+ "place": "pharyngeal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ʕ": {
+ "place": "pharyngeal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "h": {
+ "place": "glottal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɦ": {
+ "place": "glottal",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɬ": {
+ "place": "alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "minus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "plus",
+ "aspirated": "minus"
+ },
+ "ɮ": {
+ "place": "alveolar",
+ "manner": "fricative",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "plus",
+ "aspirated": "minus"
+ },
+ "ʋ": {
+ "place": "labiodental",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɹ": {
+ "place": "alveolar",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɻ": {
+ "place": "retroflex",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "plus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "j": {
+ "place": "palatal",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "ɰ": {
+ "place": "velar",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "l": {
+ "place": "alveolar",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "plus",
+ "aspirated": "minus"
+ },
+ "w": {
+ "place": "labiovelar",
+ "manner": "approximant",
+ "syllabic": "minus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "aspirated": "minus"
+ },
+ "i": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "y": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "front",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "e": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "E": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "minus",
+ "long": "plus",
+ "aspirated": "minus"
+ },
+ "ø": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "ɛ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "œ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "front",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "æ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "low",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "a": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "low",
+ "back": "front",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "A": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "low",
+ "back": "front",
+ "round": "minus",
+ "long": "plus",
+ "aspirated": "minus"
+ },
+ "ɨ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "central",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "ʉ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "central",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "ə": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "central",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "u": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "back",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "U": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "back",
+ "round": "plus",
+ "long": "plus",
+ "aspirated": "minus"
+ },
+ "o": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "back",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "O": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "back",
+ "round": "plus",
+ "long": "plus",
+ "aspirated": "minus"
+ },
+ "ɔ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "mid",
+ "back": "back",
+ "round": "plus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "ɒ": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "low",
+ "back": "back",
+ "round": "minus",
+ "long": "minus",
+ "aspirated": "minus"
+ },
+ "I": {
+ "place": "vowel",
+ "manner": "vowel2",
+ "syllabic": "plus",
+ "voice": "plus",
+ "nasal": "minus",
+ "retroflex": "minus",
+ "lateral": "minus",
+ "high": "high",
+ "back": "front",
+ "round": "minus",
+ "long": "plus",
+ "aspirated": "minus"
+ }
+ }
+}
diff --git a/aline/src/extract.py b/aline/src/extract.py
new file mode 100755
index 0000000..b2faba5
--- /dev/null
+++ b/aline/src/extract.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+from nltk.metrics.aline import *
+import json
+
+extract = {
+ 'C_skip': C_skip,
+ 'C_sub': C_sub,
+ 'C_exp': C_exp,
+ 'C_vwl': C_vwl,
+
+ 'consonants': consonants,
+ 'R_c': R_c,
+ 'R_v': R_v,
+ 'similarity_matrix': similarity_matrix,
+ 'salience': salience,
+ 'feature_matrix': feature_matrix,
+}
+
+print(json.dumps(extract, indent=2, ensure_ascii=False))
diff --git a/aline/src/lib.rs b/aline/src/lib.rs
new file mode 100644
index 0000000..0e44705
--- /dev/null
+++ b/aline/src/lib.rs
@@ -0,0 +1,224 @@
+// ALINE phonetic sequence alignment in Rust
+// Port of NLTK's ALINE module (Greg Kondrak, 2002)
+
+use std::{collections::HashSet, f64};
+
+use constants::EXTRACTED;
+use unicode_segmentation::UnicodeSegmentation;
+mod constants;
+
+pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> {
+ assert!(
+ (0.0..=1.0).contains(&epsilon),
+ "Epsilon must be between 0.0 and 1.0."
+ );
+
+ let str1_chars: Vec<&str> = str1.graphemes(true).collect();
+ let str2_chars: Vec<&str> = str2.graphemes(true).collect();
+ let m = str1_chars.len();
+ let n = str2_chars.len();
+
+ // This includes Kondrak's initialization of row 0 and column 0 to all 0s.
+ let mut s = vec![vec![0.0; n + 1]; m + 1];
+ for i in 1..=m {
+ for j in 1..=n {
+ let edit1 = s[i - 1][j] + sigma_skip();
+ let edit2 = s[i][j - 1] + sigma_skip();
+
+ let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);
+
+ let edit4 = if i > 1 {
+ s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
+ } else {
+ -f64::INFINITY
+ };
+
+ let edit5 = if j > 1 {
+ s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
+ } else {
+ -f64::INFINITY
+ };
+
+ s[i][j] = [edit1, edit2, edit3, edit4, edit5]
+ .iter()
+ .fold(0f64, |prev, curr| f64::max(prev, *curr));
+ }
+ }
+ let amax = s
+ .iter()
+ .flat_map(|row| row.iter())
+ .cloned()
+ .fold(f64::NAN, f64::max);
+ let t = (1.0 - epsilon) * amax;
+
+ let mut aligns = Vec::new();
+ for i in 1..=m {
+ for j in 1..=n {
+ if s[i][j] >= t {
+ let mut out = Vec::new();
+ retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
+ aligns.push(out);
+ }
+ }
+ }
+ aligns
+}
+
+/// Retrieve the path through the similarity matrix S starting at (i, j).
+#[inline]
+fn retrieve<'a>(
+ i: usize,
+ j: usize,
+ score: f64,
+ s: &Vec<Vec<f64>>,
+ t: f64,
+ str1: &[&str],
+ str2: &[&str],
+ out: &'a mut Vec<(String, String)>,
+) -> &'a mut Vec<(String, String)> {
+ if s[i][j] == 0.0 {
+ return out;
+ }
+
+ if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
+ // j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T
+
+ let key = str2[j - 2..j].join("");
+ out.insert(0, (str1[i - 1].to_string(), key));
+
+ retrieve(
+ i - 1,
+ j - 2,
+ score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ } else if i > 1
+ && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
+ {
+ // i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
+ let key = str1[i - 2..i].join("");
+ out.insert(0, (key, str2[j - 1].to_string()));
+
+ retrieve(
+ i - 2,
+ j - 1,
+ score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ } else if (s[i][j - 1] + sigma_skip() + score) >= t {
+ // S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T
+
+ out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
+ retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
+ } else if (s[i - 1][j] + sigma_skip() + score) >= t {
+ // S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T
+
+ out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
+ retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
+ } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
+ // S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T
+ out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));
+
+ retrieve(
+ i - 1,
+ j - 1,
+ score + sigma_sub(str1[i - 1], str2[j - 1]),
+ s,
+ t,
+ str1,
+ str2,
+ out,
+ );
+ }
+
+ return out;
+}
+
+/// Returns score of an indel of P.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_skip() -> f64 {
+ EXTRACTED.cskip
+}
+
+/// Returns score of a substitution of P with Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_sub(p: &str, q: &str) -> f64 {
+ EXTRACTED.csub - delta(p, q) - v(p) - v(q)
+}
+
+/// Returns score of an expansion/compression.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
+ EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
+}
+
+/// Return weighted sum of difference between P and Q.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn delta(p: &str, q: &str) -> f64 {
+ let features = r(p, q);
+ features
+ .iter()
+ .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
+ .sum()
+}
+
+/// Returns difference between phonetic segments P and Q for feature F.
+///
+/// (Kondrak 2002: 52, 54)
+#[inline]
+fn diff(p: &str, q: &str, f: &str) -> f64 {
+ let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
+ let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
+ let p_similarity = *EXTRACTED
+ .similarity_matrix
+ .get(p_features)
+ .unwrap_or_else(|| unreachable!());
+ let q_similarity = *EXTRACTED
+ .similarity_matrix
+ .get(q_features)
+ .unwrap_or_else(|| unreachable!());
+ (p_similarity - q_similarity).abs()
+}
+
+/// Return relevant features for segment comparison.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> {
+ if EXTRACTED.consonants.contains(&p.to_string())
+ || EXTRACTED.consonants.contains(&q.to_string())
+ {
+ &EXTRACTED.rc
+ } else {
+ &EXTRACTED.rv
+ }
+}
+
+/// Return vowel weight if P is vowel.
+///
+/// (Kondrak 2002: 54)
+#[inline]
+fn v(p: &str) -> f64 {
+ EXTRACTED
+ .consonants
+ .get(&p.to_string())
+ .map(|_| EXTRACTED.cvwl)
+ .or(Some(0f64))
+ .unwrap_or_else(|| unreachable!())
+}