diff options
| author | Matthieu Pignolet <matthieu@puffer.fish> | 2025-05-18 19:00:51 +0400 |
|---|---|---|
| committer | Matthieu Pignolet <matthieu@puffer.fish> | 2025-05-18 19:00:51 +0400 |
| commit | 20c7daa45e55063ff91e4a5a8ee09af53710a6d9 (patch) | |
| tree | f3ea47cd3697a4bdd455d8991ba42d0b123fb1bb | |
| parent | bdefb4e5f084a1e468ff4ee8d42035caf15c8e8d (diff) | |
feat: adding the library implementation
| -rw-r--r-- | aline/Cargo.lock | 117 | ||||
| -rw-r--r-- | aline/Cargo.toml | 10 | ||||
| -rw-r--r-- | aline/src/constants.rs | 32 | ||||
| -rw-r--r-- | aline/src/extract.json | 1009 | ||||
| -rwxr-xr-x | aline/src/extract.py | 20 | ||||
| -rw-r--r-- | aline/src/lib.rs | 224 |
6 files changed, 1412 insertions, 0 deletions
diff --git a/aline/Cargo.lock b/aline/Cargo.lock new file mode 100644 index 0000000..72e8a01 --- /dev/null +++ b/aline/Cargo.lock @@ -0,0 +1,117 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aline" +version = "0.1.0" +dependencies = [ + "array2d", + "once_cell", + "serde", + "serde_json", + "unicode-segmentation", +] + +[[package]] +name = "array2d" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b39cb2c1bf5a7c0dd097aa95ab859cf87dab5a4328900f5388942dc1889f74" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" diff --git a/aline/Cargo.toml b/aline/Cargo.toml new file mode 100644 index 0000000..d829946 --- /dev/null +++ b/aline/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "aline" +version = "0.1.0" +edition = "2024" + +[dependencies] +once_cell = "1.21.3" +serde = { version = "1.0.219", features = ["derive"] } +serde_json = "1.0.140" +unicode-segmentation = "1.12.0" diff --git a/aline/src/constants.rs b/aline/src/constants.rs new file mode 100644 index 0000000..83e11a1 --- /dev/null +++ b/aline/src/constants.rs @@ -0,0 +1,32 @@ + + +use std::collections::{HashMap, HashSet}; + +use once_cell::sync::Lazy; +use serde::{Serialize, Deserialize}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct Extracted { + #[serde(rename = "C_skip")] + pub cskip: f64, + #[serde(rename = "C_sub")] + pub csub: f64, + #[serde(rename = "C_exp")] + pub cexp: f64, + #[serde(rename = "C_vwl")] + pub cvwl: f64, + pub consonants: HashSet<String>, + #[serde(rename = "R_c")] + pub rc: HashSet<String>, + #[serde(rename = "R_v")] + pub rv: HashSet<String>, + pub similarity_matrix: HashMap<String, f64>, + pub salience: HashMap<String, f64>, + pub feature_matrix: HashMap<String, HashMap<String, String>>, +} + +const EXTRACTED_JSON: &str = include_str!("extract.json"); +pub static EXTRACTED: Lazy<Extracted> = Lazy::new(|| { + serde_json::from_str(EXTRACTED_JSON).unwrap() +}); + diff --git a/aline/src/extract.json b/aline/src/extract.json new file mode 100644 index 0000000..db9d2cf --- /dev/null +++ b/aline/src/extract.json @@ -0,0 +1,1009 @@ +{ + "C_skip": -10, + "C_sub": 35, + "C_exp": 45, + "C_vwl": 5, + "consonants": [ + "B", + "N", + "R", + "b", + "c", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "x", + "z", + "ç", + "ð", + "ħ", + "ŋ", + "ɖ", + "ɟ", + "ɢ", + "ɣ", + "ɦ", + "ɬ", + "ɮ", + "ɰ", + "ɱ", + "ɲ", + "ɳ", + "ɴ", + "ɸ", + "ɹ", + "ɻ", + "ɽ", + "ɾ", + "ʀ", + "ʁ", + "ʂ", + "ʃ", + "ʈ", + "ʋ", + "ʐ ", + "ʒ", + "ʔ", + "ʕ", + "ʙ", + "ʝ", + "β", + "θ", + "χ", + "ʐ", + "w" + ], + "R_c": [ + "aspirated", + "lateral", + "manner", + "nasal", + "place", + "retroflex", + "syllabic", + "voice" + ], + "R_v": [ + "back", + "lateral", + "long", + "manner", + "nasal", + "place", + "retroflex", + "round", + "syllabic", + "voice" + ], + "similarity_matrix": { + "bilabial": 1.0, + "labiodental": 0.95, + "dental": 0.9, + "alveolar": 0.85, + "retroflex": 0.8, + "palato-alveolar": 0.75, + "palatal": 0.7, + "velar": 0.6, + "uvular": 0.5, + "pharyngeal": 0.3, + "glottal": 0.1, + "labiovelar": 1.0, + "vowel": -1.0, + "stop": 1.0, + "affricate": 0.9, + "fricative": 0.85, + "trill": 0.7, + "tap": 0.65, + "approximant": 0.6, + "high vowel": 0.4, + "mid vowel": 0.2, + "low vowel": 0.0, + "vowel2": 0.5, + "high": 1.0, + "mid": 0.5, + "low": 0.0, + "front": 1.0, + "central": 0.5, + "back": 0.0, + "plus": 1.0, + "minus": 0.0 + }, + "salience": { + "syllabic": 5, + "place": 40, + "manner": 50, + "voice": 5, + "nasal": 20, + "retroflex": 10, + "lateral": 10, + "aspirated": 5, + "long": 0, + "high": 3, + "back": 2, + "round": 2 + }, + "feature_matrix": { + "p": { + "place": "bilabial", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "b": { + "place": "bilabial", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "t": { + "place": "alveolar", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "d": { + "place": "alveolar", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʈ": { + "place": "retroflex", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɖ": { + "place": "retroflex", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus" + }, + "c": { + "place": "palatal", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɟ": { + "place": "palatal", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "k": { + "place": "velar", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "g": { + "place": "velar", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "q": { + "place": "uvular", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɢ": { + "place": "uvular", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʔ": { + "place": "glottal", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "m": { + "place": "bilabial", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɱ": { + "place": "labiodental", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "n": { + "place": "alveolar", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɳ": { + "place": "retroflex", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɲ": { + "place": "palatal", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ŋ": { + "place": "velar", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɴ": { + "place": "uvular", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "N": { + "place": "uvular", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʙ": { + "place": "bilabial", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "B": { + "place": "bilabial", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "r": { + "place": "alveolar", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʀ": { + "place": "uvular", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "R": { + "place": "uvular", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɾ": { + "place": "alveolar", + "manner": "tap", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɽ": { + "place": "retroflex", + "manner": "tap", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɸ": { + "place": "bilabial", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "β": { + "place": "bilabial", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "f": { + "place": "labiodental", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "v": { + "place": "labiodental", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "θ": { + "place": "dental", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ð": { + "place": "dental", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "s": { + "place": "alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "z": { + "place": "alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʃ": { + "place": "palato-alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʒ": { + "place": "palato-alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʂ": { + "place": "retroflex", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʐ": { + "place": "retroflex", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus" + }, + "ç": { + "place": "palatal", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʝ": { + "place": "palatal", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "x": { + "place": "velar", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɣ": { + "place": "velar", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "χ": { + "place": "uvular", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʁ": { + "place": "uvular", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ħ": { + "place": "pharyngeal", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ʕ": { + "place": "pharyngeal", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "h": { + "place": "glottal", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɦ": { + "place": "glottal", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɬ": { + "place": "alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "plus", + "aspirated": "minus" + }, + "ɮ": { + "place": "alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "plus", + "aspirated": "minus" + }, + "ʋ": { + "place": "labiodental", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɹ": { + "place": "alveolar", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɻ": { + "place": "retroflex", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus" + }, + "j": { + "place": "palatal", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "ɰ": { + "place": "velar", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "l": { + "place": "alveolar", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "plus", + "aspirated": "minus" + }, + "w": { + "place": "labiovelar", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus" + }, + "i": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus" + }, + "y": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "front", + "round": "plus", + "long": "minus", + "aspirated": "minus" + }, + "e": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus" + }, + "E": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "minus", + "long": "plus", + "aspirated": "minus" + }, + "ø": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "plus", + "long": "minus", + "aspirated": "minus" + }, + "ɛ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus" + }, + "œ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "plus", + "long": "minus", + "aspirated": "minus" + }, + "æ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "low", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus" + }, + "a": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "low", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus" + }, + "A": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "low", + "back": "front", + "round": "minus", + "long": "plus", + "aspirated": "minus" + }, + "ɨ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "central", + "round": "minus", + "long": "minus", + "aspirated": "minus" + }, + "ʉ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "central", + "round": "plus", + "long": "minus", + "aspirated": "minus" + }, + "ə": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "central", + "round": "minus", + "long": "minus", + "aspirated": "minus" + }, + "u": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "back", + "round": "plus", + "long": "minus", + "aspirated": "minus" + }, + "U": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "back", + "round": "plus", + "long": "plus", + "aspirated": "minus" + }, + "o": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "back", + "round": "plus", + "long": "minus", + "aspirated": "minus" + }, + "O": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "back", + "round": "plus", + "long": "plus", + "aspirated": "minus" + }, + "ɔ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "back", + "round": "plus", + "long": "minus", + "aspirated": "minus" + }, + "ɒ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "low", + "back": "back", + "round": "minus", + "long": "minus", + "aspirated": "minus" + }, + "I": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "front", + "round": "minus", + "long": "plus", + "aspirated": "minus" + } + } +} diff --git a/aline/src/extract.py b/aline/src/extract.py new file mode 100755 index 0000000..b2faba5 --- /dev/null +++ b/aline/src/extract.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +from nltk.metrics.aline import * +import json + +extract = { + 'C_skip': C_skip, + 'C_sub': C_sub, + 'C_exp': C_exp, + 'C_vwl': C_vwl, + + 'consonants': consonants, + 'R_c': R_c, + 'R_v': R_v, + 'similarity_matrix': similarity_matrix, + 'salience': salience, + 'feature_matrix': feature_matrix, +} + +print(json.dumps(extract, indent=2, ensure_ascii=False)) diff --git a/aline/src/lib.rs b/aline/src/lib.rs new file mode 100644 index 0000000..0e44705 --- /dev/null +++ b/aline/src/lib.rs @@ -0,0 +1,224 @@ +// ALINE phonetic sequence alignment in Rust +// Port of NLTK's ALINE module (Greg Kondrak, 2002) + +use std::{collections::HashSet, f64}; + +use constants::EXTRACTED; +use unicode_segmentation::UnicodeSegmentation; +mod constants; + +pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> { + assert!( + (0.0..=1.0).contains(&epsilon), + "Epsilon must be between 0.0 and 1.0." + ); + + let str1_chars: Vec<&str> = str1.graphemes(true).collect(); + let str2_chars: Vec<&str> = str2.graphemes(true).collect(); + let m = str1_chars.len(); + let n = str2_chars.len(); + + // This includes Kondrak's initialization of row 0 and column 0 to all 0s. + let mut s = vec![vec![0.0; n + 1]; m + 1]; + for i in 1..=m { + for j in 1..=n { + let edit1 = s[i - 1][j] + sigma_skip(); + let edit2 = s[i][j - 1] + sigma_skip(); + + let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]); + + let edit4 = if i > 1 { + s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1]) + } else { + -f64::INFINITY + }; + + let edit5 = if j > 1 { + s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1]) + } else { + -f64::INFINITY + }; + + s[i][j] = [edit1, edit2, edit3, edit4, edit5] + .iter() + .fold(0f64, |prev, curr| f64::max(prev, *curr)); + } + } + let amax = s + .iter() + .flat_map(|row| row.iter()) + .cloned() + .fold(f64::NAN, f64::max); + let t = (1.0 - epsilon) * amax; + + let mut aligns = Vec::new(); + for i in 1..=m { + for j in 1..=n { + if s[i][j] >= t { + let mut out = Vec::new(); + retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out); + aligns.push(out); + } + } + } + aligns +} + +/// Retrieve the path through the similarity matrix S starting at (i, j). +#[inline] +fn retrieve<'a>( + i: usize, + j: usize, + score: f64, + s: &Vec<Vec<f64>>, + t: f64, + str1: &[&str], + str2: &[&str], + out: &'a mut Vec<(String, String)>, +) -> &'a mut Vec<(String, String)> { + if s[i][j] == 0.0 { + return out; + } + + if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t { + // j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T + + let key = str2[j - 2..j].join(""); + out.insert(0, (str1[i - 1].to_string(), key)); + + retrieve( + i - 1, + j - 2, + score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]), + s, + t, + str1, + str2, + out, + ); + } else if i > 1 + && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t + { + // i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T + let key = str1[i - 2..i].join(""); + out.insert(0, (key, str2[j - 1].to_string())); + + retrieve( + i - 2, + j - 1, + score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]), + s, + t, + str1, + str2, + out, + ); + } else if (s[i][j - 1] + sigma_skip() + score) >= t { + // S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T + + out.insert(0, ("-".to_string(), str2[j - 1].to_string())); + retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out); + } else if (s[i - 1][j] + sigma_skip() + score) >= t { + // S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T + + out.insert(0, (str1[i - 1].to_string(), "-".to_string())); + retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out); + } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t { + // S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T + out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string())); + + retrieve( + i - 1, + j - 1, + score + sigma_sub(str1[i - 1], str2[j - 1]), + s, + t, + str1, + str2, + out, + ); + } + + return out; +} + +/// Returns score of an indel of P. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_skip() -> f64 { + EXTRACTED.cskip +} + +/// Returns score of a substitution of P with Q. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_sub(p: &str, q: &str) -> f64 { + EXTRACTED.csub - delta(p, q) - v(p) - v(q) +} + +/// Returns score of an expansion/compression. +/// +/// (Kondrak 2002: 54) +#[inline] +fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 { + EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2)) +} + +/// Return weighted sum of difference between P and Q. +/// +/// (Kondrak 2002: 54) +#[inline] +fn delta(p: &str, q: &str) -> f64 { + let features = r(p, q); + features + .iter() + .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!())) + .sum() +} + +/// Returns difference between phonetic segments P and Q for feature F. +/// +/// (Kondrak 2002: 52, 54) +#[inline] +fn diff(p: &str, q: &str, f: &str) -> f64 { + let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f]; + let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f]; + let p_similarity = *EXTRACTED + .similarity_matrix + .get(p_features) + .unwrap_or_else(|| unreachable!()); + let q_similarity = *EXTRACTED + .similarity_matrix + .get(q_features) + .unwrap_or_else(|| unreachable!()); + (p_similarity - q_similarity).abs() +} + +/// Return relevant features for segment comparison. +/// +/// (Kondrak 2002: 54) +#[inline] +fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> { + if EXTRACTED.consonants.contains(&p.to_string()) + || EXTRACTED.consonants.contains(&q.to_string()) + { + &EXTRACTED.rc + } else { + &EXTRACTED.rv + } +} + +/// Return vowel weight if P is vowel. +/// +/// (Kondrak 2002: 54) +#[inline] +fn v(p: &str) -> f64 { + EXTRACTED + .consonants + .get(&p.to_string()) + .map(|_| EXTRACTED.cvwl) + .or(Some(0f64)) + .unwrap_or_else(|| unreachable!()) +} |
