autofeur-nova/src/preprocess.mts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

import { writeFile, writeFileSync } from "fs";
import { request } from "undici";

const phonemize = (grapheme: string) =>
  request(
    `http://localhost:5000?grapheme=${encodeURIComponent(grapheme)}`
  ).then((x) => x.body.text());

let jsonData: {
  word: string;
  phoneme: string;
  partials: Record<string, string>;
}[] = [];

let words: string[] = [
  "ta mere",
  "tapis",
  "taper",
  "tare",
  "tabasser",
  "tabouret",
  "rigole",
  "amène",
  "atchoum",
  "abracadabra",
  "abeille",
  "alibaba",
  "arnaque",
  "maison",
  "nombril",
  "lapin",
  "ouistiti",
  "wifi",
  "uifi",
  "ouisky",
  "uisky",
  "renard",
  "requin",
  "repas",
  "retard",
  "coiffeur",
  "coiffeuse",
  "kirikou",
  "kiri",
  "western",
  "un deux",
  "hein deux",
  "deu trois",
  "yoplait",
  "avalanche",
  "moisissure",
  "moisson",
  "moineau",
  "école",
  "commentaire",
  "quantificateur",
  "commandant",
  "claire chazal",
  "tornade",
  "bottes",
  "bonsoir pariiiss",
  "courtois",
  "facteur",
  "gérard",
  "quoidrilatère",
  "pepe",
  "surfeur",
  "toilettes",
  "lebron james",
  "c'est de la merde"
];

(async () => {
  for (const word of words) {
    let phoneme = await phonemize(word);
    let partials: Record<string, string> = {};

    for (let i = 3; i <= word.length; i++) {
      // add n last characters from the phoneme
      let add = word.slice(word.length - i, word.length);
      partials[add] = await phonemize(add);
    }

    jsonData.push({ phoneme, word, partials });
  }

  writeFileSync("./data.json", JSON.stringify(jsonData));
})();