starting to work on new diacritics module

2021-05-06 23:28:03 +03:00 · 2021-05-06 23:28:03 +03:00 · 7b0e6d864f
parent 8feb614238
commit 7b0e6d864f
2 changed files with 528 additions and 0 deletions
--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@ -0,0 +1,178 @@
+/**
+ * Copyright (c) 2021 lingdocs.com
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ */
+
+import {
+    addDiacritics,
+    splitFIntoPhonemes,
+} from "./diacritics";
+import * as T from "../types";
+
+const phonemeSplits: Array<{
+    in: string,
+    out: string[],
+}> = [
+    {
+        in: "kor",
+        out: ["k", "o", "r"],
+    },
+    {
+        in: "raaghey",
+        out: ["r", "aa", "gh", "ey"],
+    },
+    {
+        in: "hatsa",
+        out: ["h", "a", "ts", "a"],
+    },
+    {
+        in: "ba",
+        out: ["b", "a"],
+    },
+    {
+        in: "peydáa",
+        out: ["p", "ey", "d", "aa"],
+    },
+    {
+        in: "be kaar",
+        out: ["b", "e", "k", "aa", "r"],
+    },
+    {
+        in: "raadzeyy",
+        out: ["r", "aa", "dz", "eyy"],
+    },
+    {
+        in: "badanuy ??",
+        out: ["b", "a", "d", "a", "n", "uy"],
+    },
+    {
+        in: "tur ... pore",
+        out: ["t", "u", "r", "p", "o", "r", "e"],
+    },
+    {
+        in: "daar-Ul-iqaama",
+        out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"],
+    },
+];
+
+const badPhonetics: Array<{
+    in: string,
+    problem: string,
+}> = [
+    {
+        in: "acar",
+        problem: "c",
+    },
+    {
+        in: "a7am",
+        problem: "7",
+    },
+];
+
+const diacriticsTest: Array<{
+    in: T.PsString,
+    out: string,
+}> = [
+    {
+        in: {
+            p: "کور",
+            f: "kor",
+        },
+        out: "کور",
+    },
+    {
+        in: {
+            p: "کور",
+            f: "koor",
+        },
+        out: "کُور",
+    },
+    {
+        in: {
+            p: "تب",
+            f: "tib",
+        },
+        out: "تِب",
+    },
+    {
+        in: {
+            p: "تب",
+            f: "tab",
+        },
+        out: "تَب",
+    },
+    {
+        in: {
+            p: "تب",
+            f: "tUb",
+        },
+        out: "تُب",
+    },
+    {
+        in: {
+            p: "تب",
+            f: "tub",
+        },
+        out: "تٙب",
+    },
+    {
+        in: {
+            p: "تب",
+            f: "tb",
+        },
+        out: "تْب",
+    },
+    {
+        in: {
+            p: "تلب",
+            f: "tilab",
+        },
+        out: "تِلَب",
+    },
+    {
+        in: {
+            p: "تشناب",
+            f: "tashnaab",
+        },
+        out: "تَشْناب",
+    },
+];
+
+const brokenDiacritics = [
+    {
+        p: "تشناب",
+        f: "peshnaab",
+    },
+];
+
+phonemeSplits.forEach((s) => {
+    test(`${s.in} should split properly`, () => {
+        const result = splitFIntoPhonemes(s.in);
+        expect(result).toEqual(s.out);
+    });
+});
+
+test("bad phonetic characters should throw an error", () => {
+    badPhonetics.forEach((s) => {
+        expect(() => {
+            splitFIntoPhonemes(s.in);
+        }).toThrow(`illegal phonetic character: ${s.problem}`);
+    });
+});
+
+test("adding diacritics should work", () => {
+    diacriticsTest.forEach((t) => {
+        expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
+    });
+});
+
+test("adding diacritics errors when phonetecs and pashto do not line up", () => {
+    brokenDiacritics.forEach((t) => {
+        expect(() => {
+            addDiacritics(t);
+        }).toThrow();
+    });
+});
--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@ -0,0 +1,350 @@
+/**
+ * Copyright (c) 2021 lingdocs.com
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ */
+
+import * as T from "../types";
+import { removeAccents } from "./accent-helpers";
+import { firstPhonetics } from "./p-text-helpers";
+
+const zwar = "َ";
+const zwarakey = "ٙ";
+const zer = "ِ";
+const pesh = "ُ";
+const sukun = "ْ";
+const hamzaAbove = "ٔ";
+const tashdeed = "ّ";
+const wasla = "ٱ";
+const daggerAlif = "ٰ";
+const fathahan = "ً";
+
+type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y";
+type Ain = "'"
+type JoiningVowel = "-i-" | "-U-" | "-Ul-"; 
+type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
+type ShortVowel = "a" | "i" | "u" | "U";
+type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
+
+type PhonemeInfo = {
+    matches?: string[],
+    beginningMatches?: string[],
+    endingMatches?: string[],
+    consonant?: true,
+    diacritic?: string,
+    endingOnly?: true,
+    takesSukunOnEnding?: true,
+    addAlefOnBeginning?: true,
+    canStartWithAynBefore?: true,
+}
+
+const phonemeTable: Record<Phoneme, PhonemeInfo> = {
+    // Consonants
+    "b": {
+        matches: ["ب"],
+        consonant: true,
+    },
+    "p": {
+        matches: ["پ"],
+        consonant: true,
+    },
+    "t": {
+        matches: ["ت", "ط"],
+        consonant: true,
+    },
+    "T": {
+        matches: ["ټ"],
+        consonant: true,
+    },
+    "s": {
+        matches: ["س", "ص", "ث"],
+        consonant: true,
+    },
+    "j": {
+        matches: ["ج"],
+        consonant: true,
+    },
+    "ch": {
+        matches: ["چ"],
+        consonant: true,
+    },
+    "kh": {
+        matches: ["خ"],
+        consonant: true,
+    },
+    "ts": {
+        matches: ["څ"],
+        consonant: true,
+    },
+    "dz": {
+        matches: ["ځ"],
+        consonant: true,
+    },
+    "d": {
+        matches: ["د"],
+        consonant: true,
+    },
+    "D": {
+        matches: ["ډ"],
+        consonant: true,
+    },
+    "r": {
+        matches: ["ر"],
+        consonant: true,
+    },
+    "R": {
+        matches: ["ړ"],
+        consonant: true,
+    },
+    "z": {
+        matches: ["ز", "ذ", "ظ", "ض"],
+        consonant: true,
+    },
+    "jz": {
+        matches: ["ژ"],
+        consonant: true,
+    },
+    "G": {
+        matches: ["ږ"],
+        consonant: true,
+    },
+    "sh": {
+        matches: ["ش"],
+        consonant: true,
+    },
+    "x": {
+        matches: ["ښ"],
+        consonant: true,
+    },
+    "gh": {
+        matches: ["غ"],
+        consonant: true,
+    },
+    "f": {
+        matches: ["ف"],
+        consonant: true,
+    },
+    "q": {
+        matches: ["ق"],
+        consonant: true,
+    },
+    "k": {
+        matches: ["ک"],
+        consonant: true,
+    },
+    "g": {
+        matches: ["ګ"],
+        consonant: true,
+    },
+    "l": {
+        matches: ["ل"],
+        consonant: true,
+    },
+    "m": {
+        matches: ["م"],
+        consonant: true,
+    },
+    "n": {
+        matches: ["ن"],
+        consonant: true,
+    },
+    "N": {
+        matches: ["ڼ"],
+        consonant: true,
+    },
+    "h": {
+        matches: ["ه", "ح"],
+        consonant: true,
+        takesSukunOnEnding: true,
+    },
+    "w": {
+        matches: ["و"],
+        consonant: true,
+    },
+    "y": {
+        matches: ["ی"],
+        consonant: true,
+    },
+    // Ain
+    "'": {
+        matches: ["ع", "ئ"],
+        consonant: true,
+    },
+    // Joining Vowels
+    "-i-": {
+    },
+    "-U-": {
+        matches: [" و ", "و"],
+    },
+    "-Ul-": {
+        matches: ["ال"],
+    },
+    // Long Vowels
+    "aa": {
+        matches: ["ا"],
+        beginningMatches: ["آ", "ا"],
+        endingMatches: ["ا", "یٰ"],
+    },
+    "ee": {
+        matches: ["ی"],
+        addAlefOnBeginning: true,
+        endingMatches: ["ي"],
+        diacritic: zer,
+        canStartWithAynBefore: true
+    },
+    "e": {
+        matches: ["ې"],
+        addAlefOnBeginning: true,
+    },
+    "o": {
+        matches: ["و"],
+        addAlefOnBeginning: true,
+    },
+    "oo": {
+        matches: ["و"],
+        addAlefOnBeginning: true,
+        // alsoCanBePrefix: true,
+        diacritic: pesh,
+    },
+    "ey": {
+        matches: ["ی"],
+        addAlefOnBeginning: true,
+        endingMatches: ["ی"],
+    },
+    "uy": {
+        matches: ["ۍ"],
+        endingOnly: true,
+    },
+    "eyy": {
+        matches: ["ئ"],
+        endingOnly: true,
+    },
+    // Short Vowels
+    "a": {
+        diacritic: zwar,
+        endingMatches: ["ه"],
+        // canComeAfterHeyEnding: true,
+        // canBeFirstPartOfFathahanEnding: true,
+    },
+    "u": {
+        diacritic: zwarakey,
+        endingMatches: ["ه"],
+        // hamzaOnEnd: true,
+    },
+    "i": {
+        diacritic: zer,
+        endingMatches: ["ه"],
+        beginningMatches: ["ا", "ع"],
+        // takesDiacriticBeforeGurdaHeyEnding: true,
+        // canBeWasla: true,
+    },
+    "U": {
+        diacritic: pesh,
+        endingMatches: ["ه"],
+        // takesDiacriticBeforeGurdaHeyEnding: true,
+        beginningMatches: ["ا", "ع"],
+    },
+}
+
+
+export function splitFIntoPhonemes(fIn: string): Phoneme[] {
+    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
+    
+    const quadrigraphs: Phoneme[] = ["-Ul-"];
+    const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
+    const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
+    const endingDigraphs: Phoneme[] = ["uy"];
+    const willIgnore = ["?", " ", "`", ".", "…", ","];
+    
+    const result: Phoneme[] = [];
+    const f = removeAccents(fIn);
+    let index = 0;
+    while (index < f.length) {
+        const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " ");
+        const threeLetterChunk = f.slice(index, index + 3) as Phoneme;
+        const fourLetterChunk = f.slice(index, index + 4) as Phoneme;
+        if (quadrigraphs.includes(fourLetterChunk)) {
+            result.push(fourLetterChunk);
+            index += 4;
+            continue;
+        }
+        if (trigraphs.includes(threeLetterChunk)) {
+            result.push(threeLetterChunk);
+            index += 3;
+            continue;
+        }
+        const twoLetterChunk = f.slice(index, index + 2) as Phoneme;
+        if (
+            digraphs.includes(twoLetterChunk) ||
+            (isLastTwoLetters && endingDigraphs.includes(twoLetterChunk))
+        ) {
+            result.push(twoLetterChunk);
+            index += 2;
+            continue;
+        }
+        const singleLetter = f.slice(index, index + 1) as Phoneme;
+        if (!willIgnore.includes(singleLetter)) {
+            if (!singleLetterPhonemes.includes(singleLetter)) {
+                throw new Error(`illegal phonetic character: ${singleLetter}`);
+            }
+            result.push(singleLetter);
+        }
+        index++;
+    }
+    return result;
+}
+
+/**
+ * Adds phonetis to a given PsString.
+ * Errors if the phonetics and script don't line up.
+ * 
+ * @param ps a PsSTring without phonetics
+ */
+export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
+    // TODO: 
+    const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
+
+    const { pOut } = phonemes.reduce((acc, phoneme, i) => {
+        const isBeginningOfWord = acc.pOut === "" || last(acc.pOut) === " ";
+        const phonemeInfo = phonemeTable[phoneme];
+        const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
+        const currentPLetter = acc.pIn[0];
+        const needsSukun = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
+
+        if (phonemeInfo.matches?.includes(currentPLetter)) {
+            // TODO: Check if tashdeed or sukun is used
+            // const needsSukun = is consonant + previous phoneme was consonant + not beginning of word
+            return {
+                pOut: acc.pOut
+                    + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "")
+                    + currentPLetter,
+                pIn: acc.pIn.slice(1),
+            };
+        }
+
+        if (phonemeInfo.diacritic) {
+            return {
+                pOut: acc.pOut + phonemeInfo.diacritic,
+                pIn: acc.pIn,
+            }
+        }
+
+        throw new Error("phonetics error");
+    }, { pOut: "", pIn: p });
+
+    return {
+        p: pOut,
+        f,
+    };
+}
+
+/**
+ * returns the last character of a string
+ * 
+ * @param s 
+ */
+function last(s: string) {
+    return s[s.length - 1];
+}