more work on phonetics

2021-05-27 11:06:30 +04:30 · 2021-05-27 11:06:30 +04:30 · aad1b34e17
parent 0ff0548775
commit aad1b34e17
4 changed files with 160 additions and 81 deletions
--- a/src/lib/diacritics-helpers.test.ts
+++ b/src/lib/diacritics-helpers.test.ts
@ -6,8 +6,7 @@ import {
    advanceP,
    reverseP,
    overwriteP,
-    advanceForAin,
-    advanceForAinOrHamza,
+    advanceForHamza,
    advanceForHamzaMid,
 } from "./diacritics-helpers";

@ -23,6 +22,10 @@ const phonemeSplits: Array<{
        in: "raaghey",
        out: ["r", "aa", "gh", "ey"],
    },
+    {
+        in: "ist'imaal",
+        out: ["i", "s", "t", "'", "i", "m", "aa", "l"],
+    },
    {
        in: "hatsa",
        out: ["h", "a", "ts", "a"],
--- a/src/lib/diacritics-helpers.ts
+++ b/src/lib/diacritics-helpers.ts
@ -28,6 +28,7 @@ type PhonemeInfo = {
    longVowel?: true,
    canStartWithAynBefore?: true,
    useEndingDiacritic?: true,
+    ainBlendDiacritic?: string,
 }

 export const zwar = "َ";
@ -188,13 +189,15 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
        beginningMatches: ["آ", "ا"],
        endingMatches: ["ا", "یٰ"],
        longVowel: true,
+        ainBlendDiacritic: zwar,
    },
    "ee": {
        matches: ["ی"],
        longVowel: true,
        endingMatches: ["ي"],
        diacritic: zer,
-        canStartWithAynBefore: true
+        canStartWithAynBefore: true,
+        ainBlendDiacritic: zer,
    },
    "e": {
        matches: ["ې"],
@ -210,6 +213,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
        // alsoCanBePrefix: true,
        diacritic: pesh,
        useEndingDiacritic: true,
+        ainBlendDiacritic: pesh,
    },
    "ey": {
        matches: ["ی"],
@ -262,13 +266,13 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
 * @returns an array of phonemes
 */
 export function splitFIntoPhonemes(fIn: string): Phoneme[] {
-    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
+    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y", "'"];
    
    const quadrigraphs: Phoneme[] = ["-Ul-"];
    const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
    const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
    const endingDigraphs: Phoneme[] = ["uy"];
-    const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
+    const willIgnore = ["?", " ", "`", ".", "…", ","];
    
    const result: Phoneme[] = [];
    const f = removeAccents(fIn);
@ -372,10 +376,10 @@ export function getCurrentNext(state: DiacriticsAccumulator): { current: string,
    };
 }

-export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
-    const { current } = getCurrentNext(state);
-    return (current === "ع") ? advanceP(state) : state;
-}
+// export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
+//     const { current } = getCurrentNext(state);
+//     return (current === "ع") ? advanceP(state) : state;
+// }

 export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
    const { current, next } = getCurrentNext(state);
@ -385,14 +389,14 @@ export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccu
    return state;
 }

-export function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
+export function advanceForHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
    const { current, next } = getCurrentNext(state);
    if (current === "ه" && (!next || next === " ")) {
        return advanceP(state);
    }
-    if (current === "ع") {
-        return advanceP(state);
-    }
+    // if (current === "ع") {
+    //     return advanceP(state);
+    // }
    return state;
 }

--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@ -485,34 +485,115 @@ const diacriticsSections: {
        tests: [
            {
                in: {
-                    p: "اعتصاب شکن",
-                    f: "itisaab shakan",
+                    p: "بعد",
+                    f: "ba'd",
                },
-                out: "اِعتِصاب شَکَن",
-            },
-            // starting with ع
-            {
-                in: {
-                    p: "عزت",
-                    f: "izzat",
-                },
-                out: "عِزَّت",
+                out: "بَعْد",
            },
            {
                in: {
-                    p: "عزت",
-                    f: "i'zzat",
+                    p: "بعد",
+                    f: "b'ad",
                },
-                out: "عِزَّت",
+                out: "بْعَد",
+            },
+            {
+                in: {
+                    p: "بعد",
+                    f: "ba'ad",
+                },
+                out: "بَعَد",
+            },
+            {
+                in: {
+                    p: "بعد",
+                    f: "baad",
+                },
+                out: "بَعَد",
+            },
+            {
+                in: {
+                    p: "بعد",
+                    f: "bad",
+                },
+                // TODO: Should this really be an error?
+                out: null,
+            },
+            {
+                in: {
+                    p: "معلوم",
+                    f: "maaloom",
+                },
+                out: "مَعَلُوم",
+            },
+            {
+                in: {
+                    p: "منبع",
+                    f: "manbi'",
+                },
+                out: "مَنْبِع",
+            },
+            {
+                in: {
+                    p: "منبع",
+                    f: "manb'i",
+                },
+                out: "مَنْبْعِ"
+            },
+            {
+                in: {
+                    p: "منبع",
+                    f: "manbee",
+                },
+                out: "مَنْبِعِ",
            },
-            // middle ع
            {
                in: {
                    p: "معنا",
-                    f: "ma'anaa",
+                    f: "ma'náa",
+                },
+                out: "مَعْنا",
+            },
+            {
+                in: {
+                    p: "معنا",
+                    f: "maanáa",
                },
                out: "مَعَنا",
            },
+            // TODO: Should be allowed to use a short vowel as well 
+            // طمع - tama // استعمال - istimaal
+            // TODO: Starting like عام اعتصاب etc.
+            // {
+            //     in: {
+            //         p: "اعتصاب شکن",
+            //         f: "itisaab shakan",
+            //     },
+            //     out: "اِعتِصاب شَکَن",
+            // },
+            // // starting with ع
+            // {
+            //     in: {
+            //         p: "عزت",
+            //         f: "izzat",
+            //     },
+            //     out: "عِزَّت",
+            // },
+            // {
+            //     in: {
+            //         p: "عزت",
+            //         f: "i'zzat",
+            //     },
+            //     out: "عِزَّت",
+            // },
+            // // middle ع
+            // {
+            //     in: {
+            //         p: "معنا",
+            //         f: "ma'anaa",
+            //     },
+            //     out: "مَعَنا",
+            // },
            // ending with ayn
            // {
            //     in: {
@ -683,34 +764,34 @@ diacriticsSections.forEach((section) => {

 // ERRORS

-// const brokenDiacritics = [
-//     {
-//         p: "تشناب",
-//         f: "peshnaab",
-//     },
-//     {
-//         p: "وسېدل",
-//         f: "osedul",
-//     },
-// ];
+const brokenDiacritics = [
+    {
+        p: "تشناب",
+        f: "peshnaab",
+    },
+    {
+        p: "وسېدل",
+        f: "osedul",
+    },
+];

-// test("ending with left over Pashto script will throw an error", () => {
-//     expect(() => {
-//         addDiacritics({ p: "کور ته", f: "kor" });
-//     }).toThrow(`phonetics error - phonetics shorter than pashto script`);
-// });
+test("ending with left over Pashto script will throw an error", () => {
+    expect(() => {
+        addDiacritics({ p: "کور ته", f: "kor" });
+    }).toThrow(`phonetics error - phonetics shorter than pashto script`);
+});

-// test("ending with left over phonetics will throw an error", () => {
-//     expect(() => {
-//         addDiacritics({ p: "کار", f: "kaar kawul" });
-//     }).toThrow();
-// });
+test("ending with left over phonetics will throw an error", () => {
+    expect(() => {
+        addDiacritics({ p: "کار", f: "kaar kawul" });
+    }).toThrow();
+});

-// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
-//     brokenDiacritics.forEach((t) => {
-//         expect(() => {
-//             addDiacritics(t);
-//         }).toThrow();
-//     });
-// });
+test("adding diacritics errors when phonetecs and pashto do not line up", () => {
+    brokenDiacritics.forEach((t) => {
+        expect(() => {
+            addDiacritics(t);
+        }).toThrow();
+    });
+});

--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@ -27,8 +27,7 @@ import {
    advanceP,
    reverseP,
    overwriteP,
-    advanceForAin,
-    advanceForAinOrHamza,
+    advanceForHamza,
    advanceForHamzaMid,
    DiacriticsAccumulator,
 } from "./diacritics-helpers";
@ -61,14 +60,13 @@ enum PhonemeStatus {
    DirectMatchAfterSukun,
    EndingWithHeyHimFromSukun,
    ShortVowel,
-    ShortVowelBeforeAin,
-    ShortVowelAfterAin,
    PersianSilentWWithAa,
    ArabicWasla,
    Izafe,
    EndOfDuParticle,
    HaEndingWithHeem,
    AlefDaggarEnding,
+    LongAinVowelMissingComma,
 }

 function processPhoneme(
@ -112,7 +110,6 @@ function processPhoneme(
            pipe(
                advanceP,
                addP(diacritic),
-                advanceForAin,
            )(state)
        : (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
            pipe(
@ -171,25 +168,19 @@ function processPhoneme(
                advanceP,
                advanceP,
            )(state)
-        : (phs === PhonemeStatus.ShortVowelBeforeAin) ?
+        : (phs === PhonemeStatus.LongAinVowelMissingComma) ?
            pipe(
-                // this is pretty messed up because for some reason the reverseP goes back one more step when it's an ain before it
-                reverseP,
-                advanceP,
                addP(diacritic),
-                // overwriteP(diacritic || ""),
-            )(state)
-        : (phs === PhonemeStatus.ShortVowelAfterAin) ?
-            pipe(
                advanceP,
-                addP(diacritic),
+                addP(diacritic)
            )(state)
        :
        // phs === PhonemeState.ShortVowel
            pipe(
                advanceForHamzaMid,
                addP(phonemeInfo.diacritic),
-                advanceForAinOrHamza,
+                // TODO THIS?
+                advanceForHamza,
            )(state);
 }

@ -214,7 +205,11 @@ function stateInfo({ state, i, phonemes, phoneme }: {
    const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
    const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
    const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
-    const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
+    const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
+    const diacritic = useAinBlendDiacritics
+        ? phonemeInfo.ainBlendDiacritic
+        : isEndOfWord 
+        ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;

    function getPhonemeState(): PhonemeStatus {
        if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
@ -243,6 +238,9 @@ function stateInfo({ state, i, phonemes, phoneme }: {
        if (phoneme === "-i-" && isBeginningOfWord) {
            return PhonemeStatus.Izafe;
        } 
+        if (useAinBlendDiacritics) {
+            return PhonemeStatus.LongAinVowelMissingComma;
+        }
        if (needsTashdeed) {
            return PhonemeStatus.DoubleConsonantTashdeed;
        }
@ -258,15 +256,8 @@ function stateInfo({ state, i, phonemes, phoneme }: {
        if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) {
            return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
        }
-        if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
-            // weird ayn behaviour because it automatically advances and ignores it at the beginning of the process
-            // console.log("looking prev", prevPLetter);
-            // console.log("looking next", currentPLetter);   
-            return prevPLetter === "ع" 
-                ? PhonemeStatus.ShortVowelBeforeAin
-                : currentPLetter === "ع"
-                ? PhonemeStatus.ShortVowelAfterAin
-                : PhonemeStatus.ShortVowel;
+        if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {  
+            return PhonemeStatus.ShortVowel;
        }
        // console.log("bad phoneme is ", phoneme);
        throw new Error("phonetics error - no status found for phoneme: " + phoneme);