more work on phonetics

2021-05-27 11:06:30 +04:30 · 2021-05-27 11:06:30 +04:30 · aad1b34e17
parent 0ff0548775
commit aad1b34e17
4 changed files with 160 additions and 81 deletions
--- a/src/lib/diacritics-helpers.test.ts
+++ b/src/lib/diacritics-helpers.test.ts
@ -6,8 +6,7 @@ import {
    advanceP,
    reverseP,
    overwriteP,
-    advanceForAin,
+    advanceForHamza,
    advanceForAinOrHamza,
    advanceForHamzaMid,
 } from "./diacritics-helpers";
@ -23,6 +22,10 @@ const phonemeSplits: Array<{
        in: "raaghey",
        out: ["r", "aa", "gh", "ey"],
    },
    {
        in: "ist'imaal",
        out: ["i", "s", "t", "'", "i", "m", "aa", "l"],
    },
    {
        in: "hatsa",
        out: ["h", "a", "ts", "a"],
--- a/src/lib/diacritics-helpers.ts
+++ b/src/lib/diacritics-helpers.ts
@ -28,6 +28,7 @@ type PhonemeInfo = {
    longVowel?: true,
    canStartWithAynBefore?: true,
    useEndingDiacritic?: true,
    ainBlendDiacritic?: string,
 }
 export const zwar = "َ";
@ -188,13 +189,15 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
        beginningMatches: ["آ", "ا"],
        endingMatches: ["ا", "یٰ"],
        longVowel: true,
        ainBlendDiacritic: zwar,
    },
    "ee": {
        matches: ["ی"],
        longVowel: true,
        endingMatches: ["ي"],
        diacritic: zer,
-        canStartWithAynBefore: true
+        canStartWithAynBefore: true,
        ainBlendDiacritic: zer,
    },
    "e": {
        matches: ["ې"],
@ -210,6 +213,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
        // alsoCanBePrefix: true,
        diacritic: pesh,
        useEndingDiacritic: true,
        ainBlendDiacritic: pesh,
    },
    "ey": {
        matches: ["ی"],
@ -262,13 +266,13 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
 * @returns an array of phonemes
 */
 export function splitFIntoPhonemes(fIn: string): Phoneme[] {
-    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
+    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y", "'"];
    const quadrigraphs: Phoneme[] = ["-Ul-"];
    const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
    const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
    const endingDigraphs: Phoneme[] = ["uy"];
-    const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
+    const willIgnore = ["?", " ", "`", ".", "…", ","];
    const result: Phoneme[] = [];
    const f = removeAccents(fIn);
@ -372,10 +376,10 @@ export function getCurrentNext(state: DiacriticsAccumulator): { current: string,
    };
 }
-export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
+// export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
-    const { current } = getCurrentNext(state);
+//     const { current } = getCurrentNext(state);
-    return (current === "ع") ? advanceP(state) : state;
+//     return (current === "ع") ? advanceP(state) : state;
-}
+// }
 export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
    const { current, next } = getCurrentNext(state);
@ -385,14 +389,14 @@ export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccu
    return state;
 }
-export function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
+export function advanceForHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
    const { current, next } = getCurrentNext(state);
    if (current === "ه" && (!next || next === " ")) {
        return advanceP(state);
    }
-    if (current === "ع") {
+    // if (current === "ع") {
-        return advanceP(state);
+    //     return advanceP(state);
-    }
+    // }
    return state;
 }
--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@ -485,34 +485,115 @@ const diacriticsSections: {
        tests: [
            {
                in: {
-                    p: "اعتصاب شکن",
+                    p: "بعد",
-                    f: "itisaab shakan",
+                    f: "ba'd",
                },
-                out: "اِعتِصاب شَکَن",
+                out: "بَعْد",
            },
            // starting with ع
            {
                in: {
                    p: "عزت",
                    f: "izzat",
                },
                out: "عِزَّت",
            },
            {
                in: {
-                    p: "عزت",
+                    p: "بعد",
-                    f: "i'zzat",
+                    f: "b'ad",
                },
-                out: "عِزَّت",
+                out: "بْعَد",
            },
            {
                in: {
                    p: "بعد",
                    f: "ba'ad",
                },
                out: "بَعَد",
            },
            {
                in: {
                    p: "بعد",
                    f: "baad",
                },
                out: "بَعَد",
            },
            {
                in: {
                    p: "بعد",
                    f: "bad",
                },
                // TODO: Should this really be an error?
                out: null,
            },
            {
                in: {
                    p: "معلوم",
                    f: "maaloom",
                },
                out: "مَعَلُوم",
            },
            {
                in: {
                    p: "منبع",
                    f: "manbi'",
                },
                out: "مَنْبِع",
            },
            {
                in: {
                    p: "منبع",
                    f: "manb'i",
                },
                out: "مَنْبْعِ"
            },
            {
                in: {
                    p: "منبع",
                    f: "manbee",
                },
                out: "مَنْبِعِ",
            },
            // middle ع
            {
                in: {
                    p: "معنا",
-                    f: "ma'anaa",
+                    f: "ma'náa",
                },
                out: "مَعْنا",
            },
            {
                in: {
                    p: "معنا",
                    f: "maanáa",
                },
                out: "مَعَنا",
            },
            // TODO: Should be allowed to use a short vowel as well 
            // طمع - tama // استعمال - istimaal
            // TODO: Starting like عام اعتصاب etc.
            // {
            //     in: {
            //         p: "اعتصاب شکن",
            //         f: "itisaab shakan",
            //     },
            //     out: "اِعتِصاب شَکَن",
            // },
            // // starting with ع
            // {
            //     in: {
            //         p: "عزت",
            //         f: "izzat",
            //     },
            //     out: "عِزَّت",
            // },
            // {
            //     in: {
            //         p: "عزت",
            //         f: "i'zzat",
            //     },
            //     out: "عِزَّت",
            // },
            // // middle ع
            // {
            //     in: {
            //         p: "معنا",
            //         f: "ma'anaa",
            //     },
            //     out: "مَعَنا",
            // },
            // ending with ayn
            // {
            //     in: {
@ -683,34 +764,34 @@ diacriticsSections.forEach((section) => {
 // ERRORS
-// const brokenDiacritics = [
+const brokenDiacritics = [
-//     {
+    {
-//         p: "تشناب",
+        p: "تشناب",
-//         f: "peshnaab",
+        f: "peshnaab",
-//     },
+    },
-//     {
+    {
-//         p: "وسېدل",
+        p: "وسېدل",
-//         f: "osedul",
+        f: "osedul",
-//     },
+    },
-// ];
+];
-// test("ending with left over Pashto script will throw an error", () => {
+test("ending with left over Pashto script will throw an error", () => {
-//     expect(() => {
+    expect(() => {
-//         addDiacritics({ p: "کور ته", f: "kor" });
+        addDiacritics({ p: "کور ته", f: "kor" });
-//     }).toThrow(`phonetics error - phonetics shorter than pashto script`);
+    }).toThrow(`phonetics error - phonetics shorter than pashto script`);
-// });
+});
-// test("ending with left over phonetics will throw an error", () => {
+test("ending with left over phonetics will throw an error", () => {
-//     expect(() => {
+    expect(() => {
-//         addDiacritics({ p: "کار", f: "kaar kawul" });
+        addDiacritics({ p: "کار", f: "kaar kawul" });
-//     }).toThrow();
+    }).toThrow();
-// });
+});
-// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
+test("adding diacritics errors when phonetecs and pashto do not line up", () => {
-//     brokenDiacritics.forEach((t) => {
+    brokenDiacritics.forEach((t) => {
-//         expect(() => {
+        expect(() => {
-//             addDiacritics(t);
+            addDiacritics(t);
-//         }).toThrow();
+        }).toThrow();
-//     });
+    });
-// });
+});
--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@ -27,8 +27,7 @@ import {
    advanceP,
    reverseP,
    overwriteP,
-    advanceForAin,
+    advanceForHamza,
    advanceForAinOrHamza,
    advanceForHamzaMid,
    DiacriticsAccumulator,
 } from "./diacritics-helpers";
@ -61,14 +60,13 @@ enum PhonemeStatus {
    DirectMatchAfterSukun,
    EndingWithHeyHimFromSukun,
    ShortVowel,
    ShortVowelBeforeAin,
    ShortVowelAfterAin,
    PersianSilentWWithAa,
    ArabicWasla,
    Izafe,
    EndOfDuParticle,
    HaEndingWithHeem,
    AlefDaggarEnding,
    LongAinVowelMissingComma,
 }
 function processPhoneme(
@ -112,7 +110,6 @@ function processPhoneme(
            pipe(
                advanceP,
                addP(diacritic),
                advanceForAin,
            )(state)
        : (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
            pipe(
@ -171,25 +168,19 @@ function processPhoneme(
                advanceP,
                advanceP,
            )(state)
-        : (phs === PhonemeStatus.ShortVowelBeforeAin) ?
+        : (phs === PhonemeStatus.LongAinVowelMissingComma) ?
            pipe(
                // this is pretty messed up because for some reason the reverseP goes back one more step when it's an ain before it
                reverseP,
                advanceP,
                addP(diacritic),
                // overwriteP(diacritic || ""),
            )(state)
        : (phs === PhonemeStatus.ShortVowelAfterAin) ?
            pipe(
                advanceP,
-                addP(diacritic),
+                addP(diacritic)
            )(state)
        :
        // phs === PhonemeState.ShortVowel
            pipe(
                advanceForHamzaMid,
                addP(phonemeInfo.diacritic),
-                advanceForAinOrHamza,
+                // TODO THIS?
                advanceForHamza,
            )(state);
 }
@ -214,7 +205,11 @@ function stateInfo({ state, i, phonemes, phoneme }: {
    const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
    const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
    const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
-    const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
+    const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
    const diacritic = useAinBlendDiacritics
        ? phonemeInfo.ainBlendDiacritic
        : isEndOfWord 
        ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
    function getPhonemeState(): PhonemeStatus {
        if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
@ -243,6 +238,9 @@ function stateInfo({ state, i, phonemes, phoneme }: {
        if (phoneme === "-i-" && isBeginningOfWord) {
            return PhonemeStatus.Izafe;
        } 
        if (useAinBlendDiacritics) {
            return PhonemeStatus.LongAinVowelMissingComma;
        }
        if (needsTashdeed) {
            return PhonemeStatus.DoubleConsonantTashdeed;
        }
@ -259,14 +257,7 @@ function stateInfo({ state, i, phonemes, phoneme }: {
            return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
        }
        if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {  
-            // weird ayn behaviour because it automatically advances and ignores it at the beginning of the process
+            return PhonemeStatus.ShortVowel;
            // console.log("looking prev", prevPLetter);
            // console.log("looking next", currentPLetter);   
            return prevPLetter === "ع" 
                ? PhonemeStatus.ShortVowelBeforeAin
                : currentPLetter === "ع"
                ? PhonemeStatus.ShortVowelAfterAin
                : PhonemeStatus.ShortVowel;
        }
        // console.log("bad phoneme is ", phoneme);
        throw new Error("phonetics error - no status found for phoneme: " + phoneme);