more

2021-05-08 21:31:59 +03:00 · 2021-05-08 21:31:59 +03:00 · 2dea82c32b
parent 6053d11bc0
commit 2dea82c32b
2 changed files with 330 additions and 43 deletions
--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@ -242,6 +242,196 @@ const diacriticsTest: Array<{
        },
        out: "لِیک",
    },
+    {
+        in: {
+            p: "رغېدل",
+            f: "raghedul",
+        },
+        out: "رَغېد" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "کارول",
+            f: "kaarawul",
+        },
+        out: "کارَو" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "پېښېدل",
+            f: "pexedul",
+        },
+        out: "پېښېد" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "مین",
+            f: "mayín",
+        },
+        out: "مَیِن",
+    },
+    {
+        in: {
+            p: "سړی",
+            f: "saRey",
+        },
+        out: "سَړی",
+    },
+    {
+        in: {
+            p: "سړي",
+            f: "saRee",
+        },
+        out: "سَړي",
+    },
+    {
+        in: {
+            p: "زه",
+            f: "zu",
+        },
+        out: "زهٔ",
+    },
+    {
+        in: {
+            p: "زه",
+            f: "za",
+        },
+        out: "زَه",
+    },
+    {
+        in: {
+            p: "پېشنهاد",
+            f: "peshniháad",
+        },
+        out: "پېشْنِهاد",
+    },
+    {
+        in: {
+            p: "ایستل",
+            f: "eestul",
+        },
+        out: "اِیسْت" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "ایستل",
+            f: "eystul",
+        },
+        out: "ایسْت" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "اېسېدل",
+            f: "esedul",
+        },
+        out: "اېسېد" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "اوسېدل",
+            f: "osedul",
+        },
+        out: "اوسېد" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "اواز",
+            f: "awaaz",
+        },
+        out: "اَواز",
+    },
+    {
+        in: {
+            p: "اسلام",
+            f: "islaam",
+        },
+        out: "اِسْلام",
+    },
+    {
+        in: {
+            p: "واردول",
+            f: "waaridawul",
+        },
+        out: "وارِدَو" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "غاړه",
+            f: "ghaaRa",
+        },
+        out: "غاړَه",
+    },
+    {
+        in: {
+            p: "اوتر",
+            f: "awtár",
+        },
+        out: "اَوْتَر",
+    },
+    {
+        in: {
+            p: "اختیار",
+            f: "ikhtiyáar",
+        },
+        out: "اِخْتِیار",
+    },
+    {
+        in: {
+            p: "فریاد",
+            f: "faryáad",
+        },
+        out: "فَرْیاد",
+    },
+    {
+        in: {
+            p: "کارغه",
+            f: "kaarghu",
+        },
+        out: "کارْغهٔ",
+    },
+    {
+        in: {
+            p: "بې کار",
+            f: "be kaar",
+        },
+        out: "بې کار",
+    },
+    {
+        in: {
+            p: "بې کار",
+            f: "bekaar",
+        },
+        out: "بې کار",
+    },
+    // TODO: nb mb thing
+    {
+        in: {
+            p: "انبار",
+            f: "ambáar",
+        },
+        out: "اَنْبار",
+    },
+    {
+        in: {
+            p: "ارغون",
+            f: "arghóon",
+        },
+        out: "اَرْغُون",
+    },
+    {
+        in: {
+            p: "ارمټه",
+            f: "armaTa",
+        },
+        out: "اَرْمَټَه",
+    },
+    {
+        in: {
+            p: "اروا پوه",
+            f: "arwaa poh",
+        },
+        out: "اَرْوا پوهْ",
+    },
    // starting alefs
    {
        in: {
@ -295,21 +485,28 @@ const diacriticsTest: Array<{
        },
        out: "پَرْمَخْتْیا",
    },
-    // {
-    //     in: {
-    //         p: "پته",
-    //         f: "patta",
-    //     },
-    //     out: "پَتّه",
-    // },
+    {
+        in: {
+            p: "پته",
+            f: "patta",
+        },
+        out: "پَتَّه",
+    },
+    {
+        in: {
+            p: "پته تور",
+            f: "patta toor",
+        },
+        out: "پَتَّه تُور",      
+    },
    // get ayn stuff working
-    // {
-    //     in: {
-    //         p: "اعتصاب شکن",
-    //         f: "itisaabshikan",
-    //     },
-    //     out: "اِعتِصاب شِکَن",
-    // },
+    {
+        in: {
+            p: "اعتصاب شکن",
+            f: "itisaab shakan",
+        },
+        out: "اِعتِصاب شَکَن",
+    },
    // avoid false double consonant
    {
        in: {
@ -318,6 +515,36 @@ const diacriticsTest: Array<{
        },
        out: "اَزَل لِیک",
    },
+    // starting with ع
+    {
+        in: {
+            p: "عزت",
+            f: "izzat",
+        },
+        out: "عِزَّت",
+    },
+    {
+        in: {
+            p: "عزت",
+            f: "i'zzat",
+        },
+        out: "عِزَّت",
+    },
+    // ئ in the middle
+    {
+        in: {
+            p: "برائت",
+            f: "baraa'at",
+        },
+        out: "بَرائَت",
+    },
+    {
+        in: {
+            p: "فائده",
+            f: "faaida",
+        },
+        out: "فائِدَه",
+    },
 ];

 phonemeSplits.forEach((s) => {
@ -327,8 +554,9 @@ phonemeSplits.forEach((s) => {
    });
 });

-test("adding diacritics should work", () => {
+
 diacriticsTest.forEach((t) => {
+    test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => {
        expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
    });
 });
--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@ -41,6 +41,7 @@ type PhonemeInfo = {
    takesSukunOnEnding?: true,
    longVowel?: true,
    canStartWithAynBefore?: true,
+    useEndingDiacritic?: true,
 }

 const phonemeTable: Record<Phoneme, PhonemeInfo> = {
@ -211,6 +212,7 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
        longVowel: true,
        // alsoCanBePrefix: true,
        diacritic: pesh,
+        useEndingDiacritic: true,
    },
    "ey": {
        matches: ["ی"],
@ -231,14 +233,13 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
    "a": {
        diacritic: zwar,
        endingMatches: ["ه"],
-        beginningMatches: ["ا"],
+        beginningMatches: ["ا", "ع"],
        // canComeAfterHeyEnding: true,
        // canBeFirstPartOfFathahanEnding: true,
    },
    "u": {
        diacritic: zwarakey,
        endingMatches: ["ه"],
-        // hamzaOnEnd: true,
    },
    "i": {
        diacritic: zer,
@ -270,7 +271,7 @@ export function splitFIntoPhonemes(fIn: string): Phoneme[] {
    const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
    const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
    const endingDigraphs: Phoneme[] = ["uy"];
-    const willIgnore = ["?", " ", "`", ".", "…", ","];
+    const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
    
    const result: Phoneme[] = [];
    const f = removeAccents(fIn);
@ -334,28 +335,27 @@ function processPhoneme(
    i: number,
    phonemes: Phoneme[],
 ) {
+    // console.log("PHONEME", phoneme);
+    // console.log("space coming up", acc.pIn[0] === " ");
+    // console.log("state", acc);
    // Prep state
    const state = acc.pIn[0] === " " ? advanceP(acc) : acc;
+    // console.log("AFTER SPACE PREP", phoneme);
+    // console.log("state", state);
    // WARNING: Do not use acc after this point!

-    const prevPLetter = last(state.pOut);
-    const currentPLetter = state.pIn[0];
-    const nextPLetter = state.pIn[1];
-    const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
-    // const isEndOfWord = !nextPLetter || nextPLetter === " ";
-    const phonemeInfo = phonemeTable[phoneme];
-    const previousPhoneme = i > 0 && phonemes[i-1];
-    const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
-    // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
-    // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
-    const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
-    const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
-    const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
-    const sukunOrDiacritic = (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "");
+    const {
+        phonemeInfo,
+        isBeginningOfWord,
+        currentPLetter,
+        needsTashdeed,
+        sukunOrDiacritic,
+        nextPLetter,
+        isEndOfWord,
+    } = stateInfo({ state, i, phoneme, phonemes });

    // if it's not an exception (TODO)
    // it must be one of the following 5 possibilities
-
    // 1. beginning a word with a long vowel
    if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
        if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
@ -371,29 +371,65 @@ function processPhoneme(
        return pipe(
            advanceP,
            addP(sukunOrDiacritic),
+            advanceForAin,
        )(state);
    // 3. double consonant to be marked with tashdeed
    } else if (needsTashdeed) {
-        return addP(tashdeed)(state);
-    // 4. direct match of phoneme / P letter
-    } else if (phonemeInfo.matches?.includes(currentPLetter)) {
+        return pipe(
+            addP(tashdeed)
+        )(state);
+    // 4. special ه ending
+    } else if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) {
+        return pipe(
+            advanceP,
+            addP(phoneme === "u" ? hamzaAbove : sukun),
+        )(state);
+    // 5. direct match of phoneme / P letter
+    } else if (phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب")) {
        return pipe(
            addP(sukunOrDiacritic),
            advanceP,
        )(state);
-    // 5. just a diacritic for short vowel
+    // 6. just a diacritic for short vowel
    } else if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
        return pipe(
+            advanceForHamzaMid,
            addP(phonemeInfo.diacritic),
-            advanceIfReachedEndingHamza,
+            advanceForAinOrHamza,
        )(state);
    }
-
    // anything that gets to this point is a failure/error
-    // console.log(state);
    throw new Error("phonetics error");
 }

+
+
+function stateInfo({ state, i, phonemes, phoneme }: {
+    state: DiacriticsAccumulator,
+    i: number,
+    phonemes: Phoneme[],
+    phoneme: Phoneme,
+}) {
+    const prevPLetter = last(state.pOut);
+    const currentPLetter = state.pIn[0];
+    const nextPLetter = state.pIn[1];
+    const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
+    const isEndOfWord = !nextPLetter || nextPLetter === " ";
+    const phonemeInfo = phonemeTable[phoneme];
+    const previousPhoneme = i > 0 && phonemes[i-1];
+    const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
+    // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
+    // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
+    const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
+    const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
+    const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
+    const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
+    const sukunOrDiacritic = (needsSukun ? sukun : diacritic);
+    return {
+        phonemeInfo, isBeginningOfWord, currentPLetter, needsTashdeed, sukunOrDiacritic, nextPLetter, isEndOfWord,
+    };
+};
+
 /**
 * returns the last character of a string
 * 
@ -417,8 +453,31 @@ const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): Diac
    };
 }

-function advanceIfReachedEndingHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
-    if (state.pIn[0] === "ه" && (!state.pIn[1] || state.pIn[1] === " ")) {
+function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} {
+    return {
+        current: state.pIn[0],
+        next: state.pIn[1],
+    };
+}
+
+function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
+    const { current } = getCurrentNext(state);
+    return (current === "ع") ? advanceP(state) : state;
+}
+
+function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
+    const { current, next } = getCurrentNext(state);
+    if (current === "ئ" && next && next !== "ئ") {
+        return advanceP(state);
+    }
+    return state;
+}
+function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
+    const { current, next } = getCurrentNext(state);
+    if (current === "ه" && (!next || next === " ")) {
+        return advanceP(state);
+    }
+    if (current === "ع") {
        return advanceP(state);
    }
    return state;