more work on diacritics

2021-05-06 23:48:53 +03:00 · 2021-05-06 23:48:53 +03:00 · a2b5626514
parent 7b0e6d864f
commit a2b5626514
2 changed files with 144 additions and 6 deletions
--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@ -12,6 +12,17 @@ import {
 } from "./diacritics";
 import * as T from "../types";

+const zwar = "َ";
+const zwarakey = "ٙ";
+const zer = "ِ";
+const pesh = "ُ";
+const sukun = "ْ";
+const hamzaAbove = "ٔ";
+const tashdeed = "ّ";
+const wasla = "ٱ";
+const daggerAlif = "ٰ";
+const fathahan = "ً";
+
 const phonemeSplits: Array<{
    in: string,
    out: string[],
@ -139,6 +150,105 @@ const diacriticsTest: Array<{
        },
        out: "تَشْناب",
    },
+    // working with وs
+    {
+        in: {
+            p: "کول",
+            f: "kwal",
+        },
+        out: "کْوَل",
+    },
+    {
+        in: {
+            p: "تول",
+            f: "tool",
+        },
+        out: "تُول",
+    },
+    {
+        in: {
+            p: "مقبول",
+            f: "maqbool",
+        },
+        out: "مَقْبُول",
+    },
+    {
+        in: {
+            p: "کول",
+            f: "kawul",
+        },
+        out: "کَو" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "کول",
+            f: "kiwul",
+        },
+        out: "کِو" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "کول",
+            f: "kUwul",
+        },
+        out: "کُو" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "کول",
+            f: "kuwul",
+        },
+        out: "ک" + zwarakey + "و" + zwarakey + "ل",
+    },
+    {
+        in: {
+            p: "کول",
+            f: "kawal",
+        },
+        out: "کَوَل",
+    },
+    {
+        in: {
+            p: "کول",
+            f: "kUwal",
+        },
+        out: "کُوَل",
+    },
+    {
+        in: {
+            p: "پشتګرد",
+            f: "pishtgird",
+        },
+        out: "پِشْتْګِرْد",
+    },
+    {
+        in: {
+            p: "سپین",
+            f: "speen",
+        },
+        out: "سْپِین",
+    },
+    {
+        in: {
+            p: "سپین",
+            f: "speyn",
+        },
+        out: "سْپین",
+    },
+    {
+        in: {
+            p: "پېش",
+            f: "pesh",
+        },
+        out: "پېش",
+    },
+    {
+        in: {
+            p: "بتن",
+            f: "battan",
+        },
+        out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
+    },
 ];

 const brokenDiacritics = [
@ -163,6 +273,12 @@ test("bad phonetic characters should throw an error", () => {
    });
 });

+test("ending with left over Pashto script will throw an error", () => {
+    expect(() => {
+        addDiacritics({ p: "کور ته", f: "kor" });
+    }).toThrow(`phonetics error - phonetics shorter than pashto script`);
+});
+
 test("adding diacritics should work", () => {
    diacriticsTest.forEach((t) => {
        expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@ -248,7 +248,14 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
    },
 }

-
+/**
+ * splits a phonetics string into an array of Phonemes
+ * 
+ * will error if there is an illeagal phonetics character
+ * 
+ * @param fIn a phonetics string
+ * @returns an array of phonemes
+ */
 export function splitFIntoPhonemes(fIn: string): Phoneme[] {
    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
    
@ -306,16 +313,25 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
    // TODO: 
    const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);

-    const { pOut } = phonemes.reduce((acc, phoneme, i) => {
-        const isBeginningOfWord = acc.pOut === "" || last(acc.pOut) === " ";
+    const { pIn, pOut } = phonemes.reduce((acc, phoneme, i) => {
+        const prevPLetter = last(acc.pOut);
+        const isBeginningOfWord = acc.pOut === "" || prevPLetter === " ";
        const phonemeInfo = phonemeTable[phoneme];
+        const previousPhoneme = i > 0 && phonemes[i-1];
        const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
        const currentPLetter = acc.pIn[0];
-        const needsSukun = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
+        const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
+        const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme);
+        const needsSukun = doubleConsonant && (previousPhoneme !== phoneme);
+
+        if (needsTashdeed) {
+            return {
+                pOut: acc.pOut + tashdeed,
+                pIn: acc.pIn,
+            };
+        }

        if (phonemeInfo.matches?.includes(currentPLetter)) {
-            // TODO: Check if tashdeed or sukun is used
-            // const needsSukun = is consonant + previous phoneme was consonant + not beginning of word
            return {
                pOut: acc.pOut
                    + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "")
@ -331,9 +347,15 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
            }
        }

+        // TODO: CHECK IF PASHTO IS SHORTER THAN PHONETICS
+
        throw new Error("phonetics error");
    }, { pOut: "", pIn: p });

+    if (pIn !== "") {
+        throw new Error("phonetics error - phonetics shorter than pashto script");
+    }
+
    return {
        p: pOut,
        f,