From 98c5eb745267679ef2f3c149963b39691bacfde3 Mon Sep 17 00:00:00 2001
From: Bill D <clay@mailbox.org>
Date: Fri, 7 May 2021 10:54:09 +0300
Subject: [PATCH] more

---
 src/lib/diacritics.test.ts |  85 +++++++++++++++++++++--------
 src/lib/diacritics.ts      | 108 ++++++++++++++++++++++---------------
 2 files changed, 127 insertions(+), 66 deletions(-)

diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts
index de5ea1b..620e37f 100644
--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@@ -69,20 +69,6 @@ const phonemeSplits: Array<{
     },
 ];
 
-const badPhonetics: Array<{
-    in: string,
-    problem: string,
-}> = [
-    {
-        in: "acar",
-        problem: "c",
-    },
-    {
-        in: "a7am",
-        problem: "7",
-    },
-];
-
 const diacriticsTest: Array<{
     in: T.PsString,
     out: string,
@@ -101,6 +87,13 @@ const diacriticsTest: Array<{
         },
         out: "کُور",
     },
+    {
+        in: {
+            p: "کور کور",
+            f: "kor koor",
+        },
+        out: "کور کُور",
+    },
     {
         in: {
             p: "تب",
@@ -242,6 +235,22 @@ const diacriticsTest: Array<{
         },
         out: "پېش",
     },
+    {
+        in: {
+            p: "لیک",
+            f: "leek",
+        },
+        out: "لِیک",
+    },
+    // starting alefs
+    {
+        in: {
+            p: "اسلام",
+            f: "islaam",
+        },
+        out: "اِسْلام",
+    },
+    // double consonant
     {
         in: {
             p: "بتن",
@@ -249,12 +258,13 @@ const diacriticsTest: Array<{
         },
         out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
     },
-];
-
-const brokenDiacritics = [
+    // avoid false double consonant
     {
-        p: "تشناب",
-        f: "peshnaab",
+        in: {
+            p: "ازل لیک",
+            f: "azalléek",
+        },
+        out: "اَزَل لِیک",
     },
 ];
 
@@ -265,6 +275,35 @@ phonemeSplits.forEach((s) => {
     });
 });
 
+test("adding diacritics should work", () => {
+    diacriticsTest.forEach((t) => {
+        expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
+    });
+});
+
+// ERRORS
+
+const brokenDiacritics = [
+    {
+        p: "تشناب",
+        f: "peshnaab",
+    },
+];
+
+const badPhonetics: Array<{
+    in: string,
+    problem: string,
+}> = [
+    {
+        in: "acar",
+        problem: "c",
+    },
+    {
+        in: "a7am",
+        problem: "7",
+    },
+];
+
 test("bad phonetic characters should throw an error", () => {
     badPhonetics.forEach((s) => {
         expect(() => {
@@ -279,10 +318,10 @@ test("ending with left over Pashto script will throw an error", () => {
     }).toThrow(`phonetics error - phonetics shorter than pashto script`);
 });
 
-test("adding diacritics should work", () => {
-    diacriticsTest.forEach((t) => {
-        expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
-    });
+test("ending with left over phonetics will throw an error", () => {
+    expect(() => {
+        addDiacritics({ p: "کار", f: "kaar kawul" });
+    }).toThrow();
 });
 
 test("adding diacritics errors when phonetecs and pashto do not line up", () => {
diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts
index ba814cc..26e38ec 100644
--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@@ -28,6 +28,8 @@ type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
 type ShortVowel = "a" | "i" | "u" | "U";
 type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
 
+type DiacriticsAccumulator = { pIn: string, pOut: string };
+
 type PhonemeInfo = {
     matches?: string[],
     beginningMatches?: string[],
@@ -225,6 +227,7 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
     "a": {
         diacritic: zwar,
         endingMatches: ["ه"],
+        beginningMatches: ["ا"],
         // canComeAfterHeyEnding: true,
         // canBeFirstPartOfFathahanEnding: true,
     },
@@ -304,64 +307,76 @@ export function splitFIntoPhonemes(fIn: string): Phoneme[] {
 }
 
 /**
- * Adds phonetis to a given PsString.
+ * Adds diacritics to a given PsString.
  * Errors if the phonetics and script don't line up.
  * 
  * @param ps a PsSTring without phonetics
  */
 export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
-    // TODO: 
     const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
-
-    const { pIn, pOut } = phonemes.reduce((acc, phoneme, i) => {
-        const prevPLetter = last(acc.pOut);
-        const isBeginningOfWord = acc.pOut === "" || prevPLetter === " ";
-        const phonemeInfo = phonemeTable[phoneme];
-        const previousPhoneme = i > 0 && phonemes[i-1];
-        const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
-        const currentPLetter = acc.pIn[0];
-        const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
-        const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme);
-        const needsSukun = doubleConsonant && (previousPhoneme !== phoneme);
-
-        if (needsTashdeed) {
-            return {
-                pOut: acc.pOut + tashdeed,
-                pIn: acc.pIn,
-            };
-        }
-
-        if (phonemeInfo.matches?.includes(currentPLetter)) {
-            return {
-                pOut: acc.pOut
-                    + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "")
-                    + currentPLetter,
-                pIn: acc.pIn.slice(1),
-            };
-        }
-
-        if (phonemeInfo.diacritic) {
-            return {
-                pOut: acc.pOut + phonemeInfo.diacritic,
-                pIn: acc.pIn,
-            }
-        }
-
-        // TODO: CHECK IF PASHTO IS SHORTER THAN PHONETICS
-
-        throw new Error("phonetics error");
-    }, { pOut: "", pIn: p });
-
+    const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p });
     if (pIn !== "") {
         throw new Error("phonetics error - phonetics shorter than pashto script");
     }
-
     return {
         p: pOut,
         f,
     };
 }
 
+function processPhoneme(
+    acc: DiacriticsAccumulator,
+    phoneme: Phoneme,
+    i: number,
+    phonemes: Phoneme[],
+) {
+    // Prep state
+    const state = acc.pIn[0] === " " ? advanceP(acc) : acc;
+    // WARNING: Do not use acc after this point!
+
+    const prevPLetter = last(state.pOut);
+    const currentPLetter = state.pIn[0];
+    // const nextPLetter = state.pIn[1];
+    const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
+    const phonemeInfo = phonemeTable[phoneme];
+    const previousPhoneme = i > 0 && phonemes[i-1];
+    const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
+    const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
+    const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme);
+    const needsSukun = doubleConsonant && (previousPhoneme !== phoneme);
+
+    if (needsTashdeed) {
+        return {
+            pOut: state.pOut + tashdeed,
+            pIn: state.pIn,
+        };
+    }
+
+    // TODO: Beginning of word with long vowels and alef etc.
+    if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
+        const ns = advanceP(state);
+        return {
+            ...ns,
+            pOut: ns.pOut + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""),
+        };
+    } else if (phonemeInfo.matches?.includes(currentPLetter)) {
+        return advanceP({
+            ...state,
+            pOut: state.pOut
+                + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""),
+        });
+    }
+
+    if (phonemeInfo.diacritic) {
+        return {
+            ...state,
+            pOut: state.pOut + phonemeInfo.diacritic,
+        };
+    }
+
+    throw new Error("phonetics error");
+}
+
 /**
  * returns the last character of a string
  * 
@@ -369,4 +384,11 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
  */
 function last(s: string) {
     return s[s.length - 1];
+}
+
+function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
+    return {
+        pOut: state.pOut + state.pIn.slice(0, n),
+        pIn: state.pIn.slice(n),
+    }
 }
\ No newline at end of file