From c8da32764e7e8a27990307c9c2eac5ba2a2587f4 Mon Sep 17 00:00:00 2001
From: Bill D <clay@mailbox.org>
Date: Sat, 5 Jun 2021 20:59:35 +0430
Subject: [PATCH] =?UTF-8?q?new=20diacritics=20function=20done!=20?=
 =?UTF-8?q?=F0=9F=99=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 package.json                  |   2 +-
 src/lib/diacritics-helpers.ts |  29 +++--
 src/lib/diacritics.test.ts    | 204 ++++++++++++++++++++++++----------
 src/lib/diacritics.ts         |  45 +++-----
 src/library.ts                |   4 +
 5 files changed, 191 insertions(+), 93 deletions(-)

diff --git a/package.json b/package.json
index 41755c6..b5670d2 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@lingdocs/pashto-inflector",
-  "version": "0.4.2",
+  "version": "0.4.3",
   "author": "lingdocs.com",
   "description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations",
   "homepage": "https://verbs.lingdocs.com",
diff --git a/src/lib/diacritics-helpers.ts b/src/lib/diacritics-helpers.ts
index 0149ab2..cbb15fe 100644
--- a/src/lib/diacritics-helpers.ts
+++ b/src/lib/diacritics-helpers.ts
@@ -185,7 +185,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
     },
     // Long Vowels
     "aa": {
-        matches: ["ا"],
+        matches: ["ا", "أ"],
         beginningMatches: ["آ", "ا"],
         endingMatches: ["ا", "یٰ"],
         longVowel: true,
@@ -210,7 +210,6 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
     "oo": {
         matches: ["و"],
         longVowel: true,
-        // alsoCanBePrefix: true,
         diacritic: pesh,
         useEndingDiacritic: true,
         ainBlendDiacritic: pesh,
@@ -271,7 +270,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
     const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
     const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
     const endingDigraphs: Phoneme[] = ["uy"];
-    const willIgnore = ["?", " ", "`", ".", "…", ","];
+    const willIgnore = ["?", " ", "`", ".", "…", ",", "-"];
     
     const result: Phoneme[] = [];
     const f = removeAccents(fIn).replace(/ă/g, "a");
@@ -337,6 +336,10 @@ export enum PhonemeStatus {
     NOnFathatan,
     HamzaOnWow,
     ArabicDefiniteArticleUl,
+    OoPrefix,
+    AlefHamzaBeg,
+    GlottalStopBeforeOo,
+    OoAfterGlottalStopOo,
 }
 
 export function stateInfo({ state, i, phonemes, phoneme }: {
@@ -351,14 +354,14 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
     const nextPLetter = state.pIn[1];
     const nextPhoneme = phonemes[i+1];
     const previousPhoneme = i > 0 && phonemes[i-1];
-    const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل");
+    const lastThreePLetters = last(state.pOut, 3) + last(state.pOut, 2) + prevPLetter;
+    const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل") || (["دَر", "وَر"].includes(lastThreePLetters) || (last(state.pOut, 2) + prevPLetter) === "را");
     const isEndOfWord = isOutOfWord(nextPLetter);
     const phonemeInfo = phonemeTable[phoneme];
     const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
     // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
     // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
     const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
-    const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
     const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
     const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
     const diacritic = useAinBlendDiacritics
@@ -372,6 +375,9 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
         if (isBeginningOfWord && phoneme === "aa" && phonemeInfo.beginningMatches?.includes(currentPLetter)) {
             return PhonemeStatus.DirectMatch;
         }
+        if (isBeginningOfWord && phoneme === "oo" && currentPLetter === "و") {
+            return PhonemeStatus.OoPrefix;
+        }
         if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
             if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
                 throw Error("phonetics error - needs alef prefix");
@@ -395,12 +401,21 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
         if (isBeginningOfWord && phoneme === "-Ul-" && currentPLetter === "ا" && nextPLetter === "ل") {
             return PhonemeStatus.ArabicDefiniteArticleUl;
         }
+        if (phoneme === "a" && nextPhoneme === "'" && phonemes[i+2] === "a" && currentPLetter === "أ") {
+            return PhonemeStatus.AlefHamzaBeg;
+        }
         if (phoneme === "a" && previousPhoneme === "U" && currentPLetter === "و") {
             return PhonemeStatus.HamzaOnWow;
         }
         if (phoneme === "a" && currentPLetter === "ا" && nextPLetter === fathahan) {
             return PhonemeStatus.ShortAForAlefBeforeFathatan;
         }
+        if (phoneme === "'" && currentPLetter === "و" && nextPLetter === "و") {
+            return PhonemeStatus.GlottalStopBeforeOo;
+        }
+        if (phoneme === "oo" && previousPhoneme === "'" && currentPLetter === "و" && prevPLetter === hamzaAbove) {
+            return PhonemeStatus.OoAfterGlottalStopOo;
+        }
         if (phoneme === "'" && last(state.pOut, 2) === "ع" && isOutOfWord(last(state.pOut, 3))) {
             return PhonemeStatus.AinBeginningAfterShortVowel;
         }
@@ -430,7 +445,7 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
         if (useAinBlendDiacritics) {
             return PhonemeStatus.LongAinVowelMissingComma;
         }
-        if (needsTashdeed) {
+        if (((!isBeginningOfWord && doubleConsonant) || prevPLetter === " ") && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter)) {
             return PhonemeStatus.DoubleConsonantTashdeed;
         }
         if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) {
@@ -454,7 +469,7 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
         if (isEndOfWord && phoneme === "n" && currentPLetter === fathahan && prevPLetter === "ا") {
             return PhonemeStatus.NOnFathatan;
         }
-        console.log(state);
+        // console.log("errored", "current", phoneme, "next", nextPhoneme);
         // console.log("bad phoneme is ", phoneme);
         throw new Error("phonetics error - no status found for phoneme: " + phoneme);
     }
diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts
index 653f03c..159d828 100644
--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@@ -12,14 +12,8 @@ import {
 import {
     zwar,
     zwarakey,
-    zer,
-    pesh,
     sukun,
-    hamzaAbove,
     tashdeed,
-    wasla,
-    daggerAlif,
-    fathahan,
 } from "./diacritics-helpers";
 import * as T from "../types";
 
@@ -259,6 +253,20 @@ const diacriticsSections: {
                 },
                 out: null,
             },
+            {
+                in: {
+                    p: "تشناب",
+                    f: "peshnaab",
+                },
+                out: null,
+            },
+            {
+                in: {
+                    p: "وسېدل",
+                    f: "osedul",
+                },
+                out: null,
+            },
             {
                 in: {
                     p: "رغېدل",
@@ -593,6 +601,27 @@ const diacriticsSections: {
                 },
                 out: "سَخْتْسَری",
             },
+            {
+                in: {
+                    p: " سپین کړه",
+                    f: " speen kRu",
+                },
+                out: "سْپِین کْړهٔ",
+            },
+            {
+                in: {
+                    p: "اوب",
+                    f: "ob",
+                },
+                out: "اوب",
+            },
+            {
+                in: {
+                    p: "قطعه بازي",
+                    f: "qit'a baazee",
+                },
+                out: "قِطْعَه بازي",
+            },
         ],
     },
     {
@@ -1094,29 +1123,112 @@ const diacriticsSections: {
             },
         ],
     },
-    // {
-    //     describe: "double consonants on end of words",
-    //     tests: [
-    //         {
-    //             in: {
-    //                 p: "حق",
-    //                 f: "haqq",
-    //             },
-    //             out: "حَقّ",
-    //         },
-    //         {
-    //             in: {
-    //                 p: "حق پر",
-    //                 f: "haqq par",
-    //             },
-    //             out: "حَقّ پَر",
-    //         },
-    //     ],
-    // },
+    {
+        describe: "double consonants on end of words",
+        tests: [
+            {
+                in: {
+                    p: "حق",
+                    f: "haqq",
+                },
+                out: "حَقّ",
+            },
+            {
+                in: {
+                    p: "حق پر",
+                    f: "haqq par",
+                },
+                out: "حَقّ پَر",
+            },
+        ],
+    },
+    {
+        describe: "أ in the middle of the word",
+        tests: [
+            {
+                in: {
+                    p: "متأسف",
+                    f: "mUtaassif",
+                },
+                out: "مُتأسِّف",
+            },
+            {
+                in: {
+                    p: "متأسف",
+                    f: "mUta'assif",
+                },
+                out: "مُتأسِّف",
+            },
+        ],
+    },
+    {
+        describe: "ؤو in middle of the word",
+        tests: [
+            {
+                in: {
+                    p: "مسوول",
+                    f: "mas'ool",
+                },
+                out: "مَسؤول", // TODO: Is this best??
+            },
+        ],
+    },
+    {
+        describe: "allow for beginnings prefixed with ور در را",
+        tests: [
+            {
+                in: {
+                    p: "وراوږد",
+                    f: "wăr-ooGad",
+                },
+                out: "وَراُوږَد",
+            },
+            {
+                in: {
+                    p: "دراوږد",
+                    f: "dăr-ooGad",
+                },
+                out: "دَراُوږَد",
+            },
+            {
+                in: {
+                    p: "رااوږد",
+                    f: "raa-ooGad",
+                },
+                out: "رااُوږَد",
+            },
+        ],
+    },
+    {
+        describe: "allow oo at start with و prefix",
+        tests: [
+            {
+                in: {
+                    p: "وباسي",
+                    f: "oobaasee",
+                },
+                out: "وُباسي",
+            },
+            {
+                in: {
+                    p: "وځم",
+                    f: "oodzum",
+                },
+                out: "وُځ" + zwarakey + "م",
+            },
+            {
+                in: {
+                    p: "وځم",
+                    f: "wUdzum",
+                },
+                out: "وُځ" + zwarakey + "م",
+            },
+        ],
+    },
 ];
 
 diacriticsSections.forEach((section) => {
-    // if (!section.describe.includes("require fathatan")) return;
+    // if (!section.describe.includes("allow for beginnings")) return;
     describe(section.describe, () => {
         section.tests.forEach((t) => {
             if (t.out) {
@@ -1132,36 +1244,16 @@ diacriticsSections.forEach((section) => {
     });
 });
 
-// ERRORS
+test("ending with left over Pashto script will throw an error", () => {
+    expect(() => {
+        addDiacritics({ p: "کور ته", f: "kor" });
+    }).toThrow(`phonetics error - phonetics shorter than pashto script`);
+});
 
-// const brokenDiacritics = [
-//     {
-//         p: "تشناب",
-//         f: "peshnaab",
-//     },
-//     {
-//         p: "وسېدل",
-//         f: "osedul",
-//     },
-// ];
+test("ending with left over phonetics will throw an error", () => {
+    expect(() => {
+        addDiacritics({ p: "کار", f: "kaar kawul" });
+    }).toThrow();
+});
 
-// test("ending with left over Pashto script will throw an error", () => {
-//     expect(() => {
-//         addDiacritics({ p: "کور ته", f: "kor" });
-//     }).toThrow(`phonetics error - phonetics shorter than pashto script`);
-// });
-
-// test("ending with left over phonetics will throw an error", () => {
-//     expect(() => {
-//         addDiacritics({ p: "کار", f: "kaar kawul" });
-//     }).toThrow();
-// });
-
-// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
-//     brokenDiacritics.forEach((t) => {
-//         expect(() => {
-//             addDiacritics(t);
-//         }).toThrow();
-//     });
-// });
 
diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts
index 6ce4601..b2b8608 100644
--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@@ -10,7 +10,6 @@ import * as T from "../types";
 import {
     splitFIntoPhonemes,
     Phoneme,
-    phonemeTable,
     zwar,
     zwarakey,
     zer,
@@ -19,8 +18,6 @@ import {
     hamzaAbove,
     tashdeed,
     wasla,
-    daggerAlif,
-    fathahan,
     addP,
     advanceP,
     reverseP,
@@ -41,7 +38,7 @@ import { pipe } from "rambda";
  */
  export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
     const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
-    const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p });
+    const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p.trim() });
     if (pIn !== "") {
         throw new Error("phonetics error - phonetics shorter than pashto script");
     }
@@ -57,19 +54,11 @@ function processPhoneme(
     i: number,
     phonemes: Phoneme[],
 ): DiacriticsAccumulator {
-    // console.log("PHONEME", phoneme);
-    // console.log("space coming up", acc.pIn[0] === " ");
-    // console.log("state", acc);
-    // Prep state
-    // TODO: CLEANER function jump to next char
     const state = acc.pIn.slice(0, 5) === " ... "
         ? advanceP(acc, 5)
         : acc.pIn[0] === " "
         ? advanceP(acc)
         : acc;
-    // console.log("AFTER SPACE PREP", phoneme);
-    // console.log("state", state);
-    // WARNING: Do not use acc after this point!
 
     const {
         phonemeInfo,
@@ -78,10 +67,6 @@ function processPhoneme(
         prevPLetter,
     } = stateInfo({ state, i, phoneme, phonemes });
 
-    // console.log("phoneme", phoneme);
-    // console.log("state", state);
-    // console.log(phs); 
-
     return (phs === PhonemeStatus.LeadingLongVowel) ?
             pipe(
                 advanceP,
@@ -95,6 +80,7 @@ function processPhoneme(
             )(state)
         : (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
             pipe(
+                prevPLetter === " " ? reverseP : addP(""),
                 addP(tashdeed)
             )(state)
         : (phs === PhonemeStatus.EndingWithHeyHim) ?
@@ -201,18 +187,19 @@ function processPhoneme(
                 addP(pesh),
                 advanceP,
             )(state)
+        : (phs === PhonemeStatus.OoPrefix) ?
+            pipe(
+                advanceP,
+                addP(pesh),
+            )(state)
+        : (phs === PhonemeStatus.GlottalStopBeforeOo) ?
+            pipe(
+                advanceP,
+                addP(hamzaAbove),
+            )(state)
+        : (phs === PhonemeStatus.OoAfterGlottalStopOo) ?
+            pipe(
+                advanceP,
+            )(state)
         : state;
-
-
-        
-        
-        // (phs === PhonemeStatus.AlefWithHamzaWithGlottalStop) ?
-        //    state
-        // : (phs === PhonemeStatus.AinBeginningAfterShortVowel) ?
-        //    state
-        //: (phs === PhonemeStatus.WoEndingO) ?
-        //    state
-        // :
-        // 
-
 }
diff --git a/src/library.ts b/src/library.ts
index 06bf9a2..8b5a34f 100644
--- a/src/library.ts
+++ b/src/library.ts
@@ -63,6 +63,9 @@ import {
 import {
     translatePhonetics,
 } from "./lib/translate-phonetics";
+import {
+    addDiacritics,
+} from "./lib/diacritics";
 import defaultTextOptions from "./lib/default-text-options";
 import * as grammarUnits from "./lib/grammar-units";
 import * as Types from "./types";
@@ -83,6 +86,7 @@ export {
     isNounAdjOrVerb,
     simplifyPhonetics,
     phoneticsToDiacritics,
+    addDiacritics,
     translatePhonetics,
     // protobuf helpers
     readDictionary,