some more refactoring, getting stuck on the du behaviour 😒

2021-05-16 17:13:42 +03:00 · 2021-05-16 17:13:42 +03:00 · 73f786890e
parent c5c9ea86d2
commit 73f786890e
4 changed files with 1147 additions and 960 deletions
--- a/src/lib/diacritics-helpers.test.ts
+++ b/src/lib/diacritics-helpers.test.ts
@ -0,0 +1,104 @@
+import {
+    splitFIntoPhonemes,
+    last,
+    addP,
+    prev2Chars,
+    advanceP,
+    reverseP,
+    overwriteP,
+    advanceForAin,
+    advanceForAinOrHamza,
+    advanceForHamzaMid,
+} from "./diacritics-helpers";
+
+const phonemeSplits: Array<{
+    in: string,
+    out: string[],
+}> = [
+    {
+        in: "kor",
+        out: ["k", "o", "r"],
+    },
+    {
+        in: "raaghey",
+        out: ["r", "aa", "gh", "ey"],
+    },
+    {
+        in: "hatsa",
+        out: ["h", "a", "ts", "a"],
+    },
+    {
+        in: "ba",
+        out: ["b", "a"],
+    },
+    {
+        in: "peydáa",
+        out: ["p", "ey", "d", "aa"],
+    },
+    {
+        in: "be kaar",
+        out: ["b", "e", "k", "aa", "r"],
+    },
+    {
+        in: "raadzeyy",
+        out: ["r", "aa", "dz", "eyy"],
+    },
+    {
+        in: "badanuy ??",
+        out: ["b", "a", "d", "a", "n", "uy"],
+    },
+    {
+        in: "tur ... pore",
+        out: ["t", "u", "r", "p", "o", "r", "e"],
+    },
+    {
+        in: "daar-Ul-iqaama",
+        out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"],
+    },
+];
+
+phonemeSplits.forEach((s) => {
+    test(`${s.in} should split properly`, () => {
+        const result = splitFIntoPhonemes(s.in);
+        expect(result).toEqual(s.out);
+    });
+});
+
+const badPhonetics: Array<{
+    in: string,
+    problem: string,
+}> = [
+    {
+        in: "acar",
+        problem: "c",
+    },
+    {
+        in: "a7am",
+        problem: "7",
+    },
+];
+
+test("bad phonetic characters should throw an error", () => {
+    badPhonetics.forEach((s) => {
+        expect(() => {
+            splitFIntoPhonemes(s.in);
+        }).toThrow(`illegal phonetic character: ${s.problem}`);
+    });
+});
+
+test("last should work", () => {
+    expect(last("this")).toBe("s");
+});
+
+test("addP should work", () => {
+    expect(addP("ت")({ pIn: "", pOut: "کر" })).toEqual({
+        pIn: "",
+        pOut: "کرت",
+    });
+});
+
+test("prev2Chars should work", () => {
+    expect(prev2Chars("تورن")).toBe("رن");
+    expect(prev2Chars("وست .. ")).toBe("ست");
+    expect(prev2Chars("دَ ... ")).toBe("دَ");
+});
--- a/src/lib/diacritics-helpers.ts
+++ b/src/lib/diacritics-helpers.ts
@ -0,0 +1,401 @@
+/**
+ * Copyright (c) 2021 lingdocs.com
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ */
+
+import { removeAccents } from "./accent-helpers";
+
+export type DiacriticsAccumulator = { pIn: string, pOut: string };
+
+type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y";
+type Ain = "'"
+type JoiningVowel = "-i-" | "-U-" | "-Ul-"; 
+type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
+type ShortVowel = "a" | "i" | "u" | "U";
+export type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
+
+type PhonemeInfo = {
+    matches?: string[],
+    beginningMatches?: string[],
+    endingMatches?: string[],
+    consonant?: true,
+    diacritic?: string,
+    endingOnly?: true,
+    takesSukunOnEnding?: true,
+    longVowel?: true,
+    canStartWithAynBefore?: true,
+    useEndingDiacritic?: true,
+}
+
+export const zwar = "َ";
+export const zwarakey = "ٙ";
+export const zer = "ِ";
+export const pesh = "ُ";
+export const sukun = "ْ";
+export const hamzaAbove = "ٔ";
+export const tashdeed = "ّ";
+export const wasla = "ٱ";
+export const daggerAlif = "ٰ";
+export const fathahan = "ً";
+
+export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
+    // Consonants
+    "b": {
+        matches: ["ب"],
+        consonant: true,
+    },
+    "p": {
+        matches: ["پ"],
+        consonant: true,
+    },
+    "t": {
+        matches: ["ت", "ط"],
+        consonant: true,
+    },
+    "T": {
+        matches: ["ټ"],
+        consonant: true,
+    },
+    "s": {
+        matches: ["س", "ص", "ث"],
+        consonant: true,
+    },
+    "j": {
+        matches: ["ج"],
+        consonant: true,
+    },
+    "ch": {
+        matches: ["چ"],
+        consonant: true,
+    },
+    "kh": {
+        matches: ["خ"],
+        consonant: true,
+    },
+    "ts": {
+        matches: ["څ"],
+        consonant: true,
+    },
+    "dz": {
+        matches: ["ځ"],
+        consonant: true,
+    },
+    "d": {
+        matches: ["د"],
+        consonant: true,
+    },
+    "D": {
+        matches: ["ډ"],
+        consonant: true,
+    },
+    "r": {
+        matches: ["ر"],
+        consonant: true,
+    },
+    "R": {
+        matches: ["ړ"],
+        consonant: true,
+    },
+    "z": {
+        matches: ["ز", "ذ", "ظ", "ض"],
+        consonant: true,
+    },
+    "jz": {
+        matches: ["ژ"],
+        consonant: true,
+    },
+    "G": {
+        matches: ["ږ"],
+        consonant: true,
+    },
+    "sh": {
+        matches: ["ش"],
+        consonant: true,
+    },
+    "x": {
+        matches: ["ښ"],
+        consonant: true,
+    },
+    "gh": {
+        matches: ["غ"],
+        consonant: true,
+    },
+    "f": {
+        matches: ["ف"],
+        consonant: true,
+    },
+    "q": {
+        matches: ["ق"],
+        consonant: true,
+    },
+    "k": {
+        matches: ["ک"],
+        consonant: true,
+    },
+    "g": {
+        matches: ["ګ"],
+        consonant: true,
+    },
+    "l": {
+        matches: ["ل"],
+        consonant: true,
+    },
+    "m": {
+        matches: ["م"],
+        consonant: true,
+    },
+    "n": {
+        matches: ["ن"],
+        consonant: true,
+    },
+    "N": {
+        matches: ["ڼ"],
+        consonant: true,
+    },
+    "h": {
+        matches: ["ه", "ح"],
+        consonant: true,
+        takesSukunOnEnding: true,
+    },
+    "w": {
+        matches: ["و"],
+        consonant: true,
+    },
+    "y": {
+        matches: ["ی"],
+        consonant: true,
+    },
+    // Ain
+    "'": {
+        matches: ["ع", "ئ"],
+        consonant: true,
+    },
+    // Joining Vowels
+    "-i-": {
+    },
+    "-U-": {
+        matches: [" و ", "و"],
+    },
+    "-Ul-": {
+        matches: ["ال"],
+    },
+    // Long Vowels
+    "aa": {
+        matches: ["ا"],
+        beginningMatches: ["آ", "ا"],
+        endingMatches: ["ا", "یٰ"],
+        longVowel: true,
+    },
+    "ee": {
+        matches: ["ی"],
+        longVowel: true,
+        endingMatches: ["ي"],
+        diacritic: zer,
+        canStartWithAynBefore: true
+    },
+    "e": {
+        matches: ["ې"],
+        longVowel: true,
+    },
+    "o": {
+        matches: ["و"],
+        longVowel: true,
+    },
+    "oo": {
+        matches: ["و"],
+        longVowel: true,
+        // alsoCanBePrefix: true,
+        diacritic: pesh,
+        useEndingDiacritic: true,
+    },
+    "ey": {
+        matches: ["ی"],
+        longVowel: true,
+        endingMatches: ["ی"],
+    },
+    "uy": {
+        matches: ["ۍ"],
+        longVowel: true,
+        endingOnly: true,
+    },
+    "eyy": {
+        matches: ["ئ"],
+        longVowel: true,
+        endingOnly: true,
+    },
+    // Short Vowels
+    "a": {
+        diacritic: zwar,
+        endingMatches: ["ه"],
+        beginningMatches: ["ا", "ع"],
+        // canComeAfterHeyEnding: true,
+        // canBeFirstPartOfFathahanEnding: true,
+    },
+    "u": {
+        diacritic: zwarakey,
+        endingMatches: ["ه"],
+    },
+    "i": {
+        diacritic: zer,
+        endingMatches: ["ه"],
+        beginningMatches: ["ا", "ع"],
+        // takesDiacriticBeforeGurdaHeyEnding: true,
+        // canBeWasla: true,
+    },
+    "U": {
+        diacritic: pesh,
+        endingMatches: ["ه"],
+        // takesDiacriticBeforeGurdaHeyEnding: true,
+        beginningMatches: ["ا", "ع"],
+    },
+}
+
+/**
+ * splits a phonetics string into an array of Phonemes
+ * 
+ * will error if there is an illeagal phonetics character
+ * 
+ * @param fIn a phonetics string
+ * @returns an array of phonemes
+ */
+ export function splitFIntoPhonemes(fIn: string): Phoneme[] {
+    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
+    
+    const quadrigraphs: Phoneme[] = ["-Ul-"];
+    const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
+    const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
+    const endingDigraphs: Phoneme[] = ["uy"];
+    const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
+    
+    const result: Phoneme[] = [];
+    const f = removeAccents(fIn);
+    let index = 0;
+    while (index < f.length) {
+        const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " ");
+        const threeLetterChunk = f.slice(index, index + 3) as Phoneme;
+        const fourLetterChunk = f.slice(index, index + 4) as Phoneme;
+        if (quadrigraphs.includes(fourLetterChunk)) {
+            result.push(fourLetterChunk);
+            index += 4;
+            continue;
+        }
+        if (trigraphs.includes(threeLetterChunk)) {
+            result.push(threeLetterChunk);
+            index += 3;
+            continue;
+        }
+        const twoLetterChunk = f.slice(index, index + 2) as Phoneme;
+        if (
+            digraphs.includes(twoLetterChunk) ||
+            (isLastTwoLetters && endingDigraphs.includes(twoLetterChunk))
+        ) {
+            result.push(twoLetterChunk);
+            index += 2;
+            continue;
+        }
+        const singleLetter = f.slice(index, index + 1) as Phoneme;
+        if (!willIgnore.includes(singleLetter)) {
+            if (!singleLetterPhonemes.includes(singleLetter)) {
+                throw new Error(`illegal phonetic character: ${singleLetter}`);
+            }
+            result.push(singleLetter);
+        }
+        index++;
+    }
+    return result;
+}
+/**
+ * returns the last character of a string
+ * 
+ * @param s 
+ */
+export function last(s: string) {
+    return s[s.length - 1];
+}
+
+export function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
+    return {
+        pIn: state.pIn.slice(n),
+        pOut: state.pOut + state.pIn.slice(0, n),
+    };
+}
+
+/**
+ * moves back to the last character that wasn't a " " or "."
+ * 
+ * @param state 
+ * @returns 
+ */
+export function reverseP(state: DiacriticsAccumulator): DiacriticsAccumulator {
+    const reversed = [...state.pOut].reverse();
+    const howFar = reversed.findIndex((c) => ![" ", "."].includes(c));
+    return {
+        pIn: state.pOut.slice(-howFar) + state.pIn,
+        pOut: state.pOut.slice(0, -howFar),
+    };
+}
+
+export const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
+    return {
+        ...state,
+        pOut: toAdd ? (state.pOut + toAdd) : state.pOut,
+    };
+};
+
+export const overwriteP = (toWrite: string) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
+    return {
+        pIn: state.pIn.slice(1),
+        pOut: state.pOut + toWrite,
+    };
+};
+
+/**
+ * returns the last two character in a string that was not a space or a dote
+ * 
+ * @param s 
+ * @returns 
+ */
+export function prev2Chars(s: string): string {
+    // console.log("looking at pOut", s);
+    const reversed = [...s].reverse();
+    // console.log(reversed.join("-"));
+    const lastIndex = reversed.findIndex((c) => ![" ", "."].includes(c));
+    const last2 = reversed[lastIndex + 1] + reversed[lastIndex];
+    // console.log("last2", last2);
+    return last2;
+}
+
+export function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} {
+    return {
+        current: state.pIn[0],
+        next: state.pIn[1],
+    };
+}
+
+export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
+    const { current } = getCurrentNext(state);
+    return (current === "ع") ? advanceP(state) : state;
+}
+
+export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
+    const { current, next } = getCurrentNext(state);
+    if (current === "ئ" && next && next !== "ئ") {
+        return advanceP(state);
+    }
+    return state;
+}
+
+export function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
+    const { current, next } = getCurrentNext(state);
+    if (current === "ه" && (!next || next === " ")) {
+        return advanceP(state);
+    }
+    if (current === "ع") {
+        return advanceP(state);
+    }
+    return state;
+}
+
--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@ -8,71 +8,31 @@

 import {
    addDiacritics,
-    splitFIntoPhonemes,
 } from "./diacritics";
+import {
+    zwar,
+    zwarakey,
+    zer,
+    pesh,
+    sukun,
+    hamzaAbove,
+    tashdeed,
+    wasla,
+    daggerAlif,
+    fathahan,
+} from "./diacritics-helpers";
 import * as T from "../types";

-const zwar = "َ";
-const zwarakey = "ٙ";
-const zer = "ِ";
-const pesh = "ُ";
-const sukun = "ْ";
-const hamzaAbove = "ٔ";
-const tashdeed = "ّ";
-const wasla = "ٱ";
-const daggerAlif = "ٰ";
-const fathahan = "ً";
-
-const phonemeSplits: Array<{
-    in: string,
-    out: string[],
-}> = [
-    {
-        in: "kor",
-        out: ["k", "o", "r"],
-    },
-    {
-        in: "raaghey",
-        out: ["r", "aa", "gh", "ey"],
-    },
-    {
-        in: "hatsa",
-        out: ["h", "a", "ts", "a"],
-    },
-    {
-        in: "ba",
-        out: ["b", "a"],
-    },
-    {
-        in: "peydáa",
-        out: ["p", "ey", "d", "aa"],
-    },
-    {
-        in: "be kaar",
-        out: ["b", "e", "k", "aa", "r"],
-    },
-    {
-        in: "raadzeyy",
-        out: ["r", "aa", "dz", "eyy"],
-    },
-    {
-        in: "badanuy ??",
-        out: ["b", "a", "d", "a", "n", "uy"],
-    },
-    {
-        in: "tur ... pore",
-        out: ["t", "u", "r", "p", "o", "r", "e"],
-    },
-    {
-        in: "daar-Ul-iqaama",
-        out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"],
-    },
-];
-
-const diacriticsTest: Array<{
+const diacriticsSections: {
+    describe: string,
+    tests: {
        in: T.PsString,
-    out: string,
-}> = [
+        out: string | null,
+    }[],
+}[] = [
+    {
+        describe: "regular, native Pashto script/sounds",
+        tests: [
            {
                in: {
                p: "کور",
@ -403,14 +363,6 @@ const diacriticsTest: Array<{
                },
                out: "بې کار",
            },
-    // TODO: nb mb thing
-    {
-        in: {
-            p: "انبار",
-            f: "ambáar",
-        },
-        out: "اَنْبار",
-    },
            {
                in: {
                    p: "ارغون",
@ -499,14 +451,6 @@ const diacriticsTest: Array<{
                },
                out: "پَتَّه تُور",      
            },
-    // get ayn stuff working
-    {
-        in: {
-            p: "اعتصاب شکن",
-            f: "itisaab shakan",
-        },
-        out: "اِعتِصاب شَکَن",
-    },
            // avoid false double consonant
            {
                in: {
@ -515,6 +459,30 @@ const diacriticsTest: Array<{
                },
                out: "اَزَل لِیک",
            },
+        ],
+    },
+    {
+        describe: "nm - mb thing",
+        tests: [
+            {
+                in: {
+                    p: "انبار",
+                    f: "ambáar",
+                },
+                out: "اَنْبار",
+            },
+        ],
+    },
+    {
+        describe: "ayn stuff",
+        tests: [
+            {
+                in: {
+                    p: "اعتصاب شکن",
+                    f: "itisaab shakan",
+                },
+                out: "اِعتِصاب شَکَن",
+            },
            // starting with ع
            {
                in: {
@ -530,7 +498,26 @@ const diacriticsTest: Array<{
                },
                out: "عِزَّت",
            },
-    // ئ in the middle
+            // ending with ayn
+            {
+                in: {
+                    p: "طمع کېدل",
+                    f: "tama kedul",
+                },
+                out: "طَمَع کېد" + zwarakey + "ل",
+            },
+            {
+                in: {
+                    p: "منبع",
+                    f: "manbí",
+                },
+                out: "مَنْبِع",
+            },
+        ],
+    },
+    {
+        describe: "ئ in the middle",
+        tests: [
            {
                in: {
                    p: "برائت",
@ -545,7 +532,11 @@ const diacriticsTest: Array<{
                },
                out: "فائِدَه",
            },
-    // واخ being khaa in the middle of a word
+        ],
+    },
+    {
+        describe: "واخ being khaa in the middle of a word",
+        tests: [
            {
                in: {
                    p: "استخوان",
@ -553,7 +544,11 @@ const diacriticsTest: Array<{
                },
                out: "اُسْتُخ(و)ان",
            },
-    // Arabic wasla
+        ],
+    },
+    {
+        describe: "Arabic wasla",
+        tests: [
            {
                in: {
                    p: "بالکل",
@ -561,7 +556,11 @@ const diacriticsTest: Array<{
                },
                out: "بِٱلْکُل",
            },
-    // izafe
+        ],
+    },
+    {
+        describe: "izafe",
+        tests: [
            {
                in: {
                    p: "ایصال ثواب",
@ -569,20 +568,52 @@ const diacriticsTest: Array<{
                },
                out: "اِیصالِ ثَواب",
            },
+        ],
+    },
+    {
+        describe: "special behaviour with د",
+        tests: [
+            {
+                in: {
+                    p: "د",
+                    f: "du",
+                },
+                out: "د" + zwarakey,
+            },
+            {
+                in: {
+                    p: "د لاس",
+                    f: "du laas",
+                },
+                out: "د" + zwarakey + " لاس",
+            },
+            // {
+            //     in: {
+            //         p: "د ... په شان",
+            //         f: "du ... pu shaan",
+            //     },
+            //     out: "د" + zwarakey + "... پهٔ شان",
+            // },
+        ],
+    },
 ];

-phonemeSplits.forEach((s) => {
-    test(`${s.in} should split properly`, () => {
-        const result = splitFIntoPhonemes(s.in);
-        expect(result).toEqual(s.out);
-    });
-});
-
-
-diacriticsTest.forEach((t) => {
+diacriticsSections.forEach((section) => {
+    describe(section.describe, () => {
+        section.tests.forEach((t) => {
+            if (section.describe === "special behaviour with د") {
+                if (t.out) {
                    test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => {
                        expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
                    });
+                } else {
+                    expect(() => {
+                        expect(addDiacritics(t.in)).toThrowError();
+                    });
+                }
+            }
+        });
+    });
 });

 // ERRORS
@ -598,44 +629,23 @@ const brokenDiacritics = [
    },
 ];

-const badPhonetics: Array<{
-    in: string,
-    problem: string,
-}> = [
-    {
-        in: "acar",
-        problem: "c",
-    },
-    {
-        in: "a7am",
-        problem: "7",
-    },
-];
+// test("ending with left over Pashto script will throw an error", () => {
+//     expect(() => {
+//         addDiacritics({ p: "کور ته", f: "kor" });
+//     }).toThrow(`phonetics error - phonetics shorter than pashto script`);
+// });

-test("bad phonetic characters should throw an error", () => {
-    badPhonetics.forEach((s) => {
-        expect(() => {
-            splitFIntoPhonemes(s.in);
-        }).toThrow(`illegal phonetic character: ${s.problem}`);
-    });
-});
+// test("ending with left over phonetics will throw an error", () => {
+//     expect(() => {
+//         addDiacritics({ p: "کار", f: "kaar kawul" });
+//     }).toThrow();
+// });

-test("ending with left over Pashto script will throw an error", () => {
-    expect(() => {
-        addDiacritics({ p: "کور ته", f: "kor" });
-    }).toThrow(`phonetics error - phonetics shorter than pashto script`);
-});
+// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
+//     brokenDiacritics.forEach((t) => {
+//         expect(() => {
+//             addDiacritics(t);
+//         }).toThrow();
+//     });
+// });

-test("ending with left over phonetics will throw an error", () => {
-    expect(() => {
-        addDiacritics({ p: "کار", f: "kaar kawul" });
-    }).toThrow();
-});
-
-test("adding diacritics errors when phonetecs and pashto do not line up", () => {
-    brokenDiacritics.forEach((t) => {
-        expect(() => {
-            addDiacritics(t);
-        }).toThrow();
-    });
-});
--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@ -7,255 +7,35 @@
 */

 import * as T from "../types";
-import { removeAccents } from "./accent-helpers";
+import {
+    splitFIntoPhonemes,
+    Phoneme,
+    phonemeTable,
+    zwar,
+    zwarakey,
+    zer,
+    pesh,
+    sukun,
+    hamzaAbove,
+    tashdeed,
+    wasla,
+    daggerAlif,
+    fathahan,
+    prev2Chars,
+    addP,
+    last,
+    advanceP,
+    reverseP,
+    overwriteP,
+    advanceForAin,
+    advanceForAinOrHamza,
+    advanceForHamzaMid,
+    DiacriticsAccumulator,
+} from "./diacritics-helpers";
+
 import { firstPhonetics } from "./p-text-helpers";
 import { pipe } from "rambda";

-const zwar = "َ";
-const zwarakey = "ٙ";
-const zer = "ِ";
-const pesh = "ُ";
-const sukun = "ْ";
-const hamzaAbove = "ٔ";
-const tashdeed = "ّ";
-const wasla = "ٱ";
-const daggerAlif = "ٰ";
-const fathahan = "ً";
-
-type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y";
-type Ain = "'"
-type JoiningVowel = "-i-" | "-U-" | "-Ul-"; 
-type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
-type ShortVowel = "a" | "i" | "u" | "U";
-type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
-
-type DiacriticsAccumulator = { pIn: string, pOut: string };
-
-type PhonemeInfo = {
-    matches?: string[],
-    beginningMatches?: string[],
-    endingMatches?: string[],
-    consonant?: true,
-    diacritic?: string,
-    endingOnly?: true,
-    takesSukunOnEnding?: true,
-    longVowel?: true,
-    canStartWithAynBefore?: true,
-    useEndingDiacritic?: true,
-}
-
-const phonemeTable: Record<Phoneme, PhonemeInfo> = {
-    // Consonants
-    "b": {
-        matches: ["ب"],
-        consonant: true,
-    },
-    "p": {
-        matches: ["پ"],
-        consonant: true,
-    },
-    "t": {
-        matches: ["ت", "ط"],
-        consonant: true,
-    },
-    "T": {
-        matches: ["ټ"],
-        consonant: true,
-    },
-    "s": {
-        matches: ["س", "ص", "ث"],
-        consonant: true,
-    },
-    "j": {
-        matches: ["ج"],
-        consonant: true,
-    },
-    "ch": {
-        matches: ["چ"],
-        consonant: true,
-    },
-    "kh": {
-        matches: ["خ"],
-        consonant: true,
-    },
-    "ts": {
-        matches: ["څ"],
-        consonant: true,
-    },
-    "dz": {
-        matches: ["ځ"],
-        consonant: true,
-    },
-    "d": {
-        matches: ["د"],
-        consonant: true,
-    },
-    "D": {
-        matches: ["ډ"],
-        consonant: true,
-    },
-    "r": {
-        matches: ["ر"],
-        consonant: true,
-    },
-    "R": {
-        matches: ["ړ"],
-        consonant: true,
-    },
-    "z": {
-        matches: ["ز", "ذ", "ظ", "ض"],
-        consonant: true,
-    },
-    "jz": {
-        matches: ["ژ"],
-        consonant: true,
-    },
-    "G": {
-        matches: ["ږ"],
-        consonant: true,
-    },
-    "sh": {
-        matches: ["ش"],
-        consonant: true,
-    },
-    "x": {
-        matches: ["ښ"],
-        consonant: true,
-    },
-    "gh": {
-        matches: ["غ"],
-        consonant: true,
-    },
-    "f": {
-        matches: ["ف"],
-        consonant: true,
-    },
-    "q": {
-        matches: ["ق"],
-        consonant: true,
-    },
-    "k": {
-        matches: ["ک"],
-        consonant: true,
-    },
-    "g": {
-        matches: ["ګ"],
-        consonant: true,
-    },
-    "l": {
-        matches: ["ل"],
-        consonant: true,
-    },
-    "m": {
-        matches: ["م"],
-        consonant: true,
-    },
-    "n": {
-        matches: ["ن"],
-        consonant: true,
-    },
-    "N": {
-        matches: ["ڼ"],
-        consonant: true,
-    },
-    "h": {
-        matches: ["ه", "ح"],
-        consonant: true,
-        takesSukunOnEnding: true,
-    },
-    "w": {
-        matches: ["و"],
-        consonant: true,
-    },
-    "y": {
-        matches: ["ی"],
-        consonant: true,
-    },
-    // Ain
-    "'": {
-        matches: ["ع", "ئ"],
-        consonant: true,
-    },
-    // Joining Vowels
-    "-i-": {
-    },
-    "-U-": {
-        matches: [" و ", "و"],
-    },
-    "-Ul-": {
-        matches: ["ال"],
-    },
-    // Long Vowels
-    "aa": {
-        matches: ["ا"],
-        beginningMatches: ["آ", "ا"],
-        endingMatches: ["ا", "یٰ"],
-        longVowel: true,
-    },
-    "ee": {
-        matches: ["ی"],
-        longVowel: true,
-        endingMatches: ["ي"],
-        diacritic: zer,
-        canStartWithAynBefore: true
-    },
-    "e": {
-        matches: ["ې"],
-        longVowel: true,
-    },
-    "o": {
-        matches: ["و"],
-        longVowel: true,
-    },
-    "oo": {
-        matches: ["و"],
-        longVowel: true,
-        // alsoCanBePrefix: true,
-        diacritic: pesh,
-        useEndingDiacritic: true,
-    },
-    "ey": {
-        matches: ["ی"],
-        longVowel: true,
-        endingMatches: ["ی"],
-    },
-    "uy": {
-        matches: ["ۍ"],
-        longVowel: true,
-        endingOnly: true,
-    },
-    "eyy": {
-        matches: ["ئ"],
-        longVowel: true,
-        endingOnly: true,
-    },
-    // Short Vowels
-    "a": {
-        diacritic: zwar,
-        endingMatches: ["ه"],
-        beginningMatches: ["ا", "ع"],
-        // canComeAfterHeyEnding: true,
-        // canBeFirstPartOfFathahanEnding: true,
-    },
-    "u": {
-        diacritic: zwarakey,
-        endingMatches: ["ه"],
-    },
-    "i": {
-        diacritic: zer,
-        endingMatches: ["ه"],
-        beginningMatches: ["ا", "ع"],
-        // takesDiacriticBeforeGurdaHeyEnding: true,
-        // canBeWasla: true,
-    },
-    "U": {
-        diacritic: pesh,
-        endingMatches: ["ه"],
-        // takesDiacriticBeforeGurdaHeyEnding: true,
-        beginningMatches: ["ا", "ع"],
-    },
-}
-
 /**
 * Adds diacritics to a given PsString.
 * Errors if the phonetics and script don't line up.
@ -272,61 +52,6 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
    };
 }

-/**
- * splits a phonetics string into an array of Phonemes
- * 
- * will error if there is an illeagal phonetics character
- * 
- * @param fIn a phonetics string
- * @returns an array of phonemes
- */
-export function splitFIntoPhonemes(fIn: string): Phoneme[] {
-    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
-    
-    const quadrigraphs: Phoneme[] = ["-Ul-"];
-    const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
-    const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
-    const endingDigraphs: Phoneme[] = ["uy"];
-    const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
-    
-    const result: Phoneme[] = [];
-    const f = removeAccents(fIn);
-    let index = 0;
-    while (index < f.length) {
-        const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " ");
-        const threeLetterChunk = f.slice(index, index + 3) as Phoneme;
-        const fourLetterChunk = f.slice(index, index + 4) as Phoneme;
-        if (quadrigraphs.includes(fourLetterChunk)) {
-            result.push(fourLetterChunk);
-            index += 4;
-            continue;
-        }
-        if (trigraphs.includes(threeLetterChunk)) {
-            result.push(threeLetterChunk);
-            index += 3;
-            continue;
-        }
-        const twoLetterChunk = f.slice(index, index + 2) as Phoneme;
-        if (
-            digraphs.includes(twoLetterChunk) ||
-            (isLastTwoLetters && endingDigraphs.includes(twoLetterChunk))
-        ) {
-            result.push(twoLetterChunk);
-            index += 2;
-            continue;
-        }
-        const singleLetter = f.slice(index, index + 1) as Phoneme;
-        if (!willIgnore.includes(singleLetter)) {
-            if (!singleLetterPhonemes.includes(singleLetter)) {
-                throw new Error(`illegal phonetic character: ${singleLetter}`);
-            }
-            result.push(singleLetter);
-        }
-        index++;
-    }
-    return result;
-}
-
 enum PhonemeStatus {
    LeadingLongVowel,
    LeadingConsonantOrShortVowel,
@ -337,6 +62,7 @@ enum PhonemeStatus {
    PersianSilentWWithAa,
    ArabicWasla,
    Izafe,
+    EndOfDuParticle,
 }

 function processPhoneme(
@ -349,7 +75,9 @@ function processPhoneme(
    // console.log("space coming up", acc.pIn[0] === " ");
    // console.log("state", acc);
    // Prep state
-    const state = acc.pIn[0] === " " ? advanceP(acc) : acc;
+    const state = acc.pIn[0] === " "
+        ? advanceP(acc)
+        : acc;
    // console.log("AFTER SPACE PREP", phoneme);
    // console.log("state", state);
    // WARNING: Do not use acc after this point!
@ -403,6 +131,11 @@ function processPhoneme(
                reverseP,
                addP(zer),
            )(state)
+        : (phs === PhonemeStatus.EndOfDuParticle) ?
+            (console.log("here"), pipe(
+                reverseP,
+                addP(zwarakey),
+            )(state))
        :
        // phs === PhonemeState.ShortVowel
            pipe(
@ -444,6 +177,11 @@ function stateInfo({ state, i, phonemes, phoneme }: {
        if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
            return PhonemeStatus.LeadingConsonantOrShortVowel;
        }
+        console.log(phoneme, phonemes, prev2Chars(state.pOut))
+        if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && prev2Chars(state.pOut) === ("د" + zwarakey)) {
+            // console.log("du here", phoneme, phonemes);
+            return PhonemeStatus.EndOfDuParticle
+        }
        if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") {
            return PhonemeStatus.PersianSilentWWithAa;
        }
@ -465,6 +203,7 @@ function stateInfo({ state, i, phonemes, phoneme }: {
        if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
            return PhonemeStatus.ShortVowel;
        }
+        // console.log("bad phoneme is ", phoneme);
        throw new Error("phonetics error - no status found for phoneme: " + phoneme);
    }

@ -474,70 +213,3 @@ function stateInfo({ state, i, phonemes, phoneme }: {
        phs, phonemeInfo, sukunOrDiacritic,
    };
 };
-
-/**
- * returns the last character of a string
- * 
- * @param s 
- */
-function last(s: string) {
-    return s[s.length - 1];
-}
-
-function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
-    return {
-        pIn: state.pIn.slice(n),
-        pOut: state.pOut + state.pIn.slice(0, n),
-    };
-}
-
-function reverseP(state: DiacriticsAccumulator): DiacriticsAccumulator {
-    return {
-        pIn: state.pOut.slice(-1) + state.pIn,
-        pOut: state.pOut.slice(0, -1),
-    };
-}
-
-const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
-    return {
-        ...state,
-        pOut: toAdd ? (state.pOut + toAdd) : state.pOut,
-    };
-};
-
-const overwriteP = (toWrite: string) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
-    return {
-        pIn: state.pIn.slice(1),
-        pOut: state.pOut + toWrite,
-    };
-};
-
-function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} {
-    return {
-        current: state.pIn[0],
-        next: state.pIn[1],
-    };
-}
-
-function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
-    const { current } = getCurrentNext(state);
-    return (current === "ع") ? advanceP(state) : state;
-}
-
-function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
-    const { current, next } = getCurrentNext(state);
-    if (current === "ئ" && next && next !== "ئ") {
-        return advanceP(state);
-    }
-    return state;
-}
-function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
-    const { current, next } = getCurrentNext(state);
-    if (current === "ه" && (!next || next === " ")) {
-        return advanceP(state);
-    }
-    if (current === "ع") {
-        return advanceP(state);
-    }
-    return state;
-}