more work on diacritics

2021-05-06 23:48:53 +03:00 · 2021-05-06 23:48:53 +03:00 · a2b5626514
parent 7b0e6d864f
commit a2b5626514
2 changed files with 144 additions and 6 deletions
--- a/src/lib/diacritics.test.ts
+++ b/src/lib/diacritics.test.ts
@ -12,6 +12,17 @@ import {
 } from "./diacritics";
 import * as T from "../types";
 const zwar = "َ";
 const zwarakey = "ٙ";
 const zer = "ِ";
 const pesh = "ُ";
 const sukun = "ْ";
 const hamzaAbove = "ٔ";
 const tashdeed = "ّ";
 const wasla = "ٱ";
 const daggerAlif = "ٰ";
 const fathahan = "ً";
 const phonemeSplits: Array<{
    in: string,
    out: string[],
@ -139,6 +150,105 @@ const diacriticsTest: Array<{
        },
        out: "تَشْناب",
    },
    // working with وs
    {
        in: {
            p: "کول",
            f: "kwal",
        },
        out: "کْوَل",
    },
    {
        in: {
            p: "تول",
            f: "tool",
        },
        out: "تُول",
    },
    {
        in: {
            p: "مقبول",
            f: "maqbool",
        },
        out: "مَقْبُول",
    },
    {
        in: {
            p: "کول",
            f: "kawul",
        },
        out: "کَو" + zwarakey + "ل",
    },
    {
        in: {
            p: "کول",
            f: "kiwul",
        },
        out: "کِو" + zwarakey + "ل",
    },
    {
        in: {
            p: "کول",
            f: "kUwul",
        },
        out: "کُو" + zwarakey + "ل",
    },
    {
        in: {
            p: "کول",
            f: "kuwul",
        },
        out: "ک" + zwarakey + "و" + zwarakey + "ل",
    },
    {
        in: {
            p: "کول",
            f: "kawal",
        },
        out: "کَوَل",
    },
    {
        in: {
            p: "کول",
            f: "kUwal",
        },
        out: "کُوَل",
    },
    {
        in: {
            p: "پشتګرد",
            f: "pishtgird",
        },
        out: "پِشْتْګِرْد",
    },
    {
        in: {
            p: "سپین",
            f: "speen",
        },
        out: "سْپِین",
    },
    {
        in: {
            p: "سپین",
            f: "speyn",
        },
        out: "سْپین",
    },
    {
        in: {
            p: "پېش",
            f: "pesh",
        },
        out: "پېش",
    },
    {
        in: {
            p: "بتن",
            f: "battan",
        },
        out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
    },
 ];
 const brokenDiacritics = [
@ -163,6 +273,12 @@ test("bad phonetic characters should throw an error", () => {
    });
 });
 test("ending with left over Pashto script will throw an error", () => {
    expect(() => {
        addDiacritics({ p: "کور ته", f: "kor" });
    }).toThrow(`phonetics error - phonetics shorter than pashto script`);
 });
 test("adding diacritics should work", () => {
    diacriticsTest.forEach((t) => {
        expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
--- a/src/lib/diacritics.ts
+++ b/src/lib/diacritics.ts
@ -248,7 +248,14 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
    },
 }
-
+/**
 * splits a phonetics string into an array of Phonemes
 * 
 * will error if there is an illeagal phonetics character
 * 
 * @param fIn a phonetics string
 * @returns an array of phonemes
 */
 export function splitFIntoPhonemes(fIn: string): Phoneme[] {
    const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
@ -306,16 +313,25 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
    // TODO: 
    const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
-    const { pOut } = phonemes.reduce((acc, phoneme, i) => {
+    const { pIn, pOut } = phonemes.reduce((acc, phoneme, i) => {
-        const isBeginningOfWord = acc.pOut === "" || last(acc.pOut) === " ";
+        const prevPLetter = last(acc.pOut);
        const isBeginningOfWord = acc.pOut === "" || prevPLetter === " ";
        const phonemeInfo = phonemeTable[phoneme];
        const previousPhoneme = i > 0 && phonemes[i-1];
        const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
        const currentPLetter = acc.pIn[0];
-        const needsSukun = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
+        const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
        const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme);
        const needsSukun = doubleConsonant && (previousPhoneme !== phoneme);
        if (needsTashdeed) {
            return {
                pOut: acc.pOut + tashdeed,
                pIn: acc.pIn,
            };
        }
        if (phonemeInfo.matches?.includes(currentPLetter)) {
            // TODO: Check if tashdeed or sukun is used
            // const needsSukun = is consonant + previous phoneme was consonant + not beginning of word
            return {
                pOut: acc.pOut
                    + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "")
@ -331,9 +347,15 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
            }
        }
        // TODO: CHECK IF PASHTO IS SHORTER THAN PHONETICS
        throw new Error("phonetics error");
    }, { pOut: "", pIn: p });
    if (pIn !== "") {
        throw new Error("phonetics error - phonetics shorter than pashto script");
    }
    return {
        p: pOut,
        f,