From 98c5eb745267679ef2f3c149963b39691bacfde3 Mon Sep 17 00:00:00 2001 From: Bill D Date: Fri, 7 May 2021 10:54:09 +0300 Subject: [PATCH] more --- src/lib/diacritics.test.ts | 85 +++++++++++++++++++++-------- src/lib/diacritics.ts | 108 ++++++++++++++++++++++--------------- 2 files changed, 127 insertions(+), 66 deletions(-) diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts index de5ea1b..620e37f 100644 --- a/src/lib/diacritics.test.ts +++ b/src/lib/diacritics.test.ts @@ -69,20 +69,6 @@ const phonemeSplits: Array<{ }, ]; -const badPhonetics: Array<{ - in: string, - problem: string, -}> = [ - { - in: "acar", - problem: "c", - }, - { - in: "a7am", - problem: "7", - }, -]; - const diacriticsTest: Array<{ in: T.PsString, out: string, @@ -101,6 +87,13 @@ const diacriticsTest: Array<{ }, out: "کُور", }, + { + in: { + p: "کور کور", + f: "kor koor", + }, + out: "کور کُور", + }, { in: { p: "تب", @@ -242,6 +235,22 @@ const diacriticsTest: Array<{ }, out: "پېش", }, + { + in: { + p: "لیک", + f: "leek", + }, + out: "لِیک", + }, + // starting alefs + { + in: { + p: "اسلام", + f: "islaam", + }, + out: "اِسْلام", + }, + // double consonant { in: { p: "بتن", @@ -249,12 +258,13 @@ const diacriticsTest: Array<{ }, out: "ب" + zwar + "ت" + tashdeed + zwar + "ن", }, -]; - -const brokenDiacritics = [ + // avoid false double consonant { - p: "تشناب", - f: "peshnaab", + in: { + p: "ازل لیک", + f: "azalléek", + }, + out: "اَزَل لِیک", }, ]; @@ -265,6 +275,35 @@ phonemeSplits.forEach((s) => { }); }); +test("adding diacritics should work", () => { + diacriticsTest.forEach((t) => { + expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f }); + }); +}); + +// ERRORS + +const brokenDiacritics = [ + { + p: "تشناب", + f: "peshnaab", + }, +]; + +const badPhonetics: Array<{ + in: string, + problem: string, +}> = [ + { + in: "acar", + problem: "c", + }, + { + in: "a7am", + problem: "7", + }, +]; + test("bad phonetic characters should throw an error", () => { badPhonetics.forEach((s) => { expect(() => { @@ -279,10 +318,10 @@ test("ending with left over Pashto script will throw an error", () => { }).toThrow(`phonetics error - phonetics shorter than pashto script`); }); -test("adding diacritics should work", () => { - diacriticsTest.forEach((t) => { - expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f }); - }); +test("ending with left over phonetics will throw an error", () => { + expect(() => { + addDiacritics({ p: "کار", f: "kaar kawul" }); + }).toThrow(); }); test("adding diacritics errors when phonetecs and pashto do not line up", () => { diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts index ba814cc..26e38ec 100644 --- a/src/lib/diacritics.ts +++ b/src/lib/diacritics.ts @@ -28,6 +28,8 @@ type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy"; type ShortVowel = "a" | "i" | "u" | "U"; type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel; +type DiacriticsAccumulator = { pIn: string, pOut: string }; + type PhonemeInfo = { matches?: string[], beginningMatches?: string[], @@ -225,6 +227,7 @@ const phonemeTable: Record = { "a": { diacritic: zwar, endingMatches: ["ه"], + beginningMatches: ["ا"], // canComeAfterHeyEnding: true, // canBeFirstPartOfFathahanEnding: true, }, @@ -304,64 +307,76 @@ export function splitFIntoPhonemes(fIn: string): Phoneme[] { } /** - * Adds phonetis to a given PsString. + * Adds diacritics to a given PsString. * Errors if the phonetics and script don't line up. * * @param ps a PsSTring without phonetics */ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString { - // TODO: const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f); - - const { pIn, pOut } = phonemes.reduce((acc, phoneme, i) => { - const prevPLetter = last(acc.pOut); - const isBeginningOfWord = acc.pOut === "" || prevPLetter === " "; - const phonemeInfo = phonemeTable[phoneme]; - const previousPhoneme = i > 0 && phonemes[i-1]; - const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; - const currentPLetter = acc.pIn[0]; - const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); - const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme); - const needsSukun = doubleConsonant && (previousPhoneme !== phoneme); - - if (needsTashdeed) { - return { - pOut: acc.pOut + tashdeed, - pIn: acc.pIn, - }; - } - - if (phonemeInfo.matches?.includes(currentPLetter)) { - return { - pOut: acc.pOut - + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "") - + currentPLetter, - pIn: acc.pIn.slice(1), - }; - } - - if (phonemeInfo.diacritic) { - return { - pOut: acc.pOut + phonemeInfo.diacritic, - pIn: acc.pIn, - } - } - - // TODO: CHECK IF PASHTO IS SHORTER THAN PHONETICS - - throw new Error("phonetics error"); - }, { pOut: "", pIn: p }); - + const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p }); if (pIn !== "") { throw new Error("phonetics error - phonetics shorter than pashto script"); } - return { p: pOut, f, }; } +function processPhoneme( + acc: DiacriticsAccumulator, + phoneme: Phoneme, + i: number, + phonemes: Phoneme[], +) { + // Prep state + const state = acc.pIn[0] === " " ? advanceP(acc) : acc; + // WARNING: Do not use acc after this point! + + const prevPLetter = last(state.pOut); + const currentPLetter = state.pIn[0]; + // const nextPLetter = state.pIn[1]; + const isBeginningOfWord = state.pOut === "" || prevPLetter === " "; + const phonemeInfo = phonemeTable[phoneme]; + const previousPhoneme = i > 0 && phonemes[i-1]; + const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; + const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); + const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme); + const needsSukun = doubleConsonant && (previousPhoneme !== phoneme); + + if (needsTashdeed) { + return { + pOut: state.pOut + tashdeed, + pIn: state.pIn, + }; + } + + // TODO: Beginning of word with long vowels and alef etc. + if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) { + const ns = advanceP(state); + return { + ...ns, + pOut: ns.pOut + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""), + }; + } else if (phonemeInfo.matches?.includes(currentPLetter)) { + return advanceP({ + ...state, + pOut: state.pOut + + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""), + }); + } + + if (phonemeInfo.diacritic) { + return { + ...state, + pOut: state.pOut + phonemeInfo.diacritic, + }; + } + + throw new Error("phonetics error"); +} + /** * returns the last character of a string * @@ -369,4 +384,11 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt */ function last(s: string) { return s[s.length - 1]; +} + +function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator { + return { + pOut: state.pOut + state.pIn.slice(0, n), + pIn: state.pIn.slice(n), + } } \ No newline at end of file