From 2dea82c32b5be63678c54e23538b38d50752c5f3 Mon Sep 17 00:00:00 2001 From: Bill D Date: Sat, 8 May 2021 21:31:59 +0300 Subject: [PATCH] more --- src/lib/diacritics.test.ts | 260 ++++++++++++++++++++++++++++++++++--- src/lib/diacritics.ts | 113 ++++++++++++---- 2 files changed, 330 insertions(+), 43 deletions(-) diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts index 512d5cb..155b978 100644 --- a/src/lib/diacritics.test.ts +++ b/src/lib/diacritics.test.ts @@ -242,6 +242,196 @@ const diacriticsTest: Array<{ }, out: "لِیک", }, + { + in: { + p: "رغېدل", + f: "raghedul", + }, + out: "رَغېد" + zwarakey + "ل", + }, + { + in: { + p: "کارول", + f: "kaarawul", + }, + out: "کارَو" + zwarakey + "ل", + }, + { + in: { + p: "پېښېدل", + f: "pexedul", + }, + out: "پېښېد" + zwarakey + "ل", + }, + { + in: { + p: "مین", + f: "mayín", + }, + out: "مَیِن", + }, + { + in: { + p: "سړی", + f: "saRey", + }, + out: "سَړی", + }, + { + in: { + p: "سړي", + f: "saRee", + }, + out: "سَړي", + }, + { + in: { + p: "زه", + f: "zu", + }, + out: "زهٔ", + }, + { + in: { + p: "زه", + f: "za", + }, + out: "زَه", + }, + { + in: { + p: "پېشنهاد", + f: "peshniháad", + }, + out: "پېشْنِهاد", + }, + { + in: { + p: "ایستل", + f: "eestul", + }, + out: "اِیسْت" + zwarakey + "ل", + }, + { + in: { + p: "ایستل", + f: "eystul", + }, + out: "ایسْت" + zwarakey + "ل", + }, + { + in: { + p: "اېسېدل", + f: "esedul", + }, + out: "اېسېد" + zwarakey + "ل", + }, + { + in: { + p: "اوسېدل", + f: "osedul", + }, + out: "اوسېد" + zwarakey + "ل", + }, + { + in: { + p: "اواز", + f: "awaaz", + }, + out: "اَواز", + }, + { + in: { + p: "اسلام", + f: "islaam", + }, + out: "اِسْلام", + }, + { + in: { + p: "واردول", + f: "waaridawul", + }, + out: "وارِدَو" + zwarakey + "ل", + }, + { + in: { + p: "غاړه", + f: "ghaaRa", + }, + out: "غاړَه", + }, + { + in: { + p: "اوتر", + f: "awtár", + }, + out: "اَوْتَر", + }, + { + in: { + p: "اختیار", + f: "ikhtiyáar", + }, + out: "اِخْتِیار", + }, + { + in: { + p: "فریاد", + f: "faryáad", + }, + out: "فَرْیاد", + }, + { + in: { + p: "کارغه", + f: "kaarghu", + }, + out: "کارْغهٔ", + }, + { + in: { + p: "بې کار", + f: "be kaar", + }, + out: "بې کار", + }, + { + in: { + p: "بې کار", + f: "bekaar", + }, + out: "بې کار", + }, + // TODO: nb mb thing + { + in: { + p: "انبار", + f: "ambáar", + }, + out: "اَنْبار", + }, + { + in: { + p: "ارغون", + f: "arghóon", + }, + out: "اَرْغُون", + }, + { + in: { + p: "ارمټه", + f: "armaTa", + }, + out: "اَرْمَټَه", + }, + { + in: { + p: "اروا پوه", + f: "arwaa poh", + }, + out: "اَرْوا پوهْ", + }, // starting alefs { in: { @@ -295,21 +485,28 @@ const diacriticsTest: Array<{ }, out: "پَرْمَخْتْیا", }, - // { - // in: { - // p: "پته", - // f: "patta", - // }, - // out: "پَتّه", - // }, + { + in: { + p: "پته", + f: "patta", + }, + out: "پَتَّه", + }, + { + in: { + p: "پته تور", + f: "patta toor", + }, + out: "پَتَّه تُور", + }, // get ayn stuff working - // { - // in: { - // p: "اعتصاب شکن", - // f: "itisaabshikan", - // }, - // out: "اِعتِصاب شِکَن", - // }, + { + in: { + p: "اعتصاب شکن", + f: "itisaab shakan", + }, + out: "اِعتِصاب شَکَن", + }, // avoid false double consonant { in: { @@ -318,6 +515,36 @@ const diacriticsTest: Array<{ }, out: "اَزَل لِیک", }, + // starting with ع + { + in: { + p: "عزت", + f: "izzat", + }, + out: "عِزَّت", + }, + { + in: { + p: "عزت", + f: "i'zzat", + }, + out: "عِزَّت", + }, + // ئ in the middle + { + in: { + p: "برائت", + f: "baraa'at", + }, + out: "بَرائَت", + }, + { + in: { + p: "فائده", + f: "faaida", + }, + out: "فائِدَه", + }, ]; phonemeSplits.forEach((s) => { @@ -327,8 +554,9 @@ phonemeSplits.forEach((s) => { }); }); -test("adding diacritics should work", () => { - diacriticsTest.forEach((t) => { + +diacriticsTest.forEach((t) => { + test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => { expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f }); }); }); diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts index b80b00e..92af362 100644 --- a/src/lib/diacritics.ts +++ b/src/lib/diacritics.ts @@ -41,6 +41,7 @@ type PhonemeInfo = { takesSukunOnEnding?: true, longVowel?: true, canStartWithAynBefore?: true, + useEndingDiacritic?: true, } const phonemeTable: Record = { @@ -211,6 +212,7 @@ const phonemeTable: Record = { longVowel: true, // alsoCanBePrefix: true, diacritic: pesh, + useEndingDiacritic: true, }, "ey": { matches: ["ی"], @@ -231,14 +233,13 @@ const phonemeTable: Record = { "a": { diacritic: zwar, endingMatches: ["ه"], - beginningMatches: ["ا"], + beginningMatches: ["ا", "ع"], // canComeAfterHeyEnding: true, // canBeFirstPartOfFathahanEnding: true, }, "u": { diacritic: zwarakey, endingMatches: ["ه"], - // hamzaOnEnd: true, }, "i": { diacritic: zer, @@ -270,7 +271,7 @@ export function splitFIntoPhonemes(fIn: string): Phoneme[] { const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; const endingDigraphs: Phoneme[] = ["uy"]; - const willIgnore = ["?", " ", "`", ".", "…", ","]; + const willIgnore = ["?", " ", "`", ".", "…", ",", "'"]; const result: Phoneme[] = []; const f = removeAccents(fIn); @@ -334,28 +335,27 @@ function processPhoneme( i: number, phonemes: Phoneme[], ) { + // console.log("PHONEME", phoneme); + // console.log("space coming up", acc.pIn[0] === " "); + // console.log("state", acc); // Prep state const state = acc.pIn[0] === " " ? advanceP(acc) : acc; + // console.log("AFTER SPACE PREP", phoneme); + // console.log("state", state); // WARNING: Do not use acc after this point! - const prevPLetter = last(state.pOut); - const currentPLetter = state.pIn[0]; - const nextPLetter = state.pIn[1]; - const isBeginningOfWord = state.pOut === "" || prevPLetter === " "; - // const isEndOfWord = !nextPLetter || nextPLetter === " "; - const phonemeInfo = phonemeTable[phoneme]; - const previousPhoneme = i > 0 && phonemes[i-1]; - const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; - // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; - // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined; - const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); - const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter); - const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter)); - const sukunOrDiacritic = (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""); + const { + phonemeInfo, + isBeginningOfWord, + currentPLetter, + needsTashdeed, + sukunOrDiacritic, + nextPLetter, + isEndOfWord, + } = stateInfo({ state, i, phoneme, phonemes }); // if it's not an exception (TODO) // it must be one of the following 5 possibilities - // 1. beginning a word with a long vowel if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) { if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) { @@ -371,29 +371,65 @@ function processPhoneme( return pipe( advanceP, addP(sukunOrDiacritic), + advanceForAin, )(state); // 3. double consonant to be marked with tashdeed } else if (needsTashdeed) { - return addP(tashdeed)(state); - // 4. direct match of phoneme / P letter - } else if (phonemeInfo.matches?.includes(currentPLetter)) { + return pipe( + addP(tashdeed) + )(state); + // 4. special ه ending + } else if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) { + return pipe( + advanceP, + addP(phoneme === "u" ? hamzaAbove : sukun), + )(state); + // 5. direct match of phoneme / P letter + } else if (phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب")) { return pipe( addP(sukunOrDiacritic), advanceP, )(state); - // 5. just a diacritic for short vowel + // 6. just a diacritic for short vowel } else if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { return pipe( + advanceForHamzaMid, addP(phonemeInfo.diacritic), - advanceIfReachedEndingHamza, + advanceForAinOrHamza, )(state); } - // anything that gets to this point is a failure/error - // console.log(state); throw new Error("phonetics error"); } + + +function stateInfo({ state, i, phonemes, phoneme }: { + state: DiacriticsAccumulator, + i: number, + phonemes: Phoneme[], + phoneme: Phoneme, +}) { + const prevPLetter = last(state.pOut); + const currentPLetter = state.pIn[0]; + const nextPLetter = state.pIn[1]; + const isBeginningOfWord = state.pOut === "" || prevPLetter === " "; + const isEndOfWord = !nextPLetter || nextPLetter === " "; + const phonemeInfo = phonemeTable[phoneme]; + const previousPhoneme = i > 0 && phonemes[i-1]; + const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; + // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; + // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined; + const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); + const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter); + const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter)); + const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic; + const sukunOrDiacritic = (needsSukun ? sukun : diacritic); + return { + phonemeInfo, isBeginningOfWord, currentPLetter, needsTashdeed, sukunOrDiacritic, nextPLetter, isEndOfWord, + }; +}; + /** * returns the last character of a string * @@ -417,8 +453,31 @@ const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): Diac }; } -function advanceIfReachedEndingHamza(state: DiacriticsAccumulator): DiacriticsAccumulator { - if (state.pIn[0] === "ه" && (!state.pIn[1] || state.pIn[1] === " ")) { +function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} { + return { + current: state.pIn[0], + next: state.pIn[1], + }; +} + +function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator { + const { current } = getCurrentNext(state); + return (current === "ع") ? advanceP(state) : state; +} + +function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator { + const { current, next } = getCurrentNext(state); + if (current === "ئ" && next && next !== "ئ") { + return advanceP(state); + } + return state; +} +function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator { + const { current, next } = getCurrentNext(state); + if (current === "ه" && (!next || next === " ")) { + return advanceP(state); + } + if (current === "ع") { return advanceP(state); } return state;