From aad1b34e1745dabb2a56f1143713f9464fa46cf8 Mon Sep 17 00:00:00 2001 From: Bill D Date: Thu, 27 May 2021 11:06:30 +0430 Subject: [PATCH] more work on phonetics --- src/lib/diacritics-helpers.test.ts | 7 +- src/lib/diacritics-helpers.ts | 26 +++-- src/lib/diacritics.test.ts | 167 +++++++++++++++++++++-------- src/lib/diacritics.ts | 41 +++---- 4 files changed, 160 insertions(+), 81 deletions(-) diff --git a/src/lib/diacritics-helpers.test.ts b/src/lib/diacritics-helpers.test.ts index 65c9052..a827ee0 100644 --- a/src/lib/diacritics-helpers.test.ts +++ b/src/lib/diacritics-helpers.test.ts @@ -6,8 +6,7 @@ import { advanceP, reverseP, overwriteP, - advanceForAin, - advanceForAinOrHamza, + advanceForHamza, advanceForHamzaMid, } from "./diacritics-helpers"; @@ -23,6 +22,10 @@ const phonemeSplits: Array<{ in: "raaghey", out: ["r", "aa", "gh", "ey"], }, + { + in: "ist'imaal", + out: ["i", "s", "t", "'", "i", "m", "aa", "l"], + }, { in: "hatsa", out: ["h", "a", "ts", "a"], diff --git a/src/lib/diacritics-helpers.ts b/src/lib/diacritics-helpers.ts index 3824f4d..debfbec 100644 --- a/src/lib/diacritics-helpers.ts +++ b/src/lib/diacritics-helpers.ts @@ -28,6 +28,7 @@ type PhonemeInfo = { longVowel?: true, canStartWithAynBefore?: true, useEndingDiacritic?: true, + ainBlendDiacritic?: string, } export const zwar = "َ"; @@ -188,13 +189,15 @@ export const phonemeTable: Record = { beginningMatches: ["آ", "ا"], endingMatches: ["ا", "یٰ"], longVowel: true, + ainBlendDiacritic: zwar, }, "ee": { matches: ["ی"], longVowel: true, endingMatches: ["ي"], diacritic: zer, - canStartWithAynBefore: true + canStartWithAynBefore: true, + ainBlendDiacritic: zer, }, "e": { matches: ["ې"], @@ -210,6 +213,7 @@ export const phonemeTable: Record = { // alsoCanBePrefix: true, diacritic: pesh, useEndingDiacritic: true, + ainBlendDiacritic: pesh, }, "ey": { matches: ["ی"], @@ -262,13 +266,13 @@ export const phonemeTable: Record = { * @returns an array of phonemes */ export function splitFIntoPhonemes(fIn: string): Phoneme[] { - const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"]; + const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y", "'"]; const quadrigraphs: Phoneme[] = ["-Ul-"]; const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; const endingDigraphs: Phoneme[] = ["uy"]; - const willIgnore = ["?", " ", "`", ".", "…", ",", "'"]; + const willIgnore = ["?", " ", "`", ".", "…", ","]; const result: Phoneme[] = []; const f = removeAccents(fIn); @@ -372,10 +376,10 @@ export function getCurrentNext(state: DiacriticsAccumulator): { current: string, }; } -export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator { - const { current } = getCurrentNext(state); - return (current === "ع") ? advanceP(state) : state; -} +// export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator { +// const { current } = getCurrentNext(state); +// return (current === "ع") ? advanceP(state) : state; +// } export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator { const { current, next } = getCurrentNext(state); @@ -385,14 +389,14 @@ export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccu return state; } -export function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator { +export function advanceForHamza(state: DiacriticsAccumulator): DiacriticsAccumulator { const { current, next } = getCurrentNext(state); if (current === "ه" && (!next || next === " ")) { return advanceP(state); } - if (current === "ع") { - return advanceP(state); - } + // if (current === "ع") { + // return advanceP(state); + // } return state; } diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts index b9de3ff..9117fdd 100644 --- a/src/lib/diacritics.test.ts +++ b/src/lib/diacritics.test.ts @@ -485,34 +485,115 @@ const diacriticsSections: { tests: [ { in: { - p: "اعتصاب شکن", - f: "itisaab shakan", + p: "بعد", + f: "ba'd", }, - out: "اِعتِصاب شَکَن", - }, - // starting with ع - { - in: { - p: "عزت", - f: "izzat", - }, - out: "عِزَّت", + out: "بَعْد", }, { in: { - p: "عزت", - f: "i'zzat", + p: "بعد", + f: "b'ad", }, - out: "عِزَّت", + out: "بْعَد", + }, + { + in: { + p: "بعد", + f: "ba'ad", + }, + out: "بَعَد", + }, + { + in: { + p: "بعد", + f: "baad", + }, + out: "بَعَد", + }, + { + in: { + p: "بعد", + f: "bad", + }, + // TODO: Should this really be an error? + out: null, + }, + { + in: { + p: "معلوم", + f: "maaloom", + }, + out: "مَعَلُوم", + }, + { + in: { + p: "منبع", + f: "manbi'", + }, + out: "مَنْبِع", + }, + { + in: { + p: "منبع", + f: "manb'i", + }, + out: "مَنْبْعِ" + }, + { + in: { + p: "منبع", + f: "manbee", + }, + out: "مَنْبِعِ", }, - // middle ع { in: { p: "معنا", - f: "ma'anaa", + f: "ma'náa", + }, + out: "مَعْنا", + }, + { + in: { + p: "معنا", + f: "maanáa", }, out: "مَعَنا", }, + // TODO: Should be allowed to use a short vowel as well + // طمع - tama // استعمال - istimaal + // TODO: Starting like عام اعتصاب etc. + // { + // in: { + // p: "اعتصاب شکن", + // f: "itisaab shakan", + // }, + // out: "اِعتِصاب شَکَن", + // }, + // // starting with ع + // { + // in: { + // p: "عزت", + // f: "izzat", + // }, + // out: "عِزَّت", + // }, + // { + // in: { + // p: "عزت", + // f: "i'zzat", + // }, + // out: "عِزَّت", + // }, + // // middle ع + // { + // in: { + // p: "معنا", + // f: "ma'anaa", + // }, + // out: "مَعَنا", + // }, // ending with ayn // { // in: { @@ -683,34 +764,34 @@ diacriticsSections.forEach((section) => { // ERRORS -// const brokenDiacritics = [ -// { -// p: "تشناب", -// f: "peshnaab", -// }, -// { -// p: "وسېدل", -// f: "osedul", -// }, -// ]; +const brokenDiacritics = [ + { + p: "تشناب", + f: "peshnaab", + }, + { + p: "وسېدل", + f: "osedul", + }, +]; -// test("ending with left over Pashto script will throw an error", () => { -// expect(() => { -// addDiacritics({ p: "کور ته", f: "kor" }); -// }).toThrow(`phonetics error - phonetics shorter than pashto script`); -// }); +test("ending with left over Pashto script will throw an error", () => { + expect(() => { + addDiacritics({ p: "کور ته", f: "kor" }); + }).toThrow(`phonetics error - phonetics shorter than pashto script`); +}); -// test("ending with left over phonetics will throw an error", () => { -// expect(() => { -// addDiacritics({ p: "کار", f: "kaar kawul" }); -// }).toThrow(); -// }); +test("ending with left over phonetics will throw an error", () => { + expect(() => { + addDiacritics({ p: "کار", f: "kaar kawul" }); + }).toThrow(); +}); -// test("adding diacritics errors when phonetecs and pashto do not line up", () => { -// brokenDiacritics.forEach((t) => { -// expect(() => { -// addDiacritics(t); -// }).toThrow(); -// }); -// }); +test("adding diacritics errors when phonetecs and pashto do not line up", () => { + brokenDiacritics.forEach((t) => { + expect(() => { + addDiacritics(t); + }).toThrow(); + }); +}); diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts index 25220f3..95148c9 100644 --- a/src/lib/diacritics.ts +++ b/src/lib/diacritics.ts @@ -27,8 +27,7 @@ import { advanceP, reverseP, overwriteP, - advanceForAin, - advanceForAinOrHamza, + advanceForHamza, advanceForHamzaMid, DiacriticsAccumulator, } from "./diacritics-helpers"; @@ -61,14 +60,13 @@ enum PhonemeStatus { DirectMatchAfterSukun, EndingWithHeyHimFromSukun, ShortVowel, - ShortVowelBeforeAin, - ShortVowelAfterAin, PersianSilentWWithAa, ArabicWasla, Izafe, EndOfDuParticle, HaEndingWithHeem, AlefDaggarEnding, + LongAinVowelMissingComma, } function processPhoneme( @@ -112,7 +110,6 @@ function processPhoneme( pipe( advanceP, addP(diacritic), - advanceForAin, )(state) : (phs === PhonemeStatus.DoubleConsonantTashdeed) ? pipe( @@ -171,25 +168,19 @@ function processPhoneme( advanceP, advanceP, )(state) - : (phs === PhonemeStatus.ShortVowelBeforeAin) ? + : (phs === PhonemeStatus.LongAinVowelMissingComma) ? pipe( - // this is pretty messed up because for some reason the reverseP goes back one more step when it's an ain before it - reverseP, - advanceP, addP(diacritic), - // overwriteP(diacritic || ""), - )(state) - : (phs === PhonemeStatus.ShortVowelAfterAin) ? - pipe( advanceP, - addP(diacritic), + addP(diacritic) )(state) : // phs === PhonemeState.ShortVowel pipe( advanceForHamzaMid, addP(phonemeInfo.diacritic), - advanceForAinOrHamza, + // TODO THIS? + advanceForHamza, )(state); } @@ -214,7 +205,11 @@ function stateInfo({ state, i, phonemes, phoneme }: { const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter); const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter)); - const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic; + const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع")); + const diacritic = useAinBlendDiacritics + ? phonemeInfo.ainBlendDiacritic + : isEndOfWord + ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic; function getPhonemeState(): PhonemeStatus { if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) { @@ -243,6 +238,9 @@ function stateInfo({ state, i, phonemes, phoneme }: { if (phoneme === "-i-" && isBeginningOfWord) { return PhonemeStatus.Izafe; } + if (useAinBlendDiacritics) { + return PhonemeStatus.LongAinVowelMissingComma; + } if (needsTashdeed) { return PhonemeStatus.DoubleConsonantTashdeed; } @@ -258,15 +256,8 @@ function stateInfo({ state, i, phonemes, phoneme }: { if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) { return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch; } - if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { - // weird ayn behaviour because it automatically advances and ignores it at the beginning of the process - // console.log("looking prev", prevPLetter); - // console.log("looking next", currentPLetter); - return prevPLetter === "ع" - ? PhonemeStatus.ShortVowelBeforeAin - : currentPLetter === "ع" - ? PhonemeStatus.ShortVowelAfterAin - : PhonemeStatus.ShortVowel; + if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { + return PhonemeStatus.ShortVowel; } // console.log("bad phoneme is ", phoneme); throw new Error("phonetics error - no status found for phoneme: " + phoneme);