From a62ab986baceaea04392a65cbd8c460beb509482 Mon Sep 17 00:00:00 2001 From: Bill D Date: Thu, 3 Jun 2021 20:42:07 +0430 Subject: [PATCH] more --- src/lib/diacritics-helpers.ts | 12 +++-- src/lib/diacritics.test.ts | 59 +++++++++++++++++++++++++ src/lib/diacritics.ts | 6 +++ src/lib/phonetics-to-diacritics.test.ts | 1 - 4 files changed, 73 insertions(+), 5 deletions(-) diff --git a/src/lib/diacritics-helpers.ts b/src/lib/diacritics-helpers.ts index 211037c..0149ab2 100644 --- a/src/lib/diacritics-helpers.ts +++ b/src/lib/diacritics-helpers.ts @@ -274,7 +274,7 @@ export const phonemeTable: Record = { const willIgnore = ["?", " ", "`", ".", "…", ","]; const result: Phoneme[] = []; - const f = removeAccents(fIn); + const f = removeAccents(fIn).replace(/ă/g, "a"); let index = 0; while (index < f.length) { const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " "); @@ -336,6 +336,7 @@ export enum PhonemeStatus { ShortAForAlefBeforeFathatan, NOnFathatan, HamzaOnWow, + ArabicDefiniteArticleUl, } export function stateInfo({ state, i, phonemes, phoneme }: { @@ -348,11 +349,11 @@ export function stateInfo({ state, i, phonemes, phoneme }: { const prevPLetter = last(state.pOut); const currentPLetter = state.pIn[0]; const nextPLetter = state.pIn[1]; - const isBeginningOfWord = state.pOut === "" || prevPLetter === " "; - const isEndOfWord = isOutOfWord(nextPLetter); - const phonemeInfo = phonemeTable[phoneme]; const nextPhoneme = phonemes[i+1]; const previousPhoneme = i > 0 && phonemes[i-1]; + const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل"); + const isEndOfWord = isOutOfWord(nextPLetter); + const phonemeInfo = phonemeTable[phoneme]; const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined; @@ -391,6 +392,9 @@ export function stateInfo({ state, i, phonemes, phoneme }: { if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && lastNonWhitespace(state.pOut) === "د") { return PhonemeStatus.EndOfDuParticle } + if (isBeginningOfWord && phoneme === "-Ul-" && currentPLetter === "ا" && nextPLetter === "ل") { + return PhonemeStatus.ArabicDefiniteArticleUl; + } if (phoneme === "a" && previousPhoneme === "U" && currentPLetter === "و") { return PhonemeStatus.HamzaOnWow; } diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts index 13c7e46..653f03c 100644 --- a/src/lib/diacritics.test.ts +++ b/src/lib/diacritics.test.ts @@ -586,6 +586,13 @@ const diacriticsSections: { }, out: null, }, + { + in: { + p: "سختسری", + f: "sakht sărey", + }, + out: "سَخْتْسَری", + }, ], }, { @@ -805,6 +812,20 @@ const diacriticsSections: { }, out: "تَوَقُّع", }, + { + in: { + p: "راجع کېدل", + f: "raaji kedul", + }, + out: "راجِع کېد" + zwarakey + "ل", + }, + { + in: { + p: "ربیع", + f: "rabee'", + }, + out: "رَبِیع", + }, ], }, { @@ -1054,6 +1075,44 @@ const diacriticsSections: { }, ], }, + { + describe: "With Arabic definate article -Ul- ال", + tests: [ + { + in: { + p: "حق الاجاره", + f: "haq-Ul-ijaara", + }, + out: "حَق اُلاِجارَه", + }, + { + in: { + p: "دار العلوم", + f: "daar-Ul-Ulóom", + }, + out: "دار اُلعُلُوم", + }, + ], + }, + // { + // describe: "double consonants on end of words", + // tests: [ + // { + // in: { + // p: "حق", + // f: "haqq", + // }, + // out: "حَقّ", + // }, + // { + // in: { + // p: "حق پر", + // f: "haqq par", + // }, + // out: "حَقّ پَر", + // }, + // ], + // }, ]; diacriticsSections.forEach((section) => { diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts index 65a09cd..6ce4601 100644 --- a/src/lib/diacritics.ts +++ b/src/lib/diacritics.ts @@ -195,6 +195,12 @@ function processPhoneme( addP(hamzaAbove), addP(diacritic), )(state) + : (phs === PhonemeStatus.ArabicDefiniteArticleUl) ? + pipe( + advanceP, + addP(pesh), + advanceP, + )(state) : state; diff --git a/src/lib/phonetics-to-diacritics.test.ts b/src/lib/phonetics-to-diacritics.test.ts index b9d081f..0c07cea 100644 --- a/src/lib/phonetics-to-diacritics.test.ts +++ b/src/lib/phonetics-to-diacritics.test.ts @@ -970,7 +970,6 @@ const toTest: Array<{ }, out: "حَقّ پَر", }, - // TODO: Allow ' in there { in: { p: "راجع کېدل",