more coming
This commit is contained in:
parent
cf01df5c6d
commit
1a0480a9d3
|
@ -236,7 +236,6 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||||
endingMatches: ["ه"],
|
endingMatches: ["ه"],
|
||||||
beginningMatches: ["ا", "ع"],
|
beginningMatches: ["ا", "ع"],
|
||||||
// canComeAfterHeyEnding: true,
|
// canComeAfterHeyEnding: true,
|
||||||
// canBeFirstPartOfFathahanEnding: true,
|
|
||||||
},
|
},
|
||||||
"u": {
|
"u": {
|
||||||
diacritic: zwarakey,
|
diacritic: zwarakey,
|
||||||
|
@ -311,13 +310,162 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
export enum PhonemeStatus {
|
||||||
|
LeadingLongVowel,
|
||||||
|
LeadingConsonantOrShortVowel,
|
||||||
|
DoubleConsonantTashdeed,
|
||||||
|
EndingWithHeyHim,
|
||||||
|
DirectMatch,
|
||||||
|
DirectMatchAfterSukun,
|
||||||
|
EndingWithHeyHimFromSukun,
|
||||||
|
ShortVowel,
|
||||||
|
PersianSilentWWithAa,
|
||||||
|
ArabicWasla,
|
||||||
|
Izafe,
|
||||||
|
EndOfDuParticle,
|
||||||
|
ShortAEndingAfterHeem,
|
||||||
|
AlefDaggarEnding,
|
||||||
|
AinWithLongAAtBeginning,
|
||||||
|
LongAinVowelMissingComma,
|
||||||
|
ShortAinVowelMissingComma,
|
||||||
|
ShortAinVowelMissingCommaAfterAlefStart,
|
||||||
|
AinBeginningAfterShortVowel,
|
||||||
|
AlefWithHamza,
|
||||||
|
AlefWithHamzaWithGlottalStop,
|
||||||
|
WoEndingO,
|
||||||
|
ShortAForAlefBeforeFathatan,
|
||||||
|
NOnFathatan,
|
||||||
|
}
|
||||||
|
|
||||||
|
export function stateInfo({ state, i, phonemes, phoneme }: {
|
||||||
|
state: DiacriticsAccumulator,
|
||||||
|
i: number,
|
||||||
|
phonemes: Phoneme[],
|
||||||
|
phoneme: Phoneme,
|
||||||
|
}) {
|
||||||
|
const isOutOfWord = (char: string) => !char || char === " ";
|
||||||
|
const prevPLetter = last(state.pOut);
|
||||||
|
const currentPLetter = state.pIn[0];
|
||||||
|
const nextPLetter = state.pIn[1];
|
||||||
|
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
|
||||||
|
const isEndOfWord = isOutOfWord(nextPLetter);
|
||||||
|
const phonemeInfo = phonemeTable[phoneme];
|
||||||
|
const nextPhoneme = phonemes[i+1];
|
||||||
|
const previousPhoneme = i > 0 && phonemes[i-1];
|
||||||
|
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
|
||||||
|
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
|
||||||
|
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
|
||||||
|
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
||||||
|
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
|
||||||
|
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
|
||||||
|
const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
|
||||||
|
const diacritic = useAinBlendDiacritics
|
||||||
|
? phonemeInfo.ainBlendDiacritic
|
||||||
|
: isEndOfWord
|
||||||
|
? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
|
||||||
|
|
||||||
|
const lastWordEndedW = (char: string) => ((prevPLetter === char && !currentPLetter) || (prevPLetter === " " && last(state.pOut, 2) === char));
|
||||||
|
|
||||||
|
function getPhonemeState(): PhonemeStatus {
|
||||||
|
if (isBeginningOfWord && phoneme === "aa" && phonemeInfo.beginningMatches?.includes(currentPLetter)) {
|
||||||
|
return PhonemeStatus.DirectMatch;
|
||||||
|
}
|
||||||
|
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
|
||||||
|
if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
|
||||||
|
throw Error("phonetics error - needs alef prefix");
|
||||||
|
}
|
||||||
|
return PhonemeStatus.LeadingLongVowel;
|
||||||
|
}
|
||||||
|
if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
|
||||||
|
return PhonemeStatus.LeadingConsonantOrShortVowel;
|
||||||
|
}
|
||||||
|
if (isBeginningOfWord && phoneme === "aa" && currentPLetter === "ع" && nextPLetter === "ا") {
|
||||||
|
return PhonemeStatus.AinWithLongAAtBeginning;
|
||||||
|
}
|
||||||
|
// console.log("------");
|
||||||
|
// console.log("phoneme", phoneme);
|
||||||
|
// console.log("state", state);
|
||||||
|
// console.log("prevPLetter is space", prevPLetter === " ");
|
||||||
|
// console.log("------");
|
||||||
|
if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && lastNonWhitespace(state.pOut) === "د") {
|
||||||
|
return PhonemeStatus.EndOfDuParticle
|
||||||
|
}
|
||||||
|
if (phoneme === "a" && currentPLetter === "ا" && nextPLetter === fathahan) {
|
||||||
|
return PhonemeStatus.ShortAForAlefBeforeFathatan;
|
||||||
|
}
|
||||||
|
if (phoneme === "'" && last(state.pOut, 2) === "ع" && isOutOfWord(last(state.pOut, 3))) {
|
||||||
|
return PhonemeStatus.AinBeginningAfterShortVowel;
|
||||||
|
}
|
||||||
|
if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") {
|
||||||
|
return PhonemeStatus.PersianSilentWWithAa;
|
||||||
|
}
|
||||||
|
if (!isBeginningOfWord && phoneme === "i" && currentPLetter === "ا" && nextPLetter === "ل") {
|
||||||
|
return PhonemeStatus.ArabicWasla;
|
||||||
|
}
|
||||||
|
if (phoneme === "-i-" && isBeginningOfWord) {
|
||||||
|
return PhonemeStatus.Izafe;
|
||||||
|
}
|
||||||
|
if (phoneme === "a" && currentPLetter === "أ") {
|
||||||
|
return PhonemeStatus.AlefWithHamza;
|
||||||
|
}
|
||||||
|
if (phoneme === "'" && nextPhoneme === "a" && currentPLetter === "أ") {
|
||||||
|
return PhonemeStatus.AlefWithHamzaWithGlottalStop;
|
||||||
|
}
|
||||||
|
if (currentPLetter === "ع" && phoneme !== "'" && nextPhoneme !== "'") {
|
||||||
|
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
||||||
|
return PhonemeStatus.ShortAinVowelMissingComma;
|
||||||
|
}
|
||||||
|
if ((last(state.pOut, 2) === "ا") && isOutOfWord(last(state.pOut, 3))) {
|
||||||
|
return PhonemeStatus.ShortAinVowelMissingCommaAfterAlefStart;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (useAinBlendDiacritics) {
|
||||||
|
return PhonemeStatus.LongAinVowelMissingComma;
|
||||||
|
}
|
||||||
|
if (needsTashdeed) {
|
||||||
|
return PhonemeStatus.DoubleConsonantTashdeed;
|
||||||
|
}
|
||||||
|
if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) {
|
||||||
|
return PhonemeStatus.AlefDaggarEnding;
|
||||||
|
}
|
||||||
|
if (phoneme === "a" && lastWordEndedW("ح")) {
|
||||||
|
return PhonemeStatus.ShortAEndingAfterHeem;
|
||||||
|
}
|
||||||
|
if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) {
|
||||||
|
return needsSukun ? PhonemeStatus.EndingWithHeyHimFromSukun : PhonemeStatus.EndingWithHeyHim;
|
||||||
|
}
|
||||||
|
if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) {
|
||||||
|
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
|
||||||
|
}
|
||||||
|
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
||||||
|
return PhonemeStatus.ShortVowel;
|
||||||
|
}
|
||||||
|
if (phoneme === "o" && previousPhoneme === "w" && lastWordEndedW("و")) {
|
||||||
|
return PhonemeStatus.WoEndingO;
|
||||||
|
}
|
||||||
|
if (isEndOfWord && phoneme === "n" && currentPLetter === fathahan && prevPLetter === "ا") {
|
||||||
|
return PhonemeStatus.NOnFathatan;
|
||||||
|
}
|
||||||
|
console.log(state);
|
||||||
|
// console.log("bad phoneme is ", phoneme);
|
||||||
|
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
|
||||||
|
}
|
||||||
|
|
||||||
|
const phs = getPhonemeState();
|
||||||
|
|
||||||
|
return {
|
||||||
|
phs, phonemeInfo, diacritic, prevPLetter,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* returns the last character of a string
|
* returns the nth last character of a string
|
||||||
*
|
*
|
||||||
* @param s
|
* @param s
|
||||||
*/
|
*/
|
||||||
export function last(s: string) {
|
export function last(s: string, n = 1) {
|
||||||
return s[s.length - 1];
|
return s[s.length - n];
|
||||||
}
|
}
|
||||||
|
|
||||||
export function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
|
export function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
|
||||||
|
|
|
@ -110,6 +110,21 @@ const diacriticsSections: {
|
||||||
},
|
},
|
||||||
out: "پَسْتَه",
|
out: "پَسْتَه",
|
||||||
},
|
},
|
||||||
|
// working with ئ as vowel at end
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "شئ",
|
||||||
|
f: "sheyy",
|
||||||
|
},
|
||||||
|
out: "شئ",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کار کوئ چې لاړ شئ",
|
||||||
|
f: "kaar kawéyy che laaR sheyy",
|
||||||
|
},
|
||||||
|
out: "کار کَوئ چې لاړ شئ",
|
||||||
|
},
|
||||||
// working with وs
|
// working with وs
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
|
@ -209,6 +224,41 @@ const diacriticsSections: {
|
||||||
},
|
},
|
||||||
out: "لِیک",
|
out: "لِیک",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "ماضی",
|
||||||
|
f: "maazee",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "وسېدل",
|
||||||
|
f: "osedul",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "يست",
|
||||||
|
f: "eest",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "ست",
|
||||||
|
f: "ist",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "haca",
|
||||||
|
f: "هځه",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
p: "رغېدل",
|
p: "رغېدل",
|
||||||
|
@ -458,6 +508,13 @@ const diacriticsSections: {
|
||||||
},
|
},
|
||||||
out: "پَتَّه تُور",
|
out: "پَتَّه تُور",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "لکۍ وال",
|
||||||
|
f: "lakuy waal",
|
||||||
|
},
|
||||||
|
out: "لَکۍ وال",
|
||||||
|
},
|
||||||
// avoid false double consonant
|
// avoid false double consonant
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
|
@ -466,6 +523,107 @@ const diacriticsSections: {
|
||||||
},
|
},
|
||||||
out: "اَزَل لِیک",
|
out: "اَزَل لِیک",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "سه",
|
||||||
|
f: "si",
|
||||||
|
},
|
||||||
|
out: "سِه",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "سه شنبه",
|
||||||
|
f: "sishamba",
|
||||||
|
},
|
||||||
|
out: "سِه شَنْبَه",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "توجه",
|
||||||
|
f: "tawajÚ",
|
||||||
|
},
|
||||||
|
out: "تَوَجُه",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "توجه کول",
|
||||||
|
f: "tawajU kawul",
|
||||||
|
},
|
||||||
|
out: "تَوَجُه کَو" + zwarakey + "ل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "با استعداد",
|
||||||
|
f: "baa isti'dáad",
|
||||||
|
},
|
||||||
|
out: "با اِسْتِعْداد",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "آدم",
|
||||||
|
f: "aadam",
|
||||||
|
},
|
||||||
|
out: "آدَم",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "آسان",
|
||||||
|
f: "aasáan",
|
||||||
|
},
|
||||||
|
out: "آسان",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "آسان",
|
||||||
|
f: "asáan",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "یدام",
|
||||||
|
f: "aadam",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
describe: "ې followed by ی - y needs to be written as e`y to be distinguished from ey - ی",
|
||||||
|
tests: [
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "پتېیل",
|
||||||
|
f: "pateyúl",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "پتېیل",
|
||||||
|
f: "pate`yúl",
|
||||||
|
},
|
||||||
|
out: "پَتېی" + zwarakey + "ل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "درېیم",
|
||||||
|
f: "dre`yum",
|
||||||
|
},
|
||||||
|
out: "دْرېی" + zwarakey + "م",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
describe: "handle circumpositions",
|
||||||
|
tests: [
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "تر ... پورې",
|
||||||
|
f: "tur ... pore",
|
||||||
|
},
|
||||||
|
out: "ت" + zwarakey + "ر ... پورې",
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -480,6 +638,25 @@ const diacriticsSections: {
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
describe: "excetption for و - wo",
|
||||||
|
tests: [
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "و",
|
||||||
|
f: "wo",
|
||||||
|
},
|
||||||
|
out: "و",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "سړی و",
|
||||||
|
f: "saRey wo",
|
||||||
|
},
|
||||||
|
out: "سَړی و",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
{
|
{
|
||||||
describe: "alef with hamza above",
|
describe: "alef with hamza above",
|
||||||
tests: [
|
tests: [
|
||||||
|
@ -593,43 +770,105 @@ const diacriticsSections: {
|
||||||
},
|
},
|
||||||
out: "طَمَع اِسْتِعمال",
|
out: "طَمَع اِسْتِعمال",
|
||||||
},
|
},
|
||||||
// {
|
{
|
||||||
// in: {
|
in: {
|
||||||
// p: "اعتصاب شکن",
|
p: "مربع",
|
||||||
// f: "itisaab shakan",
|
f: "mUraba'",
|
||||||
// },
|
},
|
||||||
// out: "اِعتِصاب شَکَن",
|
out: "مُرَبَع",
|
||||||
// },
|
},
|
||||||
// {
|
{
|
||||||
// in: {
|
in: {
|
||||||
// p: "عادل",
|
p: "مربع جذر",
|
||||||
// f: "aadíl",
|
f: "mUraba' jazúr",
|
||||||
// },
|
},
|
||||||
// out: "عادل",
|
out: "مُرَبَع جَذ" + zwarakey + "ر",
|
||||||
// },
|
},
|
||||||
// // starting with ع
|
{
|
||||||
// {
|
in: {
|
||||||
// in: {
|
p: "عام",
|
||||||
// p: "عزت",
|
f: "'aam",
|
||||||
// f: "izzat",
|
},
|
||||||
// },
|
out: "عام",
|
||||||
// out: "عِزَّت",
|
},
|
||||||
// },
|
{
|
||||||
// {
|
in: {
|
||||||
// in: {
|
p: "قتل عام",
|
||||||
// p: "عزت",
|
f: "qatl-i-aam",
|
||||||
// f: "i'zzat",
|
},
|
||||||
// },
|
out: "قَتْلِ عام",
|
||||||
// out: "عِزَّت",
|
},
|
||||||
// },
|
{
|
||||||
// // middle ع
|
in: {
|
||||||
// {
|
p: "توقع",
|
||||||
// in: {
|
f: "tawaqqÚ",
|
||||||
// p: "معنا",
|
},
|
||||||
// f: "ma'anaa",
|
out: "تَوَقّعُ",
|
||||||
// },
|
},
|
||||||
// out: "مَعَنا",
|
],
|
||||||
// },
|
},
|
||||||
|
{
|
||||||
|
describe: "ayn at the beginning",
|
||||||
|
tests: [
|
||||||
|
// as a short vowel at the beginning
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "عزت",
|
||||||
|
f: "izzat",
|
||||||
|
},
|
||||||
|
out: "عِزَّت",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "عزت",
|
||||||
|
f: "i'zzat",
|
||||||
|
},
|
||||||
|
out: "عِْزَّت",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "عذر",
|
||||||
|
f: "Uzar",
|
||||||
|
},
|
||||||
|
out: "عُذَر",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "عذر",
|
||||||
|
f: "U'zar",
|
||||||
|
},
|
||||||
|
out: "عُْذَر",
|
||||||
|
},
|
||||||
|
// as a short i with an alef
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "اعتصاب شکن",
|
||||||
|
f: "itisaab shakan",
|
||||||
|
},
|
||||||
|
out: "اِعتِصاب شَکَن",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "اعتصاب شکن",
|
||||||
|
f: "i'tisaab shakan",
|
||||||
|
},
|
||||||
|
out: "اِعْتِصاب شَکَن",
|
||||||
|
},
|
||||||
|
// as a long aa at beginning
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "عادل",
|
||||||
|
f: "aadíl",
|
||||||
|
},
|
||||||
|
out: "عادِل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "عید",
|
||||||
|
f: "eed",
|
||||||
|
},
|
||||||
|
out: "عِید",
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -687,6 +926,25 @@ const diacriticsSections: {
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
describe: "joiner و",
|
||||||
|
tests: [
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کار و بار",
|
||||||
|
f: "kaar-U-baar",
|
||||||
|
},
|
||||||
|
out: "کار و بار",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کاروبار",
|
||||||
|
f: "kaar-U-baar",
|
||||||
|
},
|
||||||
|
out: "کاروبار",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
{
|
{
|
||||||
describe: "special behaviour with د",
|
describe: "special behaviour with د",
|
||||||
tests: [
|
tests: [
|
||||||
|
@ -716,13 +974,13 @@ const diacriticsSections: {
|
||||||
{
|
{
|
||||||
describe: "ha ending with ح",
|
describe: "ha ending with ح",
|
||||||
tests: [
|
tests: [
|
||||||
// {
|
{
|
||||||
// in: {
|
in: {
|
||||||
// p: "ذبح",
|
p: "ذبح",
|
||||||
// f: "zabha",
|
f: "zabha",
|
||||||
// },
|
},
|
||||||
// out: "ذَبْحَ",
|
out: "ذَبْحَ",
|
||||||
// },
|
},
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
p: "ذبح کول",
|
p: "ذبح کول",
|
||||||
|
@ -764,10 +1022,42 @@ const diacriticsSections: {
|
||||||
out: "مَعَنیٰ",
|
out: "مَعَنیٰ",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
},
|
||||||
|
{
|
||||||
|
describe: "require fathatan on words ending in اً ",
|
||||||
|
tests: [
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "دقیقا",
|
||||||
|
f: "daqeeqan",
|
||||||
|
},
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "دقیقاً",
|
||||||
|
f: "daqeeqan",
|
||||||
|
},
|
||||||
|
out: "دَقِیقاً",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
describe: "Ua ؤ",
|
||||||
|
tests: [
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "مودب",
|
||||||
|
f: "mUaddab",
|
||||||
|
},
|
||||||
|
out: "مؤدَّب",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
diacriticsSections.forEach((section) => {
|
diacriticsSections.forEach((section) => {
|
||||||
|
// if (!section.describe.includes("require fathatan")) return;
|
||||||
describe(section.describe, () => {
|
describe(section.describe, () => {
|
||||||
section.tests.forEach((t) => {
|
section.tests.forEach((t) => {
|
||||||
if (t.out) {
|
if (t.out) {
|
||||||
|
@ -785,34 +1075,34 @@ diacriticsSections.forEach((section) => {
|
||||||
|
|
||||||
// ERRORS
|
// ERRORS
|
||||||
|
|
||||||
const brokenDiacritics = [
|
// const brokenDiacritics = [
|
||||||
{
|
// {
|
||||||
p: "تشناب",
|
// p: "تشناب",
|
||||||
f: "peshnaab",
|
// f: "peshnaab",
|
||||||
},
|
// },
|
||||||
{
|
// {
|
||||||
p: "وسېدل",
|
// p: "وسېدل",
|
||||||
f: "osedul",
|
// f: "osedul",
|
||||||
},
|
// },
|
||||||
];
|
// ];
|
||||||
|
|
||||||
test("ending with left over Pashto script will throw an error", () => {
|
// test("ending with left over Pashto script will throw an error", () => {
|
||||||
expect(() => {
|
// expect(() => {
|
||||||
addDiacritics({ p: "کور ته", f: "kor" });
|
// addDiacritics({ p: "کور ته", f: "kor" });
|
||||||
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
// }).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
||||||
});
|
// });
|
||||||
|
|
||||||
test("ending with left over phonetics will throw an error", () => {
|
// test("ending with left over phonetics will throw an error", () => {
|
||||||
expect(() => {
|
// expect(() => {
|
||||||
addDiacritics({ p: "کار", f: "kaar kawul" });
|
// addDiacritics({ p: "کار", f: "kaar kawul" });
|
||||||
}).toThrow();
|
// }).toThrow();
|
||||||
});
|
// });
|
||||||
|
|
||||||
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
||||||
brokenDiacritics.forEach((t) => {
|
// brokenDiacritics.forEach((t) => {
|
||||||
expect(() => {
|
// expect(() => {
|
||||||
addDiacritics(t);
|
// addDiacritics(t);
|
||||||
}).toThrow();
|
// }).toThrow();
|
||||||
});
|
// });
|
||||||
});
|
// });
|
||||||
|
|
||||||
|
|
|
@ -21,15 +21,15 @@ import {
|
||||||
wasla,
|
wasla,
|
||||||
daggerAlif,
|
daggerAlif,
|
||||||
fathahan,
|
fathahan,
|
||||||
lastNonWhitespace,
|
|
||||||
addP,
|
addP,
|
||||||
last,
|
|
||||||
advanceP,
|
advanceP,
|
||||||
reverseP,
|
reverseP,
|
||||||
overwriteP,
|
overwriteP,
|
||||||
advanceForHamza,
|
advanceForHamza,
|
||||||
advanceForHamzaMid,
|
advanceForHamzaMid,
|
||||||
DiacriticsAccumulator,
|
DiacriticsAccumulator,
|
||||||
|
stateInfo,
|
||||||
|
PhonemeStatus,
|
||||||
} from "./diacritics-helpers";
|
} from "./diacritics-helpers";
|
||||||
|
|
||||||
import { firstPhonetics } from "./p-text-helpers";
|
import { firstPhonetics } from "./p-text-helpers";
|
||||||
|
@ -51,27 +51,6 @@ import { pipe } from "rambda";
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
enum PhonemeStatus {
|
|
||||||
LeadingLongVowel,
|
|
||||||
LeadingConsonantOrShortVowel,
|
|
||||||
DoubleConsonantTashdeed,
|
|
||||||
EndingWithHeyHim,
|
|
||||||
DirectMatch,
|
|
||||||
DirectMatchAfterSukun,
|
|
||||||
EndingWithHeyHimFromSukun,
|
|
||||||
ShortVowel,
|
|
||||||
PersianSilentWWithAa,
|
|
||||||
ArabicWasla,
|
|
||||||
Izafe,
|
|
||||||
EndOfDuParticle,
|
|
||||||
HaEndingWithHeem,
|
|
||||||
AlefDaggarEnding,
|
|
||||||
LongAinVowelMissingComma,
|
|
||||||
ShortAinVowelMissingComma,
|
|
||||||
AlefWithHamza,
|
|
||||||
AlefWithHamzaWithGlottalStop,
|
|
||||||
}
|
|
||||||
|
|
||||||
function processPhoneme(
|
function processPhoneme(
|
||||||
acc: DiacriticsAccumulator,
|
acc: DiacriticsAccumulator,
|
||||||
phoneme: Phoneme,
|
phoneme: Phoneme,
|
||||||
|
@ -96,6 +75,7 @@ function processPhoneme(
|
||||||
phonemeInfo,
|
phonemeInfo,
|
||||||
diacritic,
|
diacritic,
|
||||||
phs,
|
phs,
|
||||||
|
prevPLetter,
|
||||||
} = stateInfo({ state, i, phoneme, phonemes });
|
} = stateInfo({ state, i, phoneme, phonemes });
|
||||||
|
|
||||||
// console.log("phoneme", phoneme);
|
// console.log("phoneme", phoneme);
|
||||||
|
@ -154,10 +134,9 @@ function processPhoneme(
|
||||||
reverseP,
|
reverseP,
|
||||||
addP(zwarakey),
|
addP(zwarakey),
|
||||||
)(state)
|
)(state)
|
||||||
: (phs === PhonemeStatus.HaEndingWithHeem) ?
|
: (phs === PhonemeStatus.ShortAEndingAfterHeem) ?
|
||||||
pipe(
|
pipe(
|
||||||
reverseP,
|
prevPLetter === " " ? reverseP : addP(""),
|
||||||
// prevPLetter === " " ? reverseP ,
|
|
||||||
addP(zwar),
|
addP(zwar),
|
||||||
)(state)
|
)(state)
|
||||||
: (phs === PhonemeStatus.EndingWithHeyHimFromSukun) ?
|
: (phs === PhonemeStatus.EndingWithHeyHimFromSukun) ?
|
||||||
|
@ -181,114 +160,44 @@ function processPhoneme(
|
||||||
addP(diacritic),
|
addP(diacritic),
|
||||||
advanceP,
|
advanceP,
|
||||||
)(state)
|
)(state)
|
||||||
|
: (phs === PhonemeStatus.ShortAinVowelMissingCommaAfterAlefStart) ?
|
||||||
|
pipe(
|
||||||
|
advanceP,
|
||||||
|
advanceP,
|
||||||
|
)(state)
|
||||||
|
: (phs === PhonemeStatus.AinWithLongAAtBeginning) ?
|
||||||
|
pipe(
|
||||||
|
advanceP,
|
||||||
|
advanceP,
|
||||||
|
)(state)
|
||||||
: (phs === PhonemeStatus.AlefWithHamza) ?
|
: (phs === PhonemeStatus.AlefWithHamza) ?
|
||||||
pipe(
|
pipe(
|
||||||
advanceP,
|
advanceP,
|
||||||
)(state)
|
)(state)
|
||||||
: (phs === PhonemeStatus.AlefWithHamzaWithGlottalStop) ?
|
: (phs === PhonemeStatus.ShortVowel) ?
|
||||||
state
|
|
||||||
:
|
|
||||||
// phs === PhonemeState.ShortVowel
|
|
||||||
pipe(
|
pipe(
|
||||||
advanceForHamzaMid,
|
advanceForHamzaMid,
|
||||||
addP(phonemeInfo.diacritic),
|
addP(phonemeInfo.diacritic),
|
||||||
// TODO THIS?
|
// TODO THIS?
|
||||||
advanceForHamza,
|
advanceForHamza,
|
||||||
)(state);
|
)(state)
|
||||||
}
|
: (phs === PhonemeStatus.ShortAForAlefBeforeFathatan) ?
|
||||||
|
pipe(
|
||||||
|
advanceP,
|
||||||
|
)(state)
|
||||||
|
: (phs === PhonemeStatus.NOnFathatan) ?
|
||||||
|
pipe(
|
||||||
|
advanceP,
|
||||||
|
)(state)
|
||||||
|
: state;
|
||||||
|
|
||||||
|
// (phs === PhonemeStatus.AlefWithHamzaWithGlottalStop) ?
|
||||||
|
// state
|
||||||
|
// : (phs === PhonemeStatus.AinBeginningAfterShortVowel) ?
|
||||||
|
// state
|
||||||
|
//: (phs === PhonemeStatus.WoEndingO) ?
|
||||||
|
// state
|
||||||
|
// :
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
function stateInfo({ state, i, phonemes, phoneme }: {
|
|
||||||
state: DiacriticsAccumulator,
|
|
||||||
i: number,
|
|
||||||
phonemes: Phoneme[],
|
|
||||||
phoneme: Phoneme,
|
|
||||||
}) {
|
|
||||||
const prevPLetter = last(state.pOut);
|
|
||||||
const currentPLetter = state.pIn[0];
|
|
||||||
const nextPLetter = state.pIn[1];
|
|
||||||
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
|
|
||||||
const isEndOfWord = !nextPLetter || nextPLetter === " ";
|
|
||||||
const phonemeInfo = phonemeTable[phoneme];
|
|
||||||
const nextPhoneme = phonemes[i+1];
|
|
||||||
const previousPhoneme = i > 0 && phonemes[i-1];
|
|
||||||
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
|
|
||||||
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
|
|
||||||
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
|
|
||||||
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
|
||||||
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
|
|
||||||
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
|
|
||||||
const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
|
|
||||||
const diacritic = useAinBlendDiacritics
|
|
||||||
? phonemeInfo.ainBlendDiacritic
|
|
||||||
: isEndOfWord
|
|
||||||
? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
|
|
||||||
|
|
||||||
function getPhonemeState(): PhonemeStatus {
|
|
||||||
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
|
|
||||||
if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
|
|
||||||
throw Error("phonetics error - needs alef prefix");
|
|
||||||
}
|
}
|
||||||
return PhonemeStatus.LeadingLongVowel;
|
|
||||||
}
|
|
||||||
if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
|
|
||||||
return PhonemeStatus.LeadingConsonantOrShortVowel;
|
|
||||||
}
|
|
||||||
// console.log("------");
|
|
||||||
// console.log("phoneme", phoneme);
|
|
||||||
// console.log("state", state);
|
|
||||||
// console.log("prevPLetter is space", prevPLetter === " ");
|
|
||||||
// console.log("------");
|
|
||||||
if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && lastNonWhitespace(state.pOut) === "د") {
|
|
||||||
return PhonemeStatus.EndOfDuParticle
|
|
||||||
}
|
|
||||||
if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") {
|
|
||||||
return PhonemeStatus.PersianSilentWWithAa;
|
|
||||||
}
|
|
||||||
if (!isBeginningOfWord && phoneme === "i" && currentPLetter === "ا" && nextPLetter === "ل") {
|
|
||||||
return PhonemeStatus.ArabicWasla;
|
|
||||||
}
|
|
||||||
if (phoneme === "-i-" && isBeginningOfWord) {
|
|
||||||
return PhonemeStatus.Izafe;
|
|
||||||
}
|
|
||||||
if (phoneme === "a" && currentPLetter === "أ") {
|
|
||||||
return PhonemeStatus.AlefWithHamza;
|
|
||||||
}
|
|
||||||
if (phoneme === "'" && nextPhoneme === "a" && currentPLetter === "أ") {
|
|
||||||
return PhonemeStatus.AlefWithHamzaWithGlottalStop;
|
|
||||||
}
|
|
||||||
if (currentPLetter === "ع" && phoneme !== "'" && nextPhoneme !== "'" && phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
|
||||||
return PhonemeStatus.ShortAinVowelMissingComma;
|
|
||||||
}
|
|
||||||
if (useAinBlendDiacritics) {
|
|
||||||
return PhonemeStatus.LongAinVowelMissingComma;
|
|
||||||
}
|
|
||||||
if (needsTashdeed) {
|
|
||||||
return PhonemeStatus.DoubleConsonantTashdeed;
|
|
||||||
}
|
|
||||||
if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) {
|
|
||||||
return PhonemeStatus.AlefDaggarEnding;
|
|
||||||
}
|
|
||||||
if (((isEndOfWord && prevPLetter === "ح") || (prevPLetter === " " && state.pOut[state.pOut.length - 2])) && phoneme === "a") {
|
|
||||||
return PhonemeStatus.HaEndingWithHeem;
|
|
||||||
}
|
|
||||||
if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) {
|
|
||||||
return needsSukun ? PhonemeStatus.EndingWithHeyHimFromSukun : PhonemeStatus.EndingWithHeyHim;
|
|
||||||
}
|
|
||||||
if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) {
|
|
||||||
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
|
|
||||||
}
|
|
||||||
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
|
||||||
return PhonemeStatus.ShortVowel;
|
|
||||||
}
|
|
||||||
// console.log("bad phoneme is ", phoneme);
|
|
||||||
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
|
|
||||||
}
|
|
||||||
|
|
||||||
const phs = getPhonemeState();
|
|
||||||
|
|
||||||
return {
|
|
||||||
phs, phonemeInfo, diacritic,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
Loading…
Reference in New Issue