This commit is contained in:
Bill D 2021-05-08 21:31:59 +03:00
parent 6053d11bc0
commit 2dea82c32b
2 changed files with 330 additions and 43 deletions

View File

@ -242,6 +242,196 @@ const diacriticsTest: Array<{
}, },
out: "لِیک", out: "لِیک",
}, },
{
in: {
p: "رغېدل",
f: "raghedul",
},
out: "رَغېد" + zwarakey + "ل",
},
{
in: {
p: "کارول",
f: "kaarawul",
},
out: "کارَو" + zwarakey + "ل",
},
{
in: {
p: "پېښېدل",
f: "pexedul",
},
out: "پېښېد" + zwarakey + "ل",
},
{
in: {
p: "مین",
f: "mayín",
},
out: "مَیِن",
},
{
in: {
p: "سړی",
f: "saRey",
},
out: "سَړی",
},
{
in: {
p: "سړي",
f: "saRee",
},
out: "سَړي",
},
{
in: {
p: "زه",
f: "zu",
},
out: "زهٔ",
},
{
in: {
p: "زه",
f: "za",
},
out: "زَه",
},
{
in: {
p: "پېشنهاد",
f: "peshniháad",
},
out: "پېشْنِهاد",
},
{
in: {
p: "ایستل",
f: "eestul",
},
out: "اِیسْت" + zwarakey + "ل",
},
{
in: {
p: "ایستل",
f: "eystul",
},
out: "ایسْت" + zwarakey + "ل",
},
{
in: {
p: "اېسېدل",
f: "esedul",
},
out: "اېسېد" + zwarakey + "ل",
},
{
in: {
p: "اوسېدل",
f: "osedul",
},
out: "اوسېد" + zwarakey + "ل",
},
{
in: {
p: "اواز",
f: "awaaz",
},
out: "اَواز",
},
{
in: {
p: "اسلام",
f: "islaam",
},
out: "اِسْلام",
},
{
in: {
p: "واردول",
f: "waaridawul",
},
out: "وارِدَو" + zwarakey + "ل",
},
{
in: {
p: "غاړه",
f: "ghaaRa",
},
out: "غاړَه",
},
{
in: {
p: "اوتر",
f: "awtár",
},
out: "اَوْتَر",
},
{
in: {
p: "اختیار",
f: "ikhtiyáar",
},
out: "اِخْتِیار",
},
{
in: {
p: "فریاد",
f: "faryáad",
},
out: "فَرْیاد",
},
{
in: {
p: "کارغه",
f: "kaarghu",
},
out: "کارْغهٔ",
},
{
in: {
p: "بې کار",
f: "be kaar",
},
out: "بې کار",
},
{
in: {
p: "بې کار",
f: "bekaar",
},
out: "بې کار",
},
// TODO: nb mb thing
{
in: {
p: "انبار",
f: "ambáar",
},
out: "اَنْبار",
},
{
in: {
p: "ارغون",
f: "arghóon",
},
out: "اَرْغُون",
},
{
in: {
p: "ارمټه",
f: "armaTa",
},
out: "اَرْمَټَه",
},
{
in: {
p: "اروا پوه",
f: "arwaa poh",
},
out: "اَرْوا پوهْ",
},
// starting alefs // starting alefs
{ {
in: { in: {
@ -295,21 +485,28 @@ const diacriticsTest: Array<{
}, },
out: "پَرْمَخْتْیا", out: "پَرْمَخْتْیا",
}, },
// { {
// in: { in: {
// p: "پته", p: "پته",
// f: "patta", f: "patta",
// }, },
// out: "پَتّه", out: "پَتَّه",
// }, },
{
in: {
p: "پته تور",
f: "patta toor",
},
out: "پَتَّه تُور",
},
// get ayn stuff working // get ayn stuff working
// { {
// in: { in: {
// p: "اعتصاب شکن", p: "اعتصاب شکن",
// f: "itisaabshikan", f: "itisaab shakan",
// }, },
// out: "اِعتِصاب شِکَن", out: "اِعتِصاب شَکَن",
// }, },
// avoid false double consonant // avoid false double consonant
{ {
in: { in: {
@ -318,6 +515,36 @@ const diacriticsTest: Array<{
}, },
out: "اَزَل لِیک", out: "اَزَل لِیک",
}, },
// starting with ع
{
in: {
p: "عزت",
f: "izzat",
},
out: "عِزَّت",
},
{
in: {
p: "عزت",
f: "i'zzat",
},
out: "عِزَّت",
},
// ئ in the middle
{
in: {
p: "برائت",
f: "baraa'at",
},
out: "بَرائَت",
},
{
in: {
p: "فائده",
f: "faaida",
},
out: "فائِدَه",
},
]; ];
phonemeSplits.forEach((s) => { phonemeSplits.forEach((s) => {
@ -327,8 +554,9 @@ phonemeSplits.forEach((s) => {
}); });
}); });
test("adding diacritics should work", () => {
diacriticsTest.forEach((t) => { diacriticsTest.forEach((t) => {
test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => {
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f }); expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
}); });
}); });

View File

@ -41,6 +41,7 @@ type PhonemeInfo = {
takesSukunOnEnding?: true, takesSukunOnEnding?: true,
longVowel?: true, longVowel?: true,
canStartWithAynBefore?: true, canStartWithAynBefore?: true,
useEndingDiacritic?: true,
} }
const phonemeTable: Record<Phoneme, PhonemeInfo> = { const phonemeTable: Record<Phoneme, PhonemeInfo> = {
@ -211,6 +212,7 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
longVowel: true, longVowel: true,
// alsoCanBePrefix: true, // alsoCanBePrefix: true,
diacritic: pesh, diacritic: pesh,
useEndingDiacritic: true,
}, },
"ey": { "ey": {
matches: ["ی"], matches: ["ی"],
@ -231,14 +233,13 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
"a": { "a": {
diacritic: zwar, diacritic: zwar,
endingMatches: ["ه"], endingMatches: ["ه"],
beginningMatches: ["ا"], beginningMatches: ["ا", "ع"],
// canComeAfterHeyEnding: true, // canComeAfterHeyEnding: true,
// canBeFirstPartOfFathahanEnding: true, // canBeFirstPartOfFathahanEnding: true,
}, },
"u": { "u": {
diacritic: zwarakey, diacritic: zwarakey,
endingMatches: ["ه"], endingMatches: ["ه"],
// hamzaOnEnd: true,
}, },
"i": { "i": {
diacritic: zer, diacritic: zer,
@ -270,7 +271,7 @@ export function splitFIntoPhonemes(fIn: string): Phoneme[] {
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
const endingDigraphs: Phoneme[] = ["uy"]; const endingDigraphs: Phoneme[] = ["uy"];
const willIgnore = ["?", " ", "`", ".", "…", ","]; const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
const result: Phoneme[] = []; const result: Phoneme[] = [];
const f = removeAccents(fIn); const f = removeAccents(fIn);
@ -334,28 +335,27 @@ function processPhoneme(
i: number, i: number,
phonemes: Phoneme[], phonemes: Phoneme[],
) { ) {
// console.log("PHONEME", phoneme);
// console.log("space coming up", acc.pIn[0] === " ");
// console.log("state", acc);
// Prep state // Prep state
const state = acc.pIn[0] === " " ? advanceP(acc) : acc; const state = acc.pIn[0] === " " ? advanceP(acc) : acc;
// console.log("AFTER SPACE PREP", phoneme);
// console.log("state", state);
// WARNING: Do not use acc after this point! // WARNING: Do not use acc after this point!
const prevPLetter = last(state.pOut); const {
const currentPLetter = state.pIn[0]; phonemeInfo,
const nextPLetter = state.pIn[1]; isBeginningOfWord,
const isBeginningOfWord = state.pOut === "" || prevPLetter === " "; currentPLetter,
// const isEndOfWord = !nextPLetter || nextPLetter === " "; needsTashdeed,
const phonemeInfo = phonemeTable[phoneme]; sukunOrDiacritic,
const previousPhoneme = i > 0 && phonemes[i-1]; nextPLetter,
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; isEndOfWord,
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; } = stateInfo({ state, i, phoneme, phonemes });
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const sukunOrDiacritic = (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "");
// if it's not an exception (TODO) // if it's not an exception (TODO)
// it must be one of the following 5 possibilities // it must be one of the following 5 possibilities
// 1. beginning a word with a long vowel // 1. beginning a word with a long vowel
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) { if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) { if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
@ -371,29 +371,65 @@ function processPhoneme(
return pipe( return pipe(
advanceP, advanceP,
addP(sukunOrDiacritic), addP(sukunOrDiacritic),
advanceForAin,
)(state); )(state);
// 3. double consonant to be marked with tashdeed // 3. double consonant to be marked with tashdeed
} else if (needsTashdeed) { } else if (needsTashdeed) {
return addP(tashdeed)(state); return pipe(
// 4. direct match of phoneme / P letter addP(tashdeed)
} else if (phonemeInfo.matches?.includes(currentPLetter)) { )(state);
// 4. special ه ending
} else if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) {
return pipe(
advanceP,
addP(phoneme === "u" ? hamzaAbove : sukun),
)(state);
// 5. direct match of phoneme / P letter
} else if (phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب")) {
return pipe( return pipe(
addP(sukunOrDiacritic), addP(sukunOrDiacritic),
advanceP, advanceP,
)(state); )(state);
// 5. just a diacritic for short vowel // 6. just a diacritic for short vowel
} else if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { } else if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
return pipe( return pipe(
advanceForHamzaMid,
addP(phonemeInfo.diacritic), addP(phonemeInfo.diacritic),
advanceIfReachedEndingHamza, advanceForAinOrHamza,
)(state); )(state);
} }
// anything that gets to this point is a failure/error // anything that gets to this point is a failure/error
// console.log(state);
throw new Error("phonetics error"); throw new Error("phonetics error");
} }
function stateInfo({ state, i, phonemes, phoneme }: {
state: DiacriticsAccumulator,
i: number,
phonemes: Phoneme[],
phoneme: Phoneme,
}) {
const prevPLetter = last(state.pOut);
const currentPLetter = state.pIn[0];
const nextPLetter = state.pIn[1];
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
const isEndOfWord = !nextPLetter || nextPLetter === " ";
const phonemeInfo = phonemeTable[phoneme];
const previousPhoneme = i > 0 && phonemes[i-1];
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
const sukunOrDiacritic = (needsSukun ? sukun : diacritic);
return {
phonemeInfo, isBeginningOfWord, currentPLetter, needsTashdeed, sukunOrDiacritic, nextPLetter, isEndOfWord,
};
};
/** /**
* returns the last character of a string * returns the last character of a string
* *
@ -417,8 +453,31 @@ const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): Diac
}; };
} }
function advanceIfReachedEndingHamza(state: DiacriticsAccumulator): DiacriticsAccumulator { function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} {
if (state.pIn[0] === "ه" && (!state.pIn[1] || state.pIn[1] === " ")) { return {
current: state.pIn[0],
next: state.pIn[1],
};
}
function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current } = getCurrentNext(state);
return (current === "ع") ? advanceP(state) : state;
}
function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state);
if (current === "ئ" && next && next !== "ئ") {
return advanceP(state);
}
return state;
}
function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state);
if (current === "ه" && (!next || next === " ")) {
return advanceP(state);
}
if (current === "ع") {
return advanceP(state); return advanceP(state);
} }
return state; return state;