This commit is contained in:
Bill D 2021-05-08 21:31:59 +03:00
parent 6053d11bc0
commit 2dea82c32b
2 changed files with 330 additions and 43 deletions

View File

@ -242,6 +242,196 @@ const diacriticsTest: Array<{
},
out: "لِیک",
},
{
in: {
p: "رغېدل",
f: "raghedul",
},
out: "رَغېد" + zwarakey + "ل",
},
{
in: {
p: "کارول",
f: "kaarawul",
},
out: "کارَو" + zwarakey + "ل",
},
{
in: {
p: "پېښېدل",
f: "pexedul",
},
out: "پېښېد" + zwarakey + "ل",
},
{
in: {
p: "مین",
f: "mayín",
},
out: "مَیِن",
},
{
in: {
p: "سړی",
f: "saRey",
},
out: "سَړی",
},
{
in: {
p: "سړي",
f: "saRee",
},
out: "سَړي",
},
{
in: {
p: "زه",
f: "zu",
},
out: "زهٔ",
},
{
in: {
p: "زه",
f: "za",
},
out: "زَه",
},
{
in: {
p: "پېشنهاد",
f: "peshniháad",
},
out: "پېشْنِهاد",
},
{
in: {
p: "ایستل",
f: "eestul",
},
out: "اِیسْت" + zwarakey + "ل",
},
{
in: {
p: "ایستل",
f: "eystul",
},
out: "ایسْت" + zwarakey + "ل",
},
{
in: {
p: "اېسېدل",
f: "esedul",
},
out: "اېسېد" + zwarakey + "ل",
},
{
in: {
p: "اوسېدل",
f: "osedul",
},
out: "اوسېد" + zwarakey + "ل",
},
{
in: {
p: "اواز",
f: "awaaz",
},
out: "اَواز",
},
{
in: {
p: "اسلام",
f: "islaam",
},
out: "اِسْلام",
},
{
in: {
p: "واردول",
f: "waaridawul",
},
out: "وارِدَو" + zwarakey + "ل",
},
{
in: {
p: "غاړه",
f: "ghaaRa",
},
out: "غاړَه",
},
{
in: {
p: "اوتر",
f: "awtár",
},
out: "اَوْتَر",
},
{
in: {
p: "اختیار",
f: "ikhtiyáar",
},
out: "اِخْتِیار",
},
{
in: {
p: "فریاد",
f: "faryáad",
},
out: "فَرْیاد",
},
{
in: {
p: "کارغه",
f: "kaarghu",
},
out: "کارْغهٔ",
},
{
in: {
p: "بې کار",
f: "be kaar",
},
out: "بې کار",
},
{
in: {
p: "بې کار",
f: "bekaar",
},
out: "بې کار",
},
// TODO: nb mb thing
{
in: {
p: "انبار",
f: "ambáar",
},
out: "اَنْبار",
},
{
in: {
p: "ارغون",
f: "arghóon",
},
out: "اَرْغُون",
},
{
in: {
p: "ارمټه",
f: "armaTa",
},
out: "اَرْمَټَه",
},
{
in: {
p: "اروا پوه",
f: "arwaa poh",
},
out: "اَرْوا پوهْ",
},
// starting alefs
{
in: {
@ -295,21 +485,28 @@ const diacriticsTest: Array<{
},
out: "پَرْمَخْتْیا",
},
// {
// in: {
// p: "پته",
// f: "patta",
// },
// out: "پَتّه",
// },
{
in: {
p: "پته",
f: "patta",
},
out: "پَتَّه",
},
{
in: {
p: "پته تور",
f: "patta toor",
},
out: "پَتَّه تُور",
},
// get ayn stuff working
// {
// in: {
// p: "اعتصاب شکن",
// f: "itisaabshikan",
// },
// out: "اِعتِصاب شِکَن",
// },
{
in: {
p: "اعتصاب شکن",
f: "itisaab shakan",
},
out: "اِعتِصاب شَکَن",
},
// avoid false double consonant
{
in: {
@ -318,6 +515,36 @@ const diacriticsTest: Array<{
},
out: "اَزَل لِیک",
},
// starting with ع
{
in: {
p: "عزت",
f: "izzat",
},
out: "عِزَّت",
},
{
in: {
p: "عزت",
f: "i'zzat",
},
out: "عِزَّت",
},
// ئ in the middle
{
in: {
p: "برائت",
f: "baraa'at",
},
out: "بَرائَت",
},
{
in: {
p: "فائده",
f: "faaida",
},
out: "فائِدَه",
},
];
phonemeSplits.forEach((s) => {
@ -327,8 +554,9 @@ phonemeSplits.forEach((s) => {
});
});
test("adding diacritics should work", () => {
diacriticsTest.forEach((t) => {
test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => {
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
});
});

View File

@ -41,6 +41,7 @@ type PhonemeInfo = {
takesSukunOnEnding?: true,
longVowel?: true,
canStartWithAynBefore?: true,
useEndingDiacritic?: true,
}
const phonemeTable: Record<Phoneme, PhonemeInfo> = {
@ -211,6 +212,7 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
longVowel: true,
// alsoCanBePrefix: true,
diacritic: pesh,
useEndingDiacritic: true,
},
"ey": {
matches: ["ی"],
@ -231,14 +233,13 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
"a": {
diacritic: zwar,
endingMatches: ["ه"],
beginningMatches: ["ا"],
beginningMatches: ["ا", "ع"],
// canComeAfterHeyEnding: true,
// canBeFirstPartOfFathahanEnding: true,
},
"u": {
diacritic: zwarakey,
endingMatches: ["ه"],
// hamzaOnEnd: true,
},
"i": {
diacritic: zer,
@ -270,7 +271,7 @@ export function splitFIntoPhonemes(fIn: string): Phoneme[] {
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
const endingDigraphs: Phoneme[] = ["uy"];
const willIgnore = ["?", " ", "`", ".", "…", ","];
const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
const result: Phoneme[] = [];
const f = removeAccents(fIn);
@ -334,28 +335,27 @@ function processPhoneme(
i: number,
phonemes: Phoneme[],
) {
// console.log("PHONEME", phoneme);
// console.log("space coming up", acc.pIn[0] === " ");
// console.log("state", acc);
// Prep state
const state = acc.pIn[0] === " " ? advanceP(acc) : acc;
// console.log("AFTER SPACE PREP", phoneme);
// console.log("state", state);
// WARNING: Do not use acc after this point!
const prevPLetter = last(state.pOut);
const currentPLetter = state.pIn[0];
const nextPLetter = state.pIn[1];
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
// const isEndOfWord = !nextPLetter || nextPLetter === " ";
const phonemeInfo = phonemeTable[phoneme];
const previousPhoneme = i > 0 && phonemes[i-1];
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const sukunOrDiacritic = (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "");
const {
phonemeInfo,
isBeginningOfWord,
currentPLetter,
needsTashdeed,
sukunOrDiacritic,
nextPLetter,
isEndOfWord,
} = stateInfo({ state, i, phoneme, phonemes });
// if it's not an exception (TODO)
// it must be one of the following 5 possibilities
// 1. beginning a word with a long vowel
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
@ -371,29 +371,65 @@ function processPhoneme(
return pipe(
advanceP,
addP(sukunOrDiacritic),
advanceForAin,
)(state);
// 3. double consonant to be marked with tashdeed
} else if (needsTashdeed) {
return addP(tashdeed)(state);
// 4. direct match of phoneme / P letter
} else if (phonemeInfo.matches?.includes(currentPLetter)) {
return pipe(
addP(tashdeed)
)(state);
// 4. special ه ending
} else if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) {
return pipe(
advanceP,
addP(phoneme === "u" ? hamzaAbove : sukun),
)(state);
// 5. direct match of phoneme / P letter
} else if (phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب")) {
return pipe(
addP(sukunOrDiacritic),
advanceP,
)(state);
// 5. just a diacritic for short vowel
// 6. just a diacritic for short vowel
} else if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
return pipe(
advanceForHamzaMid,
addP(phonemeInfo.diacritic),
advanceIfReachedEndingHamza,
advanceForAinOrHamza,
)(state);
}
// anything that gets to this point is a failure/error
// console.log(state);
throw new Error("phonetics error");
}
function stateInfo({ state, i, phonemes, phoneme }: {
state: DiacriticsAccumulator,
i: number,
phonemes: Phoneme[],
phoneme: Phoneme,
}) {
const prevPLetter = last(state.pOut);
const currentPLetter = state.pIn[0];
const nextPLetter = state.pIn[1];
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
const isEndOfWord = !nextPLetter || nextPLetter === " ";
const phonemeInfo = phonemeTable[phoneme];
const previousPhoneme = i > 0 && phonemes[i-1];
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
const sukunOrDiacritic = (needsSukun ? sukun : diacritic);
return {
phonemeInfo, isBeginningOfWord, currentPLetter, needsTashdeed, sukunOrDiacritic, nextPLetter, isEndOfWord,
};
};
/**
* returns the last character of a string
*
@ -417,8 +453,31 @@ const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): Diac
};
}
function advanceIfReachedEndingHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
if (state.pIn[0] === "ه" && (!state.pIn[1] || state.pIn[1] === " ")) {
function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} {
return {
current: state.pIn[0],
next: state.pIn[1],
};
}
function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current } = getCurrentNext(state);
return (current === "ع") ? advanceP(state) : state;
}
function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state);
if (current === "ئ" && next && next !== "ئ") {
return advanceP(state);
}
return state;
}
function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state);
if (current === "ه" && (!next || next === " ")) {
return advanceP(state);
}
if (current === "ع") {
return advanceP(state);
}
return state;