more work on phonetics

This commit is contained in:
Bill D 2021-05-27 11:06:30 +04:30
parent 0ff0548775
commit aad1b34e17
4 changed files with 160 additions and 81 deletions

View File

@ -6,8 +6,7 @@ import {
advanceP,
reverseP,
overwriteP,
advanceForAin,
advanceForAinOrHamza,
advanceForHamza,
advanceForHamzaMid,
} from "./diacritics-helpers";
@ -23,6 +22,10 @@ const phonemeSplits: Array<{
in: "raaghey",
out: ["r", "aa", "gh", "ey"],
},
{
in: "ist'imaal",
out: ["i", "s", "t", "'", "i", "m", "aa", "l"],
},
{
in: "hatsa",
out: ["h", "a", "ts", "a"],

View File

@ -28,6 +28,7 @@ type PhonemeInfo = {
longVowel?: true,
canStartWithAynBefore?: true,
useEndingDiacritic?: true,
ainBlendDiacritic?: string,
}
export const zwar = "َ";
@ -188,13 +189,15 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
beginningMatches: ["آ", "ا"],
endingMatches: ["ا", "یٰ"],
longVowel: true,
ainBlendDiacritic: zwar,
},
"ee": {
matches: ["ی"],
longVowel: true,
endingMatches: ["ي"],
diacritic: zer,
canStartWithAynBefore: true
canStartWithAynBefore: true,
ainBlendDiacritic: zer,
},
"e": {
matches: ["ې"],
@ -210,6 +213,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
// alsoCanBePrefix: true,
diacritic: pesh,
useEndingDiacritic: true,
ainBlendDiacritic: pesh,
},
"ey": {
matches: ["ی"],
@ -262,13 +266,13 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
* @returns an array of phonemes
*/
export function splitFIntoPhonemes(fIn: string): Phoneme[] {
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y", "'"];
const quadrigraphs: Phoneme[] = ["-Ul-"];
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
const endingDigraphs: Phoneme[] = ["uy"];
const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
const willIgnore = ["?", " ", "`", ".", "…", ","];
const result: Phoneme[] = [];
const f = removeAccents(fIn);
@ -372,10 +376,10 @@ export function getCurrentNext(state: DiacriticsAccumulator): { current: string,
};
}
export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current } = getCurrentNext(state);
return (current === "ع") ? advanceP(state) : state;
}
// export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
// const { current } = getCurrentNext(state);
// return (current === "ع") ? advanceP(state) : state;
// }
export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state);
@ -385,14 +389,14 @@ export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccu
return state;
}
export function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
export function advanceForHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state);
if (current === "ه" && (!next || next === " ")) {
return advanceP(state);
}
if (current === "ع") {
return advanceP(state);
}
// if (current === "ع") {
// return advanceP(state);
// }
return state;
}

View File

@ -485,34 +485,115 @@ const diacriticsSections: {
tests: [
{
in: {
p: "اعتصاب شکن",
f: "itisaab shakan",
p: "بعد",
f: "ba'd",
},
out: "اِعتِصاب شَکَن",
},
// starting with ع
{
in: {
p: "عزت",
f: "izzat",
},
out: "عِزَّت",
out: "بَعْد",
},
{
in: {
p: "عزت",
f: "i'zzat",
p: "بعد",
f: "b'ad",
},
out: "عِزَّت",
out: "بْعَد",
},
{
in: {
p: "بعد",
f: "ba'ad",
},
out: "بَعَد",
},
{
in: {
p: "بعد",
f: "baad",
},
out: "بَعَد",
},
{
in: {
p: "بعد",
f: "bad",
},
// TODO: Should this really be an error?
out: null,
},
{
in: {
p: "معلوم",
f: "maaloom",
},
out: "مَعَلُوم",
},
{
in: {
p: "منبع",
f: "manbi'",
},
out: "مَنْبِع",
},
{
in: {
p: "منبع",
f: "manb'i",
},
out: "مَنْبْعِ"
},
{
in: {
p: "منبع",
f: "manbee",
},
out: "مَنْبِعِ",
},
// middle ع
{
in: {
p: "معنا",
f: "ma'anaa",
f: "ma'náa",
},
out: "مَعْنا",
},
{
in: {
p: "معنا",
f: "maanáa",
},
out: "مَعَنا",
},
// TODO: Should be allowed to use a short vowel as well
// طمع - tama // استعمال - istimaal
// TODO: Starting like عام اعتصاب etc.
// {
// in: {
// p: "اعتصاب شکن",
// f: "itisaab shakan",
// },
// out: "اِعتِصاب شَکَن",
// },
// // starting with ع
// {
// in: {
// p: "عزت",
// f: "izzat",
// },
// out: "عِزَّت",
// },
// {
// in: {
// p: "عزت",
// f: "i'zzat",
// },
// out: "عِزَّت",
// },
// // middle ع
// {
// in: {
// p: "معنا",
// f: "ma'anaa",
// },
// out: "مَعَنا",
// },
// ending with ayn
// {
// in: {
@ -683,34 +764,34 @@ diacriticsSections.forEach((section) => {
// ERRORS
// const brokenDiacritics = [
// {
// p: "تشناب",
// f: "peshnaab",
// },
// {
// p: "وسېدل",
// f: "osedul",
// },
// ];
const brokenDiacritics = [
{
p: "تشناب",
f: "peshnaab",
},
{
p: "وسېدل",
f: "osedul",
},
];
// test("ending with left over Pashto script will throw an error", () => {
// expect(() => {
// addDiacritics({ p: "کور ته", f: "kor" });
// }).toThrow(`phonetics error - phonetics shorter than pashto script`);
// });
test("ending with left over Pashto script will throw an error", () => {
expect(() => {
addDiacritics({ p: "کور ته", f: "kor" });
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
});
// test("ending with left over phonetics will throw an error", () => {
// expect(() => {
// addDiacritics({ p: "کار", f: "kaar kawul" });
// }).toThrow();
// });
test("ending with left over phonetics will throw an error", () => {
expect(() => {
addDiacritics({ p: "کار", f: "kaar kawul" });
}).toThrow();
});
// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
// brokenDiacritics.forEach((t) => {
// expect(() => {
// addDiacritics(t);
// }).toThrow();
// });
// });
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
brokenDiacritics.forEach((t) => {
expect(() => {
addDiacritics(t);
}).toThrow();
});
});

View File

@ -27,8 +27,7 @@ import {
advanceP,
reverseP,
overwriteP,
advanceForAin,
advanceForAinOrHamza,
advanceForHamza,
advanceForHamzaMid,
DiacriticsAccumulator,
} from "./diacritics-helpers";
@ -61,14 +60,13 @@ enum PhonemeStatus {
DirectMatchAfterSukun,
EndingWithHeyHimFromSukun,
ShortVowel,
ShortVowelBeforeAin,
ShortVowelAfterAin,
PersianSilentWWithAa,
ArabicWasla,
Izafe,
EndOfDuParticle,
HaEndingWithHeem,
AlefDaggarEnding,
LongAinVowelMissingComma,
}
function processPhoneme(
@ -112,7 +110,6 @@ function processPhoneme(
pipe(
advanceP,
addP(diacritic),
advanceForAin,
)(state)
: (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
pipe(
@ -171,25 +168,19 @@ function processPhoneme(
advanceP,
advanceP,
)(state)
: (phs === PhonemeStatus.ShortVowelBeforeAin) ?
: (phs === PhonemeStatus.LongAinVowelMissingComma) ?
pipe(
// this is pretty messed up because for some reason the reverseP goes back one more step when it's an ain before it
reverseP,
advanceP,
addP(diacritic),
// overwriteP(diacritic || ""),
)(state)
: (phs === PhonemeStatus.ShortVowelAfterAin) ?
pipe(
advanceP,
addP(diacritic),
addP(diacritic)
)(state)
:
// phs === PhonemeState.ShortVowel
pipe(
advanceForHamzaMid,
addP(phonemeInfo.diacritic),
advanceForAinOrHamza,
// TODO THIS?
advanceForHamza,
)(state);
}
@ -214,7 +205,11 @@ function stateInfo({ state, i, phonemes, phoneme }: {
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
const diacritic = useAinBlendDiacritics
? phonemeInfo.ainBlendDiacritic
: isEndOfWord
? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
function getPhonemeState(): PhonemeStatus {
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
@ -243,6 +238,9 @@ function stateInfo({ state, i, phonemes, phoneme }: {
if (phoneme === "-i-" && isBeginningOfWord) {
return PhonemeStatus.Izafe;
}
if (useAinBlendDiacritics) {
return PhonemeStatus.LongAinVowelMissingComma;
}
if (needsTashdeed) {
return PhonemeStatus.DoubleConsonantTashdeed;
}
@ -259,14 +257,7 @@ function stateInfo({ state, i, phonemes, phoneme }: {
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
}
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
// weird ayn behaviour because it automatically advances and ignores it at the beginning of the process
// console.log("looking prev", prevPLetter);
// console.log("looking next", currentPLetter);
return prevPLetter === "ع"
? PhonemeStatus.ShortVowelBeforeAin
: currentPLetter === "ع"
? PhonemeStatus.ShortVowelAfterAin
: PhonemeStatus.ShortVowel;
return PhonemeStatus.ShortVowel;
}
// console.log("bad phoneme is ", phoneme);
throw new Error("phonetics error - no status found for phoneme: " + phoneme);