more work on phonetics
This commit is contained in:
parent
0ff0548775
commit
aad1b34e17
|
@ -6,8 +6,7 @@ import {
|
||||||
advanceP,
|
advanceP,
|
||||||
reverseP,
|
reverseP,
|
||||||
overwriteP,
|
overwriteP,
|
||||||
advanceForAin,
|
advanceForHamza,
|
||||||
advanceForAinOrHamza,
|
|
||||||
advanceForHamzaMid,
|
advanceForHamzaMid,
|
||||||
} from "./diacritics-helpers";
|
} from "./diacritics-helpers";
|
||||||
|
|
||||||
|
@ -23,6 +22,10 @@ const phonemeSplits: Array<{
|
||||||
in: "raaghey",
|
in: "raaghey",
|
||||||
out: ["r", "aa", "gh", "ey"],
|
out: ["r", "aa", "gh", "ey"],
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
in: "ist'imaal",
|
||||||
|
out: ["i", "s", "t", "'", "i", "m", "aa", "l"],
|
||||||
|
},
|
||||||
{
|
{
|
||||||
in: "hatsa",
|
in: "hatsa",
|
||||||
out: ["h", "a", "ts", "a"],
|
out: ["h", "a", "ts", "a"],
|
||||||
|
|
|
@ -28,6 +28,7 @@ type PhonemeInfo = {
|
||||||
longVowel?: true,
|
longVowel?: true,
|
||||||
canStartWithAynBefore?: true,
|
canStartWithAynBefore?: true,
|
||||||
useEndingDiacritic?: true,
|
useEndingDiacritic?: true,
|
||||||
|
ainBlendDiacritic?: string,
|
||||||
}
|
}
|
||||||
|
|
||||||
export const zwar = "َ";
|
export const zwar = "َ";
|
||||||
|
@ -188,13 +189,15 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||||
beginningMatches: ["آ", "ا"],
|
beginningMatches: ["آ", "ا"],
|
||||||
endingMatches: ["ا", "یٰ"],
|
endingMatches: ["ا", "یٰ"],
|
||||||
longVowel: true,
|
longVowel: true,
|
||||||
|
ainBlendDiacritic: zwar,
|
||||||
},
|
},
|
||||||
"ee": {
|
"ee": {
|
||||||
matches: ["ی"],
|
matches: ["ی"],
|
||||||
longVowel: true,
|
longVowel: true,
|
||||||
endingMatches: ["ي"],
|
endingMatches: ["ي"],
|
||||||
diacritic: zer,
|
diacritic: zer,
|
||||||
canStartWithAynBefore: true
|
canStartWithAynBefore: true,
|
||||||
|
ainBlendDiacritic: zer,
|
||||||
},
|
},
|
||||||
"e": {
|
"e": {
|
||||||
matches: ["ې"],
|
matches: ["ې"],
|
||||||
|
@ -210,6 +213,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||||
// alsoCanBePrefix: true,
|
// alsoCanBePrefix: true,
|
||||||
diacritic: pesh,
|
diacritic: pesh,
|
||||||
useEndingDiacritic: true,
|
useEndingDiacritic: true,
|
||||||
|
ainBlendDiacritic: pesh,
|
||||||
},
|
},
|
||||||
"ey": {
|
"ey": {
|
||||||
matches: ["ی"],
|
matches: ["ی"],
|
||||||
|
@ -262,13 +266,13 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||||
* @returns an array of phonemes
|
* @returns an array of phonemes
|
||||||
*/
|
*/
|
||||||
export function splitFIntoPhonemes(fIn: string): Phoneme[] {
|
export function splitFIntoPhonemes(fIn: string): Phoneme[] {
|
||||||
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
|
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y", "'"];
|
||||||
|
|
||||||
const quadrigraphs: Phoneme[] = ["-Ul-"];
|
const quadrigraphs: Phoneme[] = ["-Ul-"];
|
||||||
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
|
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
|
||||||
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
|
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
|
||||||
const endingDigraphs: Phoneme[] = ["uy"];
|
const endingDigraphs: Phoneme[] = ["uy"];
|
||||||
const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
|
const willIgnore = ["?", " ", "`", ".", "…", ","];
|
||||||
|
|
||||||
const result: Phoneme[] = [];
|
const result: Phoneme[] = [];
|
||||||
const f = removeAccents(fIn);
|
const f = removeAccents(fIn);
|
||||||
|
@ -372,10 +376,10 @@ export function getCurrentNext(state: DiacriticsAccumulator): { current: string,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
// export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||||
const { current } = getCurrentNext(state);
|
// const { current } = getCurrentNext(state);
|
||||||
return (current === "ع") ? advanceP(state) : state;
|
// return (current === "ع") ? advanceP(state) : state;
|
||||||
}
|
// }
|
||||||
|
|
||||||
export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||||
const { current, next } = getCurrentNext(state);
|
const { current, next } = getCurrentNext(state);
|
||||||
|
@ -385,14 +389,14 @@ export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccu
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
export function advanceForHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||||
const { current, next } = getCurrentNext(state);
|
const { current, next } = getCurrentNext(state);
|
||||||
if (current === "ه" && (!next || next === " ")) {
|
if (current === "ه" && (!next || next === " ")) {
|
||||||
return advanceP(state);
|
return advanceP(state);
|
||||||
}
|
}
|
||||||
if (current === "ع") {
|
// if (current === "ع") {
|
||||||
return advanceP(state);
|
// return advanceP(state);
|
||||||
}
|
// }
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -485,34 +485,115 @@ const diacriticsSections: {
|
||||||
tests: [
|
tests: [
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
p: "اعتصاب شکن",
|
p: "بعد",
|
||||||
f: "itisaab shakan",
|
f: "ba'd",
|
||||||
},
|
},
|
||||||
out: "اِعتِصاب شَکَن",
|
out: "بَعْد",
|
||||||
},
|
|
||||||
// starting with ع
|
|
||||||
{
|
|
||||||
in: {
|
|
||||||
p: "عزت",
|
|
||||||
f: "izzat",
|
|
||||||
},
|
|
||||||
out: "عِزَّت",
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
p: "عزت",
|
p: "بعد",
|
||||||
f: "i'zzat",
|
f: "b'ad",
|
||||||
},
|
},
|
||||||
out: "عِزَّت",
|
out: "بْعَد",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "بعد",
|
||||||
|
f: "ba'ad",
|
||||||
|
},
|
||||||
|
out: "بَعَد",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "بعد",
|
||||||
|
f: "baad",
|
||||||
|
},
|
||||||
|
out: "بَعَد",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "بعد",
|
||||||
|
f: "bad",
|
||||||
|
},
|
||||||
|
// TODO: Should this really be an error?
|
||||||
|
out: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "معلوم",
|
||||||
|
f: "maaloom",
|
||||||
|
},
|
||||||
|
out: "مَعَلُوم",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "منبع",
|
||||||
|
f: "manbi'",
|
||||||
|
},
|
||||||
|
out: "مَنْبِع",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "منبع",
|
||||||
|
f: "manb'i",
|
||||||
|
},
|
||||||
|
out: "مَنْبْعِ"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "منبع",
|
||||||
|
f: "manbee",
|
||||||
|
},
|
||||||
|
out: "مَنْبِعِ",
|
||||||
},
|
},
|
||||||
// middle ع
|
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
p: "معنا",
|
p: "معنا",
|
||||||
f: "ma'anaa",
|
f: "ma'náa",
|
||||||
|
},
|
||||||
|
out: "مَعْنا",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "معنا",
|
||||||
|
f: "maanáa",
|
||||||
},
|
},
|
||||||
out: "مَعَنا",
|
out: "مَعَنا",
|
||||||
},
|
},
|
||||||
|
// TODO: Should be allowed to use a short vowel as well
|
||||||
|
// طمع - tama // استعمال - istimaal
|
||||||
|
// TODO: Starting like عام اعتصاب etc.
|
||||||
|
// {
|
||||||
|
// in: {
|
||||||
|
// p: "اعتصاب شکن",
|
||||||
|
// f: "itisaab shakan",
|
||||||
|
// },
|
||||||
|
// out: "اِعتِصاب شَکَن",
|
||||||
|
// },
|
||||||
|
// // starting with ع
|
||||||
|
// {
|
||||||
|
// in: {
|
||||||
|
// p: "عزت",
|
||||||
|
// f: "izzat",
|
||||||
|
// },
|
||||||
|
// out: "عِزَّت",
|
||||||
|
// },
|
||||||
|
// {
|
||||||
|
// in: {
|
||||||
|
// p: "عزت",
|
||||||
|
// f: "i'zzat",
|
||||||
|
// },
|
||||||
|
// out: "عِزَّت",
|
||||||
|
// },
|
||||||
|
// // middle ع
|
||||||
|
// {
|
||||||
|
// in: {
|
||||||
|
// p: "معنا",
|
||||||
|
// f: "ma'anaa",
|
||||||
|
// },
|
||||||
|
// out: "مَعَنا",
|
||||||
|
// },
|
||||||
// ending with ayn
|
// ending with ayn
|
||||||
// {
|
// {
|
||||||
// in: {
|
// in: {
|
||||||
|
@ -683,34 +764,34 @@ diacriticsSections.forEach((section) => {
|
||||||
|
|
||||||
// ERRORS
|
// ERRORS
|
||||||
|
|
||||||
// const brokenDiacritics = [
|
const brokenDiacritics = [
|
||||||
// {
|
{
|
||||||
// p: "تشناب",
|
p: "تشناب",
|
||||||
// f: "peshnaab",
|
f: "peshnaab",
|
||||||
// },
|
},
|
||||||
// {
|
{
|
||||||
// p: "وسېدل",
|
p: "وسېدل",
|
||||||
// f: "osedul",
|
f: "osedul",
|
||||||
// },
|
},
|
||||||
// ];
|
];
|
||||||
|
|
||||||
// test("ending with left over Pashto script will throw an error", () => {
|
test("ending with left over Pashto script will throw an error", () => {
|
||||||
// expect(() => {
|
expect(() => {
|
||||||
// addDiacritics({ p: "کور ته", f: "kor" });
|
addDiacritics({ p: "کور ته", f: "kor" });
|
||||||
// }).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
||||||
// });
|
});
|
||||||
|
|
||||||
// test("ending with left over phonetics will throw an error", () => {
|
test("ending with left over phonetics will throw an error", () => {
|
||||||
// expect(() => {
|
expect(() => {
|
||||||
// addDiacritics({ p: "کار", f: "kaar kawul" });
|
addDiacritics({ p: "کار", f: "kaar kawul" });
|
||||||
// }).toThrow();
|
}).toThrow();
|
||||||
// });
|
});
|
||||||
|
|
||||||
// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
||||||
// brokenDiacritics.forEach((t) => {
|
brokenDiacritics.forEach((t) => {
|
||||||
// expect(() => {
|
expect(() => {
|
||||||
// addDiacritics(t);
|
addDiacritics(t);
|
||||||
// }).toThrow();
|
}).toThrow();
|
||||||
// });
|
});
|
||||||
// });
|
});
|
||||||
|
|
||||||
|
|
|
@ -27,8 +27,7 @@ import {
|
||||||
advanceP,
|
advanceP,
|
||||||
reverseP,
|
reverseP,
|
||||||
overwriteP,
|
overwriteP,
|
||||||
advanceForAin,
|
advanceForHamza,
|
||||||
advanceForAinOrHamza,
|
|
||||||
advanceForHamzaMid,
|
advanceForHamzaMid,
|
||||||
DiacriticsAccumulator,
|
DiacriticsAccumulator,
|
||||||
} from "./diacritics-helpers";
|
} from "./diacritics-helpers";
|
||||||
|
@ -61,14 +60,13 @@ enum PhonemeStatus {
|
||||||
DirectMatchAfterSukun,
|
DirectMatchAfterSukun,
|
||||||
EndingWithHeyHimFromSukun,
|
EndingWithHeyHimFromSukun,
|
||||||
ShortVowel,
|
ShortVowel,
|
||||||
ShortVowelBeforeAin,
|
|
||||||
ShortVowelAfterAin,
|
|
||||||
PersianSilentWWithAa,
|
PersianSilentWWithAa,
|
||||||
ArabicWasla,
|
ArabicWasla,
|
||||||
Izafe,
|
Izafe,
|
||||||
EndOfDuParticle,
|
EndOfDuParticle,
|
||||||
HaEndingWithHeem,
|
HaEndingWithHeem,
|
||||||
AlefDaggarEnding,
|
AlefDaggarEnding,
|
||||||
|
LongAinVowelMissingComma,
|
||||||
}
|
}
|
||||||
|
|
||||||
function processPhoneme(
|
function processPhoneme(
|
||||||
|
@ -112,7 +110,6 @@ function processPhoneme(
|
||||||
pipe(
|
pipe(
|
||||||
advanceP,
|
advanceP,
|
||||||
addP(diacritic),
|
addP(diacritic),
|
||||||
advanceForAin,
|
|
||||||
)(state)
|
)(state)
|
||||||
: (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
|
: (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
|
||||||
pipe(
|
pipe(
|
||||||
|
@ -171,25 +168,19 @@ function processPhoneme(
|
||||||
advanceP,
|
advanceP,
|
||||||
advanceP,
|
advanceP,
|
||||||
)(state)
|
)(state)
|
||||||
: (phs === PhonemeStatus.ShortVowelBeforeAin) ?
|
: (phs === PhonemeStatus.LongAinVowelMissingComma) ?
|
||||||
pipe(
|
pipe(
|
||||||
// this is pretty messed up because for some reason the reverseP goes back one more step when it's an ain before it
|
|
||||||
reverseP,
|
|
||||||
advanceP,
|
|
||||||
addP(diacritic),
|
addP(diacritic),
|
||||||
// overwriteP(diacritic || ""),
|
|
||||||
)(state)
|
|
||||||
: (phs === PhonemeStatus.ShortVowelAfterAin) ?
|
|
||||||
pipe(
|
|
||||||
advanceP,
|
advanceP,
|
||||||
addP(diacritic),
|
addP(diacritic)
|
||||||
)(state)
|
)(state)
|
||||||
:
|
:
|
||||||
// phs === PhonemeState.ShortVowel
|
// phs === PhonemeState.ShortVowel
|
||||||
pipe(
|
pipe(
|
||||||
advanceForHamzaMid,
|
advanceForHamzaMid,
|
||||||
addP(phonemeInfo.diacritic),
|
addP(phonemeInfo.diacritic),
|
||||||
advanceForAinOrHamza,
|
// TODO THIS?
|
||||||
|
advanceForHamza,
|
||||||
)(state);
|
)(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -214,7 +205,11 @@ function stateInfo({ state, i, phonemes, phoneme }: {
|
||||||
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
||||||
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
|
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
|
||||||
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
|
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
|
||||||
const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
|
const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
|
||||||
|
const diacritic = useAinBlendDiacritics
|
||||||
|
? phonemeInfo.ainBlendDiacritic
|
||||||
|
: isEndOfWord
|
||||||
|
? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
|
||||||
|
|
||||||
function getPhonemeState(): PhonemeStatus {
|
function getPhonemeState(): PhonemeStatus {
|
||||||
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
|
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
|
||||||
|
@ -243,6 +238,9 @@ function stateInfo({ state, i, phonemes, phoneme }: {
|
||||||
if (phoneme === "-i-" && isBeginningOfWord) {
|
if (phoneme === "-i-" && isBeginningOfWord) {
|
||||||
return PhonemeStatus.Izafe;
|
return PhonemeStatus.Izafe;
|
||||||
}
|
}
|
||||||
|
if (useAinBlendDiacritics) {
|
||||||
|
return PhonemeStatus.LongAinVowelMissingComma;
|
||||||
|
}
|
||||||
if (needsTashdeed) {
|
if (needsTashdeed) {
|
||||||
return PhonemeStatus.DoubleConsonantTashdeed;
|
return PhonemeStatus.DoubleConsonantTashdeed;
|
||||||
}
|
}
|
||||||
|
@ -259,14 +257,7 @@ function stateInfo({ state, i, phonemes, phoneme }: {
|
||||||
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
|
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
|
||||||
}
|
}
|
||||||
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
||||||
// weird ayn behaviour because it automatically advances and ignores it at the beginning of the process
|
return PhonemeStatus.ShortVowel;
|
||||||
// console.log("looking prev", prevPLetter);
|
|
||||||
// console.log("looking next", currentPLetter);
|
|
||||||
return prevPLetter === "ع"
|
|
||||||
? PhonemeStatus.ShortVowelBeforeAin
|
|
||||||
: currentPLetter === "ع"
|
|
||||||
? PhonemeStatus.ShortVowelAfterAin
|
|
||||||
: PhonemeStatus.ShortVowel;
|
|
||||||
}
|
}
|
||||||
// console.log("bad phoneme is ", phoneme);
|
// console.log("bad phoneme is ", phoneme);
|
||||||
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
|
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
|
||||||
|
|
Loading…
Reference in New Issue