more work on phonetics

This commit is contained in:
Bill D 2021-05-27 11:06:30 +04:30
parent 0ff0548775
commit aad1b34e17
4 changed files with 160 additions and 81 deletions

View File

@ -6,8 +6,7 @@ import {
advanceP, advanceP,
reverseP, reverseP,
overwriteP, overwriteP,
advanceForAin, advanceForHamza,
advanceForAinOrHamza,
advanceForHamzaMid, advanceForHamzaMid,
} from "./diacritics-helpers"; } from "./diacritics-helpers";
@ -23,6 +22,10 @@ const phonemeSplits: Array<{
in: "raaghey", in: "raaghey",
out: ["r", "aa", "gh", "ey"], out: ["r", "aa", "gh", "ey"],
}, },
{
in: "ist'imaal",
out: ["i", "s", "t", "'", "i", "m", "aa", "l"],
},
{ {
in: "hatsa", in: "hatsa",
out: ["h", "a", "ts", "a"], out: ["h", "a", "ts", "a"],

View File

@ -28,6 +28,7 @@ type PhonemeInfo = {
longVowel?: true, longVowel?: true,
canStartWithAynBefore?: true, canStartWithAynBefore?: true,
useEndingDiacritic?: true, useEndingDiacritic?: true,
ainBlendDiacritic?: string,
} }
export const zwar = "َ"; export const zwar = "َ";
@ -188,13 +189,15 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
beginningMatches: ["آ", "ا"], beginningMatches: ["آ", "ا"],
endingMatches: ["ا", "یٰ"], endingMatches: ["ا", "یٰ"],
longVowel: true, longVowel: true,
ainBlendDiacritic: zwar,
}, },
"ee": { "ee": {
matches: ["ی"], matches: ["ی"],
longVowel: true, longVowel: true,
endingMatches: ["ي"], endingMatches: ["ي"],
diacritic: zer, diacritic: zer,
canStartWithAynBefore: true canStartWithAynBefore: true,
ainBlendDiacritic: zer,
}, },
"e": { "e": {
matches: ["ې"], matches: ["ې"],
@ -210,6 +213,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
// alsoCanBePrefix: true, // alsoCanBePrefix: true,
diacritic: pesh, diacritic: pesh,
useEndingDiacritic: true, useEndingDiacritic: true,
ainBlendDiacritic: pesh,
}, },
"ey": { "ey": {
matches: ["ی"], matches: ["ی"],
@ -262,13 +266,13 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
* @returns an array of phonemes * @returns an array of phonemes
*/ */
export function splitFIntoPhonemes(fIn: string): Phoneme[] { export function splitFIntoPhonemes(fIn: string): Phoneme[] {
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"]; const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y", "'"];
const quadrigraphs: Phoneme[] = ["-Ul-"]; const quadrigraphs: Phoneme[] = ["-Ul-"];
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
const endingDigraphs: Phoneme[] = ["uy"]; const endingDigraphs: Phoneme[] = ["uy"];
const willIgnore = ["?", " ", "`", ".", "…", ",", "'"]; const willIgnore = ["?", " ", "`", ".", "…", ","];
const result: Phoneme[] = []; const result: Phoneme[] = [];
const f = removeAccents(fIn); const f = removeAccents(fIn);
@ -372,10 +376,10 @@ export function getCurrentNext(state: DiacriticsAccumulator): { current: string,
}; };
} }
export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator { // export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current } = getCurrentNext(state); // const { current } = getCurrentNext(state);
return (current === "ع") ? advanceP(state) : state; // return (current === "ع") ? advanceP(state) : state;
} // }
export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator { export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state); const { current, next } = getCurrentNext(state);
@ -385,14 +389,14 @@ export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccu
return state; return state;
} }
export function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator { export function advanceForHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state); const { current, next } = getCurrentNext(state);
if (current === "ه" && (!next || next === " ")) { if (current === "ه" && (!next || next === " ")) {
return advanceP(state); return advanceP(state);
} }
if (current === "ع") { // if (current === "ع") {
return advanceP(state); // return advanceP(state);
} // }
return state; return state;
} }

View File

@ -485,34 +485,115 @@ const diacriticsSections: {
tests: [ tests: [
{ {
in: { in: {
p: "اعتصاب شکن", p: "بعد",
f: "itisaab shakan", f: "ba'd",
}, },
out: "اِعتِصاب شَکَن", out: "بَعْد",
},
// starting with ع
{
in: {
p: "عزت",
f: "izzat",
},
out: "عِزَّت",
}, },
{ {
in: { in: {
p: "عزت", p: "بعد",
f: "i'zzat", f: "b'ad",
}, },
out: "عِزَّت", out: "بْعَد",
},
{
in: {
p: "بعد",
f: "ba'ad",
},
out: "بَعَد",
},
{
in: {
p: "بعد",
f: "baad",
},
out: "بَعَد",
},
{
in: {
p: "بعد",
f: "bad",
},
// TODO: Should this really be an error?
out: null,
},
{
in: {
p: "معلوم",
f: "maaloom",
},
out: "مَعَلُوم",
},
{
in: {
p: "منبع",
f: "manbi'",
},
out: "مَنْبِع",
},
{
in: {
p: "منبع",
f: "manb'i",
},
out: "مَنْبْعِ"
},
{
in: {
p: "منبع",
f: "manbee",
},
out: "مَنْبِعِ",
}, },
// middle ع
{ {
in: { in: {
p: "معنا", p: "معنا",
f: "ma'anaa", f: "ma'náa",
},
out: "مَعْنا",
},
{
in: {
p: "معنا",
f: "maanáa",
}, },
out: "مَعَنا", out: "مَعَنا",
}, },
// TODO: Should be allowed to use a short vowel as well
// طمع - tama // استعمال - istimaal
// TODO: Starting like عام اعتصاب etc.
// {
// in: {
// p: "اعتصاب شکن",
// f: "itisaab shakan",
// },
// out: "اِعتِصاب شَکَن",
// },
// // starting with ع
// {
// in: {
// p: "عزت",
// f: "izzat",
// },
// out: "عِزَّت",
// },
// {
// in: {
// p: "عزت",
// f: "i'zzat",
// },
// out: "عِزَّت",
// },
// // middle ع
// {
// in: {
// p: "معنا",
// f: "ma'anaa",
// },
// out: "مَعَنا",
// },
// ending with ayn // ending with ayn
// { // {
// in: { // in: {
@ -683,34 +764,34 @@ diacriticsSections.forEach((section) => {
// ERRORS // ERRORS
// const brokenDiacritics = [ const brokenDiacritics = [
// { {
// p: "تشناب", p: "تشناب",
// f: "peshnaab", f: "peshnaab",
// }, },
// { {
// p: "وسېدل", p: "وسېدل",
// f: "osedul", f: "osedul",
// }, },
// ]; ];
// test("ending with left over Pashto script will throw an error", () => { test("ending with left over Pashto script will throw an error", () => {
// expect(() => { expect(() => {
// addDiacritics({ p: "کور ته", f: "kor" }); addDiacritics({ p: "کور ته", f: "kor" });
// }).toThrow(`phonetics error - phonetics shorter than pashto script`); }).toThrow(`phonetics error - phonetics shorter than pashto script`);
// }); });
// test("ending with left over phonetics will throw an error", () => { test("ending with left over phonetics will throw an error", () => {
// expect(() => { expect(() => {
// addDiacritics({ p: "کار", f: "kaar kawul" }); addDiacritics({ p: "کار", f: "kaar kawul" });
// }).toThrow(); }).toThrow();
// }); });
// test("adding diacritics errors when phonetecs and pashto do not line up", () => { test("adding diacritics errors when phonetecs and pashto do not line up", () => {
// brokenDiacritics.forEach((t) => { brokenDiacritics.forEach((t) => {
// expect(() => { expect(() => {
// addDiacritics(t); addDiacritics(t);
// }).toThrow(); }).toThrow();
// }); });
// }); });

View File

@ -27,8 +27,7 @@ import {
advanceP, advanceP,
reverseP, reverseP,
overwriteP, overwriteP,
advanceForAin, advanceForHamza,
advanceForAinOrHamza,
advanceForHamzaMid, advanceForHamzaMid,
DiacriticsAccumulator, DiacriticsAccumulator,
} from "./diacritics-helpers"; } from "./diacritics-helpers";
@ -61,14 +60,13 @@ enum PhonemeStatus {
DirectMatchAfterSukun, DirectMatchAfterSukun,
EndingWithHeyHimFromSukun, EndingWithHeyHimFromSukun,
ShortVowel, ShortVowel,
ShortVowelBeforeAin,
ShortVowelAfterAin,
PersianSilentWWithAa, PersianSilentWWithAa,
ArabicWasla, ArabicWasla,
Izafe, Izafe,
EndOfDuParticle, EndOfDuParticle,
HaEndingWithHeem, HaEndingWithHeem,
AlefDaggarEnding, AlefDaggarEnding,
LongAinVowelMissingComma,
} }
function processPhoneme( function processPhoneme(
@ -112,7 +110,6 @@ function processPhoneme(
pipe( pipe(
advanceP, advanceP,
addP(diacritic), addP(diacritic),
advanceForAin,
)(state) )(state)
: (phs === PhonemeStatus.DoubleConsonantTashdeed) ? : (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
pipe( pipe(
@ -171,25 +168,19 @@ function processPhoneme(
advanceP, advanceP,
advanceP, advanceP,
)(state) )(state)
: (phs === PhonemeStatus.ShortVowelBeforeAin) ? : (phs === PhonemeStatus.LongAinVowelMissingComma) ?
pipe( pipe(
// this is pretty messed up because for some reason the reverseP goes back one more step when it's an ain before it
reverseP,
advanceP,
addP(diacritic), addP(diacritic),
// overwriteP(diacritic || ""),
)(state)
: (phs === PhonemeStatus.ShortVowelAfterAin) ?
pipe(
advanceP, advanceP,
addP(diacritic), addP(diacritic)
)(state) )(state)
: :
// phs === PhonemeState.ShortVowel // phs === PhonemeState.ShortVowel
pipe( pipe(
advanceForHamzaMid, advanceForHamzaMid,
addP(phonemeInfo.diacritic), addP(phonemeInfo.diacritic),
advanceForAinOrHamza, // TODO THIS?
advanceForHamza,
)(state); )(state);
} }
@ -214,7 +205,11 @@ function stateInfo({ state, i, phonemes, phoneme }: {
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter); const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter)); const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic; const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
const diacritic = useAinBlendDiacritics
? phonemeInfo.ainBlendDiacritic
: isEndOfWord
? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
function getPhonemeState(): PhonemeStatus { function getPhonemeState(): PhonemeStatus {
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) { if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
@ -243,6 +238,9 @@ function stateInfo({ state, i, phonemes, phoneme }: {
if (phoneme === "-i-" && isBeginningOfWord) { if (phoneme === "-i-" && isBeginningOfWord) {
return PhonemeStatus.Izafe; return PhonemeStatus.Izafe;
} }
if (useAinBlendDiacritics) {
return PhonemeStatus.LongAinVowelMissingComma;
}
if (needsTashdeed) { if (needsTashdeed) {
return PhonemeStatus.DoubleConsonantTashdeed; return PhonemeStatus.DoubleConsonantTashdeed;
} }
@ -259,14 +257,7 @@ function stateInfo({ state, i, phonemes, phoneme }: {
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch; return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
} }
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
// weird ayn behaviour because it automatically advances and ignores it at the beginning of the process return PhonemeStatus.ShortVowel;
// console.log("looking prev", prevPLetter);
// console.log("looking next", currentPLetter);
return prevPLetter === "ع"
? PhonemeStatus.ShortVowelBeforeAin
: currentPLetter === "ع"
? PhonemeStatus.ShortVowelAfterAin
: PhonemeStatus.ShortVowel;
} }
// console.log("bad phoneme is ", phoneme); // console.log("bad phoneme is ", phoneme);
throw new Error("phonetics error - no status found for phoneme: " + phoneme); throw new Error("phonetics error - no status found for phoneme: " + phoneme);