This commit is contained in:
Bill D 2021-05-07 10:54:09 +03:00
parent a2b5626514
commit 98c5eb7452
2 changed files with 127 additions and 66 deletions

View File

@ -69,20 +69,6 @@ const phonemeSplits: Array<{
}, },
]; ];
const badPhonetics: Array<{
in: string,
problem: string,
}> = [
{
in: "acar",
problem: "c",
},
{
in: "a7am",
problem: "7",
},
];
const diacriticsTest: Array<{ const diacriticsTest: Array<{
in: T.PsString, in: T.PsString,
out: string, out: string,
@ -101,6 +87,13 @@ const diacriticsTest: Array<{
}, },
out: "کُور", out: "کُور",
}, },
{
in: {
p: "کور کور",
f: "kor koor",
},
out: "کور کُور",
},
{ {
in: { in: {
p: "تب", p: "تب",
@ -242,6 +235,22 @@ const diacriticsTest: Array<{
}, },
out: "پېش", out: "پېش",
}, },
{
in: {
p: "لیک",
f: "leek",
},
out: "لِیک",
},
// starting alefs
{
in: {
p: "اسلام",
f: "islaam",
},
out: "اِسْلام",
},
// double consonant
{ {
in: { in: {
p: "بتن", p: "بتن",
@ -249,12 +258,13 @@ const diacriticsTest: Array<{
}, },
out: "ب" + zwar + "ت" + tashdeed + zwar + "ن", out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
}, },
]; // avoid false double consonant
const brokenDiacritics = [
{ {
p: "تشناب", in: {
f: "peshnaab", p: "ازل لیک",
f: "azalléek",
},
out: "اَزَل لِیک",
}, },
]; ];
@ -265,6 +275,35 @@ phonemeSplits.forEach((s) => {
}); });
}); });
test("adding diacritics should work", () => {
diacriticsTest.forEach((t) => {
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
});
});
// ERRORS
const brokenDiacritics = [
{
p: "تشناب",
f: "peshnaab",
},
];
const badPhonetics: Array<{
in: string,
problem: string,
}> = [
{
in: "acar",
problem: "c",
},
{
in: "a7am",
problem: "7",
},
];
test("bad phonetic characters should throw an error", () => { test("bad phonetic characters should throw an error", () => {
badPhonetics.forEach((s) => { badPhonetics.forEach((s) => {
expect(() => { expect(() => {
@ -279,10 +318,10 @@ test("ending with left over Pashto script will throw an error", () => {
}).toThrow(`phonetics error - phonetics shorter than pashto script`); }).toThrow(`phonetics error - phonetics shorter than pashto script`);
}); });
test("adding diacritics should work", () => { test("ending with left over phonetics will throw an error", () => {
diacriticsTest.forEach((t) => { expect(() => {
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f }); addDiacritics({ p: "کار", f: "kaar kawul" });
}); }).toThrow();
}); });
test("adding diacritics errors when phonetecs and pashto do not line up", () => { test("adding diacritics errors when phonetecs and pashto do not line up", () => {

View File

@ -28,6 +28,8 @@ type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
type ShortVowel = "a" | "i" | "u" | "U"; type ShortVowel = "a" | "i" | "u" | "U";
type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel; type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
type DiacriticsAccumulator = { pIn: string, pOut: string };
type PhonemeInfo = { type PhonemeInfo = {
matches?: string[], matches?: string[],
beginningMatches?: string[], beginningMatches?: string[],
@ -225,6 +227,7 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
"a": { "a": {
diacritic: zwar, diacritic: zwar,
endingMatches: ["ه"], endingMatches: ["ه"],
beginningMatches: ["ا"],
// canComeAfterHeyEnding: true, // canComeAfterHeyEnding: true,
// canBeFirstPartOfFathahanEnding: true, // canBeFirstPartOfFathahanEnding: true,
}, },
@ -304,62 +307,74 @@ export function splitFIntoPhonemes(fIn: string): Phoneme[] {
} }
/** /**
* Adds phonetis to a given PsString. * Adds diacritics to a given PsString.
* Errors if the phonetics and script don't line up. * Errors if the phonetics and script don't line up.
* *
* @param ps a PsSTring without phonetics * @param ps a PsSTring without phonetics
*/ */
export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString { export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
// TODO:
const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f); const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p });
if (pIn !== "") {
throw new Error("phonetics error - phonetics shorter than pashto script");
}
return {
p: pOut,
f,
};
}
const { pIn, pOut } = phonemes.reduce((acc, phoneme, i) => { function processPhoneme(
const prevPLetter = last(acc.pOut); acc: DiacriticsAccumulator,
const isBeginningOfWord = acc.pOut === "" || prevPLetter === " "; phoneme: Phoneme,
i: number,
phonemes: Phoneme[],
) {
// Prep state
const state = acc.pIn[0] === " " ? advanceP(acc) : acc;
// WARNING: Do not use acc after this point!
const prevPLetter = last(state.pOut);
const currentPLetter = state.pIn[0];
// const nextPLetter = state.pIn[1];
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
const phonemeInfo = phonemeTable[phoneme]; const phonemeInfo = phonemeTable[phoneme];
const previousPhoneme = i > 0 && phonemes[i-1]; const previousPhoneme = i > 0 && phonemes[i-1];
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
const currentPLetter = acc.pIn[0];
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme); const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme);
const needsSukun = doubleConsonant && (previousPhoneme !== phoneme); const needsSukun = doubleConsonant && (previousPhoneme !== phoneme);
if (needsTashdeed) { if (needsTashdeed) {
return { return {
pOut: acc.pOut + tashdeed, pOut: state.pOut + tashdeed,
pIn: acc.pIn, pIn: state.pIn,
}; };
} }
if (phonemeInfo.matches?.includes(currentPLetter)) { // TODO: Beginning of word with long vowels and alef etc.
if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
const ns = advanceP(state);
return { return {
pOut: acc.pOut ...ns,
+ (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "") pOut: ns.pOut + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""),
+ currentPLetter,
pIn: acc.pIn.slice(1),
}; };
} else if (phonemeInfo.matches?.includes(currentPLetter)) {
return advanceP({
...state,
pOut: state.pOut
+ (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""),
});
} }
if (phonemeInfo.diacritic) { if (phonemeInfo.diacritic) {
return { return {
pOut: acc.pOut + phonemeInfo.diacritic, ...state,
pIn: acc.pIn, pOut: state.pOut + phonemeInfo.diacritic,
};
} }
}
// TODO: CHECK IF PASHTO IS SHORTER THAN PHONETICS
throw new Error("phonetics error"); throw new Error("phonetics error");
}, { pOut: "", pIn: p });
if (pIn !== "") {
throw new Error("phonetics error - phonetics shorter than pashto script");
}
return {
p: pOut,
f,
};
} }
/** /**
@ -370,3 +385,10 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
function last(s: string) { function last(s: string) {
return s[s.length - 1]; return s[s.length - 1];
} }
function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
return {
pOut: state.pOut + state.pIn.slice(0, n),
pIn: state.pIn.slice(n),
}
}