more
This commit is contained in:
parent
a2b5626514
commit
98c5eb7452
|
@ -69,20 +69,6 @@ const phonemeSplits: Array<{
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
const badPhonetics: Array<{
|
|
||||||
in: string,
|
|
||||||
problem: string,
|
|
||||||
}> = [
|
|
||||||
{
|
|
||||||
in: "acar",
|
|
||||||
problem: "c",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
in: "a7am",
|
|
||||||
problem: "7",
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const diacriticsTest: Array<{
|
const diacriticsTest: Array<{
|
||||||
in: T.PsString,
|
in: T.PsString,
|
||||||
out: string,
|
out: string,
|
||||||
|
@ -101,6 +87,13 @@ const diacriticsTest: Array<{
|
||||||
},
|
},
|
||||||
out: "کُور",
|
out: "کُور",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کور کور",
|
||||||
|
f: "kor koor",
|
||||||
|
},
|
||||||
|
out: "کور کُور",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
p: "تب",
|
p: "تب",
|
||||||
|
@ -242,6 +235,22 @@ const diacriticsTest: Array<{
|
||||||
},
|
},
|
||||||
out: "پېش",
|
out: "پېش",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "لیک",
|
||||||
|
f: "leek",
|
||||||
|
},
|
||||||
|
out: "لِیک",
|
||||||
|
},
|
||||||
|
// starting alefs
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "اسلام",
|
||||||
|
f: "islaam",
|
||||||
|
},
|
||||||
|
out: "اِسْلام",
|
||||||
|
},
|
||||||
|
// double consonant
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
p: "بتن",
|
p: "بتن",
|
||||||
|
@ -249,12 +258,13 @@ const diacriticsTest: Array<{
|
||||||
},
|
},
|
||||||
out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
|
out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
|
||||||
},
|
},
|
||||||
];
|
// avoid false double consonant
|
||||||
|
|
||||||
const brokenDiacritics = [
|
|
||||||
{
|
{
|
||||||
p: "تشناب",
|
in: {
|
||||||
f: "peshnaab",
|
p: "ازل لیک",
|
||||||
|
f: "azalléek",
|
||||||
|
},
|
||||||
|
out: "اَزَل لِیک",
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
|
@ -265,6 +275,35 @@ phonemeSplits.forEach((s) => {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("adding diacritics should work", () => {
|
||||||
|
diacriticsTest.forEach((t) => {
|
||||||
|
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ERRORS
|
||||||
|
|
||||||
|
const brokenDiacritics = [
|
||||||
|
{
|
||||||
|
p: "تشناب",
|
||||||
|
f: "peshnaab",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const badPhonetics: Array<{
|
||||||
|
in: string,
|
||||||
|
problem: string,
|
||||||
|
}> = [
|
||||||
|
{
|
||||||
|
in: "acar",
|
||||||
|
problem: "c",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: "a7am",
|
||||||
|
problem: "7",
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
test("bad phonetic characters should throw an error", () => {
|
test("bad phonetic characters should throw an error", () => {
|
||||||
badPhonetics.forEach((s) => {
|
badPhonetics.forEach((s) => {
|
||||||
expect(() => {
|
expect(() => {
|
||||||
|
@ -279,10 +318,10 @@ test("ending with left over Pashto script will throw an error", () => {
|
||||||
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("adding diacritics should work", () => {
|
test("ending with left over phonetics will throw an error", () => {
|
||||||
diacriticsTest.forEach((t) => {
|
expect(() => {
|
||||||
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
|
addDiacritics({ p: "کار", f: "kaar kawul" });
|
||||||
});
|
}).toThrow();
|
||||||
});
|
});
|
||||||
|
|
||||||
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
||||||
|
|
|
@ -28,6 +28,8 @@ type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
|
||||||
type ShortVowel = "a" | "i" | "u" | "U";
|
type ShortVowel = "a" | "i" | "u" | "U";
|
||||||
type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
|
type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
|
||||||
|
|
||||||
|
type DiacriticsAccumulator = { pIn: string, pOut: string };
|
||||||
|
|
||||||
type PhonemeInfo = {
|
type PhonemeInfo = {
|
||||||
matches?: string[],
|
matches?: string[],
|
||||||
beginningMatches?: string[],
|
beginningMatches?: string[],
|
||||||
|
@ -225,6 +227,7 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||||
"a": {
|
"a": {
|
||||||
diacritic: zwar,
|
diacritic: zwar,
|
||||||
endingMatches: ["ه"],
|
endingMatches: ["ه"],
|
||||||
|
beginningMatches: ["ا"],
|
||||||
// canComeAfterHeyEnding: true,
|
// canComeAfterHeyEnding: true,
|
||||||
// canBeFirstPartOfFathahanEnding: true,
|
// canBeFirstPartOfFathahanEnding: true,
|
||||||
},
|
},
|
||||||
|
@ -304,62 +307,74 @@ export function splitFIntoPhonemes(fIn: string): Phoneme[] {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds phonetis to a given PsString.
|
* Adds diacritics to a given PsString.
|
||||||
* Errors if the phonetics and script don't line up.
|
* Errors if the phonetics and script don't line up.
|
||||||
*
|
*
|
||||||
* @param ps a PsSTring without phonetics
|
* @param ps a PsSTring without phonetics
|
||||||
*/
|
*/
|
||||||
export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
|
export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
|
||||||
// TODO:
|
|
||||||
const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
|
const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
|
||||||
|
const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p });
|
||||||
|
if (pIn !== "") {
|
||||||
|
throw new Error("phonetics error - phonetics shorter than pashto script");
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
p: pOut,
|
||||||
|
f,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const { pIn, pOut } = phonemes.reduce((acc, phoneme, i) => {
|
function processPhoneme(
|
||||||
const prevPLetter = last(acc.pOut);
|
acc: DiacriticsAccumulator,
|
||||||
const isBeginningOfWord = acc.pOut === "" || prevPLetter === " ";
|
phoneme: Phoneme,
|
||||||
|
i: number,
|
||||||
|
phonemes: Phoneme[],
|
||||||
|
) {
|
||||||
|
// Prep state
|
||||||
|
const state = acc.pIn[0] === " " ? advanceP(acc) : acc;
|
||||||
|
// WARNING: Do not use acc after this point!
|
||||||
|
|
||||||
|
const prevPLetter = last(state.pOut);
|
||||||
|
const currentPLetter = state.pIn[0];
|
||||||
|
// const nextPLetter = state.pIn[1];
|
||||||
|
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
|
||||||
const phonemeInfo = phonemeTable[phoneme];
|
const phonemeInfo = phonemeTable[phoneme];
|
||||||
const previousPhoneme = i > 0 && phonemes[i-1];
|
const previousPhoneme = i > 0 && phonemes[i-1];
|
||||||
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
|
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
|
||||||
const currentPLetter = acc.pIn[0];
|
|
||||||
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
||||||
const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme);
|
const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme);
|
||||||
const needsSukun = doubleConsonant && (previousPhoneme !== phoneme);
|
const needsSukun = doubleConsonant && (previousPhoneme !== phoneme);
|
||||||
|
|
||||||
if (needsTashdeed) {
|
if (needsTashdeed) {
|
||||||
return {
|
return {
|
||||||
pOut: acc.pOut + tashdeed,
|
pOut: state.pOut + tashdeed,
|
||||||
pIn: acc.pIn,
|
pIn: state.pIn,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (phonemeInfo.matches?.includes(currentPLetter)) {
|
// TODO: Beginning of word with long vowels and alef etc.
|
||||||
|
if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
|
||||||
|
const ns = advanceP(state);
|
||||||
return {
|
return {
|
||||||
pOut: acc.pOut
|
...ns,
|
||||||
+ (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "")
|
pOut: ns.pOut + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""),
|
||||||
+ currentPLetter,
|
|
||||||
pIn: acc.pIn.slice(1),
|
|
||||||
};
|
};
|
||||||
|
} else if (phonemeInfo.matches?.includes(currentPLetter)) {
|
||||||
|
return advanceP({
|
||||||
|
...state,
|
||||||
|
pOut: state.pOut
|
||||||
|
+ (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : ""),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (phonemeInfo.diacritic) {
|
if (phonemeInfo.diacritic) {
|
||||||
return {
|
return {
|
||||||
pOut: acc.pOut + phonemeInfo.diacritic,
|
...state,
|
||||||
pIn: acc.pIn,
|
pOut: state.pOut + phonemeInfo.diacritic,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: CHECK IF PASHTO IS SHORTER THAN PHONETICS
|
|
||||||
|
|
||||||
throw new Error("phonetics error");
|
throw new Error("phonetics error");
|
||||||
}, { pOut: "", pIn: p });
|
|
||||||
|
|
||||||
if (pIn !== "") {
|
|
||||||
throw new Error("phonetics error - phonetics shorter than pashto script");
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
p: pOut,
|
|
||||||
f,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -370,3 +385,10 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
|
||||||
function last(s: string) {
|
function last(s: string) {
|
||||||
return s[s.length - 1];
|
return s[s.length - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
|
||||||
|
return {
|
||||||
|
pOut: state.pOut + state.pIn.slice(0, n),
|
||||||
|
pIn: state.pIn.slice(n),
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue