more work on diacritics
This commit is contained in:
parent
7b0e6d864f
commit
a2b5626514
|
@ -12,6 +12,17 @@ import {
|
||||||
} from "./diacritics";
|
} from "./diacritics";
|
||||||
import * as T from "../types";
|
import * as T from "../types";
|
||||||
|
|
||||||
|
const zwar = "َ";
|
||||||
|
const zwarakey = "ٙ";
|
||||||
|
const zer = "ِ";
|
||||||
|
const pesh = "ُ";
|
||||||
|
const sukun = "ْ";
|
||||||
|
const hamzaAbove = "ٔ";
|
||||||
|
const tashdeed = "ّ";
|
||||||
|
const wasla = "ٱ";
|
||||||
|
const daggerAlif = "ٰ";
|
||||||
|
const fathahan = "ً";
|
||||||
|
|
||||||
const phonemeSplits: Array<{
|
const phonemeSplits: Array<{
|
||||||
in: string,
|
in: string,
|
||||||
out: string[],
|
out: string[],
|
||||||
|
@ -139,6 +150,105 @@ const diacriticsTest: Array<{
|
||||||
},
|
},
|
||||||
out: "تَشْناب",
|
out: "تَشْناب",
|
||||||
},
|
},
|
||||||
|
// working with وs
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کول",
|
||||||
|
f: "kwal",
|
||||||
|
},
|
||||||
|
out: "کْوَل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "تول",
|
||||||
|
f: "tool",
|
||||||
|
},
|
||||||
|
out: "تُول",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "مقبول",
|
||||||
|
f: "maqbool",
|
||||||
|
},
|
||||||
|
out: "مَقْبُول",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کول",
|
||||||
|
f: "kawul",
|
||||||
|
},
|
||||||
|
out: "کَو" + zwarakey + "ل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کول",
|
||||||
|
f: "kiwul",
|
||||||
|
},
|
||||||
|
out: "کِو" + zwarakey + "ل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کول",
|
||||||
|
f: "kUwul",
|
||||||
|
},
|
||||||
|
out: "کُو" + zwarakey + "ل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کول",
|
||||||
|
f: "kuwul",
|
||||||
|
},
|
||||||
|
out: "ک" + zwarakey + "و" + zwarakey + "ل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کول",
|
||||||
|
f: "kawal",
|
||||||
|
},
|
||||||
|
out: "کَوَل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "کول",
|
||||||
|
f: "kUwal",
|
||||||
|
},
|
||||||
|
out: "کُوَل",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "پشتګرد",
|
||||||
|
f: "pishtgird",
|
||||||
|
},
|
||||||
|
out: "پِشْتْګِرْد",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "سپین",
|
||||||
|
f: "speen",
|
||||||
|
},
|
||||||
|
out: "سْپِین",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "سپین",
|
||||||
|
f: "speyn",
|
||||||
|
},
|
||||||
|
out: "سْپین",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "پېش",
|
||||||
|
f: "pesh",
|
||||||
|
},
|
||||||
|
out: "پېش",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
p: "بتن",
|
||||||
|
f: "battan",
|
||||||
|
},
|
||||||
|
out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
|
||||||
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
const brokenDiacritics = [
|
const brokenDiacritics = [
|
||||||
|
@ -163,6 +273,12 @@ test("bad phonetic characters should throw an error", () => {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("ending with left over Pashto script will throw an error", () => {
|
||||||
|
expect(() => {
|
||||||
|
addDiacritics({ p: "کور ته", f: "kor" });
|
||||||
|
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
||||||
|
});
|
||||||
|
|
||||||
test("adding diacritics should work", () => {
|
test("adding diacritics should work", () => {
|
||||||
diacriticsTest.forEach((t) => {
|
diacriticsTest.forEach((t) => {
|
||||||
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
|
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
|
||||||
|
|
|
@ -248,7 +248,14 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* splits a phonetics string into an array of Phonemes
|
||||||
|
*
|
||||||
|
* will error if there is an illeagal phonetics character
|
||||||
|
*
|
||||||
|
* @param fIn a phonetics string
|
||||||
|
* @returns an array of phonemes
|
||||||
|
*/
|
||||||
export function splitFIntoPhonemes(fIn: string): Phoneme[] {
|
export function splitFIntoPhonemes(fIn: string): Phoneme[] {
|
||||||
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
|
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
|
||||||
|
|
||||||
|
@ -306,16 +313,25 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
|
||||||
// TODO:
|
// TODO:
|
||||||
const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
|
const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
|
||||||
|
|
||||||
const { pOut } = phonemes.reduce((acc, phoneme, i) => {
|
const { pIn, pOut } = phonemes.reduce((acc, phoneme, i) => {
|
||||||
const isBeginningOfWord = acc.pOut === "" || last(acc.pOut) === " ";
|
const prevPLetter = last(acc.pOut);
|
||||||
|
const isBeginningOfWord = acc.pOut === "" || prevPLetter === " ";
|
||||||
const phonemeInfo = phonemeTable[phoneme];
|
const phonemeInfo = phonemeTable[phoneme];
|
||||||
|
const previousPhoneme = i > 0 && phonemes[i-1];
|
||||||
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
|
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
|
||||||
const currentPLetter = acc.pIn[0];
|
const currentPLetter = acc.pIn[0];
|
||||||
const needsSukun = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
||||||
|
const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme);
|
||||||
|
const needsSukun = doubleConsonant && (previousPhoneme !== phoneme);
|
||||||
|
|
||||||
|
if (needsTashdeed) {
|
||||||
|
return {
|
||||||
|
pOut: acc.pOut + tashdeed,
|
||||||
|
pIn: acc.pIn,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
if (phonemeInfo.matches?.includes(currentPLetter)) {
|
if (phonemeInfo.matches?.includes(currentPLetter)) {
|
||||||
// TODO: Check if tashdeed or sukun is used
|
|
||||||
// const needsSukun = is consonant + previous phoneme was consonant + not beginning of word
|
|
||||||
return {
|
return {
|
||||||
pOut: acc.pOut
|
pOut: acc.pOut
|
||||||
+ (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "")
|
+ (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "")
|
||||||
|
@ -331,9 +347,15 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: CHECK IF PASHTO IS SHORTER THAN PHONETICS
|
||||||
|
|
||||||
throw new Error("phonetics error");
|
throw new Error("phonetics error");
|
||||||
}, { pOut: "", pIn: p });
|
}, { pOut: "", pIn: p });
|
||||||
|
|
||||||
|
if (pIn !== "") {
|
||||||
|
throw new Error("phonetics error - phonetics shorter than pashto script");
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
p: pOut,
|
p: pOut,
|
||||||
f,
|
f,
|
||||||
|
|
Loading…
Reference in New Issue