some more refactoring, getting stuck on the du behaviour 😒
This commit is contained in:
parent
c5c9ea86d2
commit
73f786890e
|
@ -0,0 +1,104 @@
|
|||
import {
|
||||
splitFIntoPhonemes,
|
||||
last,
|
||||
addP,
|
||||
prev2Chars,
|
||||
advanceP,
|
||||
reverseP,
|
||||
overwriteP,
|
||||
advanceForAin,
|
||||
advanceForAinOrHamza,
|
||||
advanceForHamzaMid,
|
||||
} from "./diacritics-helpers";
|
||||
|
||||
const phonemeSplits: Array<{
|
||||
in: string,
|
||||
out: string[],
|
||||
}> = [
|
||||
{
|
||||
in: "kor",
|
||||
out: ["k", "o", "r"],
|
||||
},
|
||||
{
|
||||
in: "raaghey",
|
||||
out: ["r", "aa", "gh", "ey"],
|
||||
},
|
||||
{
|
||||
in: "hatsa",
|
||||
out: ["h", "a", "ts", "a"],
|
||||
},
|
||||
{
|
||||
in: "ba",
|
||||
out: ["b", "a"],
|
||||
},
|
||||
{
|
||||
in: "peydáa",
|
||||
out: ["p", "ey", "d", "aa"],
|
||||
},
|
||||
{
|
||||
in: "be kaar",
|
||||
out: ["b", "e", "k", "aa", "r"],
|
||||
},
|
||||
{
|
||||
in: "raadzeyy",
|
||||
out: ["r", "aa", "dz", "eyy"],
|
||||
},
|
||||
{
|
||||
in: "badanuy ??",
|
||||
out: ["b", "a", "d", "a", "n", "uy"],
|
||||
},
|
||||
{
|
||||
in: "tur ... pore",
|
||||
out: ["t", "u", "r", "p", "o", "r", "e"],
|
||||
},
|
||||
{
|
||||
in: "daar-Ul-iqaama",
|
||||
out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"],
|
||||
},
|
||||
];
|
||||
|
||||
phonemeSplits.forEach((s) => {
|
||||
test(`${s.in} should split properly`, () => {
|
||||
const result = splitFIntoPhonemes(s.in);
|
||||
expect(result).toEqual(s.out);
|
||||
});
|
||||
});
|
||||
|
||||
const badPhonetics: Array<{
|
||||
in: string,
|
||||
problem: string,
|
||||
}> = [
|
||||
{
|
||||
in: "acar",
|
||||
problem: "c",
|
||||
},
|
||||
{
|
||||
in: "a7am",
|
||||
problem: "7",
|
||||
},
|
||||
];
|
||||
|
||||
test("bad phonetic characters should throw an error", () => {
|
||||
badPhonetics.forEach((s) => {
|
||||
expect(() => {
|
||||
splitFIntoPhonemes(s.in);
|
||||
}).toThrow(`illegal phonetic character: ${s.problem}`);
|
||||
});
|
||||
});
|
||||
|
||||
test("last should work", () => {
|
||||
expect(last("this")).toBe("s");
|
||||
});
|
||||
|
||||
test("addP should work", () => {
|
||||
expect(addP("ت")({ pIn: "", pOut: "کر" })).toEqual({
|
||||
pIn: "",
|
||||
pOut: "کرت",
|
||||
});
|
||||
});
|
||||
|
||||
test("prev2Chars should work", () => {
|
||||
expect(prev2Chars("تورن")).toBe("رن");
|
||||
expect(prev2Chars("وست .. ")).toBe("ست");
|
||||
expect(prev2Chars("دَ ... ")).toBe("دَ");
|
||||
});
|
|
@ -0,0 +1,401 @@
|
|||
/**
|
||||
* Copyright (c) 2021 lingdocs.com
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*
|
||||
*/
|
||||
|
||||
import { removeAccents } from "./accent-helpers";
|
||||
|
||||
export type DiacriticsAccumulator = { pIn: string, pOut: string };
|
||||
|
||||
type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y";
|
||||
type Ain = "'"
|
||||
type JoiningVowel = "-i-" | "-U-" | "-Ul-";
|
||||
type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
|
||||
type ShortVowel = "a" | "i" | "u" | "U";
|
||||
export type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
|
||||
|
||||
type PhonemeInfo = {
|
||||
matches?: string[],
|
||||
beginningMatches?: string[],
|
||||
endingMatches?: string[],
|
||||
consonant?: true,
|
||||
diacritic?: string,
|
||||
endingOnly?: true,
|
||||
takesSukunOnEnding?: true,
|
||||
longVowel?: true,
|
||||
canStartWithAynBefore?: true,
|
||||
useEndingDiacritic?: true,
|
||||
}
|
||||
|
||||
export const zwar = "َ";
|
||||
export const zwarakey = "ٙ";
|
||||
export const zer = "ِ";
|
||||
export const pesh = "ُ";
|
||||
export const sukun = "ْ";
|
||||
export const hamzaAbove = "ٔ";
|
||||
export const tashdeed = "ّ";
|
||||
export const wasla = "ٱ";
|
||||
export const daggerAlif = "ٰ";
|
||||
export const fathahan = "ً";
|
||||
|
||||
export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||
// Consonants
|
||||
"b": {
|
||||
matches: ["ب"],
|
||||
consonant: true,
|
||||
},
|
||||
"p": {
|
||||
matches: ["پ"],
|
||||
consonant: true,
|
||||
},
|
||||
"t": {
|
||||
matches: ["ت", "ط"],
|
||||
consonant: true,
|
||||
},
|
||||
"T": {
|
||||
matches: ["ټ"],
|
||||
consonant: true,
|
||||
},
|
||||
"s": {
|
||||
matches: ["س", "ص", "ث"],
|
||||
consonant: true,
|
||||
},
|
||||
"j": {
|
||||
matches: ["ج"],
|
||||
consonant: true,
|
||||
},
|
||||
"ch": {
|
||||
matches: ["چ"],
|
||||
consonant: true,
|
||||
},
|
||||
"kh": {
|
||||
matches: ["خ"],
|
||||
consonant: true,
|
||||
},
|
||||
"ts": {
|
||||
matches: ["څ"],
|
||||
consonant: true,
|
||||
},
|
||||
"dz": {
|
||||
matches: ["ځ"],
|
||||
consonant: true,
|
||||
},
|
||||
"d": {
|
||||
matches: ["د"],
|
||||
consonant: true,
|
||||
},
|
||||
"D": {
|
||||
matches: ["ډ"],
|
||||
consonant: true,
|
||||
},
|
||||
"r": {
|
||||
matches: ["ر"],
|
||||
consonant: true,
|
||||
},
|
||||
"R": {
|
||||
matches: ["ړ"],
|
||||
consonant: true,
|
||||
},
|
||||
"z": {
|
||||
matches: ["ز", "ذ", "ظ", "ض"],
|
||||
consonant: true,
|
||||
},
|
||||
"jz": {
|
||||
matches: ["ژ"],
|
||||
consonant: true,
|
||||
},
|
||||
"G": {
|
||||
matches: ["ږ"],
|
||||
consonant: true,
|
||||
},
|
||||
"sh": {
|
||||
matches: ["ش"],
|
||||
consonant: true,
|
||||
},
|
||||
"x": {
|
||||
matches: ["ښ"],
|
||||
consonant: true,
|
||||
},
|
||||
"gh": {
|
||||
matches: ["غ"],
|
||||
consonant: true,
|
||||
},
|
||||
"f": {
|
||||
matches: ["ف"],
|
||||
consonant: true,
|
||||
},
|
||||
"q": {
|
||||
matches: ["ق"],
|
||||
consonant: true,
|
||||
},
|
||||
"k": {
|
||||
matches: ["ک"],
|
||||
consonant: true,
|
||||
},
|
||||
"g": {
|
||||
matches: ["ګ"],
|
||||
consonant: true,
|
||||
},
|
||||
"l": {
|
||||
matches: ["ل"],
|
||||
consonant: true,
|
||||
},
|
||||
"m": {
|
||||
matches: ["م"],
|
||||
consonant: true,
|
||||
},
|
||||
"n": {
|
||||
matches: ["ن"],
|
||||
consonant: true,
|
||||
},
|
||||
"N": {
|
||||
matches: ["ڼ"],
|
||||
consonant: true,
|
||||
},
|
||||
"h": {
|
||||
matches: ["ه", "ح"],
|
||||
consonant: true,
|
||||
takesSukunOnEnding: true,
|
||||
},
|
||||
"w": {
|
||||
matches: ["و"],
|
||||
consonant: true,
|
||||
},
|
||||
"y": {
|
||||
matches: ["ی"],
|
||||
consonant: true,
|
||||
},
|
||||
// Ain
|
||||
"'": {
|
||||
matches: ["ع", "ئ"],
|
||||
consonant: true,
|
||||
},
|
||||
// Joining Vowels
|
||||
"-i-": {
|
||||
},
|
||||
"-U-": {
|
||||
matches: [" و ", "و"],
|
||||
},
|
||||
"-Ul-": {
|
||||
matches: ["ال"],
|
||||
},
|
||||
// Long Vowels
|
||||
"aa": {
|
||||
matches: ["ا"],
|
||||
beginningMatches: ["آ", "ا"],
|
||||
endingMatches: ["ا", "یٰ"],
|
||||
longVowel: true,
|
||||
},
|
||||
"ee": {
|
||||
matches: ["ی"],
|
||||
longVowel: true,
|
||||
endingMatches: ["ي"],
|
||||
diacritic: zer,
|
||||
canStartWithAynBefore: true
|
||||
},
|
||||
"e": {
|
||||
matches: ["ې"],
|
||||
longVowel: true,
|
||||
},
|
||||
"o": {
|
||||
matches: ["و"],
|
||||
longVowel: true,
|
||||
},
|
||||
"oo": {
|
||||
matches: ["و"],
|
||||
longVowel: true,
|
||||
// alsoCanBePrefix: true,
|
||||
diacritic: pesh,
|
||||
useEndingDiacritic: true,
|
||||
},
|
||||
"ey": {
|
||||
matches: ["ی"],
|
||||
longVowel: true,
|
||||
endingMatches: ["ی"],
|
||||
},
|
||||
"uy": {
|
||||
matches: ["ۍ"],
|
||||
longVowel: true,
|
||||
endingOnly: true,
|
||||
},
|
||||
"eyy": {
|
||||
matches: ["ئ"],
|
||||
longVowel: true,
|
||||
endingOnly: true,
|
||||
},
|
||||
// Short Vowels
|
||||
"a": {
|
||||
diacritic: zwar,
|
||||
endingMatches: ["ه"],
|
||||
beginningMatches: ["ا", "ع"],
|
||||
// canComeAfterHeyEnding: true,
|
||||
// canBeFirstPartOfFathahanEnding: true,
|
||||
},
|
||||
"u": {
|
||||
diacritic: zwarakey,
|
||||
endingMatches: ["ه"],
|
||||
},
|
||||
"i": {
|
||||
diacritic: zer,
|
||||
endingMatches: ["ه"],
|
||||
beginningMatches: ["ا", "ع"],
|
||||
// takesDiacriticBeforeGurdaHeyEnding: true,
|
||||
// canBeWasla: true,
|
||||
},
|
||||
"U": {
|
||||
diacritic: pesh,
|
||||
endingMatches: ["ه"],
|
||||
// takesDiacriticBeforeGurdaHeyEnding: true,
|
||||
beginningMatches: ["ا", "ع"],
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* splits a phonetics string into an array of Phonemes
|
||||
*
|
||||
* will error if there is an illeagal phonetics character
|
||||
*
|
||||
* @param fIn a phonetics string
|
||||
* @returns an array of phonemes
|
||||
*/
|
||||
export function splitFIntoPhonemes(fIn: string): Phoneme[] {
|
||||
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
|
||||
|
||||
const quadrigraphs: Phoneme[] = ["-Ul-"];
|
||||
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
|
||||
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
|
||||
const endingDigraphs: Phoneme[] = ["uy"];
|
||||
const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
|
||||
|
||||
const result: Phoneme[] = [];
|
||||
const f = removeAccents(fIn);
|
||||
let index = 0;
|
||||
while (index < f.length) {
|
||||
const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " ");
|
||||
const threeLetterChunk = f.slice(index, index + 3) as Phoneme;
|
||||
const fourLetterChunk = f.slice(index, index + 4) as Phoneme;
|
||||
if (quadrigraphs.includes(fourLetterChunk)) {
|
||||
result.push(fourLetterChunk);
|
||||
index += 4;
|
||||
continue;
|
||||
}
|
||||
if (trigraphs.includes(threeLetterChunk)) {
|
||||
result.push(threeLetterChunk);
|
||||
index += 3;
|
||||
continue;
|
||||
}
|
||||
const twoLetterChunk = f.slice(index, index + 2) as Phoneme;
|
||||
if (
|
||||
digraphs.includes(twoLetterChunk) ||
|
||||
(isLastTwoLetters && endingDigraphs.includes(twoLetterChunk))
|
||||
) {
|
||||
result.push(twoLetterChunk);
|
||||
index += 2;
|
||||
continue;
|
||||
}
|
||||
const singleLetter = f.slice(index, index + 1) as Phoneme;
|
||||
if (!willIgnore.includes(singleLetter)) {
|
||||
if (!singleLetterPhonemes.includes(singleLetter)) {
|
||||
throw new Error(`illegal phonetic character: ${singleLetter}`);
|
||||
}
|
||||
result.push(singleLetter);
|
||||
}
|
||||
index++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* returns the last character of a string
|
||||
*
|
||||
* @param s
|
||||
*/
|
||||
export function last(s: string) {
|
||||
return s[s.length - 1];
|
||||
}
|
||||
|
||||
export function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
|
||||
return {
|
||||
pIn: state.pIn.slice(n),
|
||||
pOut: state.pOut + state.pIn.slice(0, n),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* moves back to the last character that wasn't a " " or "."
|
||||
*
|
||||
* @param state
|
||||
* @returns
|
||||
*/
|
||||
export function reverseP(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||
const reversed = [...state.pOut].reverse();
|
||||
const howFar = reversed.findIndex((c) => ![" ", "."].includes(c));
|
||||
return {
|
||||
pIn: state.pOut.slice(-howFar) + state.pIn,
|
||||
pOut: state.pOut.slice(0, -howFar),
|
||||
};
|
||||
}
|
||||
|
||||
export const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
|
||||
return {
|
||||
...state,
|
||||
pOut: toAdd ? (state.pOut + toAdd) : state.pOut,
|
||||
};
|
||||
};
|
||||
|
||||
export const overwriteP = (toWrite: string) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
|
||||
return {
|
||||
pIn: state.pIn.slice(1),
|
||||
pOut: state.pOut + toWrite,
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* returns the last two character in a string that was not a space or a dote
|
||||
*
|
||||
* @param s
|
||||
* @returns
|
||||
*/
|
||||
export function prev2Chars(s: string): string {
|
||||
// console.log("looking at pOut", s);
|
||||
const reversed = [...s].reverse();
|
||||
// console.log(reversed.join("-"));
|
||||
const lastIndex = reversed.findIndex((c) => ![" ", "."].includes(c));
|
||||
const last2 = reversed[lastIndex + 1] + reversed[lastIndex];
|
||||
// console.log("last2", last2);
|
||||
return last2;
|
||||
}
|
||||
|
||||
export function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} {
|
||||
return {
|
||||
current: state.pIn[0],
|
||||
next: state.pIn[1],
|
||||
};
|
||||
}
|
||||
|
||||
export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||
const { current } = getCurrentNext(state);
|
||||
return (current === "ع") ? advanceP(state) : state;
|
||||
}
|
||||
|
||||
export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||
const { current, next } = getCurrentNext(state);
|
||||
if (current === "ئ" && next && next !== "ئ") {
|
||||
return advanceP(state);
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
export function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||
const { current, next } = getCurrentNext(state);
|
||||
if (current === "ه" && (!next || next === " ")) {
|
||||
return advanceP(state);
|
||||
}
|
||||
if (current === "ع") {
|
||||
return advanceP(state);
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
|
@ -8,71 +8,31 @@
|
|||
|
||||
import {
|
||||
addDiacritics,
|
||||
splitFIntoPhonemes,
|
||||
} from "./diacritics";
|
||||
import {
|
||||
zwar,
|
||||
zwarakey,
|
||||
zer,
|
||||
pesh,
|
||||
sukun,
|
||||
hamzaAbove,
|
||||
tashdeed,
|
||||
wasla,
|
||||
daggerAlif,
|
||||
fathahan,
|
||||
} from "./diacritics-helpers";
|
||||
import * as T from "../types";
|
||||
|
||||
const zwar = "َ";
|
||||
const zwarakey = "ٙ";
|
||||
const zer = "ِ";
|
||||
const pesh = "ُ";
|
||||
const sukun = "ْ";
|
||||
const hamzaAbove = "ٔ";
|
||||
const tashdeed = "ّ";
|
||||
const wasla = "ٱ";
|
||||
const daggerAlif = "ٰ";
|
||||
const fathahan = "ً";
|
||||
|
||||
const phonemeSplits: Array<{
|
||||
in: string,
|
||||
out: string[],
|
||||
}> = [
|
||||
{
|
||||
in: "kor",
|
||||
out: ["k", "o", "r"],
|
||||
},
|
||||
{
|
||||
in: "raaghey",
|
||||
out: ["r", "aa", "gh", "ey"],
|
||||
},
|
||||
{
|
||||
in: "hatsa",
|
||||
out: ["h", "a", "ts", "a"],
|
||||
},
|
||||
{
|
||||
in: "ba",
|
||||
out: ["b", "a"],
|
||||
},
|
||||
{
|
||||
in: "peydáa",
|
||||
out: ["p", "ey", "d", "aa"],
|
||||
},
|
||||
{
|
||||
in: "be kaar",
|
||||
out: ["b", "e", "k", "aa", "r"],
|
||||
},
|
||||
{
|
||||
in: "raadzeyy",
|
||||
out: ["r", "aa", "dz", "eyy"],
|
||||
},
|
||||
{
|
||||
in: "badanuy ??",
|
||||
out: ["b", "a", "d", "a", "n", "uy"],
|
||||
},
|
||||
{
|
||||
in: "tur ... pore",
|
||||
out: ["t", "u", "r", "p", "o", "r", "e"],
|
||||
},
|
||||
{
|
||||
in: "daar-Ul-iqaama",
|
||||
out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"],
|
||||
},
|
||||
];
|
||||
|
||||
const diacriticsTest: Array<{
|
||||
const diacriticsSections: {
|
||||
describe: string,
|
||||
tests: {
|
||||
in: T.PsString,
|
||||
out: string,
|
||||
}> = [
|
||||
out: string | null,
|
||||
}[],
|
||||
}[] = [
|
||||
{
|
||||
describe: "regular, native Pashto script/sounds",
|
||||
tests: [
|
||||
{
|
||||
in: {
|
||||
p: "کور",
|
||||
|
@ -403,14 +363,6 @@ const diacriticsTest: Array<{
|
|||
},
|
||||
out: "بې کار",
|
||||
},
|
||||
// TODO: nb mb thing
|
||||
{
|
||||
in: {
|
||||
p: "انبار",
|
||||
f: "ambáar",
|
||||
},
|
||||
out: "اَنْبار",
|
||||
},
|
||||
{
|
||||
in: {
|
||||
p: "ارغون",
|
||||
|
@ -499,14 +451,6 @@ const diacriticsTest: Array<{
|
|||
},
|
||||
out: "پَتَّه تُور",
|
||||
},
|
||||
// get ayn stuff working
|
||||
{
|
||||
in: {
|
||||
p: "اعتصاب شکن",
|
||||
f: "itisaab shakan",
|
||||
},
|
||||
out: "اِعتِصاب شَکَن",
|
||||
},
|
||||
// avoid false double consonant
|
||||
{
|
||||
in: {
|
||||
|
@ -515,6 +459,30 @@ const diacriticsTest: Array<{
|
|||
},
|
||||
out: "اَزَل لِیک",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
describe: "nm - mb thing",
|
||||
tests: [
|
||||
{
|
||||
in: {
|
||||
p: "انبار",
|
||||
f: "ambáar",
|
||||
},
|
||||
out: "اَنْبار",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
describe: "ayn stuff",
|
||||
tests: [
|
||||
{
|
||||
in: {
|
||||
p: "اعتصاب شکن",
|
||||
f: "itisaab shakan",
|
||||
},
|
||||
out: "اِعتِصاب شَکَن",
|
||||
},
|
||||
// starting with ع
|
||||
{
|
||||
in: {
|
||||
|
@ -530,7 +498,26 @@ const diacriticsTest: Array<{
|
|||
},
|
||||
out: "عِزَّت",
|
||||
},
|
||||
// ئ in the middle
|
||||
// ending with ayn
|
||||
{
|
||||
in: {
|
||||
p: "طمع کېدل",
|
||||
f: "tama kedul",
|
||||
},
|
||||
out: "طَمَع کېد" + zwarakey + "ل",
|
||||
},
|
||||
{
|
||||
in: {
|
||||
p: "منبع",
|
||||
f: "manbí",
|
||||
},
|
||||
out: "مَنْبِع",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
describe: "ئ in the middle",
|
||||
tests: [
|
||||
{
|
||||
in: {
|
||||
p: "برائت",
|
||||
|
@ -545,7 +532,11 @@ const diacriticsTest: Array<{
|
|||
},
|
||||
out: "فائِدَه",
|
||||
},
|
||||
// واخ being khaa in the middle of a word
|
||||
],
|
||||
},
|
||||
{
|
||||
describe: "واخ being khaa in the middle of a word",
|
||||
tests: [
|
||||
{
|
||||
in: {
|
||||
p: "استخوان",
|
||||
|
@ -553,7 +544,11 @@ const diacriticsTest: Array<{
|
|||
},
|
||||
out: "اُسْتُخ(و)ان",
|
||||
},
|
||||
// Arabic wasla
|
||||
],
|
||||
},
|
||||
{
|
||||
describe: "Arabic wasla",
|
||||
tests: [
|
||||
{
|
||||
in: {
|
||||
p: "بالکل",
|
||||
|
@ -561,7 +556,11 @@ const diacriticsTest: Array<{
|
|||
},
|
||||
out: "بِٱلْکُل",
|
||||
},
|
||||
// izafe
|
||||
],
|
||||
},
|
||||
{
|
||||
describe: "izafe",
|
||||
tests: [
|
||||
{
|
||||
in: {
|
||||
p: "ایصال ثواب",
|
||||
|
@ -569,20 +568,52 @@ const diacriticsTest: Array<{
|
|||
},
|
||||
out: "اِیصالِ ثَواب",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
describe: "special behaviour with د",
|
||||
tests: [
|
||||
{
|
||||
in: {
|
||||
p: "د",
|
||||
f: "du",
|
||||
},
|
||||
out: "د" + zwarakey,
|
||||
},
|
||||
{
|
||||
in: {
|
||||
p: "د لاس",
|
||||
f: "du laas",
|
||||
},
|
||||
out: "د" + zwarakey + " لاس",
|
||||
},
|
||||
// {
|
||||
// in: {
|
||||
// p: "د ... په شان",
|
||||
// f: "du ... pu shaan",
|
||||
// },
|
||||
// out: "د" + zwarakey + "... پهٔ شان",
|
||||
// },
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
phonemeSplits.forEach((s) => {
|
||||
test(`${s.in} should split properly`, () => {
|
||||
const result = splitFIntoPhonemes(s.in);
|
||||
expect(result).toEqual(s.out);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
diacriticsTest.forEach((t) => {
|
||||
diacriticsSections.forEach((section) => {
|
||||
describe(section.describe, () => {
|
||||
section.tests.forEach((t) => {
|
||||
if (section.describe === "special behaviour with د") {
|
||||
if (t.out) {
|
||||
test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => {
|
||||
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
|
||||
});
|
||||
} else {
|
||||
expect(() => {
|
||||
expect(addDiacritics(t.in)).toThrowError();
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// ERRORS
|
||||
|
@ -598,44 +629,23 @@ const brokenDiacritics = [
|
|||
},
|
||||
];
|
||||
|
||||
const badPhonetics: Array<{
|
||||
in: string,
|
||||
problem: string,
|
||||
}> = [
|
||||
{
|
||||
in: "acar",
|
||||
problem: "c",
|
||||
},
|
||||
{
|
||||
in: "a7am",
|
||||
problem: "7",
|
||||
},
|
||||
];
|
||||
// test("ending with left over Pashto script will throw an error", () => {
|
||||
// expect(() => {
|
||||
// addDiacritics({ p: "کور ته", f: "kor" });
|
||||
// }).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
||||
// });
|
||||
|
||||
test("bad phonetic characters should throw an error", () => {
|
||||
badPhonetics.forEach((s) => {
|
||||
expect(() => {
|
||||
splitFIntoPhonemes(s.in);
|
||||
}).toThrow(`illegal phonetic character: ${s.problem}`);
|
||||
});
|
||||
});
|
||||
// test("ending with left over phonetics will throw an error", () => {
|
||||
// expect(() => {
|
||||
// addDiacritics({ p: "کار", f: "kaar kawul" });
|
||||
// }).toThrow();
|
||||
// });
|
||||
|
||||
test("ending with left over Pashto script will throw an error", () => {
|
||||
expect(() => {
|
||||
addDiacritics({ p: "کور ته", f: "kor" });
|
||||
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
||||
});
|
||||
// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
||||
// brokenDiacritics.forEach((t) => {
|
||||
// expect(() => {
|
||||
// addDiacritics(t);
|
||||
// }).toThrow();
|
||||
// });
|
||||
// });
|
||||
|
||||
test("ending with left over phonetics will throw an error", () => {
|
||||
expect(() => {
|
||||
addDiacritics({ p: "کار", f: "kaar kawul" });
|
||||
}).toThrow();
|
||||
});
|
||||
|
||||
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
||||
brokenDiacritics.forEach((t) => {
|
||||
expect(() => {
|
||||
addDiacritics(t);
|
||||
}).toThrow();
|
||||
});
|
||||
});
|
||||
|
|
|
@ -7,255 +7,35 @@
|
|||
*/
|
||||
|
||||
import * as T from "../types";
|
||||
import { removeAccents } from "./accent-helpers";
|
||||
import {
|
||||
splitFIntoPhonemes,
|
||||
Phoneme,
|
||||
phonemeTable,
|
||||
zwar,
|
||||
zwarakey,
|
||||
zer,
|
||||
pesh,
|
||||
sukun,
|
||||
hamzaAbove,
|
||||
tashdeed,
|
||||
wasla,
|
||||
daggerAlif,
|
||||
fathahan,
|
||||
prev2Chars,
|
||||
addP,
|
||||
last,
|
||||
advanceP,
|
||||
reverseP,
|
||||
overwriteP,
|
||||
advanceForAin,
|
||||
advanceForAinOrHamza,
|
||||
advanceForHamzaMid,
|
||||
DiacriticsAccumulator,
|
||||
} from "./diacritics-helpers";
|
||||
|
||||
import { firstPhonetics } from "./p-text-helpers";
|
||||
import { pipe } from "rambda";
|
||||
|
||||
const zwar = "َ";
|
||||
const zwarakey = "ٙ";
|
||||
const zer = "ِ";
|
||||
const pesh = "ُ";
|
||||
const sukun = "ْ";
|
||||
const hamzaAbove = "ٔ";
|
||||
const tashdeed = "ّ";
|
||||
const wasla = "ٱ";
|
||||
const daggerAlif = "ٰ";
|
||||
const fathahan = "ً";
|
||||
|
||||
type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y";
|
||||
type Ain = "'"
|
||||
type JoiningVowel = "-i-" | "-U-" | "-Ul-";
|
||||
type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
|
||||
type ShortVowel = "a" | "i" | "u" | "U";
|
||||
type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
|
||||
|
||||
type DiacriticsAccumulator = { pIn: string, pOut: string };
|
||||
|
||||
type PhonemeInfo = {
|
||||
matches?: string[],
|
||||
beginningMatches?: string[],
|
||||
endingMatches?: string[],
|
||||
consonant?: true,
|
||||
diacritic?: string,
|
||||
endingOnly?: true,
|
||||
takesSukunOnEnding?: true,
|
||||
longVowel?: true,
|
||||
canStartWithAynBefore?: true,
|
||||
useEndingDiacritic?: true,
|
||||
}
|
||||
|
||||
const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
||||
// Consonants
|
||||
"b": {
|
||||
matches: ["ب"],
|
||||
consonant: true,
|
||||
},
|
||||
"p": {
|
||||
matches: ["پ"],
|
||||
consonant: true,
|
||||
},
|
||||
"t": {
|
||||
matches: ["ت", "ط"],
|
||||
consonant: true,
|
||||
},
|
||||
"T": {
|
||||
matches: ["ټ"],
|
||||
consonant: true,
|
||||
},
|
||||
"s": {
|
||||
matches: ["س", "ص", "ث"],
|
||||
consonant: true,
|
||||
},
|
||||
"j": {
|
||||
matches: ["ج"],
|
||||
consonant: true,
|
||||
},
|
||||
"ch": {
|
||||
matches: ["چ"],
|
||||
consonant: true,
|
||||
},
|
||||
"kh": {
|
||||
matches: ["خ"],
|
||||
consonant: true,
|
||||
},
|
||||
"ts": {
|
||||
matches: ["څ"],
|
||||
consonant: true,
|
||||
},
|
||||
"dz": {
|
||||
matches: ["ځ"],
|
||||
consonant: true,
|
||||
},
|
||||
"d": {
|
||||
matches: ["د"],
|
||||
consonant: true,
|
||||
},
|
||||
"D": {
|
||||
matches: ["ډ"],
|
||||
consonant: true,
|
||||
},
|
||||
"r": {
|
||||
matches: ["ر"],
|
||||
consonant: true,
|
||||
},
|
||||
"R": {
|
||||
matches: ["ړ"],
|
||||
consonant: true,
|
||||
},
|
||||
"z": {
|
||||
matches: ["ز", "ذ", "ظ", "ض"],
|
||||
consonant: true,
|
||||
},
|
||||
"jz": {
|
||||
matches: ["ژ"],
|
||||
consonant: true,
|
||||
},
|
||||
"G": {
|
||||
matches: ["ږ"],
|
||||
consonant: true,
|
||||
},
|
||||
"sh": {
|
||||
matches: ["ش"],
|
||||
consonant: true,
|
||||
},
|
||||
"x": {
|
||||
matches: ["ښ"],
|
||||
consonant: true,
|
||||
},
|
||||
"gh": {
|
||||
matches: ["غ"],
|
||||
consonant: true,
|
||||
},
|
||||
"f": {
|
||||
matches: ["ف"],
|
||||
consonant: true,
|
||||
},
|
||||
"q": {
|
||||
matches: ["ق"],
|
||||
consonant: true,
|
||||
},
|
||||
"k": {
|
||||
matches: ["ک"],
|
||||
consonant: true,
|
||||
},
|
||||
"g": {
|
||||
matches: ["ګ"],
|
||||
consonant: true,
|
||||
},
|
||||
"l": {
|
||||
matches: ["ل"],
|
||||
consonant: true,
|
||||
},
|
||||
"m": {
|
||||
matches: ["م"],
|
||||
consonant: true,
|
||||
},
|
||||
"n": {
|
||||
matches: ["ن"],
|
||||
consonant: true,
|
||||
},
|
||||
"N": {
|
||||
matches: ["ڼ"],
|
||||
consonant: true,
|
||||
},
|
||||
"h": {
|
||||
matches: ["ه", "ح"],
|
||||
consonant: true,
|
||||
takesSukunOnEnding: true,
|
||||
},
|
||||
"w": {
|
||||
matches: ["و"],
|
||||
consonant: true,
|
||||
},
|
||||
"y": {
|
||||
matches: ["ی"],
|
||||
consonant: true,
|
||||
},
|
||||
// Ain
|
||||
"'": {
|
||||
matches: ["ع", "ئ"],
|
||||
consonant: true,
|
||||
},
|
||||
// Joining Vowels
|
||||
"-i-": {
|
||||
},
|
||||
"-U-": {
|
||||
matches: [" و ", "و"],
|
||||
},
|
||||
"-Ul-": {
|
||||
matches: ["ال"],
|
||||
},
|
||||
// Long Vowels
|
||||
"aa": {
|
||||
matches: ["ا"],
|
||||
beginningMatches: ["آ", "ا"],
|
||||
endingMatches: ["ا", "یٰ"],
|
||||
longVowel: true,
|
||||
},
|
||||
"ee": {
|
||||
matches: ["ی"],
|
||||
longVowel: true,
|
||||
endingMatches: ["ي"],
|
||||
diacritic: zer,
|
||||
canStartWithAynBefore: true
|
||||
},
|
||||
"e": {
|
||||
matches: ["ې"],
|
||||
longVowel: true,
|
||||
},
|
||||
"o": {
|
||||
matches: ["و"],
|
||||
longVowel: true,
|
||||
},
|
||||
"oo": {
|
||||
matches: ["و"],
|
||||
longVowel: true,
|
||||
// alsoCanBePrefix: true,
|
||||
diacritic: pesh,
|
||||
useEndingDiacritic: true,
|
||||
},
|
||||
"ey": {
|
||||
matches: ["ی"],
|
||||
longVowel: true,
|
||||
endingMatches: ["ی"],
|
||||
},
|
||||
"uy": {
|
||||
matches: ["ۍ"],
|
||||
longVowel: true,
|
||||
endingOnly: true,
|
||||
},
|
||||
"eyy": {
|
||||
matches: ["ئ"],
|
||||
longVowel: true,
|
||||
endingOnly: true,
|
||||
},
|
||||
// Short Vowels
|
||||
"a": {
|
||||
diacritic: zwar,
|
||||
endingMatches: ["ه"],
|
||||
beginningMatches: ["ا", "ع"],
|
||||
// canComeAfterHeyEnding: true,
|
||||
// canBeFirstPartOfFathahanEnding: true,
|
||||
},
|
||||
"u": {
|
||||
diacritic: zwarakey,
|
||||
endingMatches: ["ه"],
|
||||
},
|
||||
"i": {
|
||||
diacritic: zer,
|
||||
endingMatches: ["ه"],
|
||||
beginningMatches: ["ا", "ع"],
|
||||
// takesDiacriticBeforeGurdaHeyEnding: true,
|
||||
// canBeWasla: true,
|
||||
},
|
||||
"U": {
|
||||
diacritic: pesh,
|
||||
endingMatches: ["ه"],
|
||||
// takesDiacriticBeforeGurdaHeyEnding: true,
|
||||
beginningMatches: ["ا", "ع"],
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds diacritics to a given PsString.
|
||||
* Errors if the phonetics and script don't line up.
|
||||
|
@ -272,61 +52,6 @@ const phonemeTable: Record<Phoneme, PhonemeInfo> = {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* splits a phonetics string into an array of Phonemes
|
||||
*
|
||||
* will error if there is an illeagal phonetics character
|
||||
*
|
||||
* @param fIn a phonetics string
|
||||
* @returns an array of phonemes
|
||||
*/
|
||||
export function splitFIntoPhonemes(fIn: string): Phoneme[] {
|
||||
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"];
|
||||
|
||||
const quadrigraphs: Phoneme[] = ["-Ul-"];
|
||||
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
|
||||
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
|
||||
const endingDigraphs: Phoneme[] = ["uy"];
|
||||
const willIgnore = ["?", " ", "`", ".", "…", ",", "'"];
|
||||
|
||||
const result: Phoneme[] = [];
|
||||
const f = removeAccents(fIn);
|
||||
let index = 0;
|
||||
while (index < f.length) {
|
||||
const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " ");
|
||||
const threeLetterChunk = f.slice(index, index + 3) as Phoneme;
|
||||
const fourLetterChunk = f.slice(index, index + 4) as Phoneme;
|
||||
if (quadrigraphs.includes(fourLetterChunk)) {
|
||||
result.push(fourLetterChunk);
|
||||
index += 4;
|
||||
continue;
|
||||
}
|
||||
if (trigraphs.includes(threeLetterChunk)) {
|
||||
result.push(threeLetterChunk);
|
||||
index += 3;
|
||||
continue;
|
||||
}
|
||||
const twoLetterChunk = f.slice(index, index + 2) as Phoneme;
|
||||
if (
|
||||
digraphs.includes(twoLetterChunk) ||
|
||||
(isLastTwoLetters && endingDigraphs.includes(twoLetterChunk))
|
||||
) {
|
||||
result.push(twoLetterChunk);
|
||||
index += 2;
|
||||
continue;
|
||||
}
|
||||
const singleLetter = f.slice(index, index + 1) as Phoneme;
|
||||
if (!willIgnore.includes(singleLetter)) {
|
||||
if (!singleLetterPhonemes.includes(singleLetter)) {
|
||||
throw new Error(`illegal phonetic character: ${singleLetter}`);
|
||||
}
|
||||
result.push(singleLetter);
|
||||
}
|
||||
index++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
enum PhonemeStatus {
|
||||
LeadingLongVowel,
|
||||
LeadingConsonantOrShortVowel,
|
||||
|
@ -337,6 +62,7 @@ enum PhonemeStatus {
|
|||
PersianSilentWWithAa,
|
||||
ArabicWasla,
|
||||
Izafe,
|
||||
EndOfDuParticle,
|
||||
}
|
||||
|
||||
function processPhoneme(
|
||||
|
@ -349,7 +75,9 @@ function processPhoneme(
|
|||
// console.log("space coming up", acc.pIn[0] === " ");
|
||||
// console.log("state", acc);
|
||||
// Prep state
|
||||
const state = acc.pIn[0] === " " ? advanceP(acc) : acc;
|
||||
const state = acc.pIn[0] === " "
|
||||
? advanceP(acc)
|
||||
: acc;
|
||||
// console.log("AFTER SPACE PREP", phoneme);
|
||||
// console.log("state", state);
|
||||
// WARNING: Do not use acc after this point!
|
||||
|
@ -403,6 +131,11 @@ function processPhoneme(
|
|||
reverseP,
|
||||
addP(zer),
|
||||
)(state)
|
||||
: (phs === PhonemeStatus.EndOfDuParticle) ?
|
||||
(console.log("here"), pipe(
|
||||
reverseP,
|
||||
addP(zwarakey),
|
||||
)(state))
|
||||
:
|
||||
// phs === PhonemeState.ShortVowel
|
||||
pipe(
|
||||
|
@ -444,6 +177,11 @@ function stateInfo({ state, i, phonemes, phoneme }: {
|
|||
if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
|
||||
return PhonemeStatus.LeadingConsonantOrShortVowel;
|
||||
}
|
||||
console.log(phoneme, phonemes, prev2Chars(state.pOut))
|
||||
if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && prev2Chars(state.pOut) === ("د" + zwarakey)) {
|
||||
// console.log("du here", phoneme, phonemes);
|
||||
return PhonemeStatus.EndOfDuParticle
|
||||
}
|
||||
if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") {
|
||||
return PhonemeStatus.PersianSilentWWithAa;
|
||||
}
|
||||
|
@ -465,6 +203,7 @@ function stateInfo({ state, i, phonemes, phoneme }: {
|
|||
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
||||
return PhonemeStatus.ShortVowel;
|
||||
}
|
||||
// console.log("bad phoneme is ", phoneme);
|
||||
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
|
||||
}
|
||||
|
||||
|
@ -474,70 +213,3 @@ function stateInfo({ state, i, phonemes, phoneme }: {
|
|||
phs, phonemeInfo, sukunOrDiacritic,
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* returns the last character of a string
|
||||
*
|
||||
* @param s
|
||||
*/
|
||||
function last(s: string) {
|
||||
return s[s.length - 1];
|
||||
}
|
||||
|
||||
function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
|
||||
return {
|
||||
pIn: state.pIn.slice(n),
|
||||
pOut: state.pOut + state.pIn.slice(0, n),
|
||||
};
|
||||
}
|
||||
|
||||
function reverseP(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||
return {
|
||||
pIn: state.pOut.slice(-1) + state.pIn,
|
||||
pOut: state.pOut.slice(0, -1),
|
||||
};
|
||||
}
|
||||
|
||||
const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
|
||||
return {
|
||||
...state,
|
||||
pOut: toAdd ? (state.pOut + toAdd) : state.pOut,
|
||||
};
|
||||
};
|
||||
|
||||
const overwriteP = (toWrite: string) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
|
||||
return {
|
||||
pIn: state.pIn.slice(1),
|
||||
pOut: state.pOut + toWrite,
|
||||
};
|
||||
};
|
||||
|
||||
function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} {
|
||||
return {
|
||||
current: state.pIn[0],
|
||||
next: state.pIn[1],
|
||||
};
|
||||
}
|
||||
|
||||
function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||
const { current } = getCurrentNext(state);
|
||||
return (current === "ع") ? advanceP(state) : state;
|
||||
}
|
||||
|
||||
function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||
const { current, next } = getCurrentNext(state);
|
||||
if (current === "ئ" && next && next !== "ئ") {
|
||||
return advanceP(state);
|
||||
}
|
||||
return state;
|
||||
}
|
||||
function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
|
||||
const { current, next } = getCurrentNext(state);
|
||||
if (current === "ه" && (!next || next === " ")) {
|
||||
return advanceP(state);
|
||||
}
|
||||
if (current === "ع") {
|
||||
return advanceP(state);
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue