pashto-inflector/src/lib/diacritics-helpers.ts

574 lines
18 KiB
TypeScript
Raw Normal View History

/**
* Copyright (c) 2021 lingdocs.com
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*
*/
import { removeAccents } from "./accent-helpers";
export type DiacriticsAccumulator = { pIn: string, pOut: string };
type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y";
type Ain = "'"
type JoiningVowel = "-i-" | "-U-" | "-Ul-";
type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy";
type ShortVowel = "a" | "i" | "u" | "U";
export type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel;
type PhonemeInfo = {
matches?: string[],
beginningMatches?: string[],
endingMatches?: string[],
consonant?: true,
diacritic?: string,
endingOnly?: true,
takesSukunOnEnding?: true,
longVowel?: true,
canStartWithAynBefore?: true,
useEndingDiacritic?: true,
2021-05-27 06:36:30 +00:00
ainBlendDiacritic?: string,
}
export const zwar = "َ";
export const zwarakey = "ٙ";
export const zer = "ِ";
export const pesh = "ُ";
export const sukun = "ْ";
export const hamzaAbove = "ٔ";
export const tashdeed = "ّ";
export const wasla = "ٱ";
export const daggerAlif = "ٰ";
export const fathahan = "ً";
export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
// Consonants
"b": {
matches: ["ب"],
consonant: true,
},
"p": {
matches: ["پ"],
consonant: true,
},
"t": {
matches: ["ت", "ط"],
consonant: true,
},
"T": {
matches: ["ټ"],
consonant: true,
},
"s": {
matches: ["س", "ص", "ث"],
consonant: true,
},
"j": {
matches: ["ج"],
consonant: true,
},
"ch": {
matches: ["چ"],
consonant: true,
},
"kh": {
matches: ["خ"],
consonant: true,
},
"ts": {
matches: ["څ"],
consonant: true,
},
"dz": {
matches: ["ځ"],
consonant: true,
},
"d": {
matches: ["د"],
consonant: true,
},
"D": {
matches: ["ډ"],
consonant: true,
},
"r": {
matches: ["ر"],
consonant: true,
},
"R": {
matches: ["ړ"],
consonant: true,
},
"z": {
matches: ["ز", "ذ", "ظ", "ض"],
consonant: true,
},
"jz": {
matches: ["ژ"],
consonant: true,
},
"G": {
matches: ["ږ"],
consonant: true,
},
"sh": {
matches: ["ش"],
consonant: true,
},
"x": {
matches: ["ښ"],
consonant: true,
},
"gh": {
matches: ["غ"],
consonant: true,
},
"f": {
matches: ["ف"],
consonant: true,
},
"q": {
matches: ["ق"],
consonant: true,
},
"k": {
matches: ["ک"],
consonant: true,
},
"g": {
matches: ["ګ"],
consonant: true,
},
"l": {
matches: ["ل"],
consonant: true,
},
"m": {
matches: ["م"],
consonant: true,
},
"n": {
matches: ["ن"],
consonant: true,
},
"N": {
matches: ["ڼ"],
consonant: true,
},
"h": {
matches: ["ه", "ح"],
consonant: true,
takesSukunOnEnding: true,
},
"w": {
matches: ["و"],
consonant: true,
},
"y": {
matches: ["ی"],
consonant: true,
},
// Ain
"'": {
matches: ["ع", "ئ"],
consonant: true,
},
// Joining Vowels
"-i-": {
},
"-U-": {
matches: [" و ", "و"],
},
"-Ul-": {
matches: ["ال"],
},
// Long Vowels
"aa": {
2021-06-05 16:29:35 +00:00
matches: ["ا", "أ"],
beginningMatches: ["آ", "ا"],
endingMatches: ["ا", "یٰ"],
longVowel: true,
2021-05-27 06:36:30 +00:00
ainBlendDiacritic: zwar,
},
"ee": {
matches: ["ی"],
longVowel: true,
endingMatches: ["ي"],
diacritic: zer,
2021-05-27 06:36:30 +00:00
canStartWithAynBefore: true,
ainBlendDiacritic: zer,
},
"e": {
matches: ["ې"],
longVowel: true,
},
"o": {
matches: ["و"],
longVowel: true,
},
"oo": {
matches: ["و"],
longVowel: true,
diacritic: pesh,
useEndingDiacritic: true,
2021-05-27 06:36:30 +00:00
ainBlendDiacritic: pesh,
},
"ey": {
matches: ["ی"],
longVowel: true,
endingMatches: ["ی"],
},
"uy": {
matches: ["ۍ"],
longVowel: true,
endingOnly: true,
},
"eyy": {
matches: ["ئ"],
longVowel: true,
endingOnly: true,
},
// Short Vowels
"a": {
diacritic: zwar,
endingMatches: ["ه"],
beginningMatches: ["ا", "ع"],
// canComeAfterHeyEnding: true,
},
"u": {
diacritic: zwarakey,
endingMatches: ["ه"],
},
"i": {
diacritic: zer,
endingMatches: ["ه"],
beginningMatches: ["ا", "ع"],
// takesDiacriticBeforeGurdaHeyEnding: true,
// canBeWasla: true,
},
"U": {
diacritic: pesh,
endingMatches: ["ه"],
// takesDiacriticBeforeGurdaHeyEnding: true,
beginningMatches: ["ا", "ع"],
},
}
/**
* splits a phonetics string into an array of Phonemes
*
* will error if there is an illeagal phonetics character
*
* @param fIn a phonetics string
* @returns an array of phonemes
*/
export function splitFIntoPhonemes(fIn: string): Phoneme[] {
2021-05-27 06:36:30 +00:00
const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y", "'"];
const quadrigraphs: Phoneme[] = ["-Ul-"];
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
const endingDigraphs: Phoneme[] = ["uy"];
2021-06-05 16:29:35 +00:00
const willIgnore = ["?", " ", "`", ".", "…", ",", "-"];
const result: Phoneme[] = [];
2021-06-03 16:12:07 +00:00
const f = removeAccents(fIn).replace(/ă/g, "a");
let index = 0;
while (index < f.length) {
const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " ");
const threeLetterChunk = f.slice(index, index + 3) as Phoneme;
const fourLetterChunk = f.slice(index, index + 4) as Phoneme;
if (quadrigraphs.includes(fourLetterChunk)) {
result.push(fourLetterChunk);
index += 4;
continue;
}
if (trigraphs.includes(threeLetterChunk)) {
result.push(threeLetterChunk);
index += 3;
continue;
}
const twoLetterChunk = f.slice(index, index + 2) as Phoneme;
if (
digraphs.includes(twoLetterChunk) ||
(isLastTwoLetters && endingDigraphs.includes(twoLetterChunk))
) {
result.push(twoLetterChunk);
index += 2;
continue;
}
const singleLetter = f.slice(index, index + 1) as Phoneme;
if (!willIgnore.includes(singleLetter)) {
if (!singleLetterPhonemes.includes(singleLetter)) {
throw new Error(`illegal phonetic character: ${singleLetter}`);
}
result.push(singleLetter);
}
index++;
}
return result;
}
2021-06-03 13:52:14 +00:00
export enum PhonemeStatus {
LeadingLongVowel,
LeadingConsonantOrShortVowel,
DoubleConsonantTashdeed,
EndingWithHeyHim,
DirectMatch,
DirectMatchAfterSukun,
EndingWithHeyHimFromSukun,
ShortVowel,
PersianSilentWWithAa,
ArabicWasla,
Izafe,
EndOfDuParticle,
ShortAEndingAfterHeem,
AlefDaggarEnding,
AinWithLongAAtBeginning,
LongAinVowelMissingComma,
ShortAinVowelMissingComma,
ShortAinVowelMissingCommaAfterAlefStart,
AinBeginningAfterShortVowel,
AlefWithHamza,
AlefWithHamzaWithGlottalStop,
WoEndingO,
ShortAForAlefBeforeFathatan,
NOnFathatan,
2021-06-03 15:56:08 +00:00
HamzaOnWow,
2021-06-03 16:12:07 +00:00
ArabicDefiniteArticleUl,
2021-06-05 16:29:35 +00:00
OoPrefix,
AlefHamzaBeg,
GlottalStopBeforeOo,
OoAfterGlottalStopOo,
2021-06-03 13:52:14 +00:00
}
export function stateInfo({ state, i, phonemes, phoneme }: {
state: DiacriticsAccumulator,
i: number,
phonemes: Phoneme[],
phoneme: Phoneme,
}) {
const isOutOfWord = (char: string) => !char || char === " ";
const prevPLetter = last(state.pOut);
const currentPLetter = state.pIn[0];
const nextPLetter = state.pIn[1];
const nextPhoneme = phonemes[i+1];
const previousPhoneme = i > 0 && phonemes[i-1];
2021-06-05 16:29:35 +00:00
const lastThreePLetters = last(state.pOut, 3) + last(state.pOut, 2) + prevPLetter;
const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل") || (["دَر", "وَر"].includes(lastThreePLetters) || (last(state.pOut, 2) + prevPLetter) === "را");
2021-06-03 16:12:07 +00:00
const isEndOfWord = isOutOfWord(nextPLetter);
const phonemeInfo = phonemeTable[phoneme];
2021-06-03 13:52:14 +00:00
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
const diacritic = useAinBlendDiacritics
? phonemeInfo.ainBlendDiacritic
: isEndOfWord
? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
const lastWordEndedW = (char: string) => ((prevPLetter === char && !currentPLetter) || (prevPLetter === " " && last(state.pOut, 2) === char));
function getPhonemeState(): PhonemeStatus {
if (isBeginningOfWord && phoneme === "aa" && phonemeInfo.beginningMatches?.includes(currentPLetter)) {
return PhonemeStatus.DirectMatch;
}
2021-06-05 16:29:35 +00:00
if (isBeginningOfWord && phoneme === "oo" && currentPLetter === "و") {
return PhonemeStatus.OoPrefix;
}
2021-06-03 13:52:14 +00:00
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
throw Error("phonetics error - needs alef prefix");
}
return PhonemeStatus.LeadingLongVowel;
}
if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
return PhonemeStatus.LeadingConsonantOrShortVowel;
}
if (isBeginningOfWord && phoneme === "aa" && currentPLetter === "ع" && nextPLetter === "ا") {
return PhonemeStatus.AinWithLongAAtBeginning;
}
// console.log("------");
// console.log("phoneme", phoneme);
// console.log("state", state);
// console.log("prevPLetter is space", prevPLetter === " ");
// console.log("------");
if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && lastNonWhitespace(state.pOut) === "د") {
return PhonemeStatus.EndOfDuParticle
}
2021-06-03 16:12:07 +00:00
if (isBeginningOfWord && phoneme === "-Ul-" && currentPLetter === "ا" && nextPLetter === "ل") {
return PhonemeStatus.ArabicDefiniteArticleUl;
}
2021-06-05 16:29:35 +00:00
if (phoneme === "a" && nextPhoneme === "'" && phonemes[i+2] === "a" && currentPLetter === "أ") {
return PhonemeStatus.AlefHamzaBeg;
}
2021-06-03 15:56:08 +00:00
if (phoneme === "a" && previousPhoneme === "U" && currentPLetter === "و") {
return PhonemeStatus.HamzaOnWow;
}
2021-06-03 13:52:14 +00:00
if (phoneme === "a" && currentPLetter === "ا" && nextPLetter === fathahan) {
return PhonemeStatus.ShortAForAlefBeforeFathatan;
}
2021-06-05 16:29:35 +00:00
if (phoneme === "'" && currentPLetter === "و" && nextPLetter === "و") {
return PhonemeStatus.GlottalStopBeforeOo;
}
if (phoneme === "oo" && previousPhoneme === "'" && currentPLetter === "و" && prevPLetter === hamzaAbove) {
return PhonemeStatus.OoAfterGlottalStopOo;
}
2021-06-03 13:52:14 +00:00
if (phoneme === "'" && last(state.pOut, 2) === "ع" && isOutOfWord(last(state.pOut, 3))) {
return PhonemeStatus.AinBeginningAfterShortVowel;
}
if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") {
return PhonemeStatus.PersianSilentWWithAa;
}
if (!isBeginningOfWord && phoneme === "i" && currentPLetter === "ا" && nextPLetter === "ل") {
return PhonemeStatus.ArabicWasla;
}
if (phoneme === "-i-" && isBeginningOfWord) {
return PhonemeStatus.Izafe;
}
if (phoneme === "a" && currentPLetter === "أ") {
return PhonemeStatus.AlefWithHamza;
}
if (phoneme === "'" && nextPhoneme === "a" && currentPLetter === "أ") {
return PhonemeStatus.AlefWithHamzaWithGlottalStop;
}
if (currentPLetter === "ع" && phoneme !== "'" && nextPhoneme !== "'") {
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
return PhonemeStatus.ShortAinVowelMissingComma;
}
if ((last(state.pOut, 2) === "ا") && isOutOfWord(last(state.pOut, 3))) {
return PhonemeStatus.ShortAinVowelMissingCommaAfterAlefStart;
}
}
if (useAinBlendDiacritics) {
return PhonemeStatus.LongAinVowelMissingComma;
}
2021-06-05 16:29:35 +00:00
if (((!isBeginningOfWord && doubleConsonant) || prevPLetter === " ") && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter)) {
2021-06-03 13:52:14 +00:00
return PhonemeStatus.DoubleConsonantTashdeed;
}
if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) {
return PhonemeStatus.AlefDaggarEnding;
}
if (phoneme === "a" && lastWordEndedW("ح")) {
return PhonemeStatus.ShortAEndingAfterHeem;
}
if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) {
return needsSukun ? PhonemeStatus.EndingWithHeyHimFromSukun : PhonemeStatus.EndingWithHeyHim;
}
if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) {
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
}
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
return PhonemeStatus.ShortVowel;
}
if (phoneme === "o" && previousPhoneme === "w" && lastWordEndedW("و")) {
return PhonemeStatus.WoEndingO;
}
if (isEndOfWord && phoneme === "n" && currentPLetter === fathahan && prevPLetter === "ا") {
return PhonemeStatus.NOnFathatan;
}
2021-06-05 16:29:35 +00:00
// console.log("errored", "current", phoneme, "next", nextPhoneme);
2021-06-03 13:52:14 +00:00
// console.log("bad phoneme is ", phoneme);
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
}
const phs = getPhonemeState();
return {
phs, phonemeInfo, diacritic, prevPLetter,
};
};
/**
2021-06-03 13:52:14 +00:00
* returns the nth last character of a string
*
* @param s
*/
2021-06-03 13:52:14 +00:00
export function last(s: string, n = 1) {
return s[s.length - n];
}
export function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator {
return {
pIn: state.pIn.slice(n),
pOut: state.pOut + state.pIn.slice(0, n),
};
}
/**
* moves back to the last character that wasn't a " " or "."
*
* @param state
* @returns
*/
export function reverseP(state: DiacriticsAccumulator): DiacriticsAccumulator {
const reversed = [...state.pOut].reverse();
const howFar = reversed.findIndex((c) => ![" ", "."].includes(c));
return {
pIn: state.pOut.slice(-howFar) + state.pIn,
pOut: state.pOut.slice(0, -howFar),
};
}
export const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
return {
...state,
pOut: toAdd ? (state.pOut + toAdd) : state.pOut,
};
};
export const overwriteP = (toWrite: string) => (state: DiacriticsAccumulator): DiacriticsAccumulator => {
return {
pIn: state.pIn.slice(1),
pOut: state.pOut + toWrite,
};
};
/**
2021-05-16 15:00:05 +00:00
* returns the last letter before any whitespace (" " / ".")
*
* @param s
* @returns
*/
2021-05-16 15:00:05 +00:00
export function lastNonWhitespace(s: string): string {
const reversed = [...s].reverse();
const lastIndex = reversed.findIndex((c) => ![" ", "."].includes(c));
2021-05-16 15:00:05 +00:00
const penultimateChar = reversed[lastIndex];
return penultimateChar;
}
export function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} {
return {
current: state.pIn[0],
next: state.pIn[1],
};
}
2021-05-27 06:36:30 +00:00
// export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator {
// const { current } = getCurrentNext(state);
// return (current === "ع") ? advanceP(state) : state;
// }
export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state);
if (current === "ئ" && next && next !== "ئ") {
return advanceP(state);
}
return state;
}
2021-05-27 06:36:30 +00:00
export function advanceForHamza(state: DiacriticsAccumulator): DiacriticsAccumulator {
const { current, next } = getCurrentNext(state);
if (current === "ه" && (!next || next === " ")) {
return advanceP(state);
}
2021-05-27 06:36:30 +00:00
// if (current === "ع") {
// return advanceP(state);
// }
return state;
}