/** * Copyright (c) 2021 lingdocs.com * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. * */ import * as T from "../types"; import { removeAccents } from "./accent-helpers"; import { firstPhonetics } from "./p-text-helpers"; import { pipe } from "rambda"; const zwar = "َ"; const zwarakey = "ٙ"; const zer = "ِ"; const pesh = "ُ"; const sukun = "ْ"; const hamzaAbove = "ٔ"; const tashdeed = "ّ"; const wasla = "ٱ"; const daggerAlif = "ٰ"; const fathahan = "ً"; type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y"; type Ain = "'" type JoiningVowel = "-i-" | "-U-" | "-Ul-"; type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy"; type ShortVowel = "a" | "i" | "u" | "U"; type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel; type DiacriticsAccumulator = { pIn: string, pOut: string }; type PhonemeInfo = { matches?: string[], beginningMatches?: string[], endingMatches?: string[], consonant?: true, diacritic?: string, endingOnly?: true, takesSukunOnEnding?: true, longVowel?: true, canStartWithAynBefore?: true, useEndingDiacritic?: true, } const phonemeTable: Record = { // Consonants "b": { matches: ["ب"], consonant: true, }, "p": { matches: ["پ"], consonant: true, }, "t": { matches: ["ت", "ط"], consonant: true, }, "T": { matches: ["ټ"], consonant: true, }, "s": { matches: ["س", "ص", "ث"], consonant: true, }, "j": { matches: ["ج"], consonant: true, }, "ch": { matches: ["چ"], consonant: true, }, "kh": { matches: ["خ"], consonant: true, }, "ts": { matches: ["څ"], consonant: true, }, "dz": { matches: ["ځ"], consonant: true, }, "d": { matches: ["د"], consonant: true, }, "D": { matches: ["ډ"], consonant: true, }, "r": { matches: ["ر"], consonant: true, }, "R": { matches: ["ړ"], consonant: true, }, "z": { matches: ["ز", "ذ", "ظ", "ض"], consonant: true, }, "jz": { matches: ["ژ"], consonant: true, }, "G": { matches: ["ږ"], consonant: true, }, "sh": { matches: ["ش"], consonant: true, }, "x": { matches: ["ښ"], consonant: true, }, "gh": { matches: ["غ"], consonant: true, }, "f": { matches: ["ف"], consonant: true, }, "q": { matches: ["ق"], consonant: true, }, "k": { matches: ["ک"], consonant: true, }, "g": { matches: ["ګ"], consonant: true, }, "l": { matches: ["ل"], consonant: true, }, "m": { matches: ["م"], consonant: true, }, "n": { matches: ["ن"], consonant: true, }, "N": { matches: ["ڼ"], consonant: true, }, "h": { matches: ["ه", "ح"], consonant: true, takesSukunOnEnding: true, }, "w": { matches: ["و"], consonant: true, }, "y": { matches: ["ی"], consonant: true, }, // Ain "'": { matches: ["ع", "ئ"], consonant: true, }, // Joining Vowels "-i-": { }, "-U-": { matches: [" و ", "و"], }, "-Ul-": { matches: ["ال"], }, // Long Vowels "aa": { matches: ["ا"], beginningMatches: ["آ", "ا"], endingMatches: ["ا", "یٰ"], longVowel: true, }, "ee": { matches: ["ی"], longVowel: true, endingMatches: ["ي"], diacritic: zer, canStartWithAynBefore: true }, "e": { matches: ["ې"], longVowel: true, }, "o": { matches: ["و"], longVowel: true, }, "oo": { matches: ["و"], longVowel: true, // alsoCanBePrefix: true, diacritic: pesh, useEndingDiacritic: true, }, "ey": { matches: ["ی"], longVowel: true, endingMatches: ["ی"], }, "uy": { matches: ["ۍ"], longVowel: true, endingOnly: true, }, "eyy": { matches: ["ئ"], longVowel: true, endingOnly: true, }, // Short Vowels "a": { diacritic: zwar, endingMatches: ["ه"], beginningMatches: ["ا", "ع"], // canComeAfterHeyEnding: true, // canBeFirstPartOfFathahanEnding: true, }, "u": { diacritic: zwarakey, endingMatches: ["ه"], }, "i": { diacritic: zer, endingMatches: ["ه"], beginningMatches: ["ا", "ع"], // takesDiacriticBeforeGurdaHeyEnding: true, // canBeWasla: true, }, "U": { diacritic: pesh, endingMatches: ["ه"], // takesDiacriticBeforeGurdaHeyEnding: true, beginningMatches: ["ا", "ع"], }, } /** * Adds diacritics to a given PsString. * Errors if the phonetics and script don't line up. */ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString { const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f); const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p }); if (pIn !== "") { throw new Error("phonetics error - phonetics shorter than pashto script"); } return { p: pOut, f, }; } /** * splits a phonetics string into an array of Phonemes * * will error if there is an illeagal phonetics character * * @param fIn a phonetics string * @returns an array of phonemes */ export function splitFIntoPhonemes(fIn: string): Phoneme[] { const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"]; const quadrigraphs: Phoneme[] = ["-Ul-"]; const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; const endingDigraphs: Phoneme[] = ["uy"]; const willIgnore = ["?", " ", "`", ".", "…", ",", "'"]; const result: Phoneme[] = []; const f = removeAccents(fIn); let index = 0; while (index < f.length) { const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " "); const threeLetterChunk = f.slice(index, index + 3) as Phoneme; const fourLetterChunk = f.slice(index, index + 4) as Phoneme; if (quadrigraphs.includes(fourLetterChunk)) { result.push(fourLetterChunk); index += 4; continue; } if (trigraphs.includes(threeLetterChunk)) { result.push(threeLetterChunk); index += 3; continue; } const twoLetterChunk = f.slice(index, index + 2) as Phoneme; if ( digraphs.includes(twoLetterChunk) || (isLastTwoLetters && endingDigraphs.includes(twoLetterChunk)) ) { result.push(twoLetterChunk); index += 2; continue; } const singleLetter = f.slice(index, index + 1) as Phoneme; if (!willIgnore.includes(singleLetter)) { if (!singleLetterPhonemes.includes(singleLetter)) { throw new Error(`illegal phonetic character: ${singleLetter}`); } result.push(singleLetter); } index++; } return result; } enum PhonemeStatus { LeadingLongVowel, LeadingConsonantOrShortVowel, DoubleConsonantTashdeed, EndingWithHeyHim, DirectMatch, ShortVowel, PersianSilentWWithAa, ArabicWasla, Izafe, } function processPhoneme( acc: DiacriticsAccumulator, phoneme: Phoneme, i: number, phonemes: Phoneme[], ) { // console.log("PHONEME", phoneme); // console.log("space coming up", acc.pIn[0] === " "); // console.log("state", acc); // Prep state const state = acc.pIn[0] === " " ? advanceP(acc) : acc; // console.log("AFTER SPACE PREP", phoneme); // console.log("state", state); // WARNING: Do not use acc after this point! const { phonemeInfo, sukunOrDiacritic, phs, } = stateInfo({ state, i, phoneme, phonemes }); return (phs === PhonemeStatus.LeadingLongVowel) ? pipe( advanceP, addP(phonemeInfo.diacritic), advanceP, )(state) : (phs === PhonemeStatus.LeadingConsonantOrShortVowel) ? pipe( advanceP, addP(sukunOrDiacritic), advanceForAin, )(state) : (phs === PhonemeStatus.DoubleConsonantTashdeed) ? pipe( addP(tashdeed) )(state) : (phs === PhonemeStatus.EndingWithHeyHim) ? pipe( advanceP, addP(phoneme === "u" ? hamzaAbove : sukun), )(state) : (phs === PhonemeStatus.DirectMatch) ? pipe( addP(sukunOrDiacritic), advanceP, )(state) : (phs === PhonemeStatus.PersianSilentWWithAa) ? pipe( addP("("), advanceP, addP(")"), advanceP, )(state) : (phs === PhonemeStatus.ArabicWasla) ? pipe( addP(zer), overwriteP(wasla), )(state) : (phs === PhonemeStatus.Izafe) ? pipe( reverseP, addP(zer), )(state) : // phs === PhonemeState.ShortVowel pipe( advanceForHamzaMid, addP(phonemeInfo.diacritic), advanceForAinOrHamza, )(state); } function stateInfo({ state, i, phonemes, phoneme }: { state: DiacriticsAccumulator, i: number, phonemes: Phoneme[], phoneme: Phoneme, }) { const prevPLetter = last(state.pOut); const currentPLetter = state.pIn[0]; const nextPLetter = state.pIn[1]; const isBeginningOfWord = state.pOut === "" || prevPLetter === " "; const isEndOfWord = !nextPLetter || nextPLetter === " "; const phonemeInfo = phonemeTable[phoneme]; const previousPhoneme = i > 0 && phonemes[i-1]; const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined; const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter); const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter)); const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic; const sukunOrDiacritic = (needsSukun ? sukun : diacritic); function getPhonemeState(): PhonemeStatus { if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) { if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) { throw Error("phonetics error - needs alef prefix"); } return PhonemeStatus.LeadingLongVowel; } if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) { return PhonemeStatus.LeadingConsonantOrShortVowel; } if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") { return PhonemeStatus.PersianSilentWWithAa; } if (!isBeginningOfWord && phoneme === "i" && currentPLetter === "ا" && nextPLetter === "ل") { return PhonemeStatus.ArabicWasla; } if (phoneme === "-i-" && isBeginningOfWord) { return PhonemeStatus.Izafe; } if (needsTashdeed) { return PhonemeStatus.DoubleConsonantTashdeed; } if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) { return PhonemeStatus.EndingWithHeyHim; } if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) { return PhonemeStatus.DirectMatch; } if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { return PhonemeStatus.ShortVowel; } throw new Error("phonetics error - no status found for phoneme: " + phoneme); } const phs = getPhonemeState(); return { phs, phonemeInfo, sukunOrDiacritic, }; }; /** * returns the last character of a string * * @param s */ function last(s: string) { return s[s.length - 1]; } function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator { return { pIn: state.pIn.slice(n), pOut: state.pOut + state.pIn.slice(0, n), }; } function reverseP(state: DiacriticsAccumulator): DiacriticsAccumulator { return { pIn: state.pOut.slice(-1) + state.pIn, pOut: state.pOut.slice(0, -1), }; } const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): DiacriticsAccumulator => { return { ...state, pOut: toAdd ? (state.pOut + toAdd) : state.pOut, }; }; const overwriteP = (toWrite: string) => (state: DiacriticsAccumulator): DiacriticsAccumulator => { return { pIn: state.pIn.slice(1), pOut: state.pOut + toWrite, }; }; function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} { return { current: state.pIn[0], next: state.pIn[1], }; } function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator { const { current } = getCurrentNext(state); return (current === "ع") ? advanceP(state) : state; } function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator { const { current, next } = getCurrentNext(state); if (current === "ئ" && next && next !== "ئ") { return advanceP(state); } return state; } function advanceForAinOrHamza(state: DiacriticsAccumulator): DiacriticsAccumulator { const { current, next } = getCurrentNext(state); if (current === "ه" && (!next || next === " ")) { return advanceP(state); } if (current === "ع") { return advanceP(state); } return state; }