From 7b0e6d864f86db7907a2b72df6cecfa261ec13dc Mon Sep 17 00:00:00 2001 From: Bill D Date: Thu, 6 May 2021 23:28:03 +0300 Subject: [PATCH] starting to work on new diacritics module --- src/lib/diacritics.test.ts | 178 +++++++++++++++++++ src/lib/diacritics.ts | 350 +++++++++++++++++++++++++++++++++++++ 2 files changed, 528 insertions(+) create mode 100644 src/lib/diacritics.test.ts create mode 100644 src/lib/diacritics.ts diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts new file mode 100644 index 0000000..2c51fad --- /dev/null +++ b/src/lib/diacritics.test.ts @@ -0,0 +1,178 @@ +/** + * Copyright (c) 2021 lingdocs.com + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + * + */ + +import { + addDiacritics, + splitFIntoPhonemes, +} from "./diacritics"; +import * as T from "../types"; + +const phonemeSplits: Array<{ + in: string, + out: string[], +}> = [ + { + in: "kor", + out: ["k", "o", "r"], + }, + { + in: "raaghey", + out: ["r", "aa", "gh", "ey"], + }, + { + in: "hatsa", + out: ["h", "a", "ts", "a"], + }, + { + in: "ba", + out: ["b", "a"], + }, + { + in: "peydáa", + out: ["p", "ey", "d", "aa"], + }, + { + in: "be kaar", + out: ["b", "e", "k", "aa", "r"], + }, + { + in: "raadzeyy", + out: ["r", "aa", "dz", "eyy"], + }, + { + in: "badanuy ??", + out: ["b", "a", "d", "a", "n", "uy"], + }, + { + in: "tur ... pore", + out: ["t", "u", "r", "p", "o", "r", "e"], + }, + { + in: "daar-Ul-iqaama", + out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"], + }, +]; + +const badPhonetics: Array<{ + in: string, + problem: string, +}> = [ + { + in: "acar", + problem: "c", + }, + { + in: "a7am", + problem: "7", + }, +]; + +const diacriticsTest: Array<{ + in: T.PsString, + out: string, +}> = [ + { + in: { + p: "کور", + f: "kor", + }, + out: "کور", + }, + { + in: { + p: "کور", + f: "koor", + }, + out: "کُور", + }, + { + in: { + p: "تب", + f: "tib", + }, + out: "تِب", + }, + { + in: { + p: "تب", + f: "tab", + }, + out: "تَب", + }, + { + in: { + p: "تب", + f: "tUb", + }, + out: "تُب", + }, + { + in: { + p: "تب", + f: "tub", + }, + out: "تٙب", + }, + { + in: { + p: "تب", + f: "tb", + }, + out: "تْب", + }, + { + in: { + p: "تلب", + f: "tilab", + }, + out: "تِلَب", + }, + { + in: { + p: "تشناب", + f: "tashnaab", + }, + out: "تَشْناب", + }, +]; + +const brokenDiacritics = [ + { + p: "تشناب", + f: "peshnaab", + }, +]; + +phonemeSplits.forEach((s) => { + test(`${s.in} should split properly`, () => { + const result = splitFIntoPhonemes(s.in); + expect(result).toEqual(s.out); + }); +}); + +test("bad phonetic characters should throw an error", () => { + badPhonetics.forEach((s) => { + expect(() => { + splitFIntoPhonemes(s.in); + }).toThrow(`illegal phonetic character: ${s.problem}`); + }); +}); + +test("adding diacritics should work", () => { + diacriticsTest.forEach((t) => { + expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f }); + }); +}); + +test("adding diacritics errors when phonetecs and pashto do not line up", () => { + brokenDiacritics.forEach((t) => { + expect(() => { + addDiacritics(t); + }).toThrow(); + }); +}); diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts new file mode 100644 index 0000000..2ec6dd6 --- /dev/null +++ b/src/lib/diacritics.ts @@ -0,0 +1,350 @@ +/** + * Copyright (c) 2021 lingdocs.com + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + * + */ + +import * as T from "../types"; +import { removeAccents } from "./accent-helpers"; +import { firstPhonetics } from "./p-text-helpers"; + +const zwar = "َ"; +const zwarakey = "ٙ"; +const zer = "ِ"; +const pesh = "ُ"; +const sukun = "ْ"; +const hamzaAbove = "ٔ"; +const tashdeed = "ّ"; +const wasla = "ٱ"; +const daggerAlif = "ٰ"; +const fathahan = "ً"; + +type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y"; +type Ain = "'" +type JoiningVowel = "-i-" | "-U-" | "-Ul-"; +type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy"; +type ShortVowel = "a" | "i" | "u" | "U"; +type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel; + +type PhonemeInfo = { + matches?: string[], + beginningMatches?: string[], + endingMatches?: string[], + consonant?: true, + diacritic?: string, + endingOnly?: true, + takesSukunOnEnding?: true, + addAlefOnBeginning?: true, + canStartWithAynBefore?: true, +} + +const phonemeTable: Record = { + // Consonants + "b": { + matches: ["ب"], + consonant: true, + }, + "p": { + matches: ["پ"], + consonant: true, + }, + "t": { + matches: ["ت", "ط"], + consonant: true, + }, + "T": { + matches: ["ټ"], + consonant: true, + }, + "s": { + matches: ["س", "ص", "ث"], + consonant: true, + }, + "j": { + matches: ["ج"], + consonant: true, + }, + "ch": { + matches: ["چ"], + consonant: true, + }, + "kh": { + matches: ["خ"], + consonant: true, + }, + "ts": { + matches: ["څ"], + consonant: true, + }, + "dz": { + matches: ["ځ"], + consonant: true, + }, + "d": { + matches: ["د"], + consonant: true, + }, + "D": { + matches: ["ډ"], + consonant: true, + }, + "r": { + matches: ["ر"], + consonant: true, + }, + "R": { + matches: ["ړ"], + consonant: true, + }, + "z": { + matches: ["ز", "ذ", "ظ", "ض"], + consonant: true, + }, + "jz": { + matches: ["ژ"], + consonant: true, + }, + "G": { + matches: ["ږ"], + consonant: true, + }, + "sh": { + matches: ["ش"], + consonant: true, + }, + "x": { + matches: ["ښ"], + consonant: true, + }, + "gh": { + matches: ["غ"], + consonant: true, + }, + "f": { + matches: ["ف"], + consonant: true, + }, + "q": { + matches: ["ق"], + consonant: true, + }, + "k": { + matches: ["ک"], + consonant: true, + }, + "g": { + matches: ["ګ"], + consonant: true, + }, + "l": { + matches: ["ل"], + consonant: true, + }, + "m": { + matches: ["م"], + consonant: true, + }, + "n": { + matches: ["ن"], + consonant: true, + }, + "N": { + matches: ["ڼ"], + consonant: true, + }, + "h": { + matches: ["ه", "ح"], + consonant: true, + takesSukunOnEnding: true, + }, + "w": { + matches: ["و"], + consonant: true, + }, + "y": { + matches: ["ی"], + consonant: true, + }, + // Ain + "'": { + matches: ["ع", "ئ"], + consonant: true, + }, + // Joining Vowels + "-i-": { + }, + "-U-": { + matches: [" و ", "و"], + }, + "-Ul-": { + matches: ["ال"], + }, + // Long Vowels + "aa": { + matches: ["ا"], + beginningMatches: ["آ", "ا"], + endingMatches: ["ا", "یٰ"], + }, + "ee": { + matches: ["ی"], + addAlefOnBeginning: true, + endingMatches: ["ي"], + diacritic: zer, + canStartWithAynBefore: true + }, + "e": { + matches: ["ې"], + addAlefOnBeginning: true, + }, + "o": { + matches: ["و"], + addAlefOnBeginning: true, + }, + "oo": { + matches: ["و"], + addAlefOnBeginning: true, + // alsoCanBePrefix: true, + diacritic: pesh, + }, + "ey": { + matches: ["ی"], + addAlefOnBeginning: true, + endingMatches: ["ی"], + }, + "uy": { + matches: ["ۍ"], + endingOnly: true, + }, + "eyy": { + matches: ["ئ"], + endingOnly: true, + }, + // Short Vowels + "a": { + diacritic: zwar, + endingMatches: ["ه"], + // canComeAfterHeyEnding: true, + // canBeFirstPartOfFathahanEnding: true, + }, + "u": { + diacritic: zwarakey, + endingMatches: ["ه"], + // hamzaOnEnd: true, + }, + "i": { + diacritic: zer, + endingMatches: ["ه"], + beginningMatches: ["ا", "ع"], + // takesDiacriticBeforeGurdaHeyEnding: true, + // canBeWasla: true, + }, + "U": { + diacritic: pesh, + endingMatches: ["ه"], + // takesDiacriticBeforeGurdaHeyEnding: true, + beginningMatches: ["ا", "ع"], + }, +} + + +export function splitFIntoPhonemes(fIn: string): Phoneme[] { + const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"]; + + const quadrigraphs: Phoneme[] = ["-Ul-"]; + const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; + const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; + const endingDigraphs: Phoneme[] = ["uy"]; + const willIgnore = ["?", " ", "`", ".", "…", ","]; + + const result: Phoneme[] = []; + const f = removeAccents(fIn); + let index = 0; + while (index < f.length) { + const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " "); + const threeLetterChunk = f.slice(index, index + 3) as Phoneme; + const fourLetterChunk = f.slice(index, index + 4) as Phoneme; + if (quadrigraphs.includes(fourLetterChunk)) { + result.push(fourLetterChunk); + index += 4; + continue; + } + if (trigraphs.includes(threeLetterChunk)) { + result.push(threeLetterChunk); + index += 3; + continue; + } + const twoLetterChunk = f.slice(index, index + 2) as Phoneme; + if ( + digraphs.includes(twoLetterChunk) || + (isLastTwoLetters && endingDigraphs.includes(twoLetterChunk)) + ) { + result.push(twoLetterChunk); + index += 2; + continue; + } + const singleLetter = f.slice(index, index + 1) as Phoneme; + if (!willIgnore.includes(singleLetter)) { + if (!singleLetterPhonemes.includes(singleLetter)) { + throw new Error(`illegal phonetic character: ${singleLetter}`); + } + result.push(singleLetter); + } + index++; + } + return result; +} + +/** + * Adds phonetis to a given PsString. + * Errors if the phonetics and script don't line up. + * + * @param ps a PsSTring without phonetics + */ +export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString { + // TODO: + const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f); + + const { pOut } = phonemes.reduce((acc, phoneme, i) => { + const isBeginningOfWord = acc.pOut === "" || last(acc.pOut) === " "; + const phonemeInfo = phonemeTable[phoneme]; + const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; + const currentPLetter = acc.pIn[0]; + const needsSukun = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); + + if (phonemeInfo.matches?.includes(currentPLetter)) { + // TODO: Check if tashdeed or sukun is used + // const needsSukun = is consonant + previous phoneme was consonant + not beginning of word + return { + pOut: acc.pOut + + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "") + + currentPLetter, + pIn: acc.pIn.slice(1), + }; + } + + if (phonemeInfo.diacritic) { + return { + pOut: acc.pOut + phonemeInfo.diacritic, + pIn: acc.pIn, + } + } + + throw new Error("phonetics error"); + }, { pOut: "", pIn: p }); + + return { + p: pOut, + f, + }; +} + +/** + * returns the last character of a string + * + * @param s + */ +function last(s: string) { + return s[s.length - 1]; +} \ No newline at end of file