From a2b5626514e7a6b93252ffbc2ea1ba580be90302 Mon Sep 17 00:00:00 2001 From: Bill D Date: Thu, 6 May 2021 23:48:53 +0300 Subject: [PATCH] more work on diacritics --- src/lib/diacritics.test.ts | 116 +++++++++++++++++++++++++++++++++++++ src/lib/diacritics.ts | 34 +++++++++-- 2 files changed, 144 insertions(+), 6 deletions(-) diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts index 2c51fad..de5ea1b 100644 --- a/src/lib/diacritics.test.ts +++ b/src/lib/diacritics.test.ts @@ -12,6 +12,17 @@ import { } from "./diacritics"; import * as T from "../types"; +const zwar = "َ"; +const zwarakey = "ٙ"; +const zer = "ِ"; +const pesh = "ُ"; +const sukun = "ْ"; +const hamzaAbove = "ٔ"; +const tashdeed = "ّ"; +const wasla = "ٱ"; +const daggerAlif = "ٰ"; +const fathahan = "ً"; + const phonemeSplits: Array<{ in: string, out: string[], @@ -139,6 +150,105 @@ const diacriticsTest: Array<{ }, out: "تَشْناب", }, + // working with وs + { + in: { + p: "کول", + f: "kwal", + }, + out: "کْوَل", + }, + { + in: { + p: "تول", + f: "tool", + }, + out: "تُول", + }, + { + in: { + p: "مقبول", + f: "maqbool", + }, + out: "مَقْبُول", + }, + { + in: { + p: "کول", + f: "kawul", + }, + out: "کَو" + zwarakey + "ل", + }, + { + in: { + p: "کول", + f: "kiwul", + }, + out: "کِو" + zwarakey + "ل", + }, + { + in: { + p: "کول", + f: "kUwul", + }, + out: "کُو" + zwarakey + "ل", + }, + { + in: { + p: "کول", + f: "kuwul", + }, + out: "ک" + zwarakey + "و" + zwarakey + "ل", + }, + { + in: { + p: "کول", + f: "kawal", + }, + out: "کَوَل", + }, + { + in: { + p: "کول", + f: "kUwal", + }, + out: "کُوَل", + }, + { + in: { + p: "پشتګرد", + f: "pishtgird", + }, + out: "پِشْتْګِرْد", + }, + { + in: { + p: "سپین", + f: "speen", + }, + out: "سْپِین", + }, + { + in: { + p: "سپین", + f: "speyn", + }, + out: "سْپین", + }, + { + in: { + p: "پېش", + f: "pesh", + }, + out: "پېش", + }, + { + in: { + p: "بتن", + f: "battan", + }, + out: "ب" + zwar + "ت" + tashdeed + zwar + "ن", + }, ]; const brokenDiacritics = [ @@ -163,6 +273,12 @@ test("bad phonetic characters should throw an error", () => { }); }); +test("ending with left over Pashto script will throw an error", () => { + expect(() => { + addDiacritics({ p: "کور ته", f: "kor" }); + }).toThrow(`phonetics error - phonetics shorter than pashto script`); +}); + test("adding diacritics should work", () => { diacriticsTest.forEach((t) => { expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f }); diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts index 2ec6dd6..ba814cc 100644 --- a/src/lib/diacritics.ts +++ b/src/lib/diacritics.ts @@ -248,7 +248,14 @@ const phonemeTable: Record = { }, } - +/** + * splits a phonetics string into an array of Phonemes + * + * will error if there is an illeagal phonetics character + * + * @param fIn a phonetics string + * @returns an array of phonemes + */ export function splitFIntoPhonemes(fIn: string): Phoneme[] { const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y"]; @@ -306,16 +313,25 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt // TODO: const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f); - const { pOut } = phonemes.reduce((acc, phoneme, i) => { - const isBeginningOfWord = acc.pOut === "" || last(acc.pOut) === " "; + const { pIn, pOut } = phonemes.reduce((acc, phoneme, i) => { + const prevPLetter = last(acc.pOut); + const isBeginningOfWord = acc.pOut === "" || prevPLetter === " "; const phonemeInfo = phonemeTable[phoneme]; + const previousPhoneme = i > 0 && phonemes[i-1]; const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; const currentPLetter = acc.pIn[0]; - const needsSukun = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); + const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); + const needsTashdeed = doubleConsonant && (previousPhoneme === phoneme); + const needsSukun = doubleConsonant && (previousPhoneme !== phoneme); + + if (needsTashdeed) { + return { + pOut: acc.pOut + tashdeed, + pIn: acc.pIn, + }; + } if (phonemeInfo.matches?.includes(currentPLetter)) { - // TODO: Check if tashdeed or sukun is used - // const needsSukun = is consonant + previous phoneme was consonant + not beginning of word return { pOut: acc.pOut + (needsSukun ? sukun : phonemeInfo.diacritic ? phonemeInfo.diacritic : "") @@ -331,9 +347,15 @@ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsSt } } + // TODO: CHECK IF PASHTO IS SHORTER THAN PHONETICS + throw new Error("phonetics error"); }, { pOut: "", pIn: p }); + if (pIn !== "") { + throw new Error("phonetics error - phonetics shorter than pashto script"); + } + return { p: pOut, f,