From c8da32764e7e8a27990307c9c2eac5ba2a2587f4 Mon Sep 17 00:00:00 2001 From: Bill D Date: Sat, 5 Jun 2021 20:59:35 +0430 Subject: [PATCH] =?UTF-8?q?new=20diacritics=20function=20done!=20?= =?UTF-8?q?=F0=9F=99=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- src/lib/diacritics-helpers.ts | 29 +++-- src/lib/diacritics.test.ts | 204 ++++++++++++++++++++++++---------- src/lib/diacritics.ts | 45 +++----- src/library.ts | 4 + 5 files changed, 191 insertions(+), 93 deletions(-) diff --git a/package.json b/package.json index 41755c6..b5670d2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@lingdocs/pashto-inflector", - "version": "0.4.2", + "version": "0.4.3", "author": "lingdocs.com", "description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations", "homepage": "https://verbs.lingdocs.com", diff --git a/src/lib/diacritics-helpers.ts b/src/lib/diacritics-helpers.ts index 0149ab2..cbb15fe 100644 --- a/src/lib/diacritics-helpers.ts +++ b/src/lib/diacritics-helpers.ts @@ -185,7 +185,7 @@ export const phonemeTable: Record = { }, // Long Vowels "aa": { - matches: ["ا"], + matches: ["ا", "أ"], beginningMatches: ["آ", "ا"], endingMatches: ["ا", "یٰ"], longVowel: true, @@ -210,7 +210,6 @@ export const phonemeTable: Record = { "oo": { matches: ["و"], longVowel: true, - // alsoCanBePrefix: true, diacritic: pesh, useEndingDiacritic: true, ainBlendDiacritic: pesh, @@ -271,7 +270,7 @@ export const phonemeTable: Record = { const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; const endingDigraphs: Phoneme[] = ["uy"]; - const willIgnore = ["?", " ", "`", ".", "…", ","]; + const willIgnore = ["?", " ", "`", ".", "…", ",", "-"]; const result: Phoneme[] = []; const f = removeAccents(fIn).replace(/ă/g, "a"); @@ -337,6 +336,10 @@ export enum PhonemeStatus { NOnFathatan, HamzaOnWow, ArabicDefiniteArticleUl, + OoPrefix, + AlefHamzaBeg, + GlottalStopBeforeOo, + OoAfterGlottalStopOo, } export function stateInfo({ state, i, phonemes, phoneme }: { @@ -351,14 +354,14 @@ export function stateInfo({ state, i, phonemes, phoneme }: { const nextPLetter = state.pIn[1]; const nextPhoneme = phonemes[i+1]; const previousPhoneme = i > 0 && phonemes[i-1]; - const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل"); + const lastThreePLetters = last(state.pOut, 3) + last(state.pOut, 2) + prevPLetter; + const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل") || (["دَر", "وَر"].includes(lastThreePLetters) || (last(state.pOut, 2) + prevPLetter) === "را"); const isEndOfWord = isOutOfWord(nextPLetter); const phonemeInfo = phonemeTable[phoneme]; const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined; const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); - const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter); const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter)); const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع")); const diacritic = useAinBlendDiacritics @@ -372,6 +375,9 @@ export function stateInfo({ state, i, phonemes, phoneme }: { if (isBeginningOfWord && phoneme === "aa" && phonemeInfo.beginningMatches?.includes(currentPLetter)) { return PhonemeStatus.DirectMatch; } + if (isBeginningOfWord && phoneme === "oo" && currentPLetter === "و") { + return PhonemeStatus.OoPrefix; + } if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) { if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) { throw Error("phonetics error - needs alef prefix"); @@ -395,12 +401,21 @@ export function stateInfo({ state, i, phonemes, phoneme }: { if (isBeginningOfWord && phoneme === "-Ul-" && currentPLetter === "ا" && nextPLetter === "ل") { return PhonemeStatus.ArabicDefiniteArticleUl; } + if (phoneme === "a" && nextPhoneme === "'" && phonemes[i+2] === "a" && currentPLetter === "أ") { + return PhonemeStatus.AlefHamzaBeg; + } if (phoneme === "a" && previousPhoneme === "U" && currentPLetter === "و") { return PhonemeStatus.HamzaOnWow; } if (phoneme === "a" && currentPLetter === "ا" && nextPLetter === fathahan) { return PhonemeStatus.ShortAForAlefBeforeFathatan; } + if (phoneme === "'" && currentPLetter === "و" && nextPLetter === "و") { + return PhonemeStatus.GlottalStopBeforeOo; + } + if (phoneme === "oo" && previousPhoneme === "'" && currentPLetter === "و" && prevPLetter === hamzaAbove) { + return PhonemeStatus.OoAfterGlottalStopOo; + } if (phoneme === "'" && last(state.pOut, 2) === "ع" && isOutOfWord(last(state.pOut, 3))) { return PhonemeStatus.AinBeginningAfterShortVowel; } @@ -430,7 +445,7 @@ export function stateInfo({ state, i, phonemes, phoneme }: { if (useAinBlendDiacritics) { return PhonemeStatus.LongAinVowelMissingComma; } - if (needsTashdeed) { + if (((!isBeginningOfWord && doubleConsonant) || prevPLetter === " ") && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter)) { return PhonemeStatus.DoubleConsonantTashdeed; } if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) { @@ -454,7 +469,7 @@ export function stateInfo({ state, i, phonemes, phoneme }: { if (isEndOfWord && phoneme === "n" && currentPLetter === fathahan && prevPLetter === "ا") { return PhonemeStatus.NOnFathatan; } - console.log(state); + // console.log("errored", "current", phoneme, "next", nextPhoneme); // console.log("bad phoneme is ", phoneme); throw new Error("phonetics error - no status found for phoneme: " + phoneme); } diff --git a/src/lib/diacritics.test.ts b/src/lib/diacritics.test.ts index 653f03c..159d828 100644 --- a/src/lib/diacritics.test.ts +++ b/src/lib/diacritics.test.ts @@ -12,14 +12,8 @@ import { import { zwar, zwarakey, - zer, - pesh, sukun, - hamzaAbove, tashdeed, - wasla, - daggerAlif, - fathahan, } from "./diacritics-helpers"; import * as T from "../types"; @@ -259,6 +253,20 @@ const diacriticsSections: { }, out: null, }, + { + in: { + p: "تشناب", + f: "peshnaab", + }, + out: null, + }, + { + in: { + p: "وسېدل", + f: "osedul", + }, + out: null, + }, { in: { p: "رغېدل", @@ -593,6 +601,27 @@ const diacriticsSections: { }, out: "سَخْتْسَری", }, + { + in: { + p: " سپین کړه", + f: " speen kRu", + }, + out: "سْپِین کْړهٔ", + }, + { + in: { + p: "اوب", + f: "ob", + }, + out: "اوب", + }, + { + in: { + p: "قطعه بازي", + f: "qit'a baazee", + }, + out: "قِطْعَه بازي", + }, ], }, { @@ -1094,29 +1123,112 @@ const diacriticsSections: { }, ], }, - // { - // describe: "double consonants on end of words", - // tests: [ - // { - // in: { - // p: "حق", - // f: "haqq", - // }, - // out: "حَقّ", - // }, - // { - // in: { - // p: "حق پر", - // f: "haqq par", - // }, - // out: "حَقّ پَر", - // }, - // ], - // }, + { + describe: "double consonants on end of words", + tests: [ + { + in: { + p: "حق", + f: "haqq", + }, + out: "حَقّ", + }, + { + in: { + p: "حق پر", + f: "haqq par", + }, + out: "حَقّ پَر", + }, + ], + }, + { + describe: "أ in the middle of the word", + tests: [ + { + in: { + p: "متأسف", + f: "mUtaassif", + }, + out: "مُتأسِّف", + }, + { + in: { + p: "متأسف", + f: "mUta'assif", + }, + out: "مُتأسِّف", + }, + ], + }, + { + describe: "ؤو in middle of the word", + tests: [ + { + in: { + p: "مسوول", + f: "mas'ool", + }, + out: "مَسؤول", // TODO: Is this best?? + }, + ], + }, + { + describe: "allow for beginnings prefixed with ور در را", + tests: [ + { + in: { + p: "وراوږد", + f: "wăr-ooGad", + }, + out: "وَراُوږَد", + }, + { + in: { + p: "دراوږد", + f: "dăr-ooGad", + }, + out: "دَراُوږَد", + }, + { + in: { + p: "رااوږد", + f: "raa-ooGad", + }, + out: "رااُوږَد", + }, + ], + }, + { + describe: "allow oo at start with و prefix", + tests: [ + { + in: { + p: "وباسي", + f: "oobaasee", + }, + out: "وُباسي", + }, + { + in: { + p: "وځم", + f: "oodzum", + }, + out: "وُځ" + zwarakey + "م", + }, + { + in: { + p: "وځم", + f: "wUdzum", + }, + out: "وُځ" + zwarakey + "م", + }, + ], + }, ]; diacriticsSections.forEach((section) => { - // if (!section.describe.includes("require fathatan")) return; + // if (!section.describe.includes("allow for beginnings")) return; describe(section.describe, () => { section.tests.forEach((t) => { if (t.out) { @@ -1132,36 +1244,16 @@ diacriticsSections.forEach((section) => { }); }); -// ERRORS +test("ending with left over Pashto script will throw an error", () => { + expect(() => { + addDiacritics({ p: "کور ته", f: "kor" }); + }).toThrow(`phonetics error - phonetics shorter than pashto script`); +}); -// const brokenDiacritics = [ -// { -// p: "تشناب", -// f: "peshnaab", -// }, -// { -// p: "وسېدل", -// f: "osedul", -// }, -// ]; +test("ending with left over phonetics will throw an error", () => { + expect(() => { + addDiacritics({ p: "کار", f: "kaar kawul" }); + }).toThrow(); +}); -// test("ending with left over Pashto script will throw an error", () => { -// expect(() => { -// addDiacritics({ p: "کور ته", f: "kor" }); -// }).toThrow(`phonetics error - phonetics shorter than pashto script`); -// }); - -// test("ending with left over phonetics will throw an error", () => { -// expect(() => { -// addDiacritics({ p: "کار", f: "kaar kawul" }); -// }).toThrow(); -// }); - -// test("adding diacritics errors when phonetecs and pashto do not line up", () => { -// brokenDiacritics.forEach((t) => { -// expect(() => { -// addDiacritics(t); -// }).toThrow(); -// }); -// }); diff --git a/src/lib/diacritics.ts b/src/lib/diacritics.ts index 6ce4601..b2b8608 100644 --- a/src/lib/diacritics.ts +++ b/src/lib/diacritics.ts @@ -10,7 +10,6 @@ import * as T from "../types"; import { splitFIntoPhonemes, Phoneme, - phonemeTable, zwar, zwarakey, zer, @@ -19,8 +18,6 @@ import { hamzaAbove, tashdeed, wasla, - daggerAlif, - fathahan, addP, advanceP, reverseP, @@ -41,7 +38,7 @@ import { pipe } from "rambda"; */ export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString { const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f); - const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p }); + const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p.trim() }); if (pIn !== "") { throw new Error("phonetics error - phonetics shorter than pashto script"); } @@ -57,19 +54,11 @@ function processPhoneme( i: number, phonemes: Phoneme[], ): DiacriticsAccumulator { - // console.log("PHONEME", phoneme); - // console.log("space coming up", acc.pIn[0] === " "); - // console.log("state", acc); - // Prep state - // TODO: CLEANER function jump to next char const state = acc.pIn.slice(0, 5) === " ... " ? advanceP(acc, 5) : acc.pIn[0] === " " ? advanceP(acc) : acc; - // console.log("AFTER SPACE PREP", phoneme); - // console.log("state", state); - // WARNING: Do not use acc after this point! const { phonemeInfo, @@ -78,10 +67,6 @@ function processPhoneme( prevPLetter, } = stateInfo({ state, i, phoneme, phonemes }); - // console.log("phoneme", phoneme); - // console.log("state", state); - // console.log(phs); - return (phs === PhonemeStatus.LeadingLongVowel) ? pipe( advanceP, @@ -95,6 +80,7 @@ function processPhoneme( )(state) : (phs === PhonemeStatus.DoubleConsonantTashdeed) ? pipe( + prevPLetter === " " ? reverseP : addP(""), addP(tashdeed) )(state) : (phs === PhonemeStatus.EndingWithHeyHim) ? @@ -201,18 +187,19 @@ function processPhoneme( addP(pesh), advanceP, )(state) + : (phs === PhonemeStatus.OoPrefix) ? + pipe( + advanceP, + addP(pesh), + )(state) + : (phs === PhonemeStatus.GlottalStopBeforeOo) ? + pipe( + advanceP, + addP(hamzaAbove), + )(state) + : (phs === PhonemeStatus.OoAfterGlottalStopOo) ? + pipe( + advanceP, + )(state) : state; - - - - - // (phs === PhonemeStatus.AlefWithHamzaWithGlottalStop) ? - // state - // : (phs === PhonemeStatus.AinBeginningAfterShortVowel) ? - // state - //: (phs === PhonemeStatus.WoEndingO) ? - // state - // : - // - } diff --git a/src/library.ts b/src/library.ts index 06bf9a2..8b5a34f 100644 --- a/src/library.ts +++ b/src/library.ts @@ -63,6 +63,9 @@ import { import { translatePhonetics, } from "./lib/translate-phonetics"; +import { + addDiacritics, +} from "./lib/diacritics"; import defaultTextOptions from "./lib/default-text-options"; import * as grammarUnits from "./lib/grammar-units"; import * as Types from "./types"; @@ -83,6 +86,7 @@ export { isNounAdjOrVerb, simplifyPhonetics, phoneticsToDiacritics, + addDiacritics, translatePhonetics, // protobuf helpers readDictionary,