new diacritics function done! 🙌

This commit is contained in:
Bill D 2021-06-05 20:59:35 +04:30
parent a62ab986ba
commit c8da32764e
5 changed files with 191 additions and 93 deletions

View File

@ -1,6 +1,6 @@
{ {
"name": "@lingdocs/pashto-inflector", "name": "@lingdocs/pashto-inflector",
"version": "0.4.2", "version": "0.4.3",
"author": "lingdocs.com", "author": "lingdocs.com",
"description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations", "description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations",
"homepage": "https://verbs.lingdocs.com", "homepage": "https://verbs.lingdocs.com",

View File

@ -185,7 +185,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
}, },
// Long Vowels // Long Vowels
"aa": { "aa": {
matches: ["ا"], matches: ["ا", "أ"],
beginningMatches: ["آ", "ا"], beginningMatches: ["آ", "ا"],
endingMatches: ["ا", "یٰ"], endingMatches: ["ا", "یٰ"],
longVowel: true, longVowel: true,
@ -210,7 +210,6 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
"oo": { "oo": {
matches: ["و"], matches: ["و"],
longVowel: true, longVowel: true,
// alsoCanBePrefix: true,
diacritic: pesh, diacritic: pesh,
useEndingDiacritic: true, useEndingDiacritic: true,
ainBlendDiacritic: pesh, ainBlendDiacritic: pesh,
@ -271,7 +270,7 @@ export const phonemeTable: Record<Phoneme, PhonemeInfo> = {
const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"];
const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"];
const endingDigraphs: Phoneme[] = ["uy"]; const endingDigraphs: Phoneme[] = ["uy"];
const willIgnore = ["?", " ", "`", ".", "…", ","]; const willIgnore = ["?", " ", "`", ".", "…", ",", "-"];
const result: Phoneme[] = []; const result: Phoneme[] = [];
const f = removeAccents(fIn).replace(/ă/g, "a"); const f = removeAccents(fIn).replace(/ă/g, "a");
@ -337,6 +336,10 @@ export enum PhonemeStatus {
NOnFathatan, NOnFathatan,
HamzaOnWow, HamzaOnWow,
ArabicDefiniteArticleUl, ArabicDefiniteArticleUl,
OoPrefix,
AlefHamzaBeg,
GlottalStopBeforeOo,
OoAfterGlottalStopOo,
} }
export function stateInfo({ state, i, phonemes, phoneme }: { export function stateInfo({ state, i, phonemes, phoneme }: {
@ -351,14 +354,14 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
const nextPLetter = state.pIn[1]; const nextPLetter = state.pIn[1];
const nextPhoneme = phonemes[i+1]; const nextPhoneme = phonemes[i+1];
const previousPhoneme = i > 0 && phonemes[i-1]; const previousPhoneme = i > 0 && phonemes[i-1];
const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل"); const lastThreePLetters = last(state.pOut, 3) + last(state.pOut, 2) + prevPLetter;
const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل") || (["دَر", "وَر"].includes(lastThreePLetters) || (last(state.pOut, 2) + prevPLetter) === "را");
const isEndOfWord = isOutOfWord(nextPLetter); const isEndOfWord = isOutOfWord(nextPLetter);
const phonemeInfo = phonemeTable[phoneme]; const phonemeInfo = phonemeTable[phoneme];
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined; // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter)); const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع")); const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
const diacritic = useAinBlendDiacritics const diacritic = useAinBlendDiacritics
@ -372,6 +375,9 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
if (isBeginningOfWord && phoneme === "aa" && phonemeInfo.beginningMatches?.includes(currentPLetter)) { if (isBeginningOfWord && phoneme === "aa" && phonemeInfo.beginningMatches?.includes(currentPLetter)) {
return PhonemeStatus.DirectMatch; return PhonemeStatus.DirectMatch;
} }
if (isBeginningOfWord && phoneme === "oo" && currentPLetter === "و") {
return PhonemeStatus.OoPrefix;
}
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) { if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) { if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
throw Error("phonetics error - needs alef prefix"); throw Error("phonetics error - needs alef prefix");
@ -395,12 +401,21 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
if (isBeginningOfWord && phoneme === "-Ul-" && currentPLetter === "ا" && nextPLetter === "ل") { if (isBeginningOfWord && phoneme === "-Ul-" && currentPLetter === "ا" && nextPLetter === "ل") {
return PhonemeStatus.ArabicDefiniteArticleUl; return PhonemeStatus.ArabicDefiniteArticleUl;
} }
if (phoneme === "a" && nextPhoneme === "'" && phonemes[i+2] === "a" && currentPLetter === "أ") {
return PhonemeStatus.AlefHamzaBeg;
}
if (phoneme === "a" && previousPhoneme === "U" && currentPLetter === "و") { if (phoneme === "a" && previousPhoneme === "U" && currentPLetter === "و") {
return PhonemeStatus.HamzaOnWow; return PhonemeStatus.HamzaOnWow;
} }
if (phoneme === "a" && currentPLetter === "ا" && nextPLetter === fathahan) { if (phoneme === "a" && currentPLetter === "ا" && nextPLetter === fathahan) {
return PhonemeStatus.ShortAForAlefBeforeFathatan; return PhonemeStatus.ShortAForAlefBeforeFathatan;
} }
if (phoneme === "'" && currentPLetter === "و" && nextPLetter === "و") {
return PhonemeStatus.GlottalStopBeforeOo;
}
if (phoneme === "oo" && previousPhoneme === "'" && currentPLetter === "و" && prevPLetter === hamzaAbove) {
return PhonemeStatus.OoAfterGlottalStopOo;
}
if (phoneme === "'" && last(state.pOut, 2) === "ع" && isOutOfWord(last(state.pOut, 3))) { if (phoneme === "'" && last(state.pOut, 2) === "ع" && isOutOfWord(last(state.pOut, 3))) {
return PhonemeStatus.AinBeginningAfterShortVowel; return PhonemeStatus.AinBeginningAfterShortVowel;
} }
@ -430,7 +445,7 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
if (useAinBlendDiacritics) { if (useAinBlendDiacritics) {
return PhonemeStatus.LongAinVowelMissingComma; return PhonemeStatus.LongAinVowelMissingComma;
} }
if (needsTashdeed) { if (((!isBeginningOfWord && doubleConsonant) || prevPLetter === " ") && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter)) {
return PhonemeStatus.DoubleConsonantTashdeed; return PhonemeStatus.DoubleConsonantTashdeed;
} }
if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) { if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) {
@ -454,7 +469,7 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
if (isEndOfWord && phoneme === "n" && currentPLetter === fathahan && prevPLetter === "ا") { if (isEndOfWord && phoneme === "n" && currentPLetter === fathahan && prevPLetter === "ا") {
return PhonemeStatus.NOnFathatan; return PhonemeStatus.NOnFathatan;
} }
console.log(state); // console.log("errored", "current", phoneme, "next", nextPhoneme);
// console.log("bad phoneme is ", phoneme); // console.log("bad phoneme is ", phoneme);
throw new Error("phonetics error - no status found for phoneme: " + phoneme); throw new Error("phonetics error - no status found for phoneme: " + phoneme);
} }

View File

@ -12,14 +12,8 @@ import {
import { import {
zwar, zwar,
zwarakey, zwarakey,
zer,
pesh,
sukun, sukun,
hamzaAbove,
tashdeed, tashdeed,
wasla,
daggerAlif,
fathahan,
} from "./diacritics-helpers"; } from "./diacritics-helpers";
import * as T from "../types"; import * as T from "../types";
@ -259,6 +253,20 @@ const diacriticsSections: {
}, },
out: null, out: null,
}, },
{
in: {
p: "تشناب",
f: "peshnaab",
},
out: null,
},
{
in: {
p: "وسېدل",
f: "osedul",
},
out: null,
},
{ {
in: { in: {
p: "رغېدل", p: "رغېدل",
@ -593,6 +601,27 @@ const diacriticsSections: {
}, },
out: "سَخْتْسَری", out: "سَخْتْسَری",
}, },
{
in: {
p: " سپین کړه",
f: " speen kRu",
},
out: "سْپِین کْړهٔ",
},
{
in: {
p: "اوب",
f: "ob",
},
out: "اوب",
},
{
in: {
p: "قطعه بازي",
f: "qit'a baazee",
},
out: "قِطْعَه بازي",
},
], ],
}, },
{ {
@ -1094,29 +1123,112 @@ const diacriticsSections: {
}, },
], ],
}, },
// { {
// describe: "double consonants on end of words", describe: "double consonants on end of words",
// tests: [ tests: [
// { {
// in: { in: {
// p: "حق", p: "حق",
// f: "haqq", f: "haqq",
// }, },
// out: "حَقّ", out: "حَقّ",
// }, },
// { {
// in: { in: {
// p: "حق پر", p: "حق پر",
// f: "haqq par", f: "haqq par",
// }, },
// out: "حَقّ پَر", out: "حَقّ پَر",
// }, },
// ], ],
// }, },
{
describe: "أ in the middle of the word",
tests: [
{
in: {
p: "متأسف",
f: "mUtaassif",
},
out: "مُتأسِّف",
},
{
in: {
p: "متأسف",
f: "mUta'assif",
},
out: "مُتأسِّف",
},
],
},
{
describe: "ؤو in middle of the word",
tests: [
{
in: {
p: "مسوول",
f: "mas'ool",
},
out: "مَسؤول", // TODO: Is this best??
},
],
},
{
describe: "allow for beginnings prefixed with ور در را",
tests: [
{
in: {
p: "وراوږد",
f: "wăr-ooGad",
},
out: "وَراُوږَد",
},
{
in: {
p: "دراوږد",
f: "dăr-ooGad",
},
out: "دَراُوږَد",
},
{
in: {
p: "رااوږد",
f: "raa-ooGad",
},
out: "رااُوږَد",
},
],
},
{
describe: "allow oo at start with و prefix",
tests: [
{
in: {
p: "وباسي",
f: "oobaasee",
},
out: "وُباسي",
},
{
in: {
p: "وځم",
f: "oodzum",
},
out: "وُځ" + zwarakey + "م",
},
{
in: {
p: "وځم",
f: "wUdzum",
},
out: "وُځ" + zwarakey + "م",
},
],
},
]; ];
diacriticsSections.forEach((section) => { diacriticsSections.forEach((section) => {
// if (!section.describe.includes("require fathatan")) return; // if (!section.describe.includes("allow for beginnings")) return;
describe(section.describe, () => { describe(section.describe, () => {
section.tests.forEach((t) => { section.tests.forEach((t) => {
if (t.out) { if (t.out) {
@ -1132,36 +1244,16 @@ diacriticsSections.forEach((section) => {
}); });
}); });
// ERRORS test("ending with left over Pashto script will throw an error", () => {
expect(() => {
addDiacritics({ p: "کور ته", f: "kor" });
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
});
// const brokenDiacritics = [ test("ending with left over phonetics will throw an error", () => {
// { expect(() => {
// p: "تشناب", addDiacritics({ p: "کار", f: "kaar kawul" });
// f: "peshnaab", }).toThrow();
// }, });
// {
// p: "وسېدل",
// f: "osedul",
// },
// ];
// test("ending with left over Pashto script will throw an error", () => {
// expect(() => {
// addDiacritics({ p: "کور ته", f: "kor" });
// }).toThrow(`phonetics error - phonetics shorter than pashto script`);
// });
// test("ending with left over phonetics will throw an error", () => {
// expect(() => {
// addDiacritics({ p: "کار", f: "kaar kawul" });
// }).toThrow();
// });
// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
// brokenDiacritics.forEach((t) => {
// expect(() => {
// addDiacritics(t);
// }).toThrow();
// });
// });

View File

@ -10,7 +10,6 @@ import * as T from "../types";
import { import {
splitFIntoPhonemes, splitFIntoPhonemes,
Phoneme, Phoneme,
phonemeTable,
zwar, zwar,
zwarakey, zwarakey,
zer, zer,
@ -19,8 +18,6 @@ import {
hamzaAbove, hamzaAbove,
tashdeed, tashdeed,
wasla, wasla,
daggerAlif,
fathahan,
addP, addP,
advanceP, advanceP,
reverseP, reverseP,
@ -41,7 +38,7 @@ import { pipe } from "rambda";
*/ */
export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString { export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f); const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p }); const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p.trim() });
if (pIn !== "") { if (pIn !== "") {
throw new Error("phonetics error - phonetics shorter than pashto script"); throw new Error("phonetics error - phonetics shorter than pashto script");
} }
@ -57,19 +54,11 @@ function processPhoneme(
i: number, i: number,
phonemes: Phoneme[], phonemes: Phoneme[],
): DiacriticsAccumulator { ): DiacriticsAccumulator {
// console.log("PHONEME", phoneme);
// console.log("space coming up", acc.pIn[0] === " ");
// console.log("state", acc);
// Prep state
// TODO: CLEANER function jump to next char
const state = acc.pIn.slice(0, 5) === " ... " const state = acc.pIn.slice(0, 5) === " ... "
? advanceP(acc, 5) ? advanceP(acc, 5)
: acc.pIn[0] === " " : acc.pIn[0] === " "
? advanceP(acc) ? advanceP(acc)
: acc; : acc;
// console.log("AFTER SPACE PREP", phoneme);
// console.log("state", state);
// WARNING: Do not use acc after this point!
const { const {
phonemeInfo, phonemeInfo,
@ -78,10 +67,6 @@ function processPhoneme(
prevPLetter, prevPLetter,
} = stateInfo({ state, i, phoneme, phonemes }); } = stateInfo({ state, i, phoneme, phonemes });
// console.log("phoneme", phoneme);
// console.log("state", state);
// console.log(phs);
return (phs === PhonemeStatus.LeadingLongVowel) ? return (phs === PhonemeStatus.LeadingLongVowel) ?
pipe( pipe(
advanceP, advanceP,
@ -95,6 +80,7 @@ function processPhoneme(
)(state) )(state)
: (phs === PhonemeStatus.DoubleConsonantTashdeed) ? : (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
pipe( pipe(
prevPLetter === " " ? reverseP : addP(""),
addP(tashdeed) addP(tashdeed)
)(state) )(state)
: (phs === PhonemeStatus.EndingWithHeyHim) ? : (phs === PhonemeStatus.EndingWithHeyHim) ?
@ -201,18 +187,19 @@ function processPhoneme(
addP(pesh), addP(pesh),
advanceP, advanceP,
)(state) )(state)
: (phs === PhonemeStatus.OoPrefix) ?
pipe(
advanceP,
addP(pesh),
)(state)
: (phs === PhonemeStatus.GlottalStopBeforeOo) ?
pipe(
advanceP,
addP(hamzaAbove),
)(state)
: (phs === PhonemeStatus.OoAfterGlottalStopOo) ?
pipe(
advanceP,
)(state)
: state; : state;
// (phs === PhonemeStatus.AlefWithHamzaWithGlottalStop) ?
// state
// : (phs === PhonemeStatus.AinBeginningAfterShortVowel) ?
// state
//: (phs === PhonemeStatus.WoEndingO) ?
// state
// :
//
} }

View File

@ -63,6 +63,9 @@ import {
import { import {
translatePhonetics, translatePhonetics,
} from "./lib/translate-phonetics"; } from "./lib/translate-phonetics";
import {
addDiacritics,
} from "./lib/diacritics";
import defaultTextOptions from "./lib/default-text-options"; import defaultTextOptions from "./lib/default-text-options";
import * as grammarUnits from "./lib/grammar-units"; import * as grammarUnits from "./lib/grammar-units";
import * as Types from "./types"; import * as Types from "./types";
@ -83,6 +86,7 @@ export {
isNounAdjOrVerb, isNounAdjOrVerb,
simplifyPhonetics, simplifyPhonetics,
phoneticsToDiacritics, phoneticsToDiacritics,
addDiacritics,
translatePhonetics, translatePhonetics,
// protobuf helpers // protobuf helpers
readDictionary, readDictionary,