From 916bc244874e97e8b5b6409cf07859ba5728a435 Mon Sep 17 00:00:00 2001 From: lingdocs <71590811+lingdocs@users.noreply.github.com> Date: Tue, 14 Sep 2021 18:25:04 +0400 Subject: [PATCH] inflector with plurals beta working! --- src/lib/accent-helpers.ts | 3 +- src/lib/p-text-helpers.test.ts | 96 +++++++++++++- src/lib/p-text-helpers.ts | 115 +++++++++++++++-- src/lib/pashto-inflector.test.ts | 196 +++++++++++++++++++++++------ src/lib/pashto-inflector.ts | 180 ++++++++++++++++++-------- src/lib/standardize-pashto.test.ts | 13 +- src/lib/standardize-pashto.ts | 5 + src/types.ts | 5 + 8 files changed, 505 insertions(+), 108 deletions(-) diff --git a/src/lib/accent-helpers.ts b/src/lib/accent-helpers.ts index d92223c..83a7b5c 100644 --- a/src/lib/accent-helpers.ts +++ b/src/lib/accent-helpers.ts @@ -118,6 +118,7 @@ export function removeAccents(s: T.PsString | string): T.PsString | string { * * @param s a string of Pashto phonetics */ -export function hasAccents(s: string): boolean { +export function hasAccents(s: string | T.PsString): boolean { + if (typeof s !== "string") return hasAccents(s.f); return accentReplacer.some((x) => s.includes(x.accented)); } diff --git a/src/lib/p-text-helpers.test.ts b/src/lib/p-text-helpers.test.ts index 7bb69f8..5c95b76 100644 --- a/src/lib/p-text-helpers.test.ts +++ b/src/lib/p-text-helpers.test.ts @@ -8,7 +8,6 @@ import { concatPsString, - firstPhonetics, makePsString, removeEndingL, yulEndingInfinitive, @@ -22,6 +21,11 @@ import { removeRetroflexR, splitDoubleWord, endsInConsonant, + addOEnding, + removeFVarients, + endsInShwa, + removeAynEnding, + splitPsByVarients, } from "./p-text-helpers"; import * as T from "../types"; import { @@ -617,9 +621,11 @@ test(`complementInflects`, () => { })).toBe(false); }); -test(`firstPhonetics should work`, () => { - expect(firstPhonetics("ist'imaal, istimaal")).toBe("ist'imaal"); - expect(firstPhonetics("kor")).toBe("kor"); +test(`removeFVarients`, () => { + expect(removeFVarients("ist'imaal, istimaal")).toBe("ist'imaal"); + expect(removeFVarients({ p: "معالوم", f: "ma'aalóom, maalóom" })) + .toEqual({ p: "معالوم", f: "ma'aalóom" }); + expect(removeFVarients("kor")).toBe("kor"); }); test(`makePsString should work`, () => { @@ -1034,4 +1040,86 @@ test("endsInAConsonant", () => { ]; does.forEach((x) => expect(endsInConsonant(x)).toBe(true)); doesnt.forEach((x) => expect(endsInConsonant(x)).toBe(false)); +}) + +test("addOEnding", () => { + const tests: { in: T.PsString, out: T.PsString[] }[] = [ + { + in: { p: "کتابونه", f: "kitaabóona" }, + out: [{ p: "کتابونو", f: "kitaabóono" }], + }, + { + in: { p: "کارغان", f: "kaargháan" }, + out: [{ p: "کارغانو", f: "kaargháano" }], + }, + { + in: { p: "کارغانې", f: "kaargháane" }, + out: [{ p: "کارغانو", f: "kaargháano" }], + }, + { + in: { p: "ښځې", f: "xúdze" }, + out: [{ p: "ښځو", f: "xúdzo" }], + }, + // TODO: Make this last thing accented?? + { + in: { p: "کور", f: "kor" }, + out: [{ p: "کورو", f: "koro" }], + }, + { + in: { p: "سړی", f: "saRéy" }, + out: [{ p: "سړیو", f: "saRíyo" }, { p: "سړو", f: "saRó"}], + }, + { + in: { p: "افغانۍ", f: "afghaanúy" }, + out: [{ p: "افغانیو", f: "afghaanúyo" }], + }, + { + in: { p: "اوبه", f: "oobú" }, + out: [{ p: "اوبو", f: "oobó" }], + }, + { + in: { p: "شودې", f: "shoodé" }, + out: [{ p: "شودو", f: "shoodó" }], + }, + { + in: { p: "منابع", f: "manaabí" }, + out: [{ p: "منابو", f: "manaabó" }], + }, + { + in: { p: "انبیا", f: "ambiyáa" }, + out: [{ p: "انبیاوو", f: "ambiyáawo" }], + }, + { + in: { p: "مراجع", f: "maraají'" }, + out: [{ p: "مراجو", f: "maraajó" }], + }, + { + in: { p: "اتباع", f: "atbaa" }, + out: [{ p: "اتباعوو", f: "atbaawo" }], + }, + { + in: { p: "اتباع", "f": "atbáa'" }, + out: [{ p: "اتباعوو", f: "atbáawo" }], + }, + ]; + tests.forEach((t) => { + expect(addOEnding(t.in)).toEqual(t.out); + }); +}); + +test("endsInShwa", () => { + expect(endsInShwa({ p: "ښایسته", f: "xaaystú" })).toBe(true); + expect(endsInShwa({ p: "ښایسته", f: "xaaystu" })).toBe(true); + expect(endsInShwa({ p: "ښایسته", f: "xaaysta" })).toBe(false); + expect(endsInShwa({ p: "کور", f: "kor" })).toBe(false); +}); + +test("splitPsByVarients", () => { + expect(splitPsByVarients({ p: "حوادث, حادثات", f: "hawáadis, haadisáat" })) + .toEqual([{ p: "حوادث", f: "hawáadis" }, { p: "حادثات", f: "haadisáat" }]); + // should work with Pashto comma too + expect(splitPsByVarients({ p: "حوادث، حادثات", f: "hawáadis, haadisáat" })) + .toEqual([{ p: "حوادث", f: "hawáadis" }, { p: "حادثات", f: "haadisáat" }]); + expect(splitPsByVarients({ p: "کور", f: "kor" })) + .toEqual([{ p: "کور", f: "kor" }]); }) \ No newline at end of file diff --git a/src/lib/p-text-helpers.ts b/src/lib/p-text-helpers.ts index b205f0a..6e88176 100644 --- a/src/lib/p-text-helpers.ts +++ b/src/lib/p-text-helpers.ts @@ -15,8 +15,8 @@ import { getPersonInflectionsKey, } from "./misc-helpers"; import * as T from "../types"; -import { removeAccents } from "./accent-helpers"; -import { pashtoConsonants, phoneticsConsonants } from "./pashto-consonants"; +import { hasAccents, removeAccents } from "./accent-helpers"; +import { phoneticsConsonants } from "./pashto-consonants"; import { simplifyPhonetics } from "./simplify-phonetics"; // export function concatPsStringWithVars(...items: Array): T.PsString[] { @@ -190,14 +190,12 @@ export function removeFVarients(x: string | T.PsString | T.DictionaryEntry): T.F return { ...x, f: removeFVarients(x.f), - __brand: "name for a dictionary entry with all the phonetics variations removed", - } as T.DictionaryEntryNoFVars; + } as unknown as T.DictionaryEntryNoFVars; } return { ...x, f: removeFVarients(x.f), - __brand: "name for a ps string with all the phonetics variations removed", - } as T.PsStringNoFVars; + } as unknown as T.PsStringNoFVars; } /** @@ -796,7 +794,7 @@ export function ensureUnisexInflections(infs: T.InflectorOutput, w: T.Dictionary export function endsInAaOrOo(w: T.PsString): boolean { const fEnd = simplifyPhonetics(w.f).slice(-2); - const pEnd = w.p.slice(-1); + const pEnd = w.p.slice(-1) === "ع" ? w.p.slice(-2, -1) : w.p.slice(-1); return ( pEnd === "و" && fEnd.endsWith("o") || @@ -804,7 +802,6 @@ export function endsInAaOrOo(w: T.PsString): boolean { ); } - export function endsInConsonant(w: T.PsString): boolean { // TODO: Add reporting back that the plural ending will need a space? @@ -823,4 +820,106 @@ export function endsInConsonant(w: T.PsString): boolean { // const pCons = pashtoConsonants.includes(w.p.slice(-1)); const fCons = phoneticsConsonants.includes(simplifyPhonetics(w.f).slice(-1)); return fCons; +} + +/** + * adds a و - o ending (used in plurals 2nd inflection) to a given PsString + * It will wipe out a ه - a / u or ې - e and will preserve the accent + * + * @param w + * @returns + */ +export function addOEnding(ps: T.PsString): T.ArrayOneOrMore { + const w = removeEndTick(ps); + const lastLetter = makePsString( + w.p.slice(-1), + w.f.slice(-1), + ); + const hasEyEnding = (lastLetter.p === "ی") && ["ey", "éy"].includes(w.f.slice(-2)); + if (hasEyEnding) { + const base = makePsString(w.p.slice(0, -1), w.f.slice(0, -2)); + const endHadAccent = w.f.slice(-2) === "éy"; + return [ + concatPsString(base, { p: "یو", f: endHadAccent ? "íyo" : "iyo" }), + concatPsString(base, { p: "و", f: endHadAccent ? "ó" : "o" }), + ]; + } + if (lastLetter.p === "ۍ") { + const base = makePsString(w.p.slice(0, -1), w.f.slice(0, -2)); + const endHadAccent = w.f.slice(-2) === "úy"; + return [ + concatPsString(base, { p: "یو", f: endHadAccent ? "úyo" : "uyo" }), + ]; + } + if (lastLetter.p === "ا" || (w.p.slice(-2) === "اع")) { + return [concatPsString(w, { p: "وو", f: "wo" })]; + } + const base = ( + (["ه", "ع"].includes(lastLetter.p) && lastLetter.f.match(/[a|u|i|U|á|ú|í|Ú]/)) || + (lastLetter.p === "ې" && ["e", "é"].includes(lastLetter.f)) + ) ? makePsString( + w.p.slice(0, -1), + w.f.slice(0, -1), + ) : w; + return [concatPsString( + base, + makePsString( + "و", + hasAccents(lastLetter.f) ? "ó" : "o", + ), + )]; +} + +/** + * Determines whether a string ends in a shwa or not + * + * @param w + */ +export function endsInShwa(w: T.PsString): boolean { + const p = w.p.slice(-1); + const f = w.f.slice(-1); + return p === "ه" && ["u", "ú"].includes(f); +} + +/** + * applies f function to both the p and f in a PsString + * + */ +export function mapPsString(ps: T.PsString, f: (s: string) => T): { p: T, f: T } { + return { + p: f(ps.p), + f: f(ps.f), + }; +} + +/** + * splits up a given PsString by comma-seperated varients + * + * @param w + * @returns + */ +export function splitPsByVarients(w: T.PsString): T.ArrayOneOrMore { + function cut(s: string) { + return s.split(/[,|،]/).map((s) => s.trim()); + } + const ps = mapPsString(w, cut); + return ps.p.map((p, i) => { + if (!ps.f[i]) throw new Error("uneven comma seperated ps varients: " + JSON.stringify(w)) + return makePsString( + p, + ps.f[i], + ); + }) as T.ArrayOneOrMore; +} + + +export function removeEndTick(w: T.PsString): T.PsString; +export function removeEndTick(w: string): string; +export function removeEndTick(w: T.PsString | string): T.PsString | string { + if (typeof w !== "string") { + return makePsString(w.p, removeEndTick(w.f)); + } + return (w.slice(-1) === "'") + ? w.slice(0, -1) + : w; } \ No newline at end of file diff --git a/src/lib/pashto-inflector.test.ts b/src/lib/pashto-inflector.test.ts index ae3aafe..5fdce00 100644 --- a/src/lib/pashto-inflector.test.ts +++ b/src/lib/pashto-inflector.test.ts @@ -442,19 +442,7 @@ const nouns: Array<{ }, // Masculine irregular { - in: { - ts: 1527813809, - p: "لمونځ", - f: "lamoondz", - g: "", - e: "Muslim ritual prayers (namaz, salah, salat)", - c: "n. m. irreg.", - i: 9835, - infap: "لمانځه", - infaf: "lamaandzu", - infbp: "لمنځ", - infbf: "lamandz", - }, + in: {"ts":1527813809,"i":11318,"p":"لمونځ","f":"lamoondz","g":"lamoondz","e":"Muslim ritual prayers (namaz, salah, salat)","c":"n. m. irreg.","infap":"لمانځه","infaf":"lamaandzu","infbp":"لمنځ","infbf":"lamandz","ppp":"لمونځونه","ppf":"lamoondzóona"}, out: { inflections: { masc: [ @@ -463,17 +451,17 @@ const nouns: Array<{ [{p: "لمنځو", f: "lamandzo"}], ], }, - // plural: { - // masc: [ - // [{ p: "لمونځونه", f: "lamoondzóona" }], - // [{ p: "لمونځونو", f: "lamoondzóono" }], - // ], - // }, + plural: { + masc: [ + [{ p: "لمونځونه", f: "lamoondzóona" }], + [{ p: "لمونځونو", f: "lamoondzóono" }], + ], + }, }, }, // Masculine short squish { - in: {"i":9049,"ts":1527813593,"p":"غر","f":"ghar, ghur","g":"ghar,ghur","e":"mountain","c":"n. m.","infap":"غره","infaf":"ghru","infbp":"غرو","infbf":"ghro"}, + in: {"i":9049,"ts":1527813593,"p":"غر","f":"ghar, ghur","g":"ghar,ghur","e":"mountain","c":"n. m.","infap":"غره","infaf":"ghru","infbp":"غر","infbf":"ghr"}, out: { inflections: { masc: [ @@ -500,6 +488,11 @@ const nouns: Array<{ [{ p: "خره", f: "khru" }], [{ p: "خرو", f: "khro" }], ], + fem: [ + [{ p: "خره", f: "khra" }], + [{ p: "خرې", f: "khre" }], + [{ p: "خرو", f: "khro" }], + ], }, plural: { masc: [ @@ -587,12 +580,12 @@ const nouns: Array<{ }, }, { - in: {"ts":1527815394,"i":13991,"p":"واده","f":"waadú","g":"waadu","e":"wedding, marriage","c":"n. m."}, + in: {"ts":1527815394,"i":13991,"p":"واده","f":"waadú","g":"waadu","e":"wedding, marriage","c":"n. m.","ppp":"ودونه","ppf":"wadóona"}, out: { plural: { masc: [ - [{ p: "وادونه", f: "waadóona" }], - [{ p: "وادونو", f: "waadóono" }], + [{ p: "ودونه", f: "wadóona" }], + [{ p: "ودونو", f: "wadóono" }], ], }, }, @@ -655,8 +648,8 @@ const nouns: Array<{ inflections: { fem: [ [{p: "اره", f: "ará"}], - [{p: "ارې", f: "are"}], - [{p: "ارو", f: "aro"}], + [{p: "ارې", f: "aré"}], + [{p: "ارو", f: "aró"}], ], }, }, @@ -672,7 +665,7 @@ const nouns: Array<{ c: "n. f.", i: 10661, app: "مراجع", - apf: "maraají’", + apf: "maraají'", }, out: { inflections: { @@ -682,6 +675,12 @@ const nouns: Array<{ [{p: "مرجعو", f: "marjo"}], ], }, + arabicPlural: { + fem: [ + [{ p: "مراجع", f: "maraají'" }], + [{ p: "مراجو", f: "maraajó" }], + ], + }, }, }, { @@ -700,8 +699,128 @@ const nouns: Array<{ inflections: { fem: [ [{p: "منبع", f: "manbá"}], - [{p: "منبعې", f: "manbe"}], - [{p: "منبعو", f: "manbo"}], + [{p: "منبعې", f: "manbé"}], + [{p: "منبعو", f: "manbó"}], + ], + }, + arabicPlural: { + fem: [ + [{ p: "منابع", f: "manaabí" }], + [{ p: "منابو", f: "manaabó" }], + ], + }, + }, + }, + { + in: {"ts":1527823093,"i":13207,"p":"نبي","f":"nabee","g":"nabee","e":"prophet","c":"n. m. anim.","app":"انبیا","apf":"ambiyáa"}, + out: { + arabicPlural: { + masc: [ + [{ p: "انبیا", f: "ambiyáa" }], + [{ p: "انبیاوو", f: "ambiyáawo" }], + ], + }, + } + }, + { + in: {"ts":1527819536,"i":3063,"p":"تبع","f":"taba'","g":"taba","e":"follower, adherent, supporter, subject, national","c":"n. m. unisex anim.","app":"اتباع","apf":"atbaa"}, + out: { + arabicPlural: { + masc: [ + [{ p: "اتباع", f: "atbaa" }], + [{ p: "اتباعوو", f: "atbaawo" }], + ], + }, + }, + }, + { + in: {"ts":1527816113,"i":3072,"p":"تبلیغ","f":"tableegh","g":"tableegh","e":"propaganda; preaching, evangelism","c":"n. m.","app":"تبلیغات","apf":"tableegháat"}, + out: { + plural: { + masc: [ + [{ p: "تبلیغونه", f: "tableeghóona" }], + [{ p: "تبلیغونو", f: "tableeghóono" }], + ], + }, + arabicPlural: { + masc: [ + [{ p: "تبلیغات", f: "tableegháat" }], + [{ p: "تبلیغاتو", f: "tableegháato" }], + ], + }, + }, + }, + { + in: {"ts":1527815921,"i":3844,"p":"توقع","f":"tawaqqU","g":"tawakkU","e":"expectation, hope, anticipation","c":"n. f.","app":"توقعات","apf":"tawaqqUaat"}, + out: { + arabicPlural: { + masc: [ + [{ p: "توقعات", f: "tawaqqUaat" }], + [{ p: "توقعاتو", f: "tawaqqUaato" }], + ], + }, + }, + }, + { + in: {"ts":1527815820,"i":5177,"p":"حادثه","f":"haadisá","g":"haadisa","e":"accident, event","c":"n. f.","app":"حوادث, حادثات","apf":"hawaadis, haadisaat"}, + out: { + inflections: { + fem: [ + [{ p: "حادثه", f: "haadisá" }], + [{ p: "حادثې", f: "haadisé" }], + [{ p: "حادثو", f: "haadisó" }], + ], + }, + arabicPlural: { + masc: [ + [{ p: "حوادث", f: "hawaadis"}, { p: "حادثات", f: "haadisaat" }], + [{ p: "حوادثو", f: "hawaadiso"}, { p: "حادثاتو", f: "haadisaato" }], + ], + }, + }, + }, + { + in: {"ts":1527815329,"i":3097,"p":"تجربه","f":"tajrabá, tajribá","g":"tajraba,tajriba","e":"experience","c":"n. f.","app":"تجارب","apf":"tajaarib"}, + out: { + inflections: { + fem: [ + [{ p: "تجربه", f: "tajrabá" }], + [{ p: "تجربې", f: "tajrabé" }], + [{ p: "تجربو", f: "tajrabó" }], + ], + }, + arabicPlural: { + masc: [ + [{ p: "تجارب", f: "tajaarib"}], + [{ p: "تجاربو", f: "tajaaribo"}], + ], + }, + }, + }, + { + in: {"ts":1527814069,"i":5194,"p":"حال","f":"haal","g":"haal","e":"state, condition, circumstance","c":"n. m.","app":"احوال","apf":"ahwáal"}, + out: { + plural: { + masc: [ + [{ p: "حالونه", f: "haalóona" }], + [{ p: "حالونو", f: "haalóono" }], + ], + }, + arabicPlural: { + masc: [ + [{ p: "احوال", f: "ahwáal" }], + [{ p: "احوالو", f: "ahwáalo" }], + ], + }, + }, + }, + { + in: {"ts":1527819536,"i":3063,"p":"تبع","f":"taba'","g":"taba","e":"follower, adherent, supporter, subject, national","c":"n. m. unisex anim.","app":"اتباع","apf":"atbáa'"}, + out: { + arabicPlural: { + masc: [ + [{ p: "اتباع", f: "atbáa'" }], + [{ p: "اتباعوو", f: "atbáawo" }], ], }, }, @@ -856,15 +975,14 @@ const nouns: Array<{ c: "n. f.", i: 12205, }, - out: false, - // out: { - // plural: { - // fem: [ - // [{p: "وداع وې", f: "widáawe"}, {p: "وداع ګانې", f: "widaagáane"}], - // [{p: "وداع وو", f: "widáawo"}, {p: "وداع ګانو", f: "widaagáano"}], - // ], - // }, - // }, + out: { + plural: { + fem: [ + [{p: "وداع وې", f: "widáawe"}, {p: "وداع ګانې", f: "widaagáane"}], + [{p: "وداع وو", f: "widáawo"}, {p: "وداع ګانو", f: "widaagáano"}], + ], + }, + }, }, // TODO: Plaar plaroona paaraan - wrooNa // Word with no inflections @@ -881,7 +999,6 @@ const nouns: Array<{ }, out: false, }, - // TODO: WORDS THAT ARE ALREADY PLURAL! ]; const others: T.DictionaryEntry[] = [ @@ -912,6 +1029,7 @@ adjectives.forEach((word) => { }); nouns.forEach((word) => { + // if (word.in.p !== "نبي") return; test(`${word.in.p} should inflect properly`, () => { expect(inflectWord(word.in)).toEqual(word.out); }); @@ -936,4 +1054,4 @@ test(`inflectRegularYeyUnisex should work`, () => { [{p: "لیدونکو", f: "leedóonko"}], ], }); -}) +}); diff --git a/src/lib/pashto-inflector.ts b/src/lib/pashto-inflector.ts index f9cea50..7e01b54 100644 --- a/src/lib/pashto-inflector.ts +++ b/src/lib/pashto-inflector.ts @@ -16,8 +16,13 @@ import { concatPsString, endsInConsonant, endsInAaOrOo, + addOEnding, + endsInShwa, + splitPsByVarients, + removeEndTick, } from "./p-text-helpers"; import { + hasAccents, removeAccents, } from "./accent-helpers"; import * as T from "../types"; @@ -43,6 +48,9 @@ export function inflectWord(word: T.DictionaryEntry): T.InflectorOutput { ) as T.UnisexInflections, }; } + if (w.c && w.c.includes("pl.")) { + return handlePluralNoun(w); + } if (w.c && (w.c.includes("adj.") || w.c.includes("unisex"))) { return handleUnisexWord(w); } @@ -61,24 +69,24 @@ function handleUnisexWord(word: T.DictionaryEntryNoFVars): T.InflectorOutput { // Get last letter of Pashto and last two letters of phonetics // TODO: !!! Handle weird endings / symbols ' etc. const pEnd = word.p.slice(-1); - const plural = makePlural(word); + const plurals = makePlural(word); if (word.infap && word.infaf && word.infbp && word.infbf) { return { inflections: inflectIrregularUnisex(word.p, word.f, [ {p: word.infap, f: word.infaf}, {p: word.infbp, f: word.infbf}, ]), - plural, + ...plurals, }; } if (pEnd === "ی" && word.f.slice(-2) === "ey") { - return { inflections: inflectRegularYeyUnisex(word.p, word.f), plural }; + return { inflections: inflectRegularYeyUnisex(word.p, word.f), ...plurals }; } if (pEnd === "ه" && word.g.slice(-1) === "u") { - return { inflections: inflectRegularShwaEndingUnisex(word.p, word.f), plural }; + return { inflections: inflectRegularShwaEndingUnisex(word.p, word.f), ...plurals }; } if (pEnd === "ی" && word.f.slice(-2) === "éy") { - return { inflections: inflectEmphasizedYeyUnisex(word.p, word.f), plural }; + return { inflections: inflectEmphasizedYeyUnisex(word.p, word.f), ...plurals }; } if ( pashtoConsonants.includes(pEnd) || @@ -86,15 +94,23 @@ function handleUnisexWord(word: T.DictionaryEntryNoFVars): T.InflectorOutput { word.p.slice(-2) === "ای" || (word.p.slice(-1) === "ه" && word.f.slice(-1) === "h") ) { - return { inflections: inflectConsonantEndingUnisex(word.p, word.f), plural }; + return { inflections: inflectConsonantEndingUnisex(word.p, word.f), ...plurals }; } + if (plurals) return plurals; return false; } +function handlePluralNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput { + if (!w.c || !w.c.includes("n.")) return false; + const plurals = makePlural(w); + if (!plurals) return false; + return { ...plurals }; +} + function handleMascNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput { // Get last letter of Pashto and last two letters of phonetics // TODO: !!! Handle weird endings / symbols ' etc. - const plural = makePlural(w); + const plurals = makePlural(w); const pEnd = w.p.slice(-1); const fEnd = w.f.slice(-2); if (w.infap && w.infaf && w.infbp && w.infbf) { @@ -103,20 +119,20 @@ function handleMascNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput { {p: w.infap, f: w.infaf}, {p: w.infbp, f: w.infbf}, ]), - plural, + ...plurals, }; } const isTobEnding = (w.p.slice(-3) === "توب" && ["tób", "tob"].includes(w.f.slice(-3)) && w.p.length > 3); if (isTobEnding) { - return { inflections: inflectTobMasc(w.p, w.f), plural }; + return { inflections: inflectTobMasc(w.p, w.f), ...plurals }; } if (pEnd === "ی" && fEnd === "ey") { - return { inflections: inflectRegularYeyMasc(w.p, w.f), plural }; + return { inflections: inflectRegularYeyMasc(w.p, w.f), ...plurals }; } if (pEnd === "ی" && fEnd === "éy") { - return { inflections: inflectRegularEmphasizedYeyMasc(w.p, w.f), plural }; + return { inflections: inflectRegularEmphasizedYeyMasc(w.p, w.f), ...plurals }; } - return plural ? { plural } : false + return plurals ? { ...plurals } : false } function handleFemNoun(word: T.DictionaryEntryNoFVars): T.InflectorOutput { @@ -126,27 +142,27 @@ function handleFemNoun(word: T.DictionaryEntryNoFVars): T.InflectorOutput { const animate = c.includes("anim."); const pEnd = word.p.slice(-1); - const plural = makePlural(word); + const plurals = makePlural(word); if (endingInHeyOrAynRegex.test(word.p) && endingInSingleARegex.test(word.f)) { - return { inflections: inflectRegularAFem(word.p, word.f), plural }; + return { inflections: inflectRegularAFem(word.p, word.f), ...plurals }; } if (word.p.slice(-1) === "ح" && endingInSingleARegex.test(word.f)) { - return { inflections: inflectRegularAWithHimPEnding(word.p, word.f), plural }; + return { inflections: inflectRegularAWithHimPEnding(word.p, word.f), ...plurals }; } if (pashtoConsonants.includes(pEnd) && !animate) { - return { inflections: inflectRegularInanMissingAFem(word.p, word.f), plural }; + return { inflections: inflectRegularInanMissingAFem(word.p, word.f), ...plurals }; } if (pEnd === "ي" && (!animate)) { - return { inflections: inflectRegularInanEeFem(word.p, word.f), plural }; + return { inflections: inflectRegularInanEeFem(word.p, word.f), ...plurals }; } if (pEnd === "ۍ") { - return { inflections: inflectRegularUyFem(word.p, word.f), plural }; + return { inflections: inflectRegularUyFem(word.p, word.f), ...plurals }; } // if (endingInAlefRegex.test(word.p)) { // return { inflections: inflectRegularAaFem(word.p, f) }; // } - return plural ? { plural } : false; + return plurals ? { ...plurals } : false; } // LEVEL 3 FUNCTIONS @@ -294,13 +310,15 @@ function inflectIrregularMasc(p: string, f: string, inflections: Array<{p: strin } function inflectRegularAFem(p: string, f: string): T.Inflections { - const baseF = ["'", "’"].includes(f.slice(-1)) ? f.slice(0, -2) : f.slice(0, -1); + const withoutTrailingComma = ["'", "’"].includes(f.slice(-1)) ? f.slice(0, -1) : f; + const accentLast = hasAccents(withoutTrailingComma.slice(-1)); + const baseF = withoutTrailingComma.slice(0, -1); const baseP = p.slice(-1) === "ع" ? p : p.slice(0, -1); return { fem: [ [{p, f}], - [{p: `${baseP}ې`, f: `${baseF}e`}], - [{p: `${baseP}و`, f: `${baseF}o`}], + [{p: `${baseP}ې`, f: `${baseF}${accentLast ? "é" : "e"}`}], + [{p: `${baseP}و`, f: `${baseF}${accentLast ? "ó" : "o"}`}], ], }; } @@ -356,53 +374,91 @@ function inflectRegularUyFem(p: string, f: string): T.Inflections { function makePashtoPlural(word: T.DictionaryEntryNoFVars): T.PluralInflections | undefined { if (!(word.ppp && word.ppf)) return undefined; const base = makePsString(word.ppp, word.ppf); - // TODO: Add male Pashto plural + function getBaseAndO(): T.PluralInflectionSet { + return [[base], addOEnding(base)]; + } + if (word.c?.includes("n. m.")) { + return { masc: getBaseAndO() }; + } if (word.c?.includes("n. f.")) { - return { - fem: [ - [base], - // todo: function to add و ending automatically - [concatPsString( - makePsString(base.p.slice(0, -1), base.f.slice(0, -1)), - { p: "و", f: "o" }, - )], - ], - } + return { fem: getBaseAndO() }; } // TODO: handle masculine and unisex return undefined; } -function makePlural(w: T.DictionaryEntryNoFVars): T.PluralInflections | undefined { - // TODO: Include the Pashto plural thing here - const pashtoPlural = makePashtoPlural(w); - if (pashtoPlural) return pashtoPlural; - function addMascPluralSuffix(animate?: boolean): T.PluralInflectionSet { - const base = removeAccents(w); +function makeArabicPlural(word: T.DictionaryEntryNoFVars): T.PluralInflections | undefined { + if (!(word.apf && word.app)) return undefined; + const w = makePsString(word.app, word.apf); + const plural = splitPsByVarients(w); + const end = removeAccents(removeEndTick(word.apf).slice(-1)); + // again typescript being dumb and not letting me use a typed key here + const value = [ + plural, + plural.flatMap(addOEnding) as T.ArrayOneOrMore, + ] as T.PluralInflectionSet; + // feminine words that have arabic plurals stay feminine with the plural - ie مرجع - مراجع + // but masculine words that appear feminine in the plural aren't femening with the Arabic plural - ie. نبي - انبیا + if (["i", "e", "a"].includes(end) && word.c?.includes("n. f.")) { + return { fem: value }; + } + return { masc: value }; +} + +function makePlural(w: T.DictionaryEntryNoFVars): { plural: T.PluralInflections } | { arabicPlural: T.PluralInflections } | undefined { + function addSecondInf(plur: T.ArrayOneOrMore | T.PsString): T.PluralInflectionSet { + if (!Array.isArray(plur)) { + return addSecondInf([plur]); + } return [ - [concatPsString(base, animate ? { p: "ان", f: "áan" } : { p: "ونه", f: "óona" })], - [concatPsString(base, animate ? { p: "انو", f: "áano" } : { p: "ونو", f: "óono" })], + plur, + plur.flatMap(addOEnding) as T.ArrayOneOrMore, ]; + } + if (w.c && w.c.includes("pl.")) { + const plural = addSecondInf(makePsString(w.p, w.f)); + // Typescript being dumb and not letting me do a typed variable for the key + // could try refactoring with an updated TypeScript dependency + if (w.c.includes("n. m.")) return { plural: { masc: plural }}; + if (w.c.includes("n. f.")) return { plural: { fem: plural }}; + } + // TODO: MAKE ARABIC PLURAL HERE IF THERE IS ARABIC PLURAL + const arabicPlural = makeArabicPlural(w); + const pashtoPlural = makePashtoPlural(w); + if (pashtoPlural) return { plural: pashtoPlural, arabicPlural }; + function addMascPluralSuffix(animate?: boolean, shortSquish?: boolean): T.PluralInflectionSet { + if (shortSquish && (w.infap == undefined || w.infaf === undefined)) { + throw new Error(`no irregular inflection info for ${w.p} - ${w.ts}`); + } + const b = removeAccents(shortSquish + ? makePsString((w.infap as string).slice(0, -1), (w.infaf as string).slice(0, -1)) + : w + ); + const base = endsInShwa(b) + ? makePsString(b.p.slice(0, -1), b.f.slice(0, -1)) + : b; + return addSecondInf( + concatPsString(base, (animate && !shortSquish) ? { p: "ان", f: "áan" } : { p: "ونه", f: "óona" }), + ); } function addAnimUnisexPluralSuffix(): T.UnisexSet { const base = removeAccents(w); return { masc: addMascPluralSuffix(true), - fem: [ - [concatPsString(base, { p: "انې", f: "áane" })], - [concatPsString(base, { p: "انو", f: "áano" })], - ], + fem: addSecondInf(concatPsString(base, { p: "انې", f: "áane" })), }; } function addFemLongVowelSuffix(): T.PluralInflectionSet { - const base = makePsString(w.p, w.f); + const base = removeEndTick(makePsString(w.p, w.f)); const baseWOutAccents = removeAccents(base); - return [ - [concatPsString(base, { p: "وې", f: "we" }), concatPsString(baseWOutAccents, { p: "ګانې", f: "gáane" })], - [concatPsString(base, { p: "وو", f: "wo" }), concatPsString(baseWOutAccents, { p: "ګانو", f: "gáano" })], - ]; + const space = (w.p.slice(-1) === "ع" || w.p.slice(-1) === "ه") ? { p: " ", f: "" } : ""; + return addSecondInf([ + concatPsString(base, space, { p: "وې", f: "we" }), + concatPsString(baseWOutAccents, space, { p: "ګانې", f: "gáane" }) + ]); } + const shortSquish = !!w.infap && !w.infap.includes("ا"); const anim = w.c?.includes("anim."); const type = (w.c?.includes("unisex")) ? "unisex noun" @@ -411,19 +467,33 @@ function makePlural(w: T.DictionaryEntryNoFVars): T.PluralInflections | undefine : (w.c?.includes("n. f.")) ? "fem noun" : "other"; - if (type === "unisex noun" && endsInConsonant(w) && (!w.infap) && anim) { - return addAnimUnisexPluralSuffix(); + if (type === "unisex noun") { + if (endsInConsonant(w) && (!w.infap) && anim) { + return { arabicPlural, plural: addAnimUnisexPluralSuffix() }; + } + if (shortSquish) { + return { arabicPlural, plural: { masc: addMascPluralSuffix(anim, shortSquish) }}; + } } - if (type === "masc noun" && endsInConsonant(w) && (!w.infap) && (w.p.slice(-3) !== "توب")) { + if (type === "masc noun" && (shortSquish || (endsInConsonant(w) || endsInShwa(w) && (!w.infap))) && (w.p.slice(-3) !== "توب")) { return { - masc: addMascPluralSuffix(anim), + arabicPlural, + plural: { + masc: addMascPluralSuffix(anim, shortSquish), + }, }; } // TODO: What about endings in long ee / animate at inanimate if (type === "fem noun" && endsInAaOrOo(w) && (!w.infap)) { return { - fem: addFemLongVowelSuffix(), + arabicPlural, + plural: { + fem: addFemLongVowelSuffix(), + }, }; } + if (arabicPlural) { + return { arabicPlural, plural: pashtoPlural }; + } return undefined; } diff --git a/src/lib/standardize-pashto.test.ts b/src/lib/standardize-pashto.test.ts index 483c25f..193ecd9 100644 --- a/src/lib/standardize-pashto.test.ts +++ b/src/lib/standardize-pashto.test.ts @@ -6,7 +6,7 @@ * */ -import { standardizePashto } from "./standardize-pashto"; +import { standardizePashto, standardizePhonetics } from "./standardize-pashto"; const testPairs = [ ["گوگل", "ګوګل"], @@ -31,3 +31,14 @@ testPairs.forEach((pair) => { expect(result).toBe(pair[1]); }); }); + +test("standardizePashto", () => { + const pairs = [ + ["ma’aaloom", "ma'aaloom"], + ["ma‘aaloom", "ma'aaloom"], + ["ma'aaloom", "ma'aaloom"], + ]; + pairs.forEach((x) => { + expect(standardizePhonetics(x[0])).toBe(x[1]) + }); +}) diff --git a/src/lib/standardize-pashto.ts b/src/lib/standardize-pashto.ts index 35aed3f..5952285 100644 --- a/src/lib/standardize-pashto.ts +++ b/src/lib/standardize-pashto.ts @@ -20,3 +20,8 @@ export function standardizePashto(input: string): string { // Replace آ two character version with combined آ character .replace(/آ/g, "آ"); } + +export function standardizePhonetics(input: string): string { + // TODO: check that these are the only kinds of smart comments + return input.replace(/[‘|’]/g, "'"); +} diff --git a/src/types.ts b/src/types.ts index 8702713..2e52ed8 100644 --- a/src/types.ts +++ b/src/types.ts @@ -352,7 +352,12 @@ export type Inflections = GenderedSet; export type PluralInflections = GenderedSet; export type InflectorOutput = { + arabicPlural: PluralInflections, + plural?: PluralInflections, + inflections?: Inflections, +} | { plural: PluralInflections, + arabicPlural?: PluralInflections, inflections?: Inflections, } | { inflections: Inflections,