inflector with plurals beta working!

This commit is contained in:
lingdocs 2021-09-14 18:25:04 +04:00
parent 9baa2d5e58
commit 916bc24487
8 changed files with 505 additions and 108 deletions

View File

@ -118,6 +118,7 @@ export function removeAccents(s: T.PsString | string): T.PsString | string {
* *
* @param s a string of Pashto phonetics * @param s a string of Pashto phonetics
*/ */
export function hasAccents(s: string): boolean { export function hasAccents(s: string | T.PsString): boolean {
if (typeof s !== "string") return hasAccents(s.f);
return accentReplacer.some((x) => s.includes(x.accented)); return accentReplacer.some((x) => s.includes(x.accented));
} }

View File

@ -8,7 +8,6 @@
import { import {
concatPsString, concatPsString,
firstPhonetics,
makePsString, makePsString,
removeEndingL, removeEndingL,
yulEndingInfinitive, yulEndingInfinitive,
@ -22,6 +21,11 @@ import {
removeRetroflexR, removeRetroflexR,
splitDoubleWord, splitDoubleWord,
endsInConsonant, endsInConsonant,
addOEnding,
removeFVarients,
endsInShwa,
removeAynEnding,
splitPsByVarients,
} from "./p-text-helpers"; } from "./p-text-helpers";
import * as T from "../types"; import * as T from "../types";
import { import {
@ -617,9 +621,11 @@ test(`complementInflects`, () => {
})).toBe(false); })).toBe(false);
}); });
test(`firstPhonetics should work`, () => { test(`removeFVarients`, () => {
expect(firstPhonetics("ist'imaal, istimaal")).toBe("ist'imaal"); expect(removeFVarients("ist'imaal, istimaal")).toBe("ist'imaal");
expect(firstPhonetics("kor")).toBe("kor"); expect(removeFVarients({ p: "معالوم", f: "ma'aalóom, maalóom" }))
.toEqual({ p: "معالوم", f: "ma'aalóom" });
expect(removeFVarients("kor")).toBe("kor");
}); });
test(`makePsString should work`, () => { test(`makePsString should work`, () => {
@ -1035,3 +1041,85 @@ test("endsInAConsonant", () => {
does.forEach((x) => expect(endsInConsonant(x)).toBe(true)); does.forEach((x) => expect(endsInConsonant(x)).toBe(true));
doesnt.forEach((x) => expect(endsInConsonant(x)).toBe(false)); doesnt.forEach((x) => expect(endsInConsonant(x)).toBe(false));
}) })
test("addOEnding", () => {
const tests: { in: T.PsString, out: T.PsString[] }[] = [
{
in: { p: "کتابونه", f: "kitaabóona" },
out: [{ p: "کتابونو", f: "kitaabóono" }],
},
{
in: { p: "کارغان", f: "kaargháan" },
out: [{ p: "کارغانو", f: "kaargháano" }],
},
{
in: { p: "کارغانې", f: "kaargháane" },
out: [{ p: "کارغانو", f: "kaargháano" }],
},
{
in: { p: "ښځې", f: "xúdze" },
out: [{ p: "ښځو", f: "xúdzo" }],
},
// TODO: Make this last thing accented??
{
in: { p: "کور", f: "kor" },
out: [{ p: "کورو", f: "koro" }],
},
{
in: { p: "سړی", f: "saRéy" },
out: [{ p: "سړیو", f: "saRíyo" }, { p: "سړو", f: "saRó"}],
},
{
in: { p: "افغانۍ", f: "afghaanúy" },
out: [{ p: "افغانیو", f: "afghaanúyo" }],
},
{
in: { p: "اوبه", f: "oobú" },
out: [{ p: "اوبو", f: "oobó" }],
},
{
in: { p: "شودې", f: "shoodé" },
out: [{ p: "شودو", f: "shoodó" }],
},
{
in: { p: "منابع", f: "manaabí" },
out: [{ p: "منابو", f: "manaabó" }],
},
{
in: { p: "انبیا", f: "ambiyáa" },
out: [{ p: "انبیاوو", f: "ambiyáawo" }],
},
{
in: { p: "مراجع", f: "maraají'" },
out: [{ p: "مراجو", f: "maraajó" }],
},
{
in: { p: "اتباع", f: "atbaa" },
out: [{ p: "اتباعوو", f: "atbaawo" }],
},
{
in: { p: "اتباع", "f": "atbáa'" },
out: [{ p: "اتباعوو", f: "atbáawo" }],
},
];
tests.forEach((t) => {
expect(addOEnding(t.in)).toEqual(t.out);
});
});
test("endsInShwa", () => {
expect(endsInShwa({ p: "ښایسته", f: "xaaystú" })).toBe(true);
expect(endsInShwa({ p: "ښایسته", f: "xaaystu" })).toBe(true);
expect(endsInShwa({ p: "ښایسته", f: "xaaysta" })).toBe(false);
expect(endsInShwa({ p: "کور", f: "kor" })).toBe(false);
});
test("splitPsByVarients", () => {
expect(splitPsByVarients({ p: "حوادث, حادثات", f: "hawáadis, haadisáat" }))
.toEqual([{ p: "حوادث", f: "hawáadis" }, { p: "حادثات", f: "haadisáat" }]);
// should work with Pashto comma too
expect(splitPsByVarients({ p: "حوادث، حادثات", f: "hawáadis, haadisáat" }))
.toEqual([{ p: "حوادث", f: "hawáadis" }, { p: "حادثات", f: "haadisáat" }]);
expect(splitPsByVarients({ p: "کور", f: "kor" }))
.toEqual([{ p: "کور", f: "kor" }]);
})

View File

@ -15,8 +15,8 @@ import {
getPersonInflectionsKey, getPersonInflectionsKey,
} from "./misc-helpers"; } from "./misc-helpers";
import * as T from "../types"; import * as T from "../types";
import { removeAccents } from "./accent-helpers"; import { hasAccents, removeAccents } from "./accent-helpers";
import { pashtoConsonants, phoneticsConsonants } from "./pashto-consonants"; import { phoneticsConsonants } from "./pashto-consonants";
import { simplifyPhonetics } from "./simplify-phonetics"; import { simplifyPhonetics } from "./simplify-phonetics";
// export function concatPsStringWithVars(...items: Array<T.PsString | " " | "">): T.PsString[] { // export function concatPsStringWithVars(...items: Array<T.PsString | " " | "">): T.PsString[] {
@ -190,14 +190,12 @@ export function removeFVarients(x: string | T.PsString | T.DictionaryEntry): T.F
return { return {
...x, ...x,
f: removeFVarients(x.f), f: removeFVarients(x.f),
__brand: "name for a dictionary entry with all the phonetics variations removed", } as unknown as T.DictionaryEntryNoFVars;
} as T.DictionaryEntryNoFVars;
} }
return { return {
...x, ...x,
f: removeFVarients(x.f), f: removeFVarients(x.f),
__brand: "name for a ps string with all the phonetics variations removed", } as unknown as T.PsStringNoFVars;
} as T.PsStringNoFVars;
} }
/** /**
@ -796,7 +794,7 @@ export function ensureUnisexInflections(infs: T.InflectorOutput, w: T.Dictionary
export function endsInAaOrOo(w: T.PsString): boolean { export function endsInAaOrOo(w: T.PsString): boolean {
const fEnd = simplifyPhonetics(w.f).slice(-2); const fEnd = simplifyPhonetics(w.f).slice(-2);
const pEnd = w.p.slice(-1); const pEnd = w.p.slice(-1) === "ع" ? w.p.slice(-2, -1) : w.p.slice(-1);
return ( return (
pEnd === "و" && fEnd.endsWith("o") pEnd === "و" && fEnd.endsWith("o")
|| ||
@ -804,7 +802,6 @@ export function endsInAaOrOo(w: T.PsString): boolean {
); );
} }
export function endsInConsonant(w: T.PsString): boolean { export function endsInConsonant(w: T.PsString): boolean {
// TODO: Add reporting back that the plural ending will need a space? // TODO: Add reporting back that the plural ending will need a space?
@ -824,3 +821,105 @@ export function endsInConsonant(w: T.PsString): boolean {
const fCons = phoneticsConsonants.includes(simplifyPhonetics(w.f).slice(-1)); const fCons = phoneticsConsonants.includes(simplifyPhonetics(w.f).slice(-1));
return fCons; return fCons;
} }
/**
* adds a و - o ending (used in plurals 2nd inflection) to a given PsString
* It will wipe out a ه - a / u or ې - e and will preserve the accent
*
* @param w
* @returns
*/
export function addOEnding(ps: T.PsString): T.ArrayOneOrMore<T.PsString> {
const w = removeEndTick(ps);
const lastLetter = makePsString(
w.p.slice(-1),
w.f.slice(-1),
);
const hasEyEnding = (lastLetter.p === "ی") && ["ey", "éy"].includes(w.f.slice(-2));
if (hasEyEnding) {
const base = makePsString(w.p.slice(0, -1), w.f.slice(0, -2));
const endHadAccent = w.f.slice(-2) === "éy";
return [
concatPsString(base, { p: "یو", f: endHadAccent ? "íyo" : "iyo" }),
concatPsString(base, { p: "و", f: endHadAccent ? "ó" : "o" }),
];
}
if (lastLetter.p === "ۍ") {
const base = makePsString(w.p.slice(0, -1), w.f.slice(0, -2));
const endHadAccent = w.f.slice(-2) === "úy";
return [
concatPsString(base, { p: "یو", f: endHadAccent ? "úyo" : "uyo" }),
];
}
if (lastLetter.p === "ا" || (w.p.slice(-2) === "اع")) {
return [concatPsString(w, { p: "وو", f: "wo" })];
}
const base = (
(["ه", "ع"].includes(lastLetter.p) && lastLetter.f.match(/[a|u|i|U|á|ú|í|Ú]/)) ||
(lastLetter.p === "ې" && ["e", "é"].includes(lastLetter.f))
) ? makePsString(
w.p.slice(0, -1),
w.f.slice(0, -1),
) : w;
return [concatPsString(
base,
makePsString(
"و",
hasAccents(lastLetter.f) ? "ó" : "o",
),
)];
}
/**
* Determines whether a string ends in a shwa or not
*
* @param w
*/
export function endsInShwa(w: T.PsString): boolean {
const p = w.p.slice(-1);
const f = w.f.slice(-1);
return p === "ه" && ["u", "ú"].includes(f);
}
/**
* applies f function to both the p and f in a PsString
*
*/
export function mapPsString<T>(ps: T.PsString, f: (s: string) => T): { p: T, f: T } {
return {
p: f(ps.p),
f: f(ps.f),
};
}
/**
* splits up a given PsString by comma-seperated varients
*
* @param w
* @returns
*/
export function splitPsByVarients(w: T.PsString): T.ArrayOneOrMore<T.PsString> {
function cut(s: string) {
return s.split(/[,|،]/).map((s) => s.trim());
}
const ps = mapPsString(w, cut);
return ps.p.map((p, i) => {
if (!ps.f[i]) throw new Error("uneven comma seperated ps varients: " + JSON.stringify(w))
return makePsString(
p,
ps.f[i],
);
}) as T.ArrayOneOrMore<T.PsString>;
}
export function removeEndTick(w: T.PsString): T.PsString;
export function removeEndTick(w: string): string;
export function removeEndTick(w: T.PsString | string): T.PsString | string {
if (typeof w !== "string") {
return makePsString(w.p, removeEndTick(w.f));
}
return (w.slice(-1) === "'")
? w.slice(0, -1)
: w;
}

View File

@ -442,19 +442,7 @@ const nouns: Array<{
}, },
// Masculine irregular // Masculine irregular
{ {
in: { in: {"ts":1527813809,"i":11318,"p":"لمونځ","f":"lamoondz","g":"lamoondz","e":"Muslim ritual prayers (namaz, salah, salat)","c":"n. m. irreg.","infap":"لمانځه","infaf":"lamaandzu","infbp":"لمنځ","infbf":"lamandz","ppp":"لمونځونه","ppf":"lamoondzóona"},
ts: 1527813809,
p: "لمونځ",
f: "lamoondz",
g: "",
e: "Muslim ritual prayers (namaz, salah, salat)",
c: "n. m. irreg.",
i: 9835,
infap: "لمانځه",
infaf: "lamaandzu",
infbp: "لمنځ",
infbf: "lamandz",
},
out: { out: {
inflections: { inflections: {
masc: [ masc: [
@ -463,17 +451,17 @@ const nouns: Array<{
[{p: "لمنځو", f: "lamandzo"}], [{p: "لمنځو", f: "lamandzo"}],
], ],
}, },
// plural: { plural: {
// masc: [ masc: [
// [{ p: "لمونځونه", f: "lamoondzóona" }], [{ p: "لمونځونه", f: "lamoondzóona" }],
// [{ p: "لمونځونو", f: "lamoondzóono" }], [{ p: "لمونځونو", f: "lamoondzóono" }],
// ], ],
// }, },
}, },
}, },
// Masculine short squish // Masculine short squish
{ {
in: {"i":9049,"ts":1527813593,"p":"غر","f":"ghar, ghur","g":"ghar,ghur","e":"mountain","c":"n. m.","infap":"غره","infaf":"ghru","infbp":"غرو","infbf":"ghro"}, in: {"i":9049,"ts":1527813593,"p":"غر","f":"ghar, ghur","g":"ghar,ghur","e":"mountain","c":"n. m.","infap":"غره","infaf":"ghru","infbp":"غر","infbf":"ghr"},
out: { out: {
inflections: { inflections: {
masc: [ masc: [
@ -500,6 +488,11 @@ const nouns: Array<{
[{ p: "خره", f: "khru" }], [{ p: "خره", f: "khru" }],
[{ p: "خرو", f: "khro" }], [{ p: "خرو", f: "khro" }],
], ],
fem: [
[{ p: "خره", f: "khra" }],
[{ p: "خرې", f: "khre" }],
[{ p: "خرو", f: "khro" }],
],
}, },
plural: { plural: {
masc: [ masc: [
@ -587,12 +580,12 @@ const nouns: Array<{
}, },
}, },
{ {
in: {"ts":1527815394,"i":13991,"p":"واده","f":"waadú","g":"waadu","e":"wedding, marriage","c":"n. m."}, in: {"ts":1527815394,"i":13991,"p":"واده","f":"waadú","g":"waadu","e":"wedding, marriage","c":"n. m.","ppp":"ودونه","ppf":"wadóona"},
out: { out: {
plural: { plural: {
masc: [ masc: [
[{ p: ادونه", f: "waadóona" }], [{ p: دونه", f: "wadóona" }],
[{ p: ادونو", f: "waadóono" }], [{ p: دونو", f: "wadóono" }],
], ],
}, },
}, },
@ -655,8 +648,8 @@ const nouns: Array<{
inflections: { inflections: {
fem: [ fem: [
[{p: "اره", f: "ará"}], [{p: "اره", f: "ará"}],
[{p: "ارې", f: "are"}], [{p: "ارې", f: "aré"}],
[{p: "ارو", f: "aro"}], [{p: "ارو", f: "aró"}],
], ],
}, },
}, },
@ -672,7 +665,7 @@ const nouns: Array<{
c: "n. f.", c: "n. f.",
i: 10661, i: 10661,
app: "مراجع", app: "مراجع",
apf: "maraají", apf: "maraají'",
}, },
out: { out: {
inflections: { inflections: {
@ -682,6 +675,12 @@ const nouns: Array<{
[{p: "مرجعو", f: "marjo"}], [{p: "مرجعو", f: "marjo"}],
], ],
}, },
arabicPlural: {
fem: [
[{ p: "مراجع", f: "maraají'" }],
[{ p: "مراجو", f: "maraajó" }],
],
},
}, },
}, },
{ {
@ -700,8 +699,128 @@ const nouns: Array<{
inflections: { inflections: {
fem: [ fem: [
[{p: "منبع", f: "manbá"}], [{p: "منبع", f: "manbá"}],
[{p: "منبعې", f: "manbe"}], [{p: "منبعې", f: "manbé"}],
[{p: "منبعو", f: "manbo"}], [{p: "منبعو", f: "manbó"}],
],
},
arabicPlural: {
fem: [
[{ p: "منابع", f: "manaabí" }],
[{ p: "منابو", f: "manaabó" }],
],
},
},
},
{
in: {"ts":1527823093,"i":13207,"p":"نبي","f":"nabee","g":"nabee","e":"prophet","c":"n. m. anim.","app":"انبیا","apf":"ambiyáa"},
out: {
arabicPlural: {
masc: [
[{ p: "انبیا", f: "ambiyáa" }],
[{ p: "انبیاوو", f: "ambiyáawo" }],
],
},
}
},
{
in: {"ts":1527819536,"i":3063,"p":"تبع","f":"taba'","g":"taba","e":"follower, adherent, supporter, subject, national","c":"n. m. unisex anim.","app":"اتباع","apf":"atbaa"},
out: {
arabicPlural: {
masc: [
[{ p: "اتباع", f: "atbaa" }],
[{ p: "اتباعوو", f: "atbaawo" }],
],
},
},
},
{
in: {"ts":1527816113,"i":3072,"p":"تبلیغ","f":"tableegh","g":"tableegh","e":"propaganda; preaching, evangelism","c":"n. m.","app":"تبلیغات","apf":"tableegháat"},
out: {
plural: {
masc: [
[{ p: "تبلیغونه", f: "tableeghóona" }],
[{ p: "تبلیغونو", f: "tableeghóono" }],
],
},
arabicPlural: {
masc: [
[{ p: "تبلیغات", f: "tableegháat" }],
[{ p: "تبلیغاتو", f: "tableegháato" }],
],
},
},
},
{
in: {"ts":1527815921,"i":3844,"p":"توقع","f":"tawaqqU","g":"tawakkU","e":"expectation, hope, anticipation","c":"n. f.","app":"توقعات","apf":"tawaqqUaat"},
out: {
arabicPlural: {
masc: [
[{ p: "توقعات", f: "tawaqqUaat" }],
[{ p: "توقعاتو", f: "tawaqqUaato" }],
],
},
},
},
{
in: {"ts":1527815820,"i":5177,"p":"حادثه","f":"haadisá","g":"haadisa","e":"accident, event","c":"n. f.","app":"حوادث, حادثات","apf":"hawaadis, haadisaat"},
out: {
inflections: {
fem: [
[{ p: "حادثه", f: "haadisá" }],
[{ p: "حادثې", f: "haadisé" }],
[{ p: "حادثو", f: "haadisó" }],
],
},
arabicPlural: {
masc: [
[{ p: "حوادث", f: "hawaadis"}, { p: "حادثات", f: "haadisaat" }],
[{ p: "حوادثو", f: "hawaadiso"}, { p: "حادثاتو", f: "haadisaato" }],
],
},
},
},
{
in: {"ts":1527815329,"i":3097,"p":"تجربه","f":"tajrabá, tajribá","g":"tajraba,tajriba","e":"experience","c":"n. f.","app":"تجارب","apf":"tajaarib"},
out: {
inflections: {
fem: [
[{ p: "تجربه", f: "tajrabá" }],
[{ p: "تجربې", f: "tajrabé" }],
[{ p: "تجربو", f: "tajrabó" }],
],
},
arabicPlural: {
masc: [
[{ p: "تجارب", f: "tajaarib"}],
[{ p: "تجاربو", f: "tajaaribo"}],
],
},
},
},
{
in: {"ts":1527814069,"i":5194,"p":"حال","f":"haal","g":"haal","e":"state, condition, circumstance","c":"n. m.","app":"احوال","apf":"ahwáal"},
out: {
plural: {
masc: [
[{ p: "حالونه", f: "haalóona" }],
[{ p: "حالونو", f: "haalóono" }],
],
},
arabicPlural: {
masc: [
[{ p: "احوال", f: "ahwáal" }],
[{ p: "احوالو", f: "ahwáalo" }],
],
},
},
},
{
in: {"ts":1527819536,"i":3063,"p":"تبع","f":"taba'","g":"taba","e":"follower, adherent, supporter, subject, national","c":"n. m. unisex anim.","app":"اتباع","apf":"atbáa'"},
out: {
arabicPlural: {
masc: [
[{ p: "اتباع", f: "atbáa'" }],
[{ p: "اتباعوو", f: "atbáawo" }],
], ],
}, },
}, },
@ -856,15 +975,14 @@ const nouns: Array<{
c: "n. f.", c: "n. f.",
i: 12205, i: 12205,
}, },
out: false, out: {
// out: { plural: {
// plural: { fem: [
// fem: [ [{p: "وداع وې", f: "widáawe"}, {p: "وداع ګانې", f: "widaagáane"}],
// [{p: "وداع وې", f: "widáawe"}, {p: "وداع ګانې", f: "widaagáane"}], [{p: "وداع وو", f: "widáawo"}, {p: "وداع ګانو", f: "widaagáano"}],
// [{p: "وداع وو", f: "widáawo"}, {p: "وداع ګانو", f: "widaagáano"}], ],
// ], },
// }, },
// },
}, },
// TODO: Plaar plaroona paaraan - wrooNa // TODO: Plaar plaroona paaraan - wrooNa
// Word with no inflections // Word with no inflections
@ -881,7 +999,6 @@ const nouns: Array<{
}, },
out: false, out: false,
}, },
// TODO: WORDS THAT ARE ALREADY PLURAL!
]; ];
const others: T.DictionaryEntry[] = [ const others: T.DictionaryEntry[] = [
@ -912,6 +1029,7 @@ adjectives.forEach((word) => {
}); });
nouns.forEach((word) => { nouns.forEach((word) => {
// if (word.in.p !== "نبي") return;
test(`${word.in.p} should inflect properly`, () => { test(`${word.in.p} should inflect properly`, () => {
expect(inflectWord(word.in)).toEqual(word.out); expect(inflectWord(word.in)).toEqual(word.out);
}); });
@ -936,4 +1054,4 @@ test(`inflectRegularYeyUnisex should work`, () => {
[{p: "لیدونکو", f: "leedóonko"}], [{p: "لیدونکو", f: "leedóonko"}],
], ],
}); });
}) });

View File

@ -16,8 +16,13 @@ import {
concatPsString, concatPsString,
endsInConsonant, endsInConsonant,
endsInAaOrOo, endsInAaOrOo,
addOEnding,
endsInShwa,
splitPsByVarients,
removeEndTick,
} from "./p-text-helpers"; } from "./p-text-helpers";
import { import {
hasAccents,
removeAccents, removeAccents,
} from "./accent-helpers"; } from "./accent-helpers";
import * as T from "../types"; import * as T from "../types";
@ -43,6 +48,9 @@ export function inflectWord(word: T.DictionaryEntry): T.InflectorOutput {
) as T.UnisexInflections, ) as T.UnisexInflections,
}; };
} }
if (w.c && w.c.includes("pl.")) {
return handlePluralNoun(w);
}
if (w.c && (w.c.includes("adj.") || w.c.includes("unisex"))) { if (w.c && (w.c.includes("adj.") || w.c.includes("unisex"))) {
return handleUnisexWord(w); return handleUnisexWord(w);
} }
@ -61,24 +69,24 @@ function handleUnisexWord(word: T.DictionaryEntryNoFVars): T.InflectorOutput {
// Get last letter of Pashto and last two letters of phonetics // Get last letter of Pashto and last two letters of phonetics
// TODO: !!! Handle weird endings / symbols ' etc. // TODO: !!! Handle weird endings / symbols ' etc.
const pEnd = word.p.slice(-1); const pEnd = word.p.slice(-1);
const plural = makePlural(word); const plurals = makePlural(word);
if (word.infap && word.infaf && word.infbp && word.infbf) { if (word.infap && word.infaf && word.infbp && word.infbf) {
return { return {
inflections: inflectIrregularUnisex(word.p, word.f, [ inflections: inflectIrregularUnisex(word.p, word.f, [
{p: word.infap, f: word.infaf}, {p: word.infap, f: word.infaf},
{p: word.infbp, f: word.infbf}, {p: word.infbp, f: word.infbf},
]), ]),
plural, ...plurals,
}; };
} }
if (pEnd === "ی" && word.f.slice(-2) === "ey") { if (pEnd === "ی" && word.f.slice(-2) === "ey") {
return { inflections: inflectRegularYeyUnisex(word.p, word.f), plural }; return { inflections: inflectRegularYeyUnisex(word.p, word.f), ...plurals };
} }
if (pEnd === "ه" && word.g.slice(-1) === "u") { if (pEnd === "ه" && word.g.slice(-1) === "u") {
return { inflections: inflectRegularShwaEndingUnisex(word.p, word.f), plural }; return { inflections: inflectRegularShwaEndingUnisex(word.p, word.f), ...plurals };
} }
if (pEnd === "ی" && word.f.slice(-2) === "éy") { if (pEnd === "ی" && word.f.slice(-2) === "éy") {
return { inflections: inflectEmphasizedYeyUnisex(word.p, word.f), plural }; return { inflections: inflectEmphasizedYeyUnisex(word.p, word.f), ...plurals };
} }
if ( if (
pashtoConsonants.includes(pEnd) || pashtoConsonants.includes(pEnd) ||
@ -86,15 +94,23 @@ function handleUnisexWord(word: T.DictionaryEntryNoFVars): T.InflectorOutput {
word.p.slice(-2) === "ای" || word.p.slice(-2) === "ای" ||
(word.p.slice(-1) === "ه" && word.f.slice(-1) === "h") (word.p.slice(-1) === "ه" && word.f.slice(-1) === "h")
) { ) {
return { inflections: inflectConsonantEndingUnisex(word.p, word.f), plural }; return { inflections: inflectConsonantEndingUnisex(word.p, word.f), ...plurals };
} }
if (plurals) return plurals;
return false; return false;
} }
function handlePluralNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput {
if (!w.c || !w.c.includes("n.")) return false;
const plurals = makePlural(w);
if (!plurals) return false;
return { ...plurals };
}
function handleMascNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput { function handleMascNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput {
// Get last letter of Pashto and last two letters of phonetics // Get last letter of Pashto and last two letters of phonetics
// TODO: !!! Handle weird endings / symbols ' etc. // TODO: !!! Handle weird endings / symbols ' etc.
const plural = makePlural(w); const plurals = makePlural(w);
const pEnd = w.p.slice(-1); const pEnd = w.p.slice(-1);
const fEnd = w.f.slice(-2); const fEnd = w.f.slice(-2);
if (w.infap && w.infaf && w.infbp && w.infbf) { if (w.infap && w.infaf && w.infbp && w.infbf) {
@ -103,20 +119,20 @@ function handleMascNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput {
{p: w.infap, f: w.infaf}, {p: w.infap, f: w.infaf},
{p: w.infbp, f: w.infbf}, {p: w.infbp, f: w.infbf},
]), ]),
plural, ...plurals,
}; };
} }
const isTobEnding = (w.p.slice(-3) === "توب" && ["tób", "tob"].includes(w.f.slice(-3)) && w.p.length > 3); const isTobEnding = (w.p.slice(-3) === "توب" && ["tób", "tob"].includes(w.f.slice(-3)) && w.p.length > 3);
if (isTobEnding) { if (isTobEnding) {
return { inflections: inflectTobMasc(w.p, w.f), plural }; return { inflections: inflectTobMasc(w.p, w.f), ...plurals };
} }
if (pEnd === "ی" && fEnd === "ey") { if (pEnd === "ی" && fEnd === "ey") {
return { inflections: inflectRegularYeyMasc(w.p, w.f), plural }; return { inflections: inflectRegularYeyMasc(w.p, w.f), ...plurals };
} }
if (pEnd === "ی" && fEnd === "éy") { if (pEnd === "ی" && fEnd === "éy") {
return { inflections: inflectRegularEmphasizedYeyMasc(w.p, w.f), plural }; return { inflections: inflectRegularEmphasizedYeyMasc(w.p, w.f), ...plurals };
} }
return plural ? { plural } : false return plurals ? { ...plurals } : false
} }
function handleFemNoun(word: T.DictionaryEntryNoFVars): T.InflectorOutput { function handleFemNoun(word: T.DictionaryEntryNoFVars): T.InflectorOutput {
@ -126,27 +142,27 @@ function handleFemNoun(word: T.DictionaryEntryNoFVars): T.InflectorOutput {
const animate = c.includes("anim."); const animate = c.includes("anim.");
const pEnd = word.p.slice(-1); const pEnd = word.p.slice(-1);
const plural = makePlural(word); const plurals = makePlural(word);
if (endingInHeyOrAynRegex.test(word.p) && endingInSingleARegex.test(word.f)) { if (endingInHeyOrAynRegex.test(word.p) && endingInSingleARegex.test(word.f)) {
return { inflections: inflectRegularAFem(word.p, word.f), plural }; return { inflections: inflectRegularAFem(word.p, word.f), ...plurals };
} }
if (word.p.slice(-1) === "ح" && endingInSingleARegex.test(word.f)) { if (word.p.slice(-1) === "ح" && endingInSingleARegex.test(word.f)) {
return { inflections: inflectRegularAWithHimPEnding(word.p, word.f), plural }; return { inflections: inflectRegularAWithHimPEnding(word.p, word.f), ...plurals };
} }
if (pashtoConsonants.includes(pEnd) && !animate) { if (pashtoConsonants.includes(pEnd) && !animate) {
return { inflections: inflectRegularInanMissingAFem(word.p, word.f), plural }; return { inflections: inflectRegularInanMissingAFem(word.p, word.f), ...plurals };
} }
if (pEnd === "ي" && (!animate)) { if (pEnd === "ي" && (!animate)) {
return { inflections: inflectRegularInanEeFem(word.p, word.f), plural }; return { inflections: inflectRegularInanEeFem(word.p, word.f), ...plurals };
} }
if (pEnd === "ۍ") { if (pEnd === "ۍ") {
return { inflections: inflectRegularUyFem(word.p, word.f), plural }; return { inflections: inflectRegularUyFem(word.p, word.f), ...plurals };
} }
// if (endingInAlefRegex.test(word.p)) { // if (endingInAlefRegex.test(word.p)) {
// return { inflections: inflectRegularAaFem(word.p, f) }; // return { inflections: inflectRegularAaFem(word.p, f) };
// } // }
return plural ? { plural } : false; return plurals ? { ...plurals } : false;
} }
// LEVEL 3 FUNCTIONS // LEVEL 3 FUNCTIONS
@ -294,13 +310,15 @@ function inflectIrregularMasc(p: string, f: string, inflections: Array<{p: strin
} }
function inflectRegularAFem(p: string, f: string): T.Inflections { function inflectRegularAFem(p: string, f: string): T.Inflections {
const baseF = ["'", ""].includes(f.slice(-1)) ? f.slice(0, -2) : f.slice(0, -1); const withoutTrailingComma = ["'", ""].includes(f.slice(-1)) ? f.slice(0, -1) : f;
const accentLast = hasAccents(withoutTrailingComma.slice(-1));
const baseF = withoutTrailingComma.slice(0, -1);
const baseP = p.slice(-1) === "ع" ? p : p.slice(0, -1); const baseP = p.slice(-1) === "ع" ? p : p.slice(0, -1);
return { return {
fem: [ fem: [
[{p, f}], [{p, f}],
[{p: `${baseP}ې`, f: `${baseF}e`}], [{p: `${baseP}ې`, f: `${baseF}${accentLast ? "é" : "e"}`}],
[{p: `${baseP}و`, f: `${baseF}o`}], [{p: `${baseP}و`, f: `${baseF}${accentLast ? "ó" : "o"}`}],
], ],
}; };
} }
@ -356,53 +374,91 @@ function inflectRegularUyFem(p: string, f: string): T.Inflections {
function makePashtoPlural(word: T.DictionaryEntryNoFVars): T.PluralInflections | undefined { function makePashtoPlural(word: T.DictionaryEntryNoFVars): T.PluralInflections | undefined {
if (!(word.ppp && word.ppf)) return undefined; if (!(word.ppp && word.ppf)) return undefined;
const base = makePsString(word.ppp, word.ppf); const base = makePsString(word.ppp, word.ppf);
// TODO: Add male Pashto plural function getBaseAndO(): T.PluralInflectionSet {
return [[base], addOEnding(base)];
}
if (word.c?.includes("n. m.")) {
return { masc: getBaseAndO() };
}
if (word.c?.includes("n. f.")) { if (word.c?.includes("n. f.")) {
return { return { fem: getBaseAndO() };
fem: [
[base],
// todo: function to add و ending automatically
[concatPsString(
makePsString(base.p.slice(0, -1), base.f.slice(0, -1)),
{ p: "و", f: "o" },
)],
],
}
} }
// TODO: handle masculine and unisex // TODO: handle masculine and unisex
return undefined; return undefined;
} }
function makePlural(w: T.DictionaryEntryNoFVars): T.PluralInflections | undefined { function makeArabicPlural(word: T.DictionaryEntryNoFVars): T.PluralInflections | undefined {
// TODO: Include the Pashto plural thing here if (!(word.apf && word.app)) return undefined;
const pashtoPlural = makePashtoPlural(w); const w = makePsString(word.app, word.apf);
if (pashtoPlural) return pashtoPlural; const plural = splitPsByVarients(w);
function addMascPluralSuffix(animate?: boolean): T.PluralInflectionSet { const end = removeAccents(removeEndTick(word.apf).slice(-1));
const base = removeAccents(w); // again typescript being dumb and not letting me use a typed key here
const value = [
plural,
plural.flatMap(addOEnding) as T.ArrayOneOrMore<T.PsString>,
] as T.PluralInflectionSet;
// feminine words that have arabic plurals stay feminine with the plural - ie مرجع - مراجع
// but masculine words that appear feminine in the plural aren't femening with the Arabic plural - ie. نبي - انبیا
if (["i", "e", "a"].includes(end) && word.c?.includes("n. f.")) {
return { fem: value };
}
return { masc: value };
}
function makePlural(w: T.DictionaryEntryNoFVars): { plural: T.PluralInflections } | { arabicPlural: T.PluralInflections } | undefined {
function addSecondInf(plur: T.ArrayOneOrMore<T.PsString> | T.PsString): T.PluralInflectionSet {
if (!Array.isArray(plur)) {
return addSecondInf([plur]);
}
return [ return [
[concatPsString(base, animate ? { p: "ان", f: "áan" } : { p: "ونه", f: "óona" })], plur,
[concatPsString(base, animate ? { p: "انو", f: "áano" } : { p: "ونو", f: "óono" })], plur.flatMap(addOEnding) as T.ArrayOneOrMore<T.PsString>,
]; ];
} }
if (w.c && w.c.includes("pl.")) {
const plural = addSecondInf(makePsString(w.p, w.f));
// Typescript being dumb and not letting me do a typed variable for the key
// could try refactoring with an updated TypeScript dependency
if (w.c.includes("n. m.")) return { plural: { masc: plural }};
if (w.c.includes("n. f.")) return { plural: { fem: plural }};
}
// TODO: MAKE ARABIC PLURAL HERE IF THERE IS ARABIC PLURAL
const arabicPlural = makeArabicPlural(w);
const pashtoPlural = makePashtoPlural(w);
if (pashtoPlural) return { plural: pashtoPlural, arabicPlural };
function addMascPluralSuffix(animate?: boolean, shortSquish?: boolean): T.PluralInflectionSet {
if (shortSquish && (w.infap == undefined || w.infaf === undefined)) {
throw new Error(`no irregular inflection info for ${w.p} - ${w.ts}`);
}
const b = removeAccents(shortSquish
? makePsString((w.infap as string).slice(0, -1), (w.infaf as string).slice(0, -1))
: w
);
const base = endsInShwa(b)
? makePsString(b.p.slice(0, -1), b.f.slice(0, -1))
: b;
return addSecondInf(
concatPsString(base, (animate && !shortSquish) ? { p: "ان", f: "áan" } : { p: "ونه", f: "óona" }),
);
}
function addAnimUnisexPluralSuffix(): T.UnisexSet<T.PluralInflectionSet> { function addAnimUnisexPluralSuffix(): T.UnisexSet<T.PluralInflectionSet> {
const base = removeAccents(w); const base = removeAccents(w);
return { return {
masc: addMascPluralSuffix(true), masc: addMascPluralSuffix(true),
fem: [ fem: addSecondInf(concatPsString(base, { p: "انې", f: "áane" })),
[concatPsString(base, { p: "انې", f: "áane" })],
[concatPsString(base, { p: "انو", f: "áano" })],
],
}; };
} }
function addFemLongVowelSuffix(): T.PluralInflectionSet { function addFemLongVowelSuffix(): T.PluralInflectionSet {
const base = makePsString(w.p, w.f); const base = removeEndTick(makePsString(w.p, w.f));
const baseWOutAccents = removeAccents(base); const baseWOutAccents = removeAccents(base);
return [ const space = (w.p.slice(-1) === "ع" || w.p.slice(-1) === "ه") ? { p: " ", f: "" } : "";
[concatPsString(base, { p: "وې", f: "we" }), concatPsString(baseWOutAccents, { p: "ګانې", f: "gáane" })], return addSecondInf([
[concatPsString(base, { p: "وو", f: "wo" }), concatPsString(baseWOutAccents, { p: "ګانو", f: "gáano" })], concatPsString(base, space, { p: "وې", f: "we" }),
]; concatPsString(baseWOutAccents, space, { p: "ګانې", f: "gáane" })
]);
} }
const shortSquish = !!w.infap && !w.infap.includes("ا");
const anim = w.c?.includes("anim."); const anim = w.c?.includes("anim.");
const type = (w.c?.includes("unisex")) const type = (w.c?.includes("unisex"))
? "unisex noun" ? "unisex noun"
@ -411,19 +467,33 @@ function makePlural(w: T.DictionaryEntryNoFVars): T.PluralInflections | undefine
: (w.c?.includes("n. f.")) : (w.c?.includes("n. f."))
? "fem noun" ? "fem noun"
: "other"; : "other";
if (type === "unisex noun" && endsInConsonant(w) && (!w.infap) && anim) { if (type === "unisex noun") {
return addAnimUnisexPluralSuffix(); if (endsInConsonant(w) && (!w.infap) && anim) {
return { arabicPlural, plural: addAnimUnisexPluralSuffix() };
}
if (shortSquish) {
return { arabicPlural, plural: { masc: addMascPluralSuffix(anim, shortSquish) }};
}
} }
if (type === "masc noun" && endsInConsonant(w) && (!w.infap) && (w.p.slice(-3) !== "توب")) { if (type === "masc noun" && (shortSquish || (endsInConsonant(w) || endsInShwa(w) && (!w.infap))) && (w.p.slice(-3) !== "توب")) {
return { return {
masc: addMascPluralSuffix(anim), arabicPlural,
plural: {
masc: addMascPluralSuffix(anim, shortSquish),
},
}; };
} }
// TODO: What about endings in long ee / animate at inanimate // TODO: What about endings in long ee / animate at inanimate
if (type === "fem noun" && endsInAaOrOo(w) && (!w.infap)) { if (type === "fem noun" && endsInAaOrOo(w) && (!w.infap)) {
return { return {
fem: addFemLongVowelSuffix(), arabicPlural,
plural: {
fem: addFemLongVowelSuffix(),
},
}; };
} }
if (arabicPlural) {
return { arabicPlural, plural: pashtoPlural };
}
return undefined; return undefined;
} }

View File

@ -6,7 +6,7 @@
* *
*/ */
import { standardizePashto } from "./standardize-pashto"; import { standardizePashto, standardizePhonetics } from "./standardize-pashto";
const testPairs = [ const testPairs = [
["گوگل", "ګوګل"], ["گوگل", "ګوګل"],
@ -31,3 +31,14 @@ testPairs.forEach((pair) => {
expect(result).toBe(pair[1]); expect(result).toBe(pair[1]);
}); });
}); });
test("standardizePashto", () => {
const pairs = [
["maaaloom", "ma'aaloom"],
["maaaloom", "ma'aaloom"],
["ma'aaloom", "ma'aaloom"],
];
pairs.forEach((x) => {
expect(standardizePhonetics(x[0])).toBe(x[1])
});
})

View File

@ -20,3 +20,8 @@ export function standardizePashto(input: string): string {
// Replace آ two character version with combined آ character // Replace آ two character version with combined آ character
.replace(/آ/g, "آ"); .replace(/آ/g, "آ");
} }
export function standardizePhonetics(input: string): string {
// TODO: check that these are the only kinds of smart comments
return input.replace(/[|]/g, "'");
}

View File

@ -352,7 +352,12 @@ export type Inflections = GenderedSet<InflectionSet>;
export type PluralInflections = GenderedSet<PluralInflectionSet>; export type PluralInflections = GenderedSet<PluralInflectionSet>;
export type InflectorOutput = { export type InflectorOutput = {
arabicPlural: PluralInflections,
plural?: PluralInflections,
inflections?: Inflections,
} | {
plural: PluralInflections, plural: PluralInflections,
arabicPlural?: PluralInflections,
inflections?: Inflections, inflections?: Inflections,
} | { } | {
inflections: Inflections, inflections: Inflections,