pashto-inflector/src/lib/pashto-inflector.ts

509 lines
15 KiB
TypeScript
Raw Normal View History

2021-03-09 12:39:13 +00:00
/**
* Copyright (c) 2021 lingdocs.com
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*
*/
import { pashtoConsonants } from "./pashto-consonants";
2021-05-25 09:47:02 +00:00
import {
concatInflections,
splitDoubleWord,
ensureUnisexInflections,
2021-08-31 09:34:18 +00:00
makePsString,
2021-09-07 11:49:57 +00:00
removeFVarients,
concatPsString,
endsInConsonant,
endsInAaOrOo,
2021-09-14 14:25:04 +00:00
addOEnding,
endsInShwa,
splitPsByVarients,
removeEndTick,
2021-05-25 09:47:02 +00:00
} from "./p-text-helpers";
2021-08-31 09:34:18 +00:00
import {
2021-09-14 14:25:04 +00:00
hasAccents,
2021-08-31 09:34:18 +00:00
removeAccents,
} from "./accent-helpers";
2021-03-09 12:39:13 +00:00
import * as T from "../types";
const endingInSingleARegex = /[^a]'??[aá]'??$/;
const endingInHeyOrAynRegex = /[^ا][هع]$/;
2021-09-07 11:49:57 +00:00
// const endingInAlefRegex = /اع?$/;
2021-03-09 12:39:13 +00:00
2021-09-07 11:49:57 +00:00
export function inflectWord(word: T.DictionaryEntry): T.InflectorOutput {
2021-03-09 12:39:13 +00:00
// If it's a noun/adj, inflect accordingly
// TODO: What about n. f. / adj. that end in ي ??
2021-09-07 11:49:57 +00:00
const w = removeFVarients(word);
if (w.noInf) {
2021-03-09 12:39:13 +00:00
return false;
}
2021-09-07 11:49:57 +00:00
if (w.c?.includes("doub.")) {
const words = splitDoubleWord(w);
const inflected = words.map((x) => ensureUnisexInflections(inflectWord(x), x));
return {
inflections: concatInflections(
inflected[0].inflections,
inflected[1].inflections,
) as T.UnisexInflections,
};
2021-05-25 09:47:02 +00:00
}
2021-09-14 14:25:04 +00:00
if (w.c && w.c.includes("pl.")) {
return handlePluralNoun(w);
}
2021-09-07 11:49:57 +00:00
if (w.c && (w.c.includes("adj.") || w.c.includes("unisex"))) {
return handleUnisexWord(w);
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
if (w.c && (w.c.includes("n. m."))) {
return handleMascNoun(w);
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
if (w.c && (w.c.includes("n. f."))) {
return handleFemNoun(w);
2021-03-09 12:39:13 +00:00
}
// It's not a noun/adj
return false;
}
// LEVEL 2 FUNCTIONS
2021-09-07 11:49:57 +00:00
function handleUnisexWord(word: T.DictionaryEntryNoFVars): T.InflectorOutput {
2021-03-09 12:39:13 +00:00
// Get last letter of Pashto and last two letters of phonetics
// TODO: !!! Handle weird endings / symbols ' etc.
const pEnd = word.p.slice(-1);
2021-09-14 14:25:04 +00:00
const plurals = makePlural(word);
2021-03-09 12:39:13 +00:00
if (word.infap && word.infaf && word.infbp && word.infbf) {
2021-09-07 11:49:57 +00:00
return {
inflections: inflectIrregularUnisex(word.p, word.f, [
{p: word.infap, f: word.infaf},
{p: word.infbp, f: word.infbf},
]),
2021-09-14 14:25:04 +00:00
...plurals,
2021-09-07 11:49:57 +00:00
};
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
if (pEnd === "ی" && word.f.slice(-2) === "ey") {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularYeyUnisex(word.p, word.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
2021-08-31 09:34:18 +00:00
if (pEnd === "ه" && word.g.slice(-1) === "u") {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularShwaEndingUnisex(word.p, word.f), ...plurals };
2021-08-31 09:34:18 +00:00
}
2021-09-07 11:49:57 +00:00
if (pEnd === "ی" && word.f.slice(-2) === "éy") {
2021-09-14 14:25:04 +00:00
return { inflections: inflectEmphasizedYeyUnisex(word.p, word.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
if (
pashtoConsonants.includes(pEnd) ||
word.p.slice(-2) === "وی" ||
word.p.slice(-2) === "ای" ||
2021-09-07 11:49:57 +00:00
(word.p.slice(-1) === "ه" && word.f.slice(-1) === "h")
2021-03-09 12:39:13 +00:00
) {
2021-09-14 14:25:04 +00:00
return { inflections: inflectConsonantEndingUnisex(word.p, word.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
2021-09-14 14:25:04 +00:00
if (plurals) return plurals;
2021-03-09 12:39:13 +00:00
return false;
}
2021-09-14 14:25:04 +00:00
function handlePluralNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput {
if (!w.c || !w.c.includes("n.")) return false;
const plurals = makePlural(w);
if (!plurals) return false;
return { ...plurals };
}
2021-09-07 11:49:57 +00:00
function handleMascNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput {
2021-03-09 12:39:13 +00:00
// Get last letter of Pashto and last two letters of phonetics
// TODO: !!! Handle weird endings / symbols ' etc.
2021-09-14 14:25:04 +00:00
const plurals = makePlural(w);
2021-09-07 11:49:57 +00:00
const pEnd = w.p.slice(-1);
const fEnd = w.f.slice(-2);
if (w.infap && w.infaf && w.infbp && w.infbf) {
return {
inflections: inflectIrregularMasc(w.p, w.f, [
{p: w.infap, f: w.infaf},
{p: w.infbp, f: w.infbf},
]),
2021-09-14 14:25:04 +00:00
...plurals,
2021-09-07 11:49:57 +00:00
};
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
const isTobEnding = (w.p.slice(-3) === "توب" && ["tób", "tob"].includes(w.f.slice(-3)) && w.p.length > 3);
2021-03-09 12:39:13 +00:00
if (isTobEnding) {
2021-09-14 14:25:04 +00:00
return { inflections: inflectTobMasc(w.p, w.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
if (pEnd === "ی" && fEnd === "ey") {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularYeyMasc(w.p, w.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
if (pEnd === "ی" && fEnd === "éy") {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularEmphasizedYeyMasc(w.p, w.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
2021-09-14 14:25:04 +00:00
return plurals ? { ...plurals } : false
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
function handleFemNoun(word: T.DictionaryEntryNoFVars): T.InflectorOutput {
2021-03-09 12:39:13 +00:00
// Get first of comma seperated phonetics entries
/* istanbul ignore next */ // will always have word.c at this point
const c = word.c || "";
const animate = c.includes("anim.");
const pEnd = word.p.slice(-1);
2021-09-14 14:25:04 +00:00
const plurals = makePlural(word);
2021-09-07 11:49:57 +00:00
2021-09-07 12:02:01 +00:00
if (endingInHeyOrAynRegex.test(word.p) && endingInSingleARegex.test(word.f)) {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularAFem(word.p, word.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
2021-09-07 12:02:01 +00:00
if (word.p.slice(-1) === "ح" && endingInSingleARegex.test(word.f)) {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularAWithHimPEnding(word.p, word.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
if (pashtoConsonants.includes(pEnd) && !animate) {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularInanMissingAFem(word.p, word.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
if (pEnd === "ي" && (!animate)) {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularInanEeFem(word.p, word.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
if (pEnd === "ۍ") {
2021-09-14 14:25:04 +00:00
return { inflections: inflectRegularUyFem(word.p, word.f), ...plurals };
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
// if (endingInAlefRegex.test(word.p)) {
// return { inflections: inflectRegularAaFem(word.p, f) };
// }
2021-09-14 14:25:04 +00:00
return plurals ? { ...plurals } : false;
2021-03-09 12:39:13 +00:00
}
// LEVEL 3 FUNCTIONS
function inflectIrregularUnisex(p: string, f: string, inflections: Array<{p: string, f: string}>): T.Inflections {
return {
masc: [
[{p, f}],
[{p: inflections[0].p, f: inflections[0].f}],
[{p: `${inflections[1].p}و`, f: `${inflections[1].f}o`}],
],
fem: [
[{p: `${inflections[1].p}ه`, f: `${inflections[1].f}a`}],
[{p: `${inflections[1].p}ې`, f: `${inflections[1].f}e`}],
[{p: `${inflections[1].p}و`, f: `${inflections[1].f}o`}],
],
};
}
export function inflectRegularYeyUnisex(p: string, f: string): T.UnisexInflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
masc: [
[{p, f}],
[{p: `${baseP}ي`, f: `${baseF}ee`}],
[
{p: `${baseP}یو`, f: `${baseF}iyo`},
{p: `${baseP}و`, f: `${baseF}o`},
],
],
fem: [
[{p: `${baseP}ې`, f: `${baseF}e`}],
[{p: `${baseP}ې`, f: `${baseF}e`}],
[{p: `${baseP}و`, f: `${baseF}o`}],
],
};
}
2021-08-31 09:34:18 +00:00
export function inflectRegularShwaEndingUnisex(pr: string, fr: string): T.UnisexInflections {
const { p, f } = removeAccents(makePsString(pr, fr));
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -1);
return {
masc: [
[{p: `${baseP}ه`, f: `${baseF}ú`}],
[{p: `${baseP}ه`, f: `${baseF}ú`}],
[{p: `${baseP}و`, f: `${baseF}ó`}],
],
fem: [
[{p: `${baseP}ه`, f: `${baseF}á`}],
[{p: `${baseP}ې`, f: `${baseF}é`}],
[{p: `${baseP}و`, f: `${baseF}ó`}],
],
};
}
2021-03-09 12:39:13 +00:00
function inflectEmphasizedYeyUnisex(p: string, f: string): T.UnisexInflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
masc: [
[{p, f}],
[{p: `${baseP}ي`, f: `${baseF}ée`}],
[
{p: `${baseP}یو`, f: `${baseF}iyo`},
{p: `${baseP}و`, f: `${baseF}ó`},
],
],
fem: [
[{p: `${baseP}ۍ`, f: `${baseF}úy`}],
[{p: `${baseP}ۍ`, f: `${baseF}úy`}],
[
{ p: `${baseP}یو`, f: `${baseF}úyo` },
{ p: `${baseP}و`, f: `${baseF}ó`, },
],
],
};
}
function inflectConsonantEndingUnisex(p: string, f: string): T.UnisexInflections {
return {
masc: [
[{p, f}],
[{p, f}],
[{p: `${p}و`, f: `${f}o`}],
],
fem: [
[{p: `${p}ه`, f: `${f}a`}],
[{p: `${p}ې`, f: `${f}e`}],
[{p: `${p}و`, f: `${f}o`}],
],
};
}
function inflectRegularYeyMasc(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
masc: [
[{p, f}],
[{p: `${baseP}ي`, f: `${baseF}ee`}],
[
{p: `${baseP}یو`, f: `${baseF}iyo`},
{p: `${baseP}و`, f: `${baseF}o`},
],
],
};
}
function inflectTobMasc(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -3);
const baseF = f.slice(0, -3);
return {
masc: [
[{p, f}],
[{p: `${baseP}تابه`, f: `${baseF}taabu`}],
[{p: `${baseP}تبو`, f: `${baseF}tabo`}],
],
};
}
function inflectRegularEmphasizedYeyMasc(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
masc: [
[{p, f}],
[{p: `${baseP}ي`, f: `${baseF}ée`}],
[
{p: `${baseP}یو`, f: `${baseF}iyo`},
{p: `${baseP}و`, f: `${baseF}o`},
],
],
};
}
function inflectIrregularMasc(p: string, f: string, inflections: Array<{p: string, f: string}>): T.Inflections {
return {
masc: [
[{p, f}],
[{p: inflections[0].p, f: inflections[0].f}],
[{p: `${inflections[1].p}و`, f: `${inflections[1].f}o`}],
],
};
}
function inflectRegularAFem(p: string, f: string): T.Inflections {
2021-09-14 14:25:04 +00:00
const withoutTrailingComma = ["'", ""].includes(f.slice(-1)) ? f.slice(0, -1) : f;
const accentLast = hasAccents(withoutTrailingComma.slice(-1));
const baseF = withoutTrailingComma.slice(0, -1);
2021-03-09 12:39:13 +00:00
const baseP = p.slice(-1) === "ع" ? p : p.slice(0, -1);
return {
fem: [
[{p, f}],
2021-09-14 14:25:04 +00:00
[{p: `${baseP}ې`, f: `${baseF}${accentLast ? "é" : "e"}`}],
[{p: `${baseP}و`, f: `${baseF}${accentLast ? "ó" : "o"}`}],
2021-03-09 12:39:13 +00:00
],
};
}
function inflectRegularAWithHimPEnding(p: string, f: string): T.Inflections {
const baseF = f.slice(0, -1);
return {
fem: [
[{p, f}],
[{p: `${p}ې`, f: `${baseF}e`}],
[{p: `${p}و`, f: `${baseF}o`}],
],
};
}
function inflectRegularInanMissingAFem(p: string, f: string): T.Inflections {
return {
fem: [
[{p, f}],
[{p: `${p}ې`, f: `${f}e`}],
[{p: `${p}و`, f: `${f}o`}],
],
};
}
function inflectRegularInanEeFem(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
fem: [
[{p, f}],
[{p: `${baseP}ۍ`, f: `${baseF}uy`}],
[{p: `${baseP}یو`, f: `${baseF}uyo`}],
],
};
}
function inflectRegularUyFem(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
fem: [
[{p, f}],
[{p, f}],
[
{p: `${baseP}یو`, f: `${baseF}uyo`},
{p: `${baseP}و`, f: `${baseF}o`},
],
],
};
}
2021-09-07 11:49:57 +00:00
function makePashtoPlural(word: T.DictionaryEntryNoFVars): T.PluralInflections | undefined {
if (!(word.ppp && word.ppf)) return undefined;
2021-09-16 20:41:50 +00:00
const base = splitPsByVarients(
makePsString(word.ppp, word.ppf)
);
2021-09-14 14:25:04 +00:00
function getBaseAndO(): T.PluralInflectionSet {
2021-09-16 20:41:50 +00:00
return [
base,
base.flatMap(addOEnding) as T.ArrayOneOrMore<T.PsString>,
];
2021-09-14 14:25:04 +00:00
}
if (word.c?.includes("n. m.")) {
return { masc: getBaseAndO() };
}
2021-09-07 11:49:57 +00:00
if (word.c?.includes("n. f.")) {
2021-09-14 14:25:04 +00:00
return { fem: getBaseAndO() };
2021-09-07 11:49:57 +00:00
}
// TODO: handle masculine and unisex
return undefined;
}
2021-09-14 14:25:04 +00:00
function makeArabicPlural(word: T.DictionaryEntryNoFVars): T.PluralInflections | undefined {
if (!(word.apf && word.app)) return undefined;
const w = makePsString(word.app, word.apf);
const plural = splitPsByVarients(w);
const end = removeAccents(removeEndTick(word.apf).slice(-1));
// again typescript being dumb and not letting me use a typed key here
const value = [
plural,
plural.flatMap(addOEnding) as T.ArrayOneOrMore<T.PsString>,
] as T.PluralInflectionSet;
// feminine words that have arabic plurals stay feminine with the plural - ie مرجع - مراجع
// but masculine words that appear feminine in the plural aren't femening with the Arabic plural - ie. نبي - انبیا
if (["i", "e", "a"].includes(end) && word.c?.includes("n. f.")) {
return { fem: value };
}
return { masc: value };
}
function makePlural(w: T.DictionaryEntryNoFVars): { plural: T.PluralInflections } | { arabicPlural: T.PluralInflections } | undefined {
function addSecondInf(plur: T.ArrayOneOrMore<T.PsString> | T.PsString): T.PluralInflectionSet {
if (!Array.isArray(plur)) {
return addSecondInf([plur]);
}
2021-09-07 11:49:57 +00:00
return [
2021-09-14 14:25:04 +00:00
plur,
plur.flatMap(addOEnding) as T.ArrayOneOrMore<T.PsString>,
2021-09-07 11:49:57 +00:00
];
2021-09-14 14:25:04 +00:00
}
if (w.c && w.c.includes("pl.")) {
const plural = addSecondInf(makePsString(w.p, w.f));
// Typescript being dumb and not letting me do a typed variable for the key
// could try refactoring with an updated TypeScript dependency
if (w.c.includes("n. m.")) return { plural: { masc: plural }};
if (w.c.includes("n. f.")) return { plural: { fem: plural }};
}
// TODO: MAKE ARABIC PLURAL HERE IF THERE IS ARABIC PLURAL
const arabicPlural = makeArabicPlural(w);
const pashtoPlural = makePashtoPlural(w);
if (pashtoPlural) return { plural: pashtoPlural, arabicPlural };
function addMascPluralSuffix(animate?: boolean, shortSquish?: boolean): T.PluralInflectionSet {
if (shortSquish && (w.infap === undefined || w.infaf === undefined)) {
2021-09-14 14:25:04 +00:00
throw new Error(`no irregular inflection info for ${w.p} - ${w.ts}`);
}
const b = removeAccents(shortSquish
? makePsString((w.infap as string).slice(0, -1), (w.infaf as string).slice(0, -1))
: w
);
const base = endsInShwa(b)
? makePsString(b.p.slice(0, -1), b.f.slice(0, -1))
: b;
return addSecondInf(
concatPsString(base, (animate && !shortSquish) ? { p: "ان", f: "áan" } : { p: "ونه", f: "óona" }),
);
2021-09-07 11:49:57 +00:00
}
function addAnimUnisexPluralSuffix(): T.UnisexSet<T.PluralInflectionSet> {
const base = removeAccents(w);
return {
masc: addMascPluralSuffix(true),
2021-09-14 14:25:04 +00:00
fem: addSecondInf(concatPsString(base, { p: "انې", f: "áane" })),
2021-09-07 11:49:57 +00:00
};
}
function addFemLongVowelSuffix(): T.PluralInflectionSet {
2021-09-14 14:25:04 +00:00
const base = removeEndTick(makePsString(w.p, w.f));
2021-09-07 11:49:57 +00:00
const baseWOutAccents = removeAccents(base);
2021-09-14 14:25:04 +00:00
const space = (w.p.slice(-1) === "ع" || w.p.slice(-1) === "ه") ? { p: " ", f: "" } : "";
return addSecondInf([
concatPsString(base, space, { p: "وې", f: "we" }),
concatPsString(baseWOutAccents, space, { p: "ګانې", f: "gáane" })
]);
2021-09-07 11:49:57 +00:00
}
2021-09-14 14:25:04 +00:00
const shortSquish = !!w.infap && !w.infap.includes("ا");
2021-09-07 11:49:57 +00:00
const anim = w.c?.includes("anim.");
const type = (w.c?.includes("unisex"))
? "unisex noun"
: (w.c?.includes("n. m."))
? "masc noun"
: (w.c?.includes("n. f."))
? "fem noun"
: "other";
2021-09-14 14:25:04 +00:00
if (type === "unisex noun") {
if (endsInConsonant(w) && (!w.infap) && anim) {
return { arabicPlural, plural: addAnimUnisexPluralSuffix() };
}
if (shortSquish) {
return { arabicPlural, plural: { masc: addMascPluralSuffix(anim, shortSquish) }};
}
2021-09-07 11:49:57 +00:00
}
if (
type === "masc noun" &&
(shortSquish || ((endsInConsonant(w) || endsInShwa(w)) && (!w.infap))) &&
(w.p.slice(-3) !== "توب")
) {
2021-09-07 11:49:57 +00:00
return {
2021-09-14 14:25:04 +00:00
arabicPlural,
plural: {
masc: addMascPluralSuffix(anim, shortSquish),
},
2021-09-07 11:49:57 +00:00
};
}
// TODO: What about endings in long ee / animate at inanimate
if (type === "fem noun" && endsInAaOrOo(w) && (!w.infap)) {
return {
2021-09-14 14:25:04 +00:00
arabicPlural,
plural: {
fem: addFemLongVowelSuffix(),
},
2021-09-07 11:49:57 +00:00
};
}
2021-09-14 14:25:04 +00:00
if (arabicPlural) {
return { arabicPlural, plural: pashtoPlural };
}
2021-09-07 11:49:57 +00:00
return undefined;
2021-03-09 12:39:13 +00:00
}