pashto-inflector/src/lib/pashto-inflector.ts

430 lines
12 KiB
TypeScript
Raw Normal View History

2021-03-09 12:39:13 +00:00
/**
* Copyright (c) 2021 lingdocs.com
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*
*/
import { pashtoConsonants } from "./pashto-consonants";
2021-05-25 09:47:02 +00:00
import {
concatInflections,
splitDoubleWord,
ensureUnisexInflections,
2021-08-31 09:34:18 +00:00
makePsString,
2021-09-07 11:49:57 +00:00
removeFVarients,
concatPsString,
endsInConsonant,
endsInAaOrOo,
2021-05-25 09:47:02 +00:00
} from "./p-text-helpers";
2021-08-31 09:34:18 +00:00
import {
removeAccents,
} from "./accent-helpers";
2021-03-09 12:39:13 +00:00
import * as T from "../types";
const endingInSingleARegex = /[^a]'??[aá]'??$/;
const endingInHeyOrAynRegex = /[^ا][هع]$/;
2021-09-07 11:49:57 +00:00
// const endingInAlefRegex = /اع?$/;
2021-03-09 12:39:13 +00:00
2021-09-07 11:49:57 +00:00
export function inflectWord(word: T.DictionaryEntry): T.InflectorOutput {
2021-03-09 12:39:13 +00:00
// If it's a noun/adj, inflect accordingly
// TODO: What about n. f. / adj. that end in ي ??
2021-09-07 11:49:57 +00:00
const w = removeFVarients(word);
if (w.noInf) {
2021-03-09 12:39:13 +00:00
return false;
}
2021-09-07 11:49:57 +00:00
if (w.c?.includes("doub.")) {
const words = splitDoubleWord(w);
const inflected = words.map((x) => ensureUnisexInflections(inflectWord(x), x));
return {
inflections: concatInflections(
inflected[0].inflections,
inflected[1].inflections,
) as T.UnisexInflections,
};
2021-05-25 09:47:02 +00:00
}
2021-09-07 11:49:57 +00:00
if (w.c && (w.c.includes("adj.") || w.c.includes("unisex"))) {
return handleUnisexWord(w);
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
if (w.c && (w.c.includes("n. m."))) {
return handleMascNoun(w);
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
if (w.c && (w.c.includes("n. f."))) {
return handleFemNoun(w);
2021-03-09 12:39:13 +00:00
}
// It's not a noun/adj
return false;
}
// LEVEL 2 FUNCTIONS
2021-09-07 11:49:57 +00:00
function handleUnisexWord(word: T.DictionaryEntryNoFVars): T.InflectorOutput {
2021-03-09 12:39:13 +00:00
// Get last letter of Pashto and last two letters of phonetics
// TODO: !!! Handle weird endings / symbols ' etc.
const pEnd = word.p.slice(-1);
2021-09-07 11:49:57 +00:00
const plural = makePlural(word);
2021-03-09 12:39:13 +00:00
if (word.infap && word.infaf && word.infbp && word.infbf) {
2021-09-07 11:49:57 +00:00
return {
inflections: inflectIrregularUnisex(word.p, word.f, [
{p: word.infap, f: word.infaf},
{p: word.infbp, f: word.infbf},
]),
plural,
};
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
if (pEnd === "ی" && word.f.slice(-2) === "ey") {
return { inflections: inflectRegularYeyUnisex(word.p, word.f), plural };
2021-03-09 12:39:13 +00:00
}
2021-08-31 09:34:18 +00:00
if (pEnd === "ه" && word.g.slice(-1) === "u") {
2021-09-07 11:49:57 +00:00
return { inflections: inflectRegularShwaEndingUnisex(word.p, word.f), plural };
2021-08-31 09:34:18 +00:00
}
2021-09-07 11:49:57 +00:00
if (pEnd === "ی" && word.f.slice(-2) === "éy") {
return { inflections: inflectEmphasizedYeyUnisex(word.p, word.f), plural };
2021-03-09 12:39:13 +00:00
}
if (
pashtoConsonants.includes(pEnd) ||
word.p.slice(-2) === "وی" ||
word.p.slice(-2) === "ای" ||
2021-09-07 11:49:57 +00:00
(word.p.slice(-1) === "ه" && word.f.slice(-1) === "h")
2021-03-09 12:39:13 +00:00
) {
2021-09-07 11:49:57 +00:00
return { inflections: inflectConsonantEndingUnisex(word.p, word.f), plural };
2021-03-09 12:39:13 +00:00
}
return false;
}
2021-09-07 11:49:57 +00:00
function handleMascNoun(w: T.DictionaryEntryNoFVars): T.InflectorOutput {
2021-03-09 12:39:13 +00:00
// Get last letter of Pashto and last two letters of phonetics
// TODO: !!! Handle weird endings / symbols ' etc.
2021-09-07 11:49:57 +00:00
const plural = makePlural(w);
const pEnd = w.p.slice(-1);
const fEnd = w.f.slice(-2);
if (w.infap && w.infaf && w.infbp && w.infbf) {
return {
inflections: inflectIrregularMasc(w.p, w.f, [
{p: w.infap, f: w.infaf},
{p: w.infbp, f: w.infbf},
]),
plural,
};
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
const isTobEnding = (w.p.slice(-3) === "توب" && ["tób", "tob"].includes(w.f.slice(-3)) && w.p.length > 3);
2021-03-09 12:39:13 +00:00
if (isTobEnding) {
2021-09-07 11:49:57 +00:00
return { inflections: inflectTobMasc(w.p, w.f), plural };
2021-03-09 12:39:13 +00:00
}
if (pEnd === "ی" && fEnd === "ey") {
2021-09-07 11:49:57 +00:00
return { inflections: inflectRegularYeyMasc(w.p, w.f), plural };
2021-03-09 12:39:13 +00:00
}
if (pEnd === "ی" && fEnd === "éy") {
2021-09-07 11:49:57 +00:00
return { inflections: inflectRegularEmphasizedYeyMasc(w.p, w.f), plural };
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
return plural ? { plural } : false
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
function handleFemNoun(word: T.DictionaryEntryNoFVars): T.InflectorOutput {
2021-03-09 12:39:13 +00:00
// Get first of comma seperated phonetics entries
/* istanbul ignore next */ // will always have word.c at this point
const c = word.c || "";
const animate = c.includes("anim.");
const pEnd = word.p.slice(-1);
2021-09-07 11:49:57 +00:00
const plural = makePlural(word);
2021-09-07 12:02:01 +00:00
if (endingInHeyOrAynRegex.test(word.p) && endingInSingleARegex.test(word.f)) {
return { inflections: inflectRegularAFem(word.p, word.f), plural };
2021-03-09 12:39:13 +00:00
}
2021-09-07 12:02:01 +00:00
if (word.p.slice(-1) === "ح" && endingInSingleARegex.test(word.f)) {
return { inflections: inflectRegularAWithHimPEnding(word.p, word.f), plural };
2021-03-09 12:39:13 +00:00
}
if (pashtoConsonants.includes(pEnd) && !animate) {
2021-09-07 12:02:01 +00:00
return { inflections: inflectRegularInanMissingAFem(word.p, word.f), plural };
2021-03-09 12:39:13 +00:00
}
if (pEnd === "ي" && (!animate)) {
2021-09-07 12:02:01 +00:00
return { inflections: inflectRegularInanEeFem(word.p, word.f), plural };
2021-03-09 12:39:13 +00:00
}
if (pEnd === "ۍ") {
2021-09-07 12:02:01 +00:00
return { inflections: inflectRegularUyFem(word.p, word.f), plural };
2021-03-09 12:39:13 +00:00
}
2021-09-07 11:49:57 +00:00
// if (endingInAlefRegex.test(word.p)) {
// return { inflections: inflectRegularAaFem(word.p, f) };
// }
return plural ? { plural } : false;
2021-03-09 12:39:13 +00:00
}
// LEVEL 3 FUNCTIONS
function inflectIrregularUnisex(p: string, f: string, inflections: Array<{p: string, f: string}>): T.Inflections {
return {
masc: [
[{p, f}],
[{p: inflections[0].p, f: inflections[0].f}],
[{p: `${inflections[1].p}و`, f: `${inflections[1].f}o`}],
],
fem: [
[{p: `${inflections[1].p}ه`, f: `${inflections[1].f}a`}],
[{p: `${inflections[1].p}ې`, f: `${inflections[1].f}e`}],
[{p: `${inflections[1].p}و`, f: `${inflections[1].f}o`}],
],
};
}
export function inflectRegularYeyUnisex(p: string, f: string): T.UnisexInflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
masc: [
[{p, f}],
[{p: `${baseP}ي`, f: `${baseF}ee`}],
[
{p: `${baseP}یو`, f: `${baseF}iyo`},
{p: `${baseP}و`, f: `${baseF}o`},
],
],
fem: [
[{p: `${baseP}ې`, f: `${baseF}e`}],
[{p: `${baseP}ې`, f: `${baseF}e`}],
[{p: `${baseP}و`, f: `${baseF}o`}],
],
};
}
2021-08-31 09:34:18 +00:00
export function inflectRegularShwaEndingUnisex(pr: string, fr: string): T.UnisexInflections {
const { p, f } = removeAccents(makePsString(pr, fr));
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -1);
return {
masc: [
[{p: `${baseP}ه`, f: `${baseF}ú`}],
[{p: `${baseP}ه`, f: `${baseF}ú`}],
[{p: `${baseP}و`, f: `${baseF}ó`}],
],
fem: [
[{p: `${baseP}ه`, f: `${baseF}á`}],
[{p: `${baseP}ې`, f: `${baseF}é`}],
[{p: `${baseP}و`, f: `${baseF}ó`}],
],
};
}
2021-03-09 12:39:13 +00:00
function inflectEmphasizedYeyUnisex(p: string, f: string): T.UnisexInflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
masc: [
[{p, f}],
[{p: `${baseP}ي`, f: `${baseF}ée`}],
[
{p: `${baseP}یو`, f: `${baseF}iyo`},
{p: `${baseP}و`, f: `${baseF}ó`},
],
],
fem: [
[{p: `${baseP}ۍ`, f: `${baseF}úy`}],
[{p: `${baseP}ۍ`, f: `${baseF}úy`}],
[
{ p: `${baseP}یو`, f: `${baseF}úyo` },
{ p: `${baseP}و`, f: `${baseF}ó`, },
],
],
};
}
function inflectConsonantEndingUnisex(p: string, f: string): T.UnisexInflections {
return {
masc: [
[{p, f}],
[{p, f}],
[{p: `${p}و`, f: `${f}o`}],
],
fem: [
[{p: `${p}ه`, f: `${f}a`}],
[{p: `${p}ې`, f: `${f}e`}],
[{p: `${p}و`, f: `${f}o`}],
],
};
}
function inflectRegularYeyMasc(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
masc: [
[{p, f}],
[{p: `${baseP}ي`, f: `${baseF}ee`}],
[
{p: `${baseP}یو`, f: `${baseF}iyo`},
{p: `${baseP}و`, f: `${baseF}o`},
],
],
};
}
function inflectTobMasc(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -3);
const baseF = f.slice(0, -3);
return {
masc: [
[{p, f}],
[{p: `${baseP}تابه`, f: `${baseF}taabu`}],
[{p: `${baseP}تبو`, f: `${baseF}tabo`}],
],
};
}
function inflectRegularEmphasizedYeyMasc(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
masc: [
[{p, f}],
[{p: `${baseP}ي`, f: `${baseF}ée`}],
[
{p: `${baseP}یو`, f: `${baseF}iyo`},
{p: `${baseP}و`, f: `${baseF}o`},
],
],
};
}
function inflectIrregularMasc(p: string, f: string, inflections: Array<{p: string, f: string}>): T.Inflections {
return {
masc: [
[{p, f}],
[{p: inflections[0].p, f: inflections[0].f}],
[{p: `${inflections[1].p}و`, f: `${inflections[1].f}o`}],
],
};
}
function inflectRegularAFem(p: string, f: string): T.Inflections {
const baseF = ["'", ""].includes(f.slice(-1)) ? f.slice(0, -2) : f.slice(0, -1);
const baseP = p.slice(-1) === "ع" ? p : p.slice(0, -1);
return {
fem: [
[{p, f}],
[{p: `${baseP}ې`, f: `${baseF}e`}],
[{p: `${baseP}و`, f: `${baseF}o`}],
],
};
}
function inflectRegularAWithHimPEnding(p: string, f: string): T.Inflections {
const baseF = f.slice(0, -1);
return {
fem: [
[{p, f}],
[{p: `${p}ې`, f: `${baseF}e`}],
[{p: `${p}و`, f: `${baseF}o`}],
],
};
}
function inflectRegularInanMissingAFem(p: string, f: string): T.Inflections {
return {
fem: [
[{p, f}],
[{p: `${p}ې`, f: `${f}e`}],
[{p: `${p}و`, f: `${f}o`}],
],
};
}
function inflectRegularInanEeFem(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
fem: [
[{p, f}],
[{p: `${baseP}ۍ`, f: `${baseF}uy`}],
[{p: `${baseP}یو`, f: `${baseF}uyo`}],
],
};
}
function inflectRegularUyFem(p: string, f: string): T.Inflections {
const baseP = p.slice(0, -1);
const baseF = f.slice(0, -2);
return {
fem: [
[{p, f}],
[{p, f}],
[
{p: `${baseP}یو`, f: `${baseF}uyo`},
{p: `${baseP}و`, f: `${baseF}o`},
],
],
};
}
2021-09-07 11:49:57 +00:00
function makePashtoPlural(word: T.DictionaryEntryNoFVars): T.PluralInflections | undefined {
if (!(word.ppp && word.ppf)) return undefined;
const base = makePsString(word.ppp, word.ppf);
// TODO: Add male Pashto plural
if (word.c?.includes("n. f.")) {
return {
fem: [
[base],
// todo: function to add و ending automatically
[concatPsString(
makePsString(base.p.slice(0, -1), base.f.slice(0, -1)),
{ p: "و", f: "o" },
)],
],
}
}
// TODO: handle masculine and unisex
return undefined;
}
function makePlural(w: T.DictionaryEntryNoFVars): T.PluralInflections | undefined {
// TODO: Include the Pashto plural thing here
const pashtoPlural = makePashtoPlural(w);
if (pashtoPlural) return pashtoPlural;
function addMascPluralSuffix(animate?: boolean): T.PluralInflectionSet {
const base = removeAccents(w);
return [
[concatPsString(base, animate ? { p: "ان", f: "áan" } : { p: "ونه", f: "óona" })],
[concatPsString(base, animate ? { p: "انو", f: "áano" } : { p: "ونو", f: "óono" })],
];
}
function addAnimUnisexPluralSuffix(): T.UnisexSet<T.PluralInflectionSet> {
const base = removeAccents(w);
return {
masc: addMascPluralSuffix(true),
fem: [
[concatPsString(base, { p: "انې", f: "áane" })],
[concatPsString(base, { p: "انو", f: "áano" })],
],
};
}
function addFemLongVowelSuffix(): T.PluralInflectionSet {
const base = makePsString(w.p, w.f);
const baseWOutAccents = removeAccents(base);
return [
[concatPsString(base, { p: "وې", f: "we" }), concatPsString(baseWOutAccents, { p: "ګانې", f: "gáane" })],
[concatPsString(base, { p: "وو", f: "wo" }), concatPsString(baseWOutAccents, { p: "ګانو", f: "gáano" })],
];
}
const anim = w.c?.includes("anim.");
const type = (w.c?.includes("unisex"))
? "unisex noun"
: (w.c?.includes("n. m."))
? "masc noun"
: (w.c?.includes("n. f."))
? "fem noun"
: "other";
if (type === "unisex noun" && endsInConsonant(w) && (!w.infap) && anim) {
return addAnimUnisexPluralSuffix();
}
if (type === "masc noun" && endsInConsonant(w) && (!w.infap) && (w.p.slice(-3) !== "توب")) {
return {
masc: addMascPluralSuffix(anim),
};
}
// TODO: What about endings in long ee / animate at inanimate
if (type === "fem noun" && endsInAaOrOo(w) && (!w.infap)) {
return {
fem: addFemLongVowelSuffix(),
};
}
return undefined;
2021-03-09 12:39:13 +00:00
}