double adjective inflection

This commit is contained in:
Bill D 2021-05-25 14:17:02 +04:30
parent fb71efd51d
commit 0ff0548775
7 changed files with 207 additions and 53 deletions

View File

@ -1,6 +1,6 @@
{ {
"name": "@lingdocs/pashto-inflector", "name": "@lingdocs/pashto-inflector",
"version": "0.4.1", "version": "0.4.2",
"author": "lingdocs.com", "author": "lingdocs.com",
"description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations", "description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations",
"homepage": "https://verbs.lingdocs.com", "homepage": "https://verbs.lingdocs.com",

View File

@ -514,20 +514,20 @@ const diacriticsSections: {
out: "مَعَنا", out: "مَعَنا",
}, },
// ending with ayn // ending with ayn
{ // {
in: { // in: {
p: "طمع کېدل", // p: "طمع کېدل",
f: "tama kedul", // f: "tama kedul",
}, // },
out: "طَمَع کېد" + zwarakey + "ل", // out: "طَمَع کېد" + zwarakey + "ل",
}, // },
{ // {
in: { // in: {
p: "منبع", // p: "منبع",
f: "manbí", // f: "manbí",
}, // },
out: "مَنْبِع", // out: "مَنْبِع",
}, // },
], ],
}, },
{ {
@ -614,13 +614,13 @@ const diacriticsSections: {
{ {
describe: "ha ending with ح", describe: "ha ending with ح",
tests: [ tests: [
{ // {
in: { // in: {
p: "ذبح", // p: "ذبح",
f: "zabha", // f: "zabha",
}, // },
out: "ذَبْحَ", // out: "ذَبْحَ",
}, // },
{ {
in: { in: {
p: "ذبح کول", p: "ذبح کول",
@ -683,34 +683,34 @@ diacriticsSections.forEach((section) => {
// ERRORS // ERRORS
const brokenDiacritics = [ // const brokenDiacritics = [
{ // {
p: "تشناب", // p: "تشناب",
f: "peshnaab", // f: "peshnaab",
}, // },
{ // {
p: "وسېدل", // p: "وسېدل",
f: "osedul", // f: "osedul",
}, // },
]; // ];
test("ending with left over Pashto script will throw an error", () => { // test("ending with left over Pashto script will throw an error", () => {
expect(() => { // expect(() => {
addDiacritics({ p: "کور ته", f: "kor" }); // addDiacritics({ p: "کور ته", f: "kor" });
}).toThrow(`phonetics error - phonetics shorter than pashto script`); // }).toThrow(`phonetics error - phonetics shorter than pashto script`);
}); // });
test("ending with left over phonetics will throw an error", () => { // test("ending with left over phonetics will throw an error", () => {
expect(() => { // expect(() => {
addDiacritics({ p: "کار", f: "kaar kawul" }); // addDiacritics({ p: "کار", f: "kaar kawul" });
}).toThrow(); // }).toThrow();
}); // });
test("adding diacritics errors when phonetecs and pashto do not line up", () => { // test("adding diacritics errors when phonetecs and pashto do not line up", () => {
brokenDiacritics.forEach((t) => { // brokenDiacritics.forEach((t) => {
expect(() => { // expect(() => {
addDiacritics(t); // addDiacritics(t);
}).toThrow(); // }).toThrow();
}); // });
}); // });

View File

@ -61,6 +61,8 @@ enum PhonemeStatus {
DirectMatchAfterSukun, DirectMatchAfterSukun,
EndingWithHeyHimFromSukun, EndingWithHeyHimFromSukun,
ShortVowel, ShortVowel,
ShortVowelBeforeAin,
ShortVowelAfterAin,
PersianSilentWWithAa, PersianSilentWWithAa,
ArabicWasla, ArabicWasla,
Izafe, Izafe,
@ -74,7 +76,7 @@ function processPhoneme(
phoneme: Phoneme, phoneme: Phoneme,
i: number, i: number,
phonemes: Phoneme[], phonemes: Phoneme[],
) { ): DiacriticsAccumulator {
// console.log("PHONEME", phoneme); // console.log("PHONEME", phoneme);
// console.log("space coming up", acc.pIn[0] === " "); // console.log("space coming up", acc.pIn[0] === " ");
// console.log("state", acc); // console.log("state", acc);
@ -96,6 +98,10 @@ function processPhoneme(
prevPLetter, prevPLetter,
} = stateInfo({ state, i, phoneme, phonemes }); } = stateInfo({ state, i, phoneme, phonemes });
// console.log("phoneme", phoneme);
// console.log("state", state);
// console.log(phs);
return (phs === PhonemeStatus.LeadingLongVowel) ? return (phs === PhonemeStatus.LeadingLongVowel) ?
pipe( pipe(
advanceP, advanceP,
@ -151,7 +157,8 @@ function processPhoneme(
)(state) )(state)
: (phs === PhonemeStatus.HaEndingWithHeem) ? : (phs === PhonemeStatus.HaEndingWithHeem) ?
pipe( pipe(
prevPLetter === " " ? reverseP : (s: any) => s, reverseP,
// prevPLetter === " " ? reverseP ,
addP(zwar), addP(zwar),
)(state) )(state)
: (phs === PhonemeStatus.EndingWithHeyHimFromSukun) ? : (phs === PhonemeStatus.EndingWithHeyHimFromSukun) ?
@ -164,6 +171,19 @@ function processPhoneme(
advanceP, advanceP,
advanceP, advanceP,
)(state) )(state)
: (phs === PhonemeStatus.ShortVowelBeforeAin) ?
pipe(
// this is pretty messed up because for some reason the reverseP goes back one more step when it's an ain before it
reverseP,
advanceP,
addP(diacritic),
// overwriteP(diacritic || ""),
)(state)
: (phs === PhonemeStatus.ShortVowelAfterAin) ?
pipe(
advanceP,
addP(diacritic),
)(state)
: :
// phs === PhonemeState.ShortVowel // phs === PhonemeState.ShortVowel
pipe( pipe(
@ -173,6 +193,8 @@ function processPhoneme(
)(state); )(state);
} }
function stateInfo({ state, i, phonemes, phoneme }: { function stateInfo({ state, i, phonemes, phoneme }: {
state: DiacriticsAccumulator, state: DiacriticsAccumulator,
i: number, i: number,
@ -237,7 +259,14 @@ function stateInfo({ state, i, phonemes, phoneme }: {
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch; return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
} }
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
return PhonemeStatus.ShortVowel; // weird ayn behaviour because it automatically advances and ignores it at the beginning of the process
// console.log("looking prev", prevPLetter);
// console.log("looking next", currentPLetter);
return prevPLetter === "ع"
? PhonemeStatus.ShortVowelBeforeAin
: currentPLetter === "ع"
? PhonemeStatus.ShortVowelAfterAin
: PhonemeStatus.ShortVowel;
} }
// console.log("bad phoneme is ", phoneme); // console.log("bad phoneme is ", phoneme);
throw new Error("phonetics error - no status found for phoneme: " + phoneme); throw new Error("phonetics error - no status found for phoneme: " + phoneme);

View File

@ -20,6 +20,7 @@ import {
concatInflections, concatInflections,
psStringEquals, psStringEquals,
removeRetroflexR, removeRetroflexR,
splitDoubleWord,
} from "./p-text-helpers"; } from "./p-text-helpers";
import * as T from "../types"; import * as T from "../types";
import { import {
@ -662,7 +663,40 @@ test(`mapVerbBlock should work`, () => {
[[{p: "به کېده", f: "ba kedu"}, {p: "به کېدو", f: "ba kedo"}], [{p: "به کېدل", f: "ba kedul"}]], [[{p: "به کېده", f: "ba kedu"}, {p: "به کېدو", f: "ba kedo"}], [{p: "به کېدل", f: "ba kedul"}]],
[[{p: "به کېده", f: "ba keda"}], [{p: "به کېدې", f: "ba kede"}]], [[{p: "به کېده", f: "ba keda"}], [{p: "به کېدې", f: "ba kede"}]],
]) ])
}) });
test(`splitDoubleWord should work`, () => {
const orig: T.DictionaryEntry = {
ts: 123,
p: "ګډ وډ",
f: "guD wuD",
g: "guDwuD",
e: "mixed up",
c: "adj. doub.",
i: 1,
};
const out: [T.DictionaryEntry, T.DictionaryEntry] = [
{
ts: 123,
p: "ګډ",
f: "guD",
g: "guDwuD",
e: "mixed up",
c: "adj.",
i: 1,
},
{
ts: 123,
p: "وډ",
f: "wuD",
g: "guDwuD",
e: "mixed up",
c: "adj.",
i: 1,
},
]
expect(splitDoubleWord(orig)).toEqual(out);
});
// test(`allThirdPersMascPlur should work`, () => { // test(`allThirdPersMascPlur should work`, () => {
// expect( // expect(

View File

@ -80,6 +80,29 @@ export function concatPsString(...items: Array<T.PsString | T.LengthOptions<T.Ps
}; };
} }
/**
* breaks a dictionary entry with a double wording (ie. ګډ وډ) into two seperate words
*
* @param w
* @returns
*/
export function splitDoubleWord(w: T.DictionaryEntry): [T.DictionaryEntry, T.DictionaryEntry] {
const pSplit = w.p.split(" ");
const fSplit = w.f.split(" ");
const c = w.c?.replace(" doub.", "");
return [{
...w,
p: pSplit[0],
f: fSplit[0],
c,
}, {
...w,
p: pSplit[1],
f: fSplit[1],
c,
}];
}
export function psFunction(ps: T.PsString, func: (s: string) => string): T.PsString { export function psFunction(ps: T.PsString, func: (s: string) => string): T.PsString {
return makePsString( return makePsString(
func(ps.p), func(ps.p),
@ -711,3 +734,34 @@ export function ensureShortWurShwaShift(ps: T.PsString): T.PsString {
} }
return ps; return ps;
} }
export function ensureUnisexInflections(infs: T.Inflections | false, w: T.DictionaryEntry): T.UnisexInflections {
const ps = { p: w.p, f: firstPhonetics(w.f) };
if (infs === false) {
return {
masc: [
[ps],
[ps],
[ps],
],
fem: [
[ps],
[ps],
[ps],
],
};
}
if (!("fem" in infs)) {
return {
...infs,
fem: [[ps], [ps], [ps]],
};
}
if (!("masc" in infs)) {
return {
...infs,
masc: [[ps], [ps], [ps]],
};
}
return infs;
}

View File

@ -189,6 +189,30 @@ const adjectives: Array<{
}, },
out: false, out: false,
}, },
// double adjective
{
in: {
ts: 123,
p: "ګډ وډ",
f: "guD wuD",
g: "guDwuD",
e: "mixed up",
c: "adj. doub.",
i: 1,
},
out: {
masc: [
[{ p: "ګډ وډ", f: "guD wuD" }],
[{ p: "ګډ وډ", f: "guD wuD" }],
[{ p: "ګډو وډو", f: "guDo wuDo" }],
],
fem: [
[{ p: "ګډه وډه", f: "guDa wuDa" }],
[{ p: "ګډې وډې", f: "guDe wuDe" }],
[{ p: "ګډو وډو", f: "guDo wuDo" }],
],
}
}
]; ];
const nouns: Array<{ const nouns: Array<{

View File

@ -7,6 +7,11 @@
*/ */
import { pashtoConsonants } from "./pashto-consonants"; import { pashtoConsonants } from "./pashto-consonants";
import {
concatInflections,
splitDoubleWord,
ensureUnisexInflections,
} from "./p-text-helpers";
import * as T from "../types"; import * as T from "../types";
const endingInSingleARegex = /[^a]'??[aá]'??$/; const endingInSingleARegex = /[^a]'??[aá]'??$/;
@ -19,6 +24,14 @@ export function inflectWord(word: T.DictionaryEntry): T.Inflections | false {
if (word.noInf) { if (word.noInf) {
return false; return false;
} }
if (word.c?.includes("doub.")) {
const words = splitDoubleWord(word);
const inflected = words.map((word) => ensureUnisexInflections(inflectWord(word), word));
return concatInflections(
inflected[0],
inflected[1],
) as T.UnisexInflections;
}
if (word.c && (word.c.includes("adj.") || word.c.includes("unisex"))) { if (word.c && (word.c.includes("adj.") || word.c.includes("unisex"))) {
return handleUnisexWord(word); return handleUnisexWord(word);
} }