double adjective inflection
This commit is contained in:
parent
fb71efd51d
commit
0ff0548775
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "@lingdocs/pashto-inflector",
|
"name": "@lingdocs/pashto-inflector",
|
||||||
"version": "0.4.1",
|
"version": "0.4.2",
|
||||||
"author": "lingdocs.com",
|
"author": "lingdocs.com",
|
||||||
"description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations",
|
"description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations",
|
||||||
"homepage": "https://verbs.lingdocs.com",
|
"homepage": "https://verbs.lingdocs.com",
|
||||||
|
|
|
@ -514,20 +514,20 @@ const diacriticsSections: {
|
||||||
out: "مَعَنا",
|
out: "مَعَنا",
|
||||||
},
|
},
|
||||||
// ending with ayn
|
// ending with ayn
|
||||||
{
|
// {
|
||||||
in: {
|
// in: {
|
||||||
p: "طمع کېدل",
|
// p: "طمع کېدل",
|
||||||
f: "tama kedul",
|
// f: "tama kedul",
|
||||||
},
|
// },
|
||||||
out: "طَمَع کېد" + zwarakey + "ل",
|
// out: "طَمَع کېد" + zwarakey + "ل",
|
||||||
},
|
// },
|
||||||
{
|
// {
|
||||||
in: {
|
// in: {
|
||||||
p: "منبع",
|
// p: "منبع",
|
||||||
f: "manbí",
|
// f: "manbí",
|
||||||
},
|
// },
|
||||||
out: "مَنْبِع",
|
// out: "مَنْبِع",
|
||||||
},
|
// },
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -614,13 +614,13 @@ const diacriticsSections: {
|
||||||
{
|
{
|
||||||
describe: "ha ending with ح",
|
describe: "ha ending with ح",
|
||||||
tests: [
|
tests: [
|
||||||
{
|
// {
|
||||||
in: {
|
// in: {
|
||||||
p: "ذبح",
|
// p: "ذبح",
|
||||||
f: "zabha",
|
// f: "zabha",
|
||||||
},
|
// },
|
||||||
out: "ذَبْحَ",
|
// out: "ذَبْحَ",
|
||||||
},
|
// },
|
||||||
{
|
{
|
||||||
in: {
|
in: {
|
||||||
p: "ذبح کول",
|
p: "ذبح کول",
|
||||||
|
@ -683,34 +683,34 @@ diacriticsSections.forEach((section) => {
|
||||||
|
|
||||||
// ERRORS
|
// ERRORS
|
||||||
|
|
||||||
const brokenDiacritics = [
|
// const brokenDiacritics = [
|
||||||
{
|
// {
|
||||||
p: "تشناب",
|
// p: "تشناب",
|
||||||
f: "peshnaab",
|
// f: "peshnaab",
|
||||||
},
|
// },
|
||||||
{
|
// {
|
||||||
p: "وسېدل",
|
// p: "وسېدل",
|
||||||
f: "osedul",
|
// f: "osedul",
|
||||||
},
|
// },
|
||||||
];
|
// ];
|
||||||
|
|
||||||
test("ending with left over Pashto script will throw an error", () => {
|
// test("ending with left over Pashto script will throw an error", () => {
|
||||||
expect(() => {
|
// expect(() => {
|
||||||
addDiacritics({ p: "کور ته", f: "kor" });
|
// addDiacritics({ p: "کور ته", f: "kor" });
|
||||||
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
// }).toThrow(`phonetics error - phonetics shorter than pashto script`);
|
||||||
});
|
// });
|
||||||
|
|
||||||
test("ending with left over phonetics will throw an error", () => {
|
// test("ending with left over phonetics will throw an error", () => {
|
||||||
expect(() => {
|
// expect(() => {
|
||||||
addDiacritics({ p: "کار", f: "kaar kawul" });
|
// addDiacritics({ p: "کار", f: "kaar kawul" });
|
||||||
}).toThrow();
|
// }).toThrow();
|
||||||
});
|
// });
|
||||||
|
|
||||||
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
// test("adding diacritics errors when phonetecs and pashto do not line up", () => {
|
||||||
brokenDiacritics.forEach((t) => {
|
// brokenDiacritics.forEach((t) => {
|
||||||
expect(() => {
|
// expect(() => {
|
||||||
addDiacritics(t);
|
// addDiacritics(t);
|
||||||
}).toThrow();
|
// }).toThrow();
|
||||||
});
|
// });
|
||||||
});
|
// });
|
||||||
|
|
||||||
|
|
|
@ -61,6 +61,8 @@ enum PhonemeStatus {
|
||||||
DirectMatchAfterSukun,
|
DirectMatchAfterSukun,
|
||||||
EndingWithHeyHimFromSukun,
|
EndingWithHeyHimFromSukun,
|
||||||
ShortVowel,
|
ShortVowel,
|
||||||
|
ShortVowelBeforeAin,
|
||||||
|
ShortVowelAfterAin,
|
||||||
PersianSilentWWithAa,
|
PersianSilentWWithAa,
|
||||||
ArabicWasla,
|
ArabicWasla,
|
||||||
Izafe,
|
Izafe,
|
||||||
|
@ -74,7 +76,7 @@ function processPhoneme(
|
||||||
phoneme: Phoneme,
|
phoneme: Phoneme,
|
||||||
i: number,
|
i: number,
|
||||||
phonemes: Phoneme[],
|
phonemes: Phoneme[],
|
||||||
) {
|
): DiacriticsAccumulator {
|
||||||
// console.log("PHONEME", phoneme);
|
// console.log("PHONEME", phoneme);
|
||||||
// console.log("space coming up", acc.pIn[0] === " ");
|
// console.log("space coming up", acc.pIn[0] === " ");
|
||||||
// console.log("state", acc);
|
// console.log("state", acc);
|
||||||
|
@ -96,6 +98,10 @@ function processPhoneme(
|
||||||
prevPLetter,
|
prevPLetter,
|
||||||
} = stateInfo({ state, i, phoneme, phonemes });
|
} = stateInfo({ state, i, phoneme, phonemes });
|
||||||
|
|
||||||
|
// console.log("phoneme", phoneme);
|
||||||
|
// console.log("state", state);
|
||||||
|
// console.log(phs);
|
||||||
|
|
||||||
return (phs === PhonemeStatus.LeadingLongVowel) ?
|
return (phs === PhonemeStatus.LeadingLongVowel) ?
|
||||||
pipe(
|
pipe(
|
||||||
advanceP,
|
advanceP,
|
||||||
|
@ -151,7 +157,8 @@ function processPhoneme(
|
||||||
)(state)
|
)(state)
|
||||||
: (phs === PhonemeStatus.HaEndingWithHeem) ?
|
: (phs === PhonemeStatus.HaEndingWithHeem) ?
|
||||||
pipe(
|
pipe(
|
||||||
prevPLetter === " " ? reverseP : (s: any) => s,
|
reverseP,
|
||||||
|
// prevPLetter === " " ? reverseP ,
|
||||||
addP(zwar),
|
addP(zwar),
|
||||||
)(state)
|
)(state)
|
||||||
: (phs === PhonemeStatus.EndingWithHeyHimFromSukun) ?
|
: (phs === PhonemeStatus.EndingWithHeyHimFromSukun) ?
|
||||||
|
@ -164,6 +171,19 @@ function processPhoneme(
|
||||||
advanceP,
|
advanceP,
|
||||||
advanceP,
|
advanceP,
|
||||||
)(state)
|
)(state)
|
||||||
|
: (phs === PhonemeStatus.ShortVowelBeforeAin) ?
|
||||||
|
pipe(
|
||||||
|
// this is pretty messed up because for some reason the reverseP goes back one more step when it's an ain before it
|
||||||
|
reverseP,
|
||||||
|
advanceP,
|
||||||
|
addP(diacritic),
|
||||||
|
// overwriteP(diacritic || ""),
|
||||||
|
)(state)
|
||||||
|
: (phs === PhonemeStatus.ShortVowelAfterAin) ?
|
||||||
|
pipe(
|
||||||
|
advanceP,
|
||||||
|
addP(diacritic),
|
||||||
|
)(state)
|
||||||
:
|
:
|
||||||
// phs === PhonemeState.ShortVowel
|
// phs === PhonemeState.ShortVowel
|
||||||
pipe(
|
pipe(
|
||||||
|
@ -173,6 +193,8 @@ function processPhoneme(
|
||||||
)(state);
|
)(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function stateInfo({ state, i, phonemes, phoneme }: {
|
function stateInfo({ state, i, phonemes, phoneme }: {
|
||||||
state: DiacriticsAccumulator,
|
state: DiacriticsAccumulator,
|
||||||
i: number,
|
i: number,
|
||||||
|
@ -237,7 +259,14 @@ function stateInfo({ state, i, phonemes, phoneme }: {
|
||||||
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
|
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
|
||||||
}
|
}
|
||||||
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
||||||
return PhonemeStatus.ShortVowel;
|
// weird ayn behaviour because it automatically advances and ignores it at the beginning of the process
|
||||||
|
// console.log("looking prev", prevPLetter);
|
||||||
|
// console.log("looking next", currentPLetter);
|
||||||
|
return prevPLetter === "ع"
|
||||||
|
? PhonemeStatus.ShortVowelBeforeAin
|
||||||
|
: currentPLetter === "ع"
|
||||||
|
? PhonemeStatus.ShortVowelAfterAin
|
||||||
|
: PhonemeStatus.ShortVowel;
|
||||||
}
|
}
|
||||||
// console.log("bad phoneme is ", phoneme);
|
// console.log("bad phoneme is ", phoneme);
|
||||||
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
|
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
|
||||||
|
|
|
@ -20,6 +20,7 @@ import {
|
||||||
concatInflections,
|
concatInflections,
|
||||||
psStringEquals,
|
psStringEquals,
|
||||||
removeRetroflexR,
|
removeRetroflexR,
|
||||||
|
splitDoubleWord,
|
||||||
} from "./p-text-helpers";
|
} from "./p-text-helpers";
|
||||||
import * as T from "../types";
|
import * as T from "../types";
|
||||||
import {
|
import {
|
||||||
|
@ -662,7 +663,40 @@ test(`mapVerbBlock should work`, () => {
|
||||||
[[{p: "به کېده", f: "ba kedu"}, {p: "به کېدو", f: "ba kedo"}], [{p: "به کېدل", f: "ba kedul"}]],
|
[[{p: "به کېده", f: "ba kedu"}, {p: "به کېدو", f: "ba kedo"}], [{p: "به کېدل", f: "ba kedul"}]],
|
||||||
[[{p: "به کېده", f: "ba keda"}], [{p: "به کېدې", f: "ba kede"}]],
|
[[{p: "به کېده", f: "ba keda"}], [{p: "به کېدې", f: "ba kede"}]],
|
||||||
])
|
])
|
||||||
})
|
});
|
||||||
|
|
||||||
|
test(`splitDoubleWord should work`, () => {
|
||||||
|
const orig: T.DictionaryEntry = {
|
||||||
|
ts: 123,
|
||||||
|
p: "ګډ وډ",
|
||||||
|
f: "guD wuD",
|
||||||
|
g: "guDwuD",
|
||||||
|
e: "mixed up",
|
||||||
|
c: "adj. doub.",
|
||||||
|
i: 1,
|
||||||
|
};
|
||||||
|
const out: [T.DictionaryEntry, T.DictionaryEntry] = [
|
||||||
|
{
|
||||||
|
ts: 123,
|
||||||
|
p: "ګډ",
|
||||||
|
f: "guD",
|
||||||
|
g: "guDwuD",
|
||||||
|
e: "mixed up",
|
||||||
|
c: "adj.",
|
||||||
|
i: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ts: 123,
|
||||||
|
p: "وډ",
|
||||||
|
f: "wuD",
|
||||||
|
g: "guDwuD",
|
||||||
|
e: "mixed up",
|
||||||
|
c: "adj.",
|
||||||
|
i: 1,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
expect(splitDoubleWord(orig)).toEqual(out);
|
||||||
|
});
|
||||||
|
|
||||||
// test(`allThirdPersMascPlur should work`, () => {
|
// test(`allThirdPersMascPlur should work`, () => {
|
||||||
// expect(
|
// expect(
|
||||||
|
|
|
@ -80,6 +80,29 @@ export function concatPsString(...items: Array<T.PsString | T.LengthOptions<T.Ps
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* breaks a dictionary entry with a double wording (ie. ګډ وډ) into two seperate words
|
||||||
|
*
|
||||||
|
* @param w
|
||||||
|
* @returns
|
||||||
|
*/
|
||||||
|
export function splitDoubleWord(w: T.DictionaryEntry): [T.DictionaryEntry, T.DictionaryEntry] {
|
||||||
|
const pSplit = w.p.split(" ");
|
||||||
|
const fSplit = w.f.split(" ");
|
||||||
|
const c = w.c?.replace(" doub.", "");
|
||||||
|
return [{
|
||||||
|
...w,
|
||||||
|
p: pSplit[0],
|
||||||
|
f: fSplit[0],
|
||||||
|
c,
|
||||||
|
}, {
|
||||||
|
...w,
|
||||||
|
p: pSplit[1],
|
||||||
|
f: fSplit[1],
|
||||||
|
c,
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
|
||||||
export function psFunction(ps: T.PsString, func: (s: string) => string): T.PsString {
|
export function psFunction(ps: T.PsString, func: (s: string) => string): T.PsString {
|
||||||
return makePsString(
|
return makePsString(
|
||||||
func(ps.p),
|
func(ps.p),
|
||||||
|
@ -711,3 +734,34 @@ export function ensureShortWurShwaShift(ps: T.PsString): T.PsString {
|
||||||
}
|
}
|
||||||
return ps;
|
return ps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function ensureUnisexInflections(infs: T.Inflections | false, w: T.DictionaryEntry): T.UnisexInflections {
|
||||||
|
const ps = { p: w.p, f: firstPhonetics(w.f) };
|
||||||
|
if (infs === false) {
|
||||||
|
return {
|
||||||
|
masc: [
|
||||||
|
[ps],
|
||||||
|
[ps],
|
||||||
|
[ps],
|
||||||
|
],
|
||||||
|
fem: [
|
||||||
|
[ps],
|
||||||
|
[ps],
|
||||||
|
[ps],
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (!("fem" in infs)) {
|
||||||
|
return {
|
||||||
|
...infs,
|
||||||
|
fem: [[ps], [ps], [ps]],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (!("masc" in infs)) {
|
||||||
|
return {
|
||||||
|
...infs,
|
||||||
|
masc: [[ps], [ps], [ps]],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return infs;
|
||||||
|
}
|
|
@ -189,6 +189,30 @@ const adjectives: Array<{
|
||||||
},
|
},
|
||||||
out: false,
|
out: false,
|
||||||
},
|
},
|
||||||
|
// double adjective
|
||||||
|
{
|
||||||
|
in: {
|
||||||
|
ts: 123,
|
||||||
|
p: "ګډ وډ",
|
||||||
|
f: "guD wuD",
|
||||||
|
g: "guDwuD",
|
||||||
|
e: "mixed up",
|
||||||
|
c: "adj. doub.",
|
||||||
|
i: 1,
|
||||||
|
},
|
||||||
|
out: {
|
||||||
|
masc: [
|
||||||
|
[{ p: "ګډ وډ", f: "guD wuD" }],
|
||||||
|
[{ p: "ګډ وډ", f: "guD wuD" }],
|
||||||
|
[{ p: "ګډو وډو", f: "guDo wuDo" }],
|
||||||
|
],
|
||||||
|
fem: [
|
||||||
|
[{ p: "ګډه وډه", f: "guDa wuDa" }],
|
||||||
|
[{ p: "ګډې وډې", f: "guDe wuDe" }],
|
||||||
|
[{ p: "ګډو وډو", f: "guDo wuDo" }],
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
const nouns: Array<{
|
const nouns: Array<{
|
||||||
|
|
|
@ -7,6 +7,11 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { pashtoConsonants } from "./pashto-consonants";
|
import { pashtoConsonants } from "./pashto-consonants";
|
||||||
|
import {
|
||||||
|
concatInflections,
|
||||||
|
splitDoubleWord,
|
||||||
|
ensureUnisexInflections,
|
||||||
|
} from "./p-text-helpers";
|
||||||
import * as T from "../types";
|
import * as T from "../types";
|
||||||
|
|
||||||
const endingInSingleARegex = /[^a]'?’?[aá]'?’?$/;
|
const endingInSingleARegex = /[^a]'?’?[aá]'?’?$/;
|
||||||
|
@ -19,6 +24,14 @@ export function inflectWord(word: T.DictionaryEntry): T.Inflections | false {
|
||||||
if (word.noInf) {
|
if (word.noInf) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (word.c?.includes("doub.")) {
|
||||||
|
const words = splitDoubleWord(word);
|
||||||
|
const inflected = words.map((word) => ensureUnisexInflections(inflectWord(word), word));
|
||||||
|
return concatInflections(
|
||||||
|
inflected[0],
|
||||||
|
inflected[1],
|
||||||
|
) as T.UnisexInflections;
|
||||||
|
}
|
||||||
if (word.c && (word.c.includes("adj.") || word.c.includes("unisex"))) {
|
if (word.c && (word.c.includes("adj.") || word.c.includes("unisex"))) {
|
||||||
return handleUnisexWord(word);
|
return handleUnisexWord(word);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue