pretty full noun recognition - plural suffixes just started

This commit is contained in:
adueck 2023-08-01 16:11:10 +04:00
parent 6aec2dfeb2
commit b672e19c1a
18 changed files with 3284 additions and 1756 deletions

View File

@ -61,7 +61,6 @@ function NPNounPicker(props: {
opts: T.TextOptions;
phraseIsComplete: boolean;
}) {
console.log({ noun: props.noun });
// const [patternFilter, setPatternFilter] = useState<FilterPattern | undefined>(undefined);
// const [showFilter, setShowFilter] = useState<boolean>(false)
// const nounsFiltered = props.nouns

View File

@ -1,4 +1,5 @@
import * as T from "../../../types";
import { endsInConsonant } from "../p-text-helpers";
import {
isPattern1Entry,
isPattern2Entry,
@ -7,18 +8,26 @@ import {
isPattern5Entry,
isPattern4Entry,
isPattern6FemEntry,
isFemNounEntry,
isAdjectiveEntry,
isUnisexNounEntry,
isPluralNounEntry,
isNounEntry,
isAnimNounEntry,
isMascNounEntry,
} from "../type-predicates";
import { equals } from "rambda";
export function getInflectionQueries(
s: string,
includeNouns: boolean
noun: boolean
): {
search: Partial<T.DictionaryEntry>;
details: {
inflection: (0 | 1 | 2)[];
gender: T.Gender[];
predicate: (e: T.AdjectiveEntry | T.NounEntry) => boolean;
plural?: boolean;
}[];
}[] {
const queries: {
@ -26,6 +35,7 @@ export function getInflectionQueries(
details: {
inflection: (0 | 1 | 2)[];
gender: T.Gender[];
plural?: boolean;
predicate: (e: T.NounEntry | T.AdjectiveEntry) => boolean;
};
}[] = [];
@ -34,15 +44,111 @@ export function getInflectionQueries(
details: {
inflection: [0, 1, 2],
gender: ["masc", "fem"],
predicate: isPattern(0),
predicate: (e) =>
!(isNounEntry(e) && isPluralNounEntry(e)) &&
isPattern(0)(e) &&
isAdjectiveEntry(e),
},
});
if (noun) {
if (s.endsWith("ونه")) {
queries.push({
search: { p: s.slice(0, -3) },
details: {
inflection: [0],
gender: ["masc"],
plural: true,
predicate: (e) =>
isNounEntry(e) &&
!isPluralNounEntry(e) &&
!isPattern2Entry(e) &&
!isPattern3Entry(e) &&
!isPattern4Entry(e),
},
});
queries.push({
search: { p: s.slice(0, -3) + "ه" },
details: {
inflection: [0],
gender: ["masc"],
plural: true,
predicate: (e) =>
isNounEntry(e) &&
!isPluralNounEntry(e) &&
!isPattern2Entry(e) &&
!isPattern3Entry(e) &&
!isPattern4Entry(e),
},
});
}
if (s.endsWith("ونو")) {
queries.push({
search: { p: s.slice(0, -3) },
details: {
inflection: [1],
gender: ["masc"],
plural: true,
predicate: (e) =>
isNounEntry(e) &&
!isPluralNounEntry(e) &&
!isPattern2Entry(e) &&
!isPattern3Entry(e) &&
!isPattern4Entry(e),
},
});
queries.push({
search: { p: s.slice(0, -3) + "ه" },
details: {
inflection: [1],
gender: ["masc"],
plural: true,
predicate: (e) =>
isNounEntry(e) &&
!isPluralNounEntry(e) &&
!isPattern2Entry(e) &&
!isPattern3Entry(e) &&
!isPattern4Entry(e),
},
});
}
if (s.endsWith("و")) {
queries.push({
search: { p: s.slice(0, -1) },
details: {
inflection: [2],
gender: ["fem"],
predicate: (e) =>
isNounEntry(e) && isAnimNounEntry(e) && isFemNounEntry(e),
},
});
}
queries.push({
search: { p: s },
details: {
inflection: [0],
gender: ["fem"],
predicate: (e) =>
isNounEntry(e) && isFemNounEntry(e) && isPattern1Entry(e),
},
});
queries.push({
search: { p: s },
details: {
inflection: [0, 1],
gender: ["fem"],
predicate: (e) =>
isNounEntry(e) && isAnimNounEntry(e) && isFemNounEntry(e),
},
});
}
queries.push({
search: { p: s },
details: {
inflection: [0, 1],
gender: ["masc"],
predicate: isPattern1Entry,
predicate: (e) =>
!(isNounEntry(e) && isPluralNounEntry(e)) &&
(isPattern1Entry(e) || isPattern(0)(e)),
},
});
queries.push({
@ -65,6 +171,17 @@ export function getInflectionQueries(
predicate: (e) => isPattern4Entry(e) || isPattern5Entry(e),
},
});
if (noun) {
queries.push({
search: { p: s },
details: {
inflection: [0],
plural: true,
gender: ["masc", "fem"],
predicate: (e) => isNounEntry(e) && isPluralNounEntry(e),
},
});
}
if (s.endsWith("ه")) {
queries.push({
search: { p: s.slice(0, -1) },
@ -74,16 +191,6 @@ export function getInflectionQueries(
predicate: isPattern1Entry,
},
});
if (includeNouns) {
queries.push({
search: { p: s },
details: {
inflection: [0],
gender: ["fem"],
predicate: isPattern1Entry,
},
});
}
queries.push({
search: { infbp: s.slice(0, -1) },
details: {
@ -101,7 +208,7 @@ export function getInflectionQueries(
predicate: isPattern1Entry,
},
});
if (includeNouns) {
if (noun) {
queries.push({
search: { p: s.slice(0, -1) + "ه" },
details: {
@ -150,7 +257,7 @@ export function getInflectionQueries(
details: {
inflection: [2],
gender: ["masc", "fem"],
predicate: (e) => isPattern1Entry(e) || isPattern5Entry(e),
predicate: (e) => isPattern1Entry(e),
},
});
queries.push({
@ -169,6 +276,48 @@ export function getInflectionQueries(
predicate: (e) => isPattern2Entry(e) || isPattern3Entry(e),
},
});
if (noun) {
queries.push({
search: { p: s.slice(0, -1) + "ه" },
details: {
inflection: [2],
gender: ["fem"],
predicate: (e) => isPattern1Entry(e) || isFemNounEntry(e),
},
});
queries.push({
search: { p: s.slice(0, -1) + "ه" },
details: {
inflection: [2],
gender: ["masc"],
predicate: isMascNounEntry,
},
});
queries.push({
search: { p: s.slice(0, -1) + "ې" },
details: {
inflection: [2],
gender: ["fem"],
predicate: (e) => isNounEntry(e) || isFemNounEntry(e),
},
});
queries.push({
search: { p: s.slice(0, -1) + "ۍ" },
details: {
inflection: [2],
gender: ["fem"],
predicate: (e) => isFemNounEntry(e) && isPattern3Entry(e),
},
});
queries.push({
search: { p: s.slice(0, -1) + "ي" },
details: {
inflection: [2],
gender: ["fem"],
predicate: isPattern6FemEntry,
},
});
}
if (s.endsWith("یو")) {
queries.push({
search: { p: s.slice(0, -2) + "ی" },
@ -178,6 +327,24 @@ export function getInflectionQueries(
predicate: (e) => isPattern2Entry(e) || isPattern3Entry(e),
},
});
if (noun) {
queries.push({
search: { p: s.slice(0, -2) + "ۍ" },
details: {
inflection: [2],
gender: ["fem"],
predicate: (e) => isPattern3Entry(e) && isFemNounEntry(e),
},
});
queries.push({
search: { p: s.slice(0, -2) + "ي" },
details: {
inflection: [2],
gender: ["fem"],
predicate: isPattern6FemEntry,
},
});
}
}
} else if (s.endsWith("ۍ")) {
queries.push({
@ -188,7 +355,7 @@ export function getInflectionQueries(
predicate: isPattern3Entry,
},
});
if (includeNouns) {
if (noun) {
queries.push({
search: { p: s.slice(0, -1) + "ي" },
details: {

View File

@ -1,8 +1,30 @@
import nounsAdjs from "../../../nouns-adjs";
import * as T from "../../../types";
import { isAdjectiveEntry, isNounEntry } from "../type-predicates";
export function lookup(s: Partial<T.DictionaryEntry>): T.DictionaryEntry[] {
const [key, value] = Object.entries(s)[0];
// @ts-ignore
return nounsAdjs.filter((e) => e[key] === value) as T.DictionaryEntry[];
}
export function wordQuery(word: string, type: "adj"): T.AdjectiveEntry;
export function wordQuery(word: string, type: "noun"): T.NounEntry;
export function wordQuery(
word: string,
type: "noun" | "adj"
): T.NounEntry | T.AdjectiveEntry {
const entry = nounsAdjs.find(
(x) => x.p === word || x.f === word || x.g === word
);
if (!entry) {
throw new Error(`missing ${word} in word query`);
}
if (type === "noun" && !isNounEntry(entry)) {
throw new Error(`${word} is not a noun`);
}
if (type === "adj" && !isAdjectiveEntry(entry)) {
throw new Error(`${word} is not an adjective`);
}
return entry as T.NounEntry | T.AdjectiveEntry;
}

View File

@ -4,10 +4,10 @@ import { isAdjectiveEntry } from "../type-predicates";
import { getInflectionQueries } from "./inflection-query";
export function parseAdjective(
tokens: Readonly<string[]>,
tokens: Readonly<T.Token[]>,
lookup: (s: Partial<T.DictionaryEntry>) => T.DictionaryEntry[]
): [
string[],
T.Token[],
{
inflection: (0 | 1 | 2)[];
gender: T.Gender[];
@ -20,7 +20,7 @@ export function parseAdjective(
return [];
}
const [first, ...rest] = tokens;
const queries = getInflectionQueries(first, false);
const queries = getInflectionQueries(first.s, false);
queries.forEach(({ search, details }) => {
const wideMatches = lookup(search).filter(isAdjectiveEntry);
details.forEach((deets) => {
@ -33,7 +33,7 @@ export function parseAdjective(
selection,
inflection: deets.inflection,
gender: deets.gender,
given: first,
given: first.s,
},
]);
});

File diff suppressed because it is too large Load Diff

View File

@ -1,291 +0,0 @@
import { makeNounSelection } from "../phrase-building/make-selections";
import * as T from "../../../types";
import { lookup } from "./lookup";
import { parseNoun } from "./parse-noun";
const sarey = {
ts: 1527815251,
i: 8163,
p: "سړی",
f: "saRáy",
g: "saRay",
e: "man",
r: 4,
c: "n. m.",
ec: "man",
ep: "men",
} as T.NounEntry;
const dostee = {
ts: 1527811877,
i: 6627,
p: "دوستي",
f: "dostee",
g: "dostee",
e: "friendship",
r: 3,
c: "n. f.",
} as T.NounEntry;
const wreejze = {
ts: 1586551382412,
i: 14985,
p: "وریژې",
f: "wreejze",
g: "wreejze",
e: "rice",
r: 4,
c: "n. f. pl.",
} as T.NounEntry;
const xudza = {
ts: 1527812797,
i: 9018,
p: "ښځه",
f: "xúdza",
g: "xudza",
e: "woman, wife",
r: 4,
c: "n. f.",
ec: "woman",
ep: "women",
} as T.NounEntry;
const kursuy = {
ts: 1527814203,
i: 10573,
p: "کرسۍ",
f: "kUrsúy",
g: "kUrsuy",
e: "chair, seat, stool",
r: 3,
c: "n. f.",
} as T.NounEntry;
const kor = {
ts: 1527812828,
i: 11022,
p: "کور",
f: "kor",
g: "kor",
e: "house, home",
r: 4,
c: "n. m.",
} as T.NounEntry;
const daktar = {
ts: 1527816747,
i: 6709,
p: "ډاکټر",
f: "DaakTar",
g: "DaakTar",
e: "doctor",
r: 4,
c: "n. m. anim. unisex",
} as T.NounEntry;
// TODO: test unisex ملګری etc
const tests: {
category: string;
cases: {
input: string;
output: {
inflected: boolean;
selection: T.NounSelection;
}[];
}[];
}[] = [
{
category: "pattern 1 nouns",
cases: [
{
input: "کور",
output: [
{
inflected: false,
selection: makeNounSelection(kor, undefined),
},
],
},
{
input: "کورو",
output: [
{
inflected: true,
selection: {
...makeNounSelection(kor, undefined),
number: "plural",
},
},
],
},
{
input: "ډاکټره",
output: [
{
inflected: false,
selection: {
...makeNounSelection(daktar, undefined),
gender: "fem",
},
},
],
},
{
input: "ډاکټرې",
output: [
{
inflected: true,
selection: {
...makeNounSelection(daktar, undefined),
gender: "fem",
},
},
],
},
],
},
];
// {
// input: "سړی",
// output: [
// {
// inflected: false,
// selection: makeNounSelection(sarey, undefined),
// },
// ],
// },
// {
// input: "سړي",
// output: [
// {
// inflected: true,
// selection: makeNounSelection(sarey, undefined),
// },
// ],
// },
// {
// input: "سړو",
// output: [
// {
// inflected: true,
// selection: {
// ...makeNounSelection(sarey, undefined),
// number: "plural",
// },
// },
// ],
// },
// {
// input: "سړیو",
// output: [
// {
// inflected: true,
// selection: {
// ...makeNounSelection(sarey, undefined),
// number: "plural",
// },
// },
// ],
// },
// {
// input: "دوستي",
// output: [
// {
// inflected: false,
// selection: makeNounSelection(dostee, undefined),
// },
// ],
// },
// {
// input: "دوستۍ",
// output: [
// {
// inflected: true,
// selection: makeNounSelection(dostee, undefined),
// },
// ],
// },
// {
// input: "دوستیو",
// output: [
// {
// inflected: true,
// selection: {
// ...makeNounSelection(dostee, undefined),
// number: "plural",
// },
// },
// ],
// },
// {
// input: "وریژې",
// output: [
// {
// inflected: false,
// selection: makeNounSelection(wreejze, undefined),
// },
// ],
// },
// {
// input: "ښځه",
// output: [
// {
// inflected: false,
// selection: makeNounSelection(xudza, undefined),
// },
// ],
// },
// {
// input: "ښځې",
// output: [
// {
// inflected: true,
// selection: makeNounSelection(xudza, undefined),
// },
// ],
// },
// {
// input: "ښځو",
// output: [
// {
// inflected: true,
// selection: {
// ...makeNounSelection(xudza, undefined),
// number: "plural",
// },
// },
// ],
// },
// {
// input: "کرسۍ",
// output: [
// {
// inflected: false,
// selection: makeNounSelection(kursuy, undefined),
// },
// {
// inflected: true,
// selection: makeNounSelection(kursuy, undefined),
// },
// ],
// },
// {
// input: "کرسیو",
// output: [
// {
// inflected: true,
// selection: {
// ...makeNounSelection(kursuy, undefined),
// number: "plural",
// },
// },
// ],
// },
// ];
describe("parsing nouns", () => {
tests.forEach(({ category, cases }) => {
// eslint-disable-next-line jest/valid-title
test(category, () => {
cases.forEach(({ input, output }) => {
expect(parseNoun(input, lookup)).toEqual(output);
});
});
});
});

View File

@ -1,16 +1,22 @@
import * as T from "../../../types";
import { getInflectionPattern } from "../inflection-pattern";
import { makeNounSelection } from "../phrase-building/make-selections";
import {
isFemNounEntry,
isMascNounEntry,
isNounEntry,
isPluralNounEntry,
isUnisexNounEntry,
} from "../type-predicates";
import { getInflectionQueries } from "./inflection-query";
import { parseAdjective } from "./parse-adjective";
// TODO:
// - cleanup the workflow and make sure all nouns are covered and test
// - add possesive parsing
export function parseNoun(
tokens: Readonly<string[]>,
tokens: Readonly<T.Token[]>,
lookup: (s: Partial<T.DictionaryEntry>) => T.DictionaryEntry[],
adjectives: {
inflection: (0 | 1 | 2)[];
@ -19,10 +25,7 @@ export function parseNoun(
selection: T.AdjectiveSelection;
}[]
): {
success: [
string[],
{ inflection: (0 | 1 | 2)[]; selection: T.NounSelection }
][];
success: [T.Token[], { inflected: boolean; selection: T.NounSelection }][];
errors: string[];
} {
if (tokens.length === 0) {
@ -31,15 +34,19 @@ export function parseNoun(
errors: [],
};
}
const [first, ...rest] = tokens;
// TODO: add recognition of او between adjectives
const adjRes = parseAdjective(tokens, lookup);
const withAdj = adjRes.map(([tkns, adj]) =>
parseNoun(tkns, lookup, [...adjectives, adj])
);
const success: ReturnType<typeof parseNoun>["success"] = [];
const errors: string[] = [];
const [first, ...rest] = tokens;
// const possesor =
// first === "د" ? parseNoun(rest, lookup, adjectives).success : undefined;
const searches = getInflectionQueries(first.s, true);
const searches = getInflectionQueries(first, true);
searches.forEach(({ search, details }) => {
const nounEntries = lookup(search).filter(isNounEntry);
details.forEach((deets) => {
@ -47,65 +54,108 @@ export function parseNoun(
fittingEntries.forEach((entry) => {
if (isUnisexNounEntry(entry)) {
deets.gender.forEach((gender) => {
deets.inflection.forEach((inf) => {
const { ok, error } = adjsMatch(
adjectives,
gender,
inf,
deets.plural
);
if (ok) {
convertInflection(inf, entry, gender, deets.plural).forEach(
({ inflected, number }) => {
const selection = makeNounSelection(entry, undefined);
success.push([
rest,
{
inflected,
selection: {
...selection,
gender: selection.genderCanChange
? gender
: selection.gender,
number: selection.numberCanChange
? number
: selection.number,
adjectives: adjectives.map((a) => a.selection),
},
},
]);
}
);
} else {
error.forEach((e) => {
errors.push(e);
});
}
});
});
} else if (isMascNounEntry(entry) && deets.gender.includes("masc")) {
deets.inflection.forEach((inf) => {
const { ok, error } = adjsMatch(
adjectives,
gender,
deets.inflection
"masc",
inf,
deets.plural
);
if (ok) {
success.push([
rest,
{
inflection: deets.inflection,
selection: {
...makeNounSelection(entry, undefined),
gender,
adjectives: adjectives.map((a) => a.selection),
},
},
]);
convertInflection(inf, entry, "masc", deets.plural).forEach(
({ inflected, number }) => {
const selection = makeNounSelection(entry, undefined);
success.push([
rest,
{
inflected,
selection: {
...selection,
number: selection.numberCanChange
? number
: selection.number,
adjectives: adjectives.map((a) => a.selection),
},
},
]);
}
);
} else {
error.forEach((e) => {
errors.push(e);
});
}
});
} else if (isMascNounEntry(entry) && deets.gender.includes("masc")) {
const { ok, error } = adjsMatch(adjectives, "masc", deets.inflection);
if (ok) {
success.push([
rest,
{
inflection: deets.inflection,
selection: {
...makeNounSelection(entry, undefined),
adjectives: adjectives.map((a) => a.selection),
},
},
]);
} else {
error.forEach((e) => {
errors.push(e);
});
}
} else if (isFemNounEntry(entry) && deets.gender.includes("fem")) {
const { ok, error } = adjsMatch(adjectives, "fem", deets.inflection);
if (ok) {
success.push([
rest,
{
inflection: deets.inflection,
selection: {
...makeNounSelection(entry, undefined),
adjectives: adjectives.map((a) => a.selection),
},
},
]);
} else {
error.forEach((e) => {
errors.push(e);
});
}
deets.inflection.forEach((inf) => {
const { ok, error } = adjsMatch(
adjectives,
"fem",
inf,
deets.plural
);
if (ok) {
convertInflection(inf, entry, "fem", deets.plural).forEach(
({ inflected, number }) => {
const selection = makeNounSelection(entry, undefined);
success.push([
rest,
{
inflected,
selection: {
...selection,
number: selection.numberCanChange
? number
: selection.number,
adjectives: adjectives.map((a) => a.selection),
},
},
]);
}
);
} else {
error.forEach((e) => {
errors.push(e);
});
}
});
}
});
});
@ -119,12 +169,14 @@ export function parseNoun(
function adjsMatch(
adjectives: Parameters<typeof parseNoun>[2],
gender: T.Gender,
inflection: (0 | 1 | 2)[]
inf: 0 | 1 | 2,
plural: boolean | undefined
): { ok: boolean; error: string[] } {
const inflection = (plural && inf < 2 ? inf + 1 : inf) as 0 | 1 | 2;
const unmatching = adjectives.filter(
(adj) =>
!adj.gender.includes(gender) ||
!adj.inflection.some((i) => inflection.includes(i))
!adj.inflection.some((i) => i === inflection)
);
if (unmatching.length) {
return {
@ -134,9 +186,7 @@ function adjsMatch(
x.given === x.selection.entry.p
? x.given
: `${x.given} (${x.selection.entry.p})`;
const inflectionIssue = !x.inflection.some((x) =>
inflection.includes(x)
)
const inflectionIssue = !x.inflection.some((x) => x === inflection)
? ` should be ${showInflection(inflection)}`
: ``;
return `Adjective agreement error: ${adjText} should be ${inflectionIssue} ${gender}.`;
@ -150,14 +200,63 @@ function adjsMatch(
}
}
function showInflection(inf: (0 | 1 | 2)[]): string {
const [last, ...rest] = inf.reverse();
const template = rest.length
? `${rest.join(", ")}, or ${last}`
: last.toString();
console.log(template);
return template
.replace("0", "plain")
.replace("1", "first inflection")
.replace("2", "second inflection");
function convertInflection(
inflection: 0 | 1 | 2,
entry: T.NounEntry | T.AdjectiveEntry,
gender: T.Gender,
plural: boolean | undefined
): {
inflected: boolean;
number: T.NounNumber;
}[] {
const pattern = getInflectionPattern(entry);
const inf = (plural && inflection < 2 ? inflection + 1 : inflection) as
| 0
| 1
| 2;
if (inf === 0) {
return [
{
inflected: false,
number: "singular",
},
];
} else if (inf === 1) {
return [
...(!((isNounEntry(entry) && isPluralNounEntry(entry)) || plural) &&
!(pattern === 4 && entry.p.endsWith("ه") && gender === "masc")
? [
{
inflected: true,
number: "singular" as T.NounNumber,
},
]
: []),
...(pattern > 1 ||
(pattern > 0 && gender === "fem") ||
(isNounEntry(entry) && isPluralNounEntry(entry)) ||
plural
? [
{
inflected: false,
number: "plural" as T.NounNumber,
},
]
: []),
];
}
return [
{
inflected: true,
number: "plural",
},
];
}
function showInflection(inf: 0 | 1 | 2): string {
return inf === 0
? "plain"
: inf === 1
? "first inflection"
: "second inflection";
}

View File

@ -4,7 +4,7 @@ import { parsePronoun } from "./parse-pronoun";
import { parseNoun } from "./parse-noun";
export function parsePhrase(
s: string[],
s: T.Token[],
lookup: (s: Partial<T.DictionaryEntry>) => T.DictionaryEntry[]
): {
success: any[];

View File

@ -1,15 +1,15 @@
import * as T from "../../../types";
export function parsePronoun(tokens: Readonly<string[]>): [
string[],
export function parsePronoun(tokens: Readonly<T.Token[]>): [
T.Token[],
{
inflected: boolean[];
selection: T.PronounSelection;
}
][] {
const [first, ...rest] = tokens;
const [{ s }, ...rest] = tokens;
const w: ReturnType<typeof parsePronoun> = [];
if (first === "زه") {
if (s === "زه") {
w.push([
rest,
{
@ -32,7 +32,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (first === "ته") {
} else if (s === "ته") {
w.push([
rest,
{
@ -55,7 +55,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (first === "هغه") {
} else if (s === "هغه") {
w.push([
rest,
{
@ -78,7 +78,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (first === "هغې") {
} else if (s === "هغې") {
w.push([
rest,
{
@ -90,7 +90,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (first === "دی") {
} else if (s === "دی") {
w.push([
rest,
{
@ -102,7 +102,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (first === "ده") {
} else if (s === "ده") {
w.push([
rest,
{
@ -114,7 +114,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (first === "دا") {
} else if (s === "دا") {
w.push([
rest,
{
@ -126,7 +126,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (first === "دې") {
} else if (s === "دې") {
w.push([
rest,
{
@ -138,7 +138,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (["مونږ", "موږ"].includes(first)) {
} else if (["مونږ", "موږ"].includes(s)) {
w.push([
rest,
{
@ -161,7 +161,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (["تاسو", "تاسې"].includes(first)) {
} else if (["تاسو", "تاسې"].includes(s)) {
w.push([
rest,
{
@ -184,7 +184,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (["هغوي", "هغوی"].includes(first)) {
} else if (["هغوي", "هغوی"].includes(s)) {
w.push([
rest,
{
@ -207,7 +207,7 @@ export function parsePronoun(tokens: Readonly<string[]>): [
},
},
]);
} else if (["دوي", "دوی"].includes(first)) {
} else if (["دوي", "دوی"].includes(s)) {
w.push([
rest,
{

View File

@ -1,3 +1,10 @@
export function tokenizer(s: string): string[] {
return s.trim().split(" ");
import { Token } from "../../../types";
export function tokenizer(s: string): Token[] {
const words = s.trim().split(" ");
const indexed: { i: number; s: string }[] = [];
for (let i = 0; i < words.length; i++) {
indexed.push({ i, s: words[i] });
}
return indexed;
}

File diff suppressed because it is too large Load Diff

View File

@ -256,7 +256,10 @@ export function inflectRegularYayUnisex(
fem: [
[{ p: `${baseP}ې`, f: `${baseF}e` }],
[{ p: `${baseP}ې`, f: `${baseF}e` }],
[{ p: `${baseP}و`, f: `${baseF}o` }],
[
{ p: `${baseP}یو`, f: `${baseF}iyo` },
{ p: `${baseP}و`, f: `${baseF}o` },
],
],
};
}
@ -291,7 +294,7 @@ function inflectEmphasizedYayUnisex(p: string, f: string): T.UnisexInflections {
[{ p, f }],
[{ p: `${baseP}ي`, f: `${baseF}ée` }],
[
{ p: `${baseP}یو`, f: `${baseF}iyo` },
{ p: `${baseP}یو`, f: `${baseF}íyo` },
{ p: `${baseP}و`, f: `${baseF}ó` },
],
],
@ -299,7 +302,7 @@ function inflectEmphasizedYayUnisex(p: string, f: string): T.UnisexInflections {
[{ p: `${baseP}ۍ`, f: `${baseF}úy` }],
[{ p: `${baseP}ۍ`, f: `${baseF}úy` }],
[
{ p: `${baseP}یو`, f: `${baseF}úyo` },
{ p: `${baseP}یو`, f: `${baseF}íyo` },
{ p: `${baseP}و`, f: `${baseF}ó` },
],
],
@ -360,8 +363,8 @@ function inflectRegularEmphasizedYayMasc(p: string, f: string): T.Inflections {
[{ p, f }],
[{ p: `${baseP}ي`, f: `${baseF}ée` }],
[
{ p: `${baseP}یو`, f: `${baseF}iyo` },
{ p: `${baseP}و`, f: `${baseF}o` },
{ p: `${baseP}یو`, f: `${baseF}íyo` },
{ p: `${baseP}و`, f: `${baseF}ó` },
],
],
};
@ -453,8 +456,8 @@ function inflectRegularUyFem(p: string, f: string): T.Inflections {
[{ p, f: `${baseF}úy` }],
[{ p, f: `${baseF}úy` }],
[
{ p: `${baseP}یو`, f: `${baseF}úyo` },
{ p: `${baseP}و`, f: `${baseF}o` },
{ p: `${baseP}یو`, f: `${baseF}íyo` },
{ p: `${baseP}و`, f: `${baseF}ó` },
],
],
};

View File

@ -1896,7 +1896,10 @@ const toTest = [
fem: [
[{ p: "ستړې", f: "stúRe" }],
[{ p: "ستړې", f: "stúRe" }],
[{ p: "ستړو", f: "stúRo" }],
[
{ p: "ستړیو", f: "stúRiyo" },
{ p: "ستړو", f: "stúRo" },
],
],
},
},

View File

@ -1269,3 +1269,8 @@ export type OtherComp = {
type: "Comp";
ps: PsString;
};
export type Token = {
i: number;
s: string;
};

View File

@ -0,0 +1 @@
module.exports = [{ ts: 1527815333, e: "oven" }];

View File

@ -7,121 +7,122 @@
*/
module.exports = [
{ ts: 1527815408, e: "asleep" }, // ویده - weedú
{ ts: 1527812796, e: "good" }, // ښه - xu
{ ts: 1527821744, e: "cook, chef" }, // آشپز - aashpáz
{ ts: 1527812461, e: "hero, brave" }, // اتل - atul
{ ts: 1527821649, e: "impressive, effective, influencing" }, // اثرناک - asarnáak
{ ts: 1527818704, e: "wide, spacious, extensive" }, // ارت - arát
{ ts: 1578340121962, e: "free, independant" }, // ازاد - azáad
{ ts: 1527819418, e: "independant, autonomous" }, // خپلواک - khpulwaak
{ ts: 1527817146, e: "resident; settled" }, // استوګن - astogan
{ ts: 1527813713, e: "hopeful, pregnant" }, // امیدوار - Umeedwaar
{ ts: 1527819451, e: "Englishman, English (adjective)" }, // انګرېز - angréz
{ ts: 1527820346, e: "on-line" }, // انلاین - anlaayn
{ ts: 1527813667, e: "important" }, // اهم - aham
{ ts: 1598724912198, e: "dry" }, // اوچ - ooch
{ ts: 1527815138, e: "insurgent" }, // اورپک - orpak
{ ts: 1586452587974, e: "free, available" }, // اوزګار - oozgáar
{ ts: 1527816489, e: "faithful, believer" }, // ایماندار - eemaandaar
{ ts: 1527820433, e: "valiant" }, // باتور - baatóor
{ ts: 1527813425, e: "stingy" }, // بخیل - bakheel
{ ts: 1527812511, e: "bad" }, // بد - bud, bad
{ ts: 1527812518, e: "equal, even, set up" }, // برابر - buraabur
{ ts: 1527811861, e: "naked" }, // بربنډ - barbunD
{ ts: 1527811511, e: "full, complete" }, // بشپړ - bushpuR
{ ts: 1527812515, e: "other, next" }, // بل - bul
{ ts: 1527815725, e: "knowledgeable, accustomed" }, // بلد - balad
{ ts: 1577301753727, e: "closed" }, // بند - band
{ ts: 1527812490, e: "useless" }, // بې کار - be kaar
{ ts: 1527812031, e: "separate, different" }, // بېل - bel
{ ts: 1527815144, e: "clean, pure" }, // پاک - paak
{ ts: 1527815201, e: "hidden" }, // پټ - puT
{ ts: 1527815179, e: "wide" }, // پلن - plun
{ ts: 1527819059, e: "thick, fat" }, // پنډ - punD
{ ts: 1611767359178, e: "compassionate" }, // ترسناک - tarsnáak
{ ts: 1527813270, e: "sour" }, // تروش - troosh
{ ts: 1527813817, e: "narrow, cramped" }, // تنګ - tang
{ ts: 1527816354, e: "ready" }, // تیار - tayaar
{ ts: 1527817056, e: "sharp, fast" }, // تېز - tez
{ ts: 1527814076, e: "societal, social" }, // ټولنیز - Toluneez
{ ts: 1527819864, e: "low" }, // ټیټ - TeeT
{ ts: 1527811894, e: "firm, tough, rigid" }, // ټینګ - Teeng
{ ts: 1527812943, e: "constant, stable, proven" }, // ثابت - saabit
{ ts: 1527813085, e: "heavy, difficult" }, // ثقیل - saqeel
{ ts: 1527820479, e: "ignorant" }, // جاهل - jaahíl
{ ts: 1588160800930, e: "surgeon" }, // جراح - jarráah
{ ts: 1527812707, e: "high, tall" }, // جګ - jig, jug
{ ts: 1527816944, e: "clear, evident" }, // جوت - jawat
{ ts: 1527822996, e: "alongside, adjoining" }, // جوخت - jokht
{ ts: 1527812711, e: "well, healthy" }, // جوړ - joR
{ ts: 1527816323, e: "shining, sparkling" }, // ځلاند - dzalaand
{ ts: 1527812291, e: "young, youthful" }, // ځوان - dzwaan
{ ts: 1527820112, e: "hanging" }, // ځوړند - dzwáRund
{ ts: 1527819672, e: "crafty" }, // چالاک - chaaláak
{ ts: 1527811230, e: "quick, fast" }, // چټک - chaTak
{ ts: 1527812524, e: "started, in motion" }, // چلان - chalaan
{ ts: 1527815370, e: "clear, apparent" }, // څرګند - tsărgund
{ ts: 1576366107077, e: "straight, upright" }, // څک - tsak
{ ts: 1527812113, e: "present, on hand, ready" }, // حاضر - haazir, haazur
{ ts: 1527820699, e: "pregnant, carrying" }, // حامل - haamíl
{ ts: 1527819824, e: "greedy" }, // حریص - harées
{ ts: 1527812669, e: "sensitive" }, // حساس - hasaas
{ ts: 1527812057, e: "raw, unripe" }, // خام - khaam
{ ts: 1527811523, e: "traitor, treacherous" }, // خاین - khaayin
{ ts: 1527814219, e: "relative, one's own" }, // خپل - khpul
{ ts: 1527812795, e: "relative" }, // خپلوان - khpulwaan
{ ts: 1527812808, e: "poor, miserable" }, // خوار - khwaar
{ ts: 1527814880, e: "tall" }, // دنګ - dung
{ ts: 1527812537, e: "assured" }, // ډاډمن - DaaDmun
{ ts: 1527812583, e: "full" }, // ډک - Duk
{ ts: 1527822674, e: "gaunt" }, // ډنګر - Dungár, Dangár
{ ts: 1527817256, e: "sunk" }, // ډوب - Doob
{ ts: 1527814277, e: "healthy" }, // روغ - rogh
{ ts: 1609780006604, e: "fruitful" }, // زرخېز - zarkhéz
{ ts: 1527817116, e: "green, flourishing" }, // زرغون - zarghoon
{ ts: 1527814026, e: "golden" }, // زرین - zareen
{ ts: 1527815848, e: "committed" }, // ژمن - jzman
{ ts: 1527813498, e: "light" }, // سپک - spuk
{ ts: 1578329248464, e: "white" }, // سپین - speen
{ ts: 1527811860, e: "great" }, // ستر - stur
{ ts: 1527820178, e: "problematic" }, // ستونزمن - stoonzmán
{ ts: 1527815246, e: "difficult" }, // سخت - sakht
{ ts: 1527817262, e: "barren" }, // شنډ - shanD
{ ts: 1527813426, e: "stingy" }, // شوم - shoom
{ ts: 1527812625, e: "big" }, // غټ - ghuT, ghaT
{ ts: 1527811846, e: "successful" }, // کامیاب - kaamyaab
{ ts: 1527823678, e: "lazy" }, // کاهل - kaahíl
{ ts: 1527814896, e: "proud, arrogant" }, // کبرجن - kaburjun
{ ts: 1527813117, e: "firm, solid" }, // کلک - klak, kluk
{ ts: 1578769492475, e: "few, little" }, // کم - kam
// { ts: 1527814253, e: "mixed up" }, // ګډ وډ // TODO: FIX INFLECTION MACHINE FOR DOUBLES!
{ ts: 1578769409512, e: "weak" }, // کمزور - kamzór
{ ts: 1527812639, e: "dear, difficult" }, // ګران - graan
{ ts: 1527816786, e: "all" }, // ګرد - gurd
{ ts: 1527814811, e: "warm, hot" }, // ګرم - garm, garum
{ ts: 1527817662, e: "guilty" }, // ګرم - gram
{ ts: 1527812308, e: "thick, lots" }, // ګڼ - gaN
{ ts: 1527813848, e: "desiring, eager" }, // لېوال - lewaal
{ ts: 1527816011, e: "broken" }, // مات - maat
{ ts: 1527812881, e: "child" }, // ماشوم - maashoom
{ ts: 1527817007, e: "known" }, // مالوم - maaloom
{ ts: 1527814321, e: "positive" }, // مثبت - mUsbat
{ ts: 1527811264, e: "condemned" }, // محکوم - mahkoom
{ ts: 1527814802, e: "foul" }, // مردار - mUrdáar
{ ts: 1527821812, e: "arrogant" }, // مغرور - maghróor
{ ts: 1527820222, e: "lying down" }, // ملاست - mlaast
{ ts: 1527814344, e: "important" }, // مهم - mUhím
{ ts: 1527816033, e: "uncommon" }, // نادر - naadir
{ ts: 1527815106, e: "sitting, seated" }, // ناست - naast
{ ts: 1527815127, e: "nurse" }, // نرس - nurs
{ ts: 1527821673, e: "moist, damp, wet" }, // نمجن - namjún
{ ts: 1527815130, e: "dry, land, ground" }, // وچ - wuch, wUch
{ ts: 1527817486, e: "ruined, destroyed; destructive, bad, naughty" }, // وران - wraan
{ ts: 1527814373, e: "lost" }, // ورک - wruk
{ ts: 1527822838, e: "decayed, spoiled, rotten" }, // وروست - wrost
{ ts: 1609949334478, e: "roasted" }, // وریت - wreet
{ ts: 1527811544, e: "standing" }, // ولاړ - waláaR, wuláaR
{ ts: 1527815498, e: "aforementioned" }, // یاد - yaad
{ ts: 1527815434, e: "cold" }, // یخ - yakh, yukh
];
{ ts: 1527816747, e: "doctor" }, // ډاکټر
{ ts: 1527815408, e: "asleep" }, // ویده - weedú
{ ts: 1527812796, e: "good" }, // ښه - xu
{ ts: 1527821744, e: "cook, chef" }, // آشپز - aashpáz
{ ts: 1527812461, e: "hero, brave" }, // اتل - atul
{ ts: 1527821649, e: "impressive, effective, influencing" }, // اثرناک - asarnáak
{ ts: 1527818704, e: "wide, spacious, extensive" }, // ارت - arát
{ ts: 1578340121962, e: "free, independant" }, // ازاد - azáad
{ ts: 1527819418, e: "independant, autonomous" }, // خپلواک - khpulwaak
{ ts: 1527817146, e: "resident; settled" }, // استوګن - astogan
{ ts: 1527813713, e: "hopeful, pregnant" }, // امیدوار - Umeedwaar
{ ts: 1527819451, e: "Englishman, English (adjective)" }, // انګرېز - angréz
{ ts: 1527820346, e: "on-line" }, // انلاین - anlaayn
{ ts: 1527813667, e: "important" }, // اهم - aham
{ ts: 1598724912198, e: "dry" }, // اوچ - ooch
{ ts: 1527815138, e: "insurgent" }, // اورپک - orpak
{ ts: 1586452587974, e: "free, available" }, // اوزګار - oozgáar
{ ts: 1527816489, e: "faithful, believer" }, // ایماندار - eemaandaar
{ ts: 1527820433, e: "valiant" }, // باتور - baatóor
{ ts: 1527813425, e: "stingy" }, // بخیل - bakheel
{ ts: 1527812511, e: "bad" }, // بد - bud, bad
{ ts: 1527812518, e: "equal, even, set up" }, // برابر - buraabur
{ ts: 1527811861, e: "naked" }, // بربنډ - barbunD
{ ts: 1527811511, e: "full, complete" }, // بشپړ - bushpuR
{ ts: 1527812515, e: "other, next" }, // بل - bul
{ ts: 1527815725, e: "knowledgeable, accustomed" }, // بلد - balad
{ ts: 1577301753727, e: "closed" }, // بند - band
{ ts: 1527812490, e: "useless" }, // بې کار - be kaar
{ ts: 1527812031, e: "separate, different" }, // بېل - bel
{ ts: 1527815144, e: "clean, pure" }, // پاک - paak
{ ts: 1527815201, e: "hidden" }, // پټ - puT
{ ts: 1527815179, e: "wide" }, // پلن - plun
{ ts: 1527819059, e: "thick, fat" }, // پنډ - punD
{ ts: 1611767359178, e: "compassionate" }, // ترسناک - tarsnáak
{ ts: 1527813270, e: "sour" }, // تروش - troosh
{ ts: 1527813817, e: "narrow, cramped" }, // تنګ - tang
{ ts: 1527816354, e: "ready" }, // تیار - tayaar
{ ts: 1527817056, e: "sharp, fast" }, // تېز - tez
{ ts: 1527814076, e: "societal, social" }, // ټولنیز - Toluneez
{ ts: 1527819864, e: "low" }, // ټیټ - TeeT
{ ts: 1527811894, e: "firm, tough, rigid" }, // ټینګ - Teeng
{ ts: 1527812943, e: "constant, stable, proven" }, // ثابت - saabit
{ ts: 1527813085, e: "heavy, difficult" }, // ثقیل - saqeel
{ ts: 1527820479, e: "ignorant" }, // جاهل - jaahíl
{ ts: 1588160800930, e: "surgeon" }, // جراح - jarráah
{ ts: 1527812707, e: "high, tall" }, // جګ - jig, jug
{ ts: 1527816944, e: "clear, evident" }, // جوت - jawat
{ ts: 1527822996, e: "alongside, adjoining" }, // جوخت - jokht
{ ts: 1527812711, e: "well, healthy" }, // جوړ - joR
{ ts: 1527816323, e: "shining, sparkling" }, // ځلاند - dzalaand
{ ts: 1527812291, e: "young, youthful" }, // ځوان - dzwaan
{ ts: 1527820112, e: "hanging" }, // ځوړند - dzwáRund
{ ts: 1527819672, e: "crafty" }, // چالاک - chaaláak
{ ts: 1527811230, e: "quick, fast" }, // چټک - chaTak
{ ts: 1527812524, e: "started, in motion" }, // چلان - chalaan
{ ts: 1527815370, e: "clear, apparent" }, // څرګند - tsărgund
{ ts: 1576366107077, e: "straight, upright" }, // څک - tsak
{ ts: 1527812113, e: "present, on hand, ready" }, // حاضر - haazir, haazur
{ ts: 1527820699, e: "pregnant, carrying" }, // حامل - haamíl
{ ts: 1527819824, e: "greedy" }, // حریص - harées
{ ts: 1527812669, e: "sensitive" }, // حساس - hasaas
{ ts: 1527812057, e: "raw, unripe" }, // خام - khaam
{ ts: 1527811523, e: "traitor, treacherous" }, // خاین - khaayin
{ ts: 1527814219, e: "relative, one's own" }, // خپل - khpul
{ ts: 1527812795, e: "relative" }, // خپلوان - khpulwaan
{ ts: 1527812808, e: "poor, miserable" }, // خوار - khwaar
{ ts: 1527814880, e: "tall" }, // دنګ - dung
{ ts: 1527812537, e: "assured" }, // ډاډمن - DaaDmun
{ ts: 1527812583, e: "full" }, // ډک - Duk
{ ts: 1527822674, e: "gaunt" }, // ډنګر - Dungár, Dangár
{ ts: 1527817256, e: "sunk" }, // ډوب - Doob
{ ts: 1527814277, e: "healthy" }, // روغ - rogh
{ ts: 1609780006604, e: "fruitful" }, // زرخېز - zarkhéz
{ ts: 1527817116, e: "green, flourishing" }, // زرغون - zarghoon
{ ts: 1527814026, e: "golden" }, // زرین - zareen
{ ts: 1527815848, e: "committed" }, // ژمن - jzman
{ ts: 1527813498, e: "light" }, // سپک - spuk
{ ts: 1578329248464, e: "white" }, // سپین - speen
{ ts: 1527811860, e: "great" }, // ستر - stur
{ ts: 1527820178, e: "problematic" }, // ستونزمن - stoonzmán
{ ts: 1527815246, e: "difficult" }, // سخت - sakht
{ ts: 1527817262, e: "barren" }, // شنډ - shanD
{ ts: 1527813426, e: "stingy" }, // شوم - shoom
{ ts: 1527812625, e: "big" }, // غټ - ghuT, ghaT
{ ts: 1527811846, e: "successful" }, // کامیاب - kaamyaab
{ ts: 1527823678, e: "lazy" }, // کاهل - kaahíl
{ ts: 1527814896, e: "proud, arrogant" }, // کبرجن - kaburjun
{ ts: 1527813117, e: "firm, solid" }, // کلک - klak, kluk
{ ts: 1578769492475, e: "few, little" }, // کم - kam
// { ts: 1527814253, e: "mixed up" }, // ګډ وډ // TODO: FIX INFLECTION MACHINE FOR DOUBLES!
{ ts: 1578769409512, e: "weak" }, // کمزور - kamzór
{ ts: 1527812639, e: "dear, difficult" }, // ګران - graan
{ ts: 1527816786, e: "all" }, // ګرد - gurd
{ ts: 1527814811, e: "warm, hot" }, // ګرم - garm, garum
{ ts: 1527817662, e: "guilty" }, // ګرم - gram
{ ts: 1527812308, e: "thick, lots" }, // ګڼ - gaN
{ ts: 1527813848, e: "desiring, eager" }, // لېوال - lewaal
{ ts: 1527816011, e: "broken" }, // مات - maat
{ ts: 1527812881, e: "child" }, // ماشوم - maashoom
{ ts: 1527817007, e: "known" }, // مالوم - maaloom
{ ts: 1527814321, e: "positive" }, // مثبت - mUsbat
{ ts: 1527811264, e: "condemned" }, // محکوم - mahkoom
{ ts: 1527814802, e: "foul" }, // مردار - mUrdáar
{ ts: 1527821812, e: "arrogant" }, // مغرور - maghróor
{ ts: 1527820222, e: "lying down" }, // ملاست - mlaast
{ ts: 1527814344, e: "important" }, // مهم - mUhím
{ ts: 1527816033, e: "uncommon" }, // نادر - naadir
{ ts: 1527815106, e: "sitting, seated" }, // ناست - naast
{ ts: 1527815127, e: "nurse" }, // نرس - nurs
{ ts: 1527821673, e: "moist, damp, wet" }, // نمجن - namjún
{ ts: 1527815130, e: "dry, land, ground" }, // وچ - wuch, wUch
{ ts: 1527817486, e: "ruined, destroyed; destructive, bad, naughty" }, // وران - wraan
{ ts: 1527814373, e: "lost" }, // ورک - wruk
{ ts: 1527822838, e: "decayed, spoiled, rotten" }, // وروست - wrost
{ ts: 1609949334478, e: "roasted" }, // وریت - wreet
{ ts: 1527811544, e: "standing" }, // ولاړ - waláaR, wuláaR
{ ts: 1527815498, e: "aforementioned" }, // یاد - yaad
{ ts: 1527815434, e: "cold" }, // یخ - yakh, yukh
];

View File

@ -0,0 +1,22 @@
module.exports = [
{
ts: 1527815177,
e: "father",
},
{
ts: 1527815129,
e: "water",
},
{
ts: 1527817330,
e: "wheat",
},
{
ts: 1527815206,
e: "judge",
},
{
ts: 1527812342,
e: "people", // خلک
},
];

View File

@ -0,0 +1,4 @@
module.exports = [
{ ts: 1527811441, e: "door" }, // ور
{ ts: 1527813593, e: "mountain" }, // غر
];