some verb bug fixes and starting with AP parsing

This commit is contained in:
adueck 2023-09-04 14:26:24 +04:00
parent ded0030395
commit a9f93dc717
10 changed files with 416 additions and 155 deletions

View File

@ -1,11 +1,16 @@
import nounsAdjs from "../../../nouns-adjs"; import nounsAdjs from "../../../nouns-adjs";
import verbs from "../../../verbs"; import verbs from "../../../verbs";
import * as T from "../../../types"; import * as T from "../../../types";
import { isAdjectiveEntry, isNounEntry } from "../type-predicates"; import {
isAdjectiveEntry,
isAdverbEntry,
isNounEntry,
} from "../type-predicates";
import { removeFVarientsFromVerb } from "../accent-and-ps-utils"; import { removeFVarientsFromVerb } from "../accent-and-ps-utils";
import { splitVarients, undoAaXuPattern } from "../p-text-helpers"; import { splitVarients, undoAaXuPattern } from "../p-text-helpers";
import { arraysHaveCommon } from "../misc-helpers"; import { arraysHaveCommon } from "../misc-helpers";
import { shortVerbEndConsonant } from "./misc"; import { shortVerbEndConsonant } from "./misc";
import { kawulDyn, kawulStat, kedulDyn, kedulStat, tlul } from "./irreg-verbs";
export type LookupFunction = typeof lookup; export type LookupFunction = typeof lookup;
@ -13,11 +18,13 @@ export function lookup(
s: Partial<T.DictionaryEntry>, s: Partial<T.DictionaryEntry>,
type: "nounAdj" type: "nounAdj"
): T.DictionaryEntry[]; ): T.DictionaryEntry[];
export function lookup(s: string, type: "adverb"): T.AdverbEntry[];
export function lookup(s: string, type: "pPart"): T.VerbEntry[];
export function lookup(s: string, type: "verb" | "participle"): T.VerbEntry[]; export function lookup(s: string, type: "verb" | "participle"): T.VerbEntry[];
export function lookup( export function lookup(
s: string | Partial<T.DictionaryEntry>, s: string | Partial<T.DictionaryEntry>,
type: "nounAdj" | "verb" | "participle" type: "nounAdj" | "verb" | "participle" | "pPart" | "adverb"
): T.DictionaryEntry[] | T.VerbEntry[] { ): T.DictionaryEntry[] | T.VerbEntry[] | T.AdverbEntry[] {
if (type === "nounAdj") { if (type === "nounAdj") {
if (typeof s !== "object") { if (typeof s !== "object") {
throw new Error("invalid query for noun / adj lookup"); throw new Error("invalid query for noun / adj lookup");
@ -30,6 +37,12 @@ export function lookup(
if (type === "verb") { if (type === "verb") {
return verbLookup(s); return verbLookup(s);
} }
if (type === "pPart") {
return pPartLookup(s);
}
if (type === "adverb") {
return adverbLookup(s);
}
return participleLookup(s); return participleLookup(s);
} }
@ -60,6 +73,12 @@ function nounAdjLookup(s: Partial<T.DictionaryEntry>): T.DictionaryEntry[] {
return nounsAdjs.filter((e) => e[key] === value) as T.DictionaryEntry[]; return nounsAdjs.filter((e) => e[key] === value) as T.DictionaryEntry[];
} }
function adverbLookup(s: string): T.AdverbEntry[] {
return nounsAdjs.filter(
(a) => isAdverbEntry(a) && a.p === s
) as T.AdverbEntry[];
}
export function shouldCheckTpp(s: string): boolean { export function shouldCheckTpp(s: string): boolean {
return ( return (
["د", "ړ", "ت", "ځ", "و", "ډ", "ڼ", "ن", "ه"].includes(s.slice(-1)) || ["د", "ړ", "ت", "ځ", "و", "ډ", "ڼ", "ن", "ه"].includes(s.slice(-1)) ||
@ -85,6 +104,27 @@ function participleLookup(input: string): T.VerbEntry[] {
return []; return [];
} }
function pPartLookup(input: string): T.VerbEntry[] {
if (input === "کړ") {
return [kawulStat, kawulDyn];
}
if (input === "شو") {
return [kedulStat, kedulDyn];
}
if (input === "تل") {
// TODO: is also ورتلل، راتلل، درتلل like this?
return [tlul];
}
if (["ست", "ښت"].includes(input.slice(-2))) {
const p = input + "ل";
return verbs.filter((e) => e.entry.p === p);
}
if (input.at(-1) === "ل") {
return verbs.filter((e) => e.entry.p === input);
}
return [];
}
function verbLookup(input: string): T.VerbEntry[] { function verbLookup(input: string): T.VerbEntry[] {
// TODO: // TODO:
// only look up forms if there's an ending // only look up forms if there's an ending

View File

@ -1 +1,23 @@
// TODO: ability to treat a doubled noun as an adverb import * as T from "../../../types";
import { LookupFunction } from "./lookup";
import { returnParseResultS } from "./utils";
export function parseAP(
tokens: Readonly<T.Token[]>,
lookup: LookupFunction
): T.ParseResult<T.APSelection>[] {
if (tokens.length === 0) {
return [];
}
const [first, ...rest] = tokens;
const adverbs = lookup(first.s, "adverb");
return adverbs.map((entry) =>
returnParseResultS(rest, {
type: "AP",
selection: {
type: "adverb",
entry,
},
})
);
}

View File

@ -1,5 +1,6 @@
import * as T from "../../../types"; import * as T from "../../../types";
import { LookupFunction } from "./lookup"; import { LookupFunction } from "./lookup";
import { parseAP } from "./parse-ap";
import { parseEquative } from "./parse-equative"; import { parseEquative } from "./parse-equative";
import { parseKidsSection } from "./parse-kids-section"; import { parseKidsSection } from "./parse-kids-section";
import { parseNeg } from "./parse-negative"; import { parseNeg } from "./parse-negative";
@ -25,21 +26,16 @@ export function parseBlocks(
(b): b is T.ParsedPH => b.type === "PH" (b): b is T.ParsedPH => b.type === "PH"
); );
const vbExists = blocks.some((b) => "type" in b && b.type === "VB"); const vbExists = blocks.some((b) => "type" in b && b.type === "VB");
const np = prevPh ? [] : parseNP(tokens, lookup);
const ph = vbExists || prevPh ? [] : parsePH(tokens);
const vb = parseVerb(tokens, lookup);
const vbp = parsePastPart(tokens, lookup);
const eq = parseEquative(tokens);
const neg = parseNeg(tokens);
const kidsR = parseKidsSection(tokens, []);
const allResults: T.ParseResult<T.ParsedBlock | T.ParsedKidsSection>[] = [ const allResults: T.ParseResult<T.ParsedBlock | T.ParsedKidsSection>[] = [
...np, ...(prevPh ? [] : parseAP(tokens, lookup)),
...ph, ...(prevPh ? [] : parseNP(tokens, lookup)),
...neg, ...(vbExists || prevPh ? [] : parsePH(tokens)),
...vb, ...parseVerb(tokens, lookup),
...vbp, ...parsePastPart(tokens, lookup),
...eq, ...parseEquative(tokens),
...kidsR, ...parseNeg(tokens),
...parseKidsSection(tokens, []),
]; ];
// TODO: is this necessary? // TODO: is this necessary?
// if (!allResults.length) { // if (!allResults.length) {

View File

@ -0,0 +1,186 @@
import * as T from "../../../types";
import { lookup, wordQuery } from "./lookup";
import { tokenizer } from "./tokenizer";
import { parsePastPart } from "./parse-past-part";
import { kawulDyn, kawulStat, kedulDyn, kedulStat } from "./irreg-verbs";
const leedul = wordQuery("لیدل", "verb");
const akheestul = wordQuery("اخیستل", "verb");
const wahul = wordQuery("وهل", "verb");
const awuxtul = wordQuery("اوښتل", "verb");
const tlul = wordQuery("tlul", "verb");
const tests: {
label: string;
cases: {
input: string;
output: T.ParsedVBP[];
}[];
}[] = [
{
label: "regular past participles",
cases: [
{
input: "لیدلی",
output: [
{
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "masc",
number: "singular",
},
verb: leedul,
},
},
],
},
{
input: "وهلي",
output: [
{
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "masc",
number: "plural",
},
verb: wahul,
},
},
],
},
{
input: "وهلې",
output: (["singular", "plural"] as const).map((number) => ({
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "fem",
number,
},
verb: wahul,
},
})),
},
],
},
{
label: "past participles with short forms",
cases: [
{
input: "اخیستی",
output: [
{
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "masc",
number: "singular",
},
verb: akheestul,
},
},
],
},
{
input: "اخیستلی",
output: [
{
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "masc",
number: "singular",
},
verb: akheestul,
},
},
],
},
{
input: "اوښتی",
output: [
{
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "masc",
number: "singular",
},
verb: awuxtul,
},
},
],
},
],
},
{
label: "irregular past participles",
cases: [
{
input: "تلی",
output: [
{
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "masc",
number: "singular",
},
verb: tlul,
},
},
],
},
{
input: "کړي",
output: [kawulStat, kawulDyn].map((verb) => ({
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "masc",
number: "plural",
},
verb,
},
})),
},
{
input: "شوي",
output: [kedulStat, kedulDyn].map((verb) => ({
type: "VB",
info: {
type: "ppart",
genNum: {
gender: "masc",
number: "plural",
},
verb,
},
})),
},
],
},
];
describe("parsing past participles", () => {
tests.forEach(({ label, cases }) => {
// eslint-disable-next-line jest/valid-title
test(label, () => {
cases.forEach(({ input, output }) => {
const tokens = tokenizer(input);
const res = parsePastPart(tokens, lookup).map(({ body }) => body);
expect(res).toEqual(output);
});
});
});
});

View File

@ -16,7 +16,7 @@ export function parsePastPart(
} }
// TODO: ALSO HANDLE SHORT FORMS // TODO: ALSO HANDLE SHORT FORMS
const wOutEnd = s.slice(0, -1); const wOutEnd = s.slice(0, -1);
const matches = lookup(wOutEnd, "participle"); const matches = lookup(wOutEnd, "pPart");
const genNums = endingGenderNum(ending); const genNums = endingGenderNum(ending);
return matches return matches
.flatMap<T.ParsedVBP>((verb) => .flatMap<T.ParsedVBP>((verb) =>

View File

@ -31,7 +31,8 @@ const akheestul = wordQuery("اخیستل", "verb");
const alwatul = wordQuery("الوتل", "verb"); const alwatul = wordQuery("الوتل", "verb");
// const dartlul = wordQuery("درتلل", "verb") // const dartlul = wordQuery("درتلل", "verb")
// todo alwatul waalwatul akhistul azmoyul etc // TODO: azmoyul etc
// TODO: cleaner and more thorough handling of ا seperating verbs ee - wee etc
const tests: { const tests: {
label: string; label: string;
@ -394,7 +395,7 @@ const tests: {
{ {
root: { root: {
persons: [T.Person.FirstSingMale, T.Person.FirstSingFemale], persons: [T.Person.FirstSingMale, T.Person.FirstSingFemale],
aspects: ["imperfective", "perfective"], aspects: ["imperfective"],
}, },
verb: akheestul, verb: akheestul,
}, },
@ -419,7 +420,7 @@ const tests: {
{ {
stem: { stem: {
persons: [T.Person.SecondSingMale, T.Person.SecondSingFemale], persons: [T.Person.SecondSingMale, T.Person.SecondSingFemale],
aspects: ["imperfective", "perfective"], aspects: ["imperfective"],
}, },
verb: alwatul, verb: alwatul,
}, },

View File

@ -12,9 +12,6 @@ import {
import { LookupFunction } from "./lookup"; import { LookupFunction } from "./lookup";
import { shortVerbEndConsonant } from "./misc"; import { shortVerbEndConsonant } from "./misc";
// big problem ما سړی یوړ crashes it !!
// BIG problem - issue with و being considered a VB for a lot of little verbs like بلل
// TODO: کول verbs! // TODO: کول verbs!
// check that aawu stuff is working // check that aawu stuff is working
// check oo`azmooy - // check oo`azmooy -
@ -102,6 +99,9 @@ function matchVerbs(
} }
} }
} else if (e.psp) { } else if (e.psp) {
if (hasBreakawayAlef(e) && startsWithAleph(base)) {
return acc;
}
if (e.separationAtP) { if (e.separationAtP) {
const bRest = e.psp.slice(e.separationAtP); const bRest = e.psp.slice(e.separationAtP);
if (bRest === base) { if (bRest === base) {
@ -117,6 +117,8 @@ function matchVerbs(
return [...acc, entry]; return [...acc, entry];
} }
} }
} else if (hasBreakawayAlef(e) && startsWithAleph(base)) {
return acc;
} else if (e.c.includes("intrans.")) { } else if (e.c.includes("intrans.")) {
const miniRoot = e.p !== "کېدل" && e.p.slice(0, -3); const miniRoot = e.p !== "کېدل" && e.p.slice(0, -3);
const miniRootEg = miniRoot + "ېږ"; const miniRootEg = miniRoot + "ېږ";
@ -169,6 +171,8 @@ function matchVerbs(
if (matchShortOrLong(base, bRest)) { if (matchShortOrLong(base, bRest)) {
return [...acc, entry]; return [...acc, entry];
} }
} else if (hasBreakawayAlef(e) && startsWithAleph(base) && !e.prp) {
return acc;
} else { } else {
const p = e.prp || e.p; const p = e.prp || e.p;
if (matchShortOrLong(base, p) || matchShortOrLong("ا" + base, p)) { if (matchShortOrLong(base, p) || matchShortOrLong("ا" + base, p)) {
@ -245,6 +249,9 @@ function matchVerbs(
} }
} }
} else if (!e.prp) { } else if (!e.prp) {
if (hasBreakawayAlef(e) && startsWithAleph(base)) {
return acc;
}
if (oEnd) { if (oEnd) {
if ([e.p, e.p.slice(0, -1)].includes(base)) { if ([e.p, e.p.slice(0, -1)].includes(base)) {
return [...acc, entry]; return [...acc, entry];
@ -395,3 +402,11 @@ function parseIrregularVerb(s: string): T.ParsedVBE[] {
} }
return []; return [];
} }
function hasBreakawayAlef(e: T.VerbDictionaryEntry): boolean {
return !e.sepOo && ["ا", "آ"].includes(e.p[0]);
}
function startsWithAleph(base: string): boolean {
return ["ا", "آ"].includes(base[0]);
}

View File

@ -86,6 +86,10 @@ const tests: {
output: [], output: [],
error: true, error: true,
}, },
{
input: "ما وانه اخیست",
output: [],
},
], ],
}, },
{ {
@ -1005,129 +1009,129 @@ const tests: {
})) }))
), ),
}, },
// { {
// input: "ودې وینم", input: "ودې وینم",
// output: getPeople(2, "sing").flatMap((objectPerson) => output: getPeople(2, "sing").flatMap((objectPerson) =>
// getPeople(1, "sing").map<T.VPSelectionComplete>((subjectPerson) => ({ getPeople(1, "sing").map<T.VPSelectionComplete>((subjectPerson) => ({
// blocks: [ blocks: [
// { {
// key: 1, key: 1,
// block: makeSubjectSelectionComplete({ block: makeSubjectSelectionComplete({
// type: "NP", type: "NP",
// selection: makePronounSelection(subjectPerson), selection: makePronounSelection(subjectPerson),
// }), }),
// }, },
// { {
// key: 2, key: 2,
// block: makeObjectSelectionComplete({ block: makeObjectSelectionComplete({
// type: "NP", type: "NP",
// selection: makePronounSelection(objectPerson), selection: makePronounSelection(objectPerson),
// }), }),
// }, },
// ], ],
// verb: { verb: {
// type: "verb", type: "verb",
// verb: leedul, verb: leedul,
// transitivity: "transitive", transitivity: "transitive",
// canChangeTransitivity: false, canChangeTransitivity: false,
// canChangeStatDyn: false, canChangeStatDyn: false,
// negative: false, negative: false,
// tense: "subjunctiveVerb", tense: "subjunctiveVerb",
// canChangeVoice: true, canChangeVoice: true,
// isCompound: false, isCompound: false,
// voice: "active", voice: "active",
// }, },
// externalComplement: undefined, externalComplement: undefined,
// form: { form: {
// removeKing: true, removeKing: true,
// shrinkServant: true, shrinkServant: true,
// }, },
// })) }))
// ), ),
// }, },
// { {
// input: "وینم به دې", input: "وینم به دې",
// output: getPeople(2, "sing").flatMap((objectPerson) => output: getPeople(2, "sing").flatMap((objectPerson) =>
// getPeople(1, "sing").map<T.VPSelectionComplete>((subjectPerson) => ({ getPeople(1, "sing").map<T.VPSelectionComplete>((subjectPerson) => ({
// blocks: [ blocks: [
// { {
// key: 1, key: 1,
// block: makeSubjectSelectionComplete({ block: makeSubjectSelectionComplete({
// type: "NP", type: "NP",
// selection: makePronounSelection(subjectPerson), selection: makePronounSelection(subjectPerson),
// }), }),
// }, },
// { {
// key: 2, key: 2,
// block: makeObjectSelectionComplete({ block: makeObjectSelectionComplete({
// type: "NP", type: "NP",
// selection: makePronounSelection(objectPerson), selection: makePronounSelection(objectPerson),
// }), }),
// }, },
// ], ],
// verb: { verb: {
// type: "verb", type: "verb",
// verb: leedul, verb: leedul,
// transitivity: "transitive", transitivity: "transitive",
// canChangeTransitivity: false, canChangeTransitivity: false,
// canChangeStatDyn: false, canChangeStatDyn: false,
// negative: false, negative: false,
// tense: "imperfectiveFuture", tense: "imperfectiveFuture",
// canChangeVoice: true, canChangeVoice: true,
// isCompound: false, isCompound: false,
// voice: "active", voice: "active",
// }, },
// externalComplement: undefined, externalComplement: undefined,
// form: { form: {
// removeKing: true, removeKing: true,
// shrinkServant: true, shrinkServant: true,
// }, },
// })) }))
// ), ),
// }, },
// { {
// input: "یو به مې ړلې", input: "یو به مې ړلې",
// output: [...getPeople(2, "sing"), T.Person.ThirdPlurFemale].flatMap( output: [...getPeople(2, "sing"), T.Person.ThirdPlurFemale].flatMap(
// (objectPerson) => (objectPerson) =>
// getPeople(1, "sing").map<T.VPSelectionComplete>( getPeople(1, "sing").map<T.VPSelectionComplete>(
// (subjectPerson) => ({ (subjectPerson) => ({
// blocks: [ blocks: [
// { {
// key: 1, key: 1,
// block: makeSubjectSelectionComplete({ block: makeSubjectSelectionComplete({
// type: "NP", type: "NP",
// selection: makePronounSelection(subjectPerson), selection: makePronounSelection(subjectPerson),
// }), }),
// }, },
// { {
// key: 2, key: 2,
// block: makeObjectSelectionComplete({ block: makeObjectSelectionComplete({
// type: "NP", type: "NP",
// selection: makePronounSelection(objectPerson), selection: makePronounSelection(objectPerson),
// }), }),
// }, },
// ], ],
// verb: { verb: {
// type: "verb", type: "verb",
// verb: wurul, verb: wurul,
// transitivity: "transitive", transitivity: "transitive",
// canChangeTransitivity: false, canChangeTransitivity: false,
// canChangeStatDyn: false, canChangeStatDyn: false,
// negative: false, negative: false,
// tense: "habitualPerfectivePast", tense: "habitualPerfectivePast",
// canChangeVoice: true, canChangeVoice: true,
// isCompound: false, isCompound: false,
// voice: "active", voice: "active",
// }, },
// externalComplement: undefined, externalComplement: undefined,
// form: { form: {
// removeKing: true, removeKing: true,
// shrinkServant: true, shrinkServant: true,
// }, },
// }) })
// ) )
// ), ),
// }, },
], ],
}, },
{ {

View File

@ -17,13 +17,6 @@ import { personToGenNum } from "../misc-helpers";
import { equals } from "rambda"; import { equals } from "rambda";
// to hide equatives type-doubling issue // to hide equatives type-doubling issue
// this should also conjugate to
// وامې نه خیسته
// وامې نه خیستلو
// waa-me nú kheestulo
// وامې نه اخیست
// waa-me nú akheest
// TODO: word query for kawul/kedul/stat/dyn // TODO: word query for kawul/kedul/stat/dyn
// TODO: learn how to yank / use plugin for JSON neovim // TODO: learn how to yank / use plugin for JSON neovim
@ -37,6 +30,9 @@ import { equals } from "rambda";
// so we don't get something like ښځو زه خوړلې یم with a hanging // so we don't get something like ښځو زه خوړلې یم with a hanging
// یم not used // یم not used
// TODO: way to get an error message for past participle and equative
// not matching up
export function parseVP( export function parseVP(
tokens: Readonly<T.Token[]>, tokens: Readonly<T.Token[]>,
lookup: LookupFunction lookup: LookupFunction

View File

@ -1199,6 +1199,7 @@ export type ParsedBlock =
| ParsedPH | ParsedPH
| ParsedVBE | ParsedVBE
| ParsedVBP | ParsedVBP
| APSelection
| NegativeBlock; | NegativeBlock;
export type ParsedKidsSection = { export type ParsedKidsSection = {