From 73eb04d7e065cfcedaa239797ded5e11aafadd97 Mon Sep 17 00:00:00 2001 From: adueck Date: Mon, 14 Oct 2024 20:22:32 -0400 Subject: [PATCH] parser working - a bit slow/rough - with dictionary lookup --- .gitignore | 1 + get-mini-dict-and-split-verbs.ts | 41 ++ get-mini-dict.ts | 22 - package.json | 4 +- src/App.tsx | 6 +- src/demo-components/ParserDemo.tsx | 163 ++++---- src/lib/src/dictionary/dictionary.ts | 48 ++- src/lib/src/parsing/mini-test-dictionary.ts | 34 +- src/lib/src/parsing/parse-adverb.ts | 6 +- src/lib/src/parsing/parse-ap.ts | 7 +- src/lib/src/parsing/parse-blocks.ts | 15 +- src/lib/src/parsing/parse-noun-new.test.ts | 2 +- src/lib/src/parsing/parse-noun-new.ts | 106 ++--- src/lib/src/parsing/parse-noun.ts | 191 --------- src/lib/src/parsing/parse-np.ts | 9 +- src/lib/src/parsing/parse-npap.ts | 15 +- src/lib/src/parsing/parse-participle.test.ts | 42 +- src/lib/src/parsing/parse-participle.ts | 14 +- src/lib/src/parsing/parse-phrase.ts | 22 +- src/lib/src/parsing/parse-possesor.test.ts | 16 +- src/lib/src/parsing/parse-possesor.ts | 13 +- src/lib/src/parsing/parse-sandwich.ts | 5 +- ...arse-vbe.test.ts => parse-vbe-new.test.ts} | 365 ++++++++++------- src/lib/src/parsing/parse-vbe-new.ts | 385 ++++++++++++++++++ src/lib/src/parsing/parse-vbe.ts | 354 ---------------- src/lib/src/parsing/parse-vbp.ts | 126 +++--- src/lib/src/parsing/parse-verb-helpers.ts | 10 +- src/lib/src/parsing/parse-vp.ts | 7 +- src/lib/src/parsing/utils.ts | 32 ++ src/lib/src/phrase-building/np-tools.ts | 2 +- .../src/phrase-building/remove-redundant.ts | 58 +++ src/types.ts | 7 +- tsconfig.node.json | 2 +- vocab/mini-dict-tss.ts | 25 ++ 34 files changed, 1157 insertions(+), 998 deletions(-) create mode 100644 get-mini-dict-and-split-verbs.ts delete mode 100644 get-mini-dict.ts delete mode 100644 src/lib/src/parsing/parse-noun.ts rename src/lib/src/parsing/{parse-vbe.test.ts => parse-vbe-new.test.ts} (82%) create mode 100644 src/lib/src/parsing/parse-vbe-new.ts delete mode 100644 src/lib/src/parsing/parse-vbe.ts create mode 100644 src/lib/src/phrase-building/remove-redundant.ts diff --git a/.gitignore b/.gitignore index 2c74fa3..f307474 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ lerna-debug.log* src/verbs.ts src/nouns-adjs.ts vocab/mini-dict-entries.ts +src/lib/src/parsing/split-verbs.ts # testing /coverage diff --git a/get-mini-dict-and-split-verbs.ts b/get-mini-dict-and-split-verbs.ts new file mode 100644 index 0000000..a700f65 --- /dev/null +++ b/get-mini-dict-and-split-verbs.ts @@ -0,0 +1,41 @@ +import * as T from "./src/types"; +import * as tp from "./src/lib/src/type-predicates"; +import fs from "fs"; + +import { entries as collection } from "./vocab/mini-dict-tss"; + +const res = await fetch( + "https://storage.lingdocs.com/dictionary/dictionary.json" +); +const dictionary = (await res.json()) as T.Dictionary; + +const entries: T.DictionaryEntry[] = dictionary.entries.filter((x) => + collection.includes(x.ts) +); + +const splitEntries: T.VerbDictionaryEntry[] = + dictionary.entries.filter( + (x): x is T.VerbDictionaryEntry => + tp.isVerbDictionaryEntry(x) && + !!x.separationAtP && + !["کول", "کېدل"].includes(x.p) + ); + +const miniDictContents = `import { DictionaryEntry } from "../src/types"; +// DO NOT MODIFY - GENERATED FROM mini-dict-tss.ts +export const entries: DictionaryEntry[] = [ +${entries.map((e) => `\t${JSON.stringify(e)},`).join("\n")} +]; +`; + +const splitVerbContents = `import { VerbEntry, VerbDictionaryEntry } from "../../../types"; +// DO NOT MODIFY - GENERATED +export const entries: VerbEntry[] = [ +${splitEntries + .map((e) => `\t{ entry: ${JSON.stringify(e)} as VerbDictionaryEntry },`) + .join("\n")} +]; +`; + +fs.writeFileSync("./vocab/mini-dict-entries.ts", miniDictContents); +fs.writeFileSync("./src/lib/src/parsing/split-verbs.ts", splitVerbContents); diff --git a/get-mini-dict.ts b/get-mini-dict.ts deleted file mode 100644 index 3a3c94c..0000000 --- a/get-mini-dict.ts +++ /dev/null @@ -1,22 +0,0 @@ -import * as T from "./src/types"; -import fs from "fs"; - -import { entries as collection } from "./vocab/mini-dict-tss"; - -const res = await fetch( - "https://storage.lingdocs.com/dictionary/dictionary.json" -); -const dictionary = (await res.json()) as T.Dictionary; - -const entries: T.DictionaryEntry[] = dictionary.entries.filter((x) => - collection.includes(x.ts) -); - -const contents = `import { DictionaryEntry } from "../src/types"; -// DO NOT MODIFY - GENERATED FROM mini-dict-tss.ts -export const entries: DictionaryEntry[] = [ -${entries.map((e) => `\t${JSON.stringify(e)},`).join("\n")} -]; -`; - -fs.writeFileSync("./vocab/mini-dict-entries.ts", contents); diff --git a/package.json b/package.json index 5d14d60..d8e6e31 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ "build-website": "tsc -b && vite build", "build-components": "rm -rf src/components/dist && tsc --project src/components/tsconfig.json && cd src/components && node post-build.cjs", "build-lib": "rm -rf src/lib/dist && tsc --project src/lib/tsconfig.json && tsup src/lib/library.ts --format cjs && mv dist/library.cjs src/lib/dist/lib", - "get-words": "node get-words.cjs && tsx get-mini-dict.ts", + "get-words": "node get-words.cjs && tsx get-mini-dict-and-split-verbs.ts", "check-all-inflections": "tsx check-all-inflections.ts" }, "dependencies": { @@ -46,4 +46,4 @@ "typescript-eslint": "^8.0.0", "vite": "^5.4.0" } -} +} \ No newline at end of file diff --git a/src/App.tsx b/src/App.tsx index 9020946..515ddca 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -19,7 +19,7 @@ import { entryFeeder } from "./demo-components/entryFeeder"; import Hider from "./components/src/Hider"; import InflectionDemo from "./demo-components/InflectionDemo"; import SpellingDemo from "./demo-components/SpellingDemo"; -// import ParserDemo from "./demo-components/ParserDemo"; +import ParserDemo from "./demo-components/ParserDemo"; // import InflectionTable from "./components/src/InflectionsTable"; function App() { @@ -163,7 +163,7 @@ function App() { > - {/* - */} + (""); const [result, setResult] = useState< - ReturnType[number]["body"][] + ReturnType["success"] >([]); // ReturnType["success"] const [errors, setErrors] = useState([]); @@ -70,16 +75,10 @@ function ParserDemo({ setErrors([]); return; } - const res = parseNoun(tokenizer(value), dictionary, undefined); - const success: ReturnType[number]["body"][] = res - .filter((x) => !x.tokens.length) - .map((x) => x.body); - const errors = [ - ...new Set(res.flatMap(({ errors }) => errors.map((e) => e.message))), - ]; + const res = parsePhrase(tokenizer(value), dictionary); setText(value); - setErrors(errors); - setResult(success); + setErrors(res.errors); + setResult(removeRedundantVPSs(res.success)); } return (
@@ -141,34 +140,8 @@ function ParserDemo({
Did you mean:
)} - {result.map((r) => { - try { - const renderedNP: T.Rendered = { - type: "NP", - selection: renderNounSelection(r.selection, r.inflected, "none"), - }; - return ( - <> - {r.inflected ? "INFLECTED" : "PLAIN"} - - {renderedNP} - - - ); - } catch (e) { - console.error(e); - return
ERROR RENDERING
; - } - })} - - {/* {result.map((res) => - "inflected" in res ? ( - - ) : "verb" in res ? ( + {result.map((res) => ( + <> {uncompleteVPSelection(res)} - ) : ( - // (() => { - // try { - // const rendered = renderVP(res); - // const compiled = compileVP(rendered, res.form); - // return ( - //
- // - // {compiled.e && ( - //
- // {compiled.e.map((e, i) => ( - //
{e}
- // ))} - //
- // )} - //
- // ); - // } catch (e) { - // console.error(e); - // console.log({ res }); - // return
ERROR
; - // } - // })() - -
{JSON.stringify(res, null, "  ")}
-
- ) - )} */} -
- AST - -
{JSON.stringify(result, null, "  ")}
-
-
+
+ AST + +
+ + ))}
); } export default ParserDemo; + +// {/* {result.map((res) => +// "inflected" in res ? ( +// +// ) : "verb" in res ? ( +// +// {uncompleteVPSelection(res)} +// +// ) : ( +// (() => { +// try { +// const rendered = renderVP(res); +// const compiled = compileVP(rendered, res.form); +// return ( +//
+// +// {compiled.e && ( +//
+// {compiled.e.map((e, i) => ( +//
{e}
+// ))} +//
+// )} +//
+// ); +// } catch (e) { +// console.error(e); +// console.log({ res }); +// return
ERROR
; +// } +// })() +// +//
{JSON.stringify(res, null, "  ")}
+//
+// ) +// )} */} + +// try { +// const renderedNP: T.Rendered = { +// type: "NP", +// selection: renderNounSelection(r.selection, r.inflected, "none"), +// }; +// return ( +// <> +// {r.inflected ? "INFLECTED" : "PLAIN"} +// +// {renderedNP} +// +// +// ); +// } catch (e) { +// console.error(e); +// return
ERROR RENDERING
; +// } diff --git a/src/lib/src/dictionary/dictionary.ts b/src/lib/src/dictionary/dictionary.ts index 540fa1c..fcefa90 100644 --- a/src/lib/src/dictionary/dictionary.ts +++ b/src/lib/src/dictionary/dictionary.ts @@ -19,7 +19,19 @@ function queryP(p: string): T.DictionaryEntry[] { } return dictDb.collection.find({ p }); } -const memoizedQueryP = queryP; +const memoizedQueryP = memoize(queryP); + +function queryTs(ts: number): T.DictionaryEntry { + if (!dictDb.collection) { + throw new Error("dictionary not initialized yet"); + } + const res = dictDb.findOneByTs(ts); + if (!res) { + throw new Error("complement link broken"); + } + return res; +} +const memoizedQueryTs = memoize(queryTs); function adjLookup(p: string): T.AdjectiveEntry[] { const res = memoizedQueryP(p); @@ -33,26 +45,51 @@ function nounLookup(p: string): T.NounEntry[] { function otherLookup( key: keyof T.DictionaryEntry, - p: string + p: string, + regex?: boolean ): T.DictionaryEntry[] { if (!dictDb.collection) { return []; } - return dictDb.collection.find({ [key]: p }); + return dictDb.collection.find({ [key]: regex ? variationRegex(p) : p }); } function specialPluralLookup(p: string): T.NounEntry[] { if (!dictDb.collection) { return []; } - const regex = new RegExp(`(^|\\s|,)${p}($|,)`); + const regex = variationRegex(p); return dictDb.collection .find({ - $or: [{ ppp: { $regex: regex } }, { app: { $regex: regex } }], + $or: [{ ppp: regex }, { app: regex }], }) .filter(tp.isNounEntry); } +function verbEntryLookup(p: string): T.VerbEntry[] { + if (!dictDb.collection) { + return []; + } + return memoizedQueryP(p) + .filter(tp.isVerbDictionaryEntry) + .map((entry) => + entry.l + ? { + entry, + complement: memoizedQueryTs(entry.l), + } + : { entry } + ); +} + +/** + * creates a RegEx mongo query to search for a variation in a certain field + * ie. to search for کاته in کوت, کاته + */ +function variationRegex(p: string): { $regex: RegExp } { + return { $regex: new RegExp(`(^|\\s|,)${p}($|,)`) }; +} + export const dictionary: T.DictionaryAPI = { initialize: async () => await dictDb.initialize(), update: async () => await dictDb.updateDictionary(() => null), @@ -61,4 +98,5 @@ export const dictionary: T.DictionaryAPI = { nounLookup: memoize(nounLookup), otherLookup: memoize(otherLookup), specialPluralLookup: memoize(specialPluralLookup), + verbEntryLookup: memoize(verbEntryLookup), }; diff --git a/src/lib/src/parsing/mini-test-dictionary.ts b/src/lib/src/parsing/mini-test-dictionary.ts index 636805e..cd7d37c 100644 --- a/src/lib/src/parsing/mini-test-dictionary.ts +++ b/src/lib/src/parsing/mini-test-dictionary.ts @@ -1,7 +1,15 @@ import * as T from "../../../types"; -import { isAdjectiveEntry, isNounEntry } from "../type-predicates"; +import { + isAdjectiveEntry, + isNounEntry, + isVerbDictionaryEntry, +} from "../type-predicates"; import { entries } from "../../../../vocab/mini-dict-entries"; +function variationRegex(p: string): { $regex: RegExp } { + return { $regex: new RegExp(`(^|\\s|,)${p}($|,)`) }; +} + const queryP = (p: string) => entries.filter((e) => e.p === p); function adjLookup(p: string): T.AdjectiveEntry[] { return queryP(p).filter(isAdjectiveEntry) as T.AdjectiveEntry[]; @@ -13,18 +21,37 @@ function nounLookup(p: string): T.NounEntry[] { function otherLookup( key: keyof T.DictionaryEntry, - p: string + p: string, + regex?: boolean ): T.DictionaryEntry[] { + if (regex) { + const { $regex: regex } = variationRegex(p); + return entries.filter((e) => (e[key] as string)?.match(regex)); + } return entries.filter((e) => e[key] === p); } function specialPluralLookup(p: string): T.NounEntry[] { - const regex = new RegExp(`(^|\\s|,)${p}($|,)`); + const { $regex: regex } = variationRegex(p); return entries.filter( (e) => (e.ppp?.match(regex) || e.app?.match(regex)) && isNounEntry(e) ) as T.NounEntry[]; } +function verbEntryLookup(p: string): T.VerbEntry[] { + return entries + .filter((e) => e.p === p) + .filter(isVerbDictionaryEntry) + .map((entry) => + entry.l + ? { + entry, + complement: entries.find((e) => e.ts === entry.l), + } + : { entry } + ); +} + export const testDictionary: T.DictionaryAPI = { // @ts-expect-error we won't mock the initialization initialize: async () => 0, @@ -35,4 +62,5 @@ export const testDictionary: T.DictionaryAPI = { nounLookup, otherLookup, specialPluralLookup, + verbEntryLookup, }; diff --git a/src/lib/src/parsing/parse-adverb.ts b/src/lib/src/parsing/parse-adverb.ts index 5ee1b6a..902f0f8 100644 --- a/src/lib/src/parsing/parse-adverb.ts +++ b/src/lib/src/parsing/parse-adverb.ts @@ -1,16 +1,16 @@ import * as T from "../../../types"; -import { LookupFunction } from "./lookup"; +import { isAdverbEntry } from "../type-predicates"; import { returnParseResultS } from "./utils"; export function parseAdverb( tokens: Readonly, - lookup: LookupFunction + dictionary: T.DictionaryAPI ): T.ParseResult[] { if (tokens.length === 0) { return []; } const [first, ...rest] = tokens; - const adverbs = lookup(first.s, "adverb"); + const adverbs = dictionary.queryP(first.s).filter(isAdverbEntry); return adverbs.map((entry) => returnParseResultS(rest, { type: "AP", diff --git a/src/lib/src/parsing/parse-ap.ts b/src/lib/src/parsing/parse-ap.ts index fc0729e..dcb1003 100644 --- a/src/lib/src/parsing/parse-ap.ts +++ b/src/lib/src/parsing/parse-ap.ts @@ -1,26 +1,25 @@ import * as T from "../../../types"; import { fmapParseResult } from "../fp-ps"; -import { LookupFunction } from "./lookup"; import { parseAdverb } from "./parse-adverb"; import { parseSandwich } from "./parse-sandwich"; export function parseAP( s: Readonly, - lookup: LookupFunction, + dicitonary: T.DictionaryAPI, possesor: T.PossesorSelection | undefined ): T.ParseResult[] { if (s.length === 0) { return []; } return [ - ...(!possesor ? parseAdverb(s, lookup) : []), + ...(!possesor ? parseAdverb(s, dicitonary) : []), ...fmapParseResult( (selection) => ({ type: "AP", selection, } as const), - parseSandwich(s, lookup, possesor) + parseSandwich(s, dicitonary, possesor) ), ]; } diff --git a/src/lib/src/parsing/parse-blocks.ts b/src/lib/src/parsing/parse-blocks.ts index 09ae872..8218a9a 100644 --- a/src/lib/src/parsing/parse-blocks.ts +++ b/src/lib/src/parsing/parse-blocks.ts @@ -1,12 +1,11 @@ import * as T from "../../../types"; -import { LookupFunction } from "./lookup"; import { parseEquative } from "./parse-equative"; import { parseKidsSection } from "./parse-kids-section"; import { parseNeg } from "./parse-negative"; import { parseNPAP } from "./parse-npap"; import { parseVBP } from "./parse-vbp"; import { parsePH } from "./parse-ph"; -import { parseVBE } from "./parse-vbe"; +import { parseVBE } from "./parse-vbe-new"; import { bindParseResult, returnParseResult, @@ -18,7 +17,7 @@ import { isKedulStatEntry } from "./parse-verb-helpers"; export function parseBlocks( tokens: Readonly, - lookup: LookupFunction, + dicitonary: T.DictionaryAPI, blocks: T.ParsedBlock[], kids: T.ParsedKid[] ): T.ParseResult<{ @@ -35,13 +34,13 @@ export function parseBlocks( // TOOD: rather parse VBP / VBE const allBlocks: T.ParseResult[] = [ - ...(!inVerbSection ? parseNPAP(tokens, lookup) : []), + ...(!inVerbSection ? parseNPAP(tokens, dicitonary) : []), // ensure at most one of each PH, VBE, VBP ...(prevPh ? [] : parsePH(tokens)), ...(blocks.some(isParsedVBE) ? [] - : [...parseVBE(tokens, lookup), ...parseEquative(tokens)]), - ...(blocks.some(isParsedVBP) ? [] : parseVBP(tokens, lookup)), + : [...parseVBE(tokens, dicitonary), ...parseEquative(tokens)]), + ...(blocks.some(isParsedVBP) ? [] : parseVBP(tokens, dicitonary)), ...(blocks.some((b) => b.type === "negative") ? [] : parseNeg(tokens)), ...parseKidsSection(tokens, []), ]; @@ -50,7 +49,7 @@ export function parseBlocks( const errors: T.ParseError[] = []; if (r.type === "kids") { return { - next: parseBlocks(tokens, lookup, blocks, [...kids, ...r.kids]), + next: parseBlocks(tokens, dicitonary, blocks, [...kids, ...r.kids]), errors: blocks.length !== 1 ? [{ message: "kids' section out of place" }] @@ -71,7 +70,7 @@ export function parseBlocks( return []; } return { - next: parseBlocks(tokens, lookup, [...blocks, r], kids), + next: parseBlocks(tokens, dicitonary, [...blocks, r], kids), errors, }; }); diff --git a/src/lib/src/parsing/parse-noun-new.test.ts b/src/lib/src/parsing/parse-noun-new.test.ts index 61256d1..a6b5bba 100644 --- a/src/lib/src/parsing/parse-noun-new.test.ts +++ b/src/lib/src/parsing/parse-noun-new.test.ts @@ -1736,7 +1736,7 @@ describe("parsing nouns", () => { test(category, () => { cases.forEach(({ input, output }) => { const tokens = tokenizer(input); - const res = parseNoun(tokens, testDictionary, undefined, []).flatMap( + const res = parseNoun(tokens, testDictionary, undefined).flatMap( // only take the ones that used all the tokens ({ body, tokens }) => (tokens.length === 0 ? [body] : []) ); diff --git a/src/lib/src/parsing/parse-noun-new.ts b/src/lib/src/parsing/parse-noun-new.ts index c0431ba..894accc 100644 --- a/src/lib/src/parsing/parse-noun-new.ts +++ b/src/lib/src/parsing/parse-noun-new.ts @@ -3,7 +3,12 @@ import { makeNounSelection } from "../phrase-building/make-selections"; import { parseAdjective } from "./parse-adjective-new"; import { parseDeterminer } from "./parse-determiner"; import { parseNounWord } from "./parse-noun-word"; -import { bindParseResult, parserCombMany, toParseError } from "./utils"; +import { + bindParseResult, + parserCombMany, + parserCombSucc3, + toParseError, +} from "./utils"; type NounResult = { inflected: boolean; selection: T.NounSelection }; @@ -15,57 +20,54 @@ export function parseNoun( if (tokens.length === 0) { return []; } - const detRes = parserCombMany(parseDeterminer)(tokens, dictionary); - // TODO: add recognition of او between adjectives - return bindParseResult(detRes, (t, determiners) => { - const adjRes = parserCombMany(parseAdjective)(t, dictionary); - return bindParseResult(adjRes, (tk, adjectives) => { - const nounWord = parseNounWord(tk, dictionary); - return bindParseResult(nounWord, (tkns, nr) => { - const { error: adjErrors } = adjDetsMatch( - adjectives, - nr.gender, - nr.inflected ? 1 : 0, - nr.plural - ); - const { error: detErrors } = adjDetsMatch( - determiners, - nr.gender, - nr.inflected ? 1 : 0, - nr.plural - ); - const dupErrors = checkForDeterminerDuplicates(determiners); - const s = makeNounSelection(nr.entry, undefined); - const body: NounResult = { - inflected: nr.inflected, - selection: { - ...s, - gender: nr.gender, - number: nr.plural ? "plural" : "singular", - adjectives: adjectives.map((a) => a.selection), - determiners: determiners.length - ? { - type: "determiners", - withNoun: true, - determiners: determiners.map((d) => d.selection), - } - : undefined, - possesor, - }, - }; - return [ - { - body, - tokens: tkns, - errors: [ - ...detErrors.map(toParseError), - ...dupErrors.map(toParseError), - ...adjErrors.map(toParseError), - ], - }, - ]; - }); - }); + const res = parserCombSucc3([ + parserCombMany(parseDeterminer), + parserCombMany(parseAdjective), + parseNounWord, + ])(tokens, dictionary); + return bindParseResult(res, (tkns, [determiners, adjectives, nounWord]) => { + const { error: adjErrors } = adjDetsMatch( + adjectives, + nounWord.gender, + nounWord.inflected ? 1 : 0, + nounWord.plural + ); + const { error: detErrors } = adjDetsMatch( + determiners, + nounWord.gender, + nounWord.inflected ? 1 : 0, + nounWord.plural + ); + const dupErrors = checkForDeterminerDuplicates(determiners); + const s = makeNounSelection(nounWord.entry, undefined); + const body: NounResult = { + inflected: nounWord.inflected, + selection: { + ...s, + gender: nounWord.gender, + number: nounWord.plural ? "plural" : "singular", + adjectives: adjectives.map((a) => a.selection), + determiners: determiners.length + ? { + type: "determiners", + withNoun: true, + determiners: determiners.map((d) => d.selection), + } + : undefined, + possesor, + }, + }; + return [ + { + body, + tokens: tkns, + errors: [ + ...detErrors.map(toParseError), + ...dupErrors.map(toParseError), + ...adjErrors.map(toParseError), + ], + }, + ]; }); } diff --git a/src/lib/src/parsing/parse-noun.ts b/src/lib/src/parsing/parse-noun.ts deleted file mode 100644 index 4252f81..0000000 --- a/src/lib/src/parsing/parse-noun.ts +++ /dev/null @@ -1,191 +0,0 @@ -import * as T from "../../../types"; -import { getInflectionPattern } from "../inflection-pattern"; -import { makeNounSelection } from "../phrase-building/make-selections"; -import { - isMascNounEntry, - isNounEntry, - isPluralNounEntry, - isUnisexNounEntry, -} from "../type-predicates"; -import { getInflectionQueries } from "./inflection-query"; -import { LookupFunction } from "./lookup"; -import { parseAdjective } from "./parse-adjective"; -import { bindParseResult } from "./utils"; - -type NounResult = { inflected: boolean; selection: T.NounSelection }; - -export function parseNoun( - tokens: Readonly, - lookup: LookupFunction, - possesor: T.PossesorSelection | undefined, - adjectives: { - inflection: (0 | 1 | 2)[]; - gender: T.Gender[]; - given: string; - selection: T.AdjectiveSelection; - }[] -): T.ParseResult[] { - if (tokens.length === 0) { - return []; - } - // TODO: add recognition of او between adjectives - const adjRes = parseAdjective(tokens, lookup); - const withAdj = bindParseResult(adjRes, (tkns, adj) => - parseNoun(tkns, lookup, possesor, [...adjectives, adj]) - ); - const [first, ...rest] = tokens; - const searches = getInflectionQueries(first.s, true); - - const w: ReturnType = []; - searches.forEach(({ search, details }) => { - const nounEntries = lookup(search, "nounAdj").filter(isNounEntry); - details.forEach((deets) => { - const fittingEntries = nounEntries.filter(deets.predicate); - fittingEntries.forEach((entry) => { - const genders: T.Gender[] = isUnisexNounEntry(entry) - ? ["masc", "fem"] - : isMascNounEntry(entry) - ? ["masc"] - : ["fem"]; - deets.gender.forEach((gender) => { - if (genders.includes(gender)) { - deets.inflection.forEach((inf) => { - const { error: adjErrors } = adjsMatch( - adjectives, - gender, - inf, - deets.plural - ); - convertInflection(inf, entry, gender, deets.plural).forEach( - ({ inflected, number }) => { - const selection = makeNounSelection(entry, undefined); - const errors = [ - ...adjErrors.map((message) => ({ - message, - })), - ]; - w.push({ - tokens: rest, - body: { - inflected, - selection: { - ...selection, - gender: selection.genderCanChange - ? gender - : selection.gender, - number: selection.numberCanChange - ? number - : selection.number, - adjectives: adjectives.map((a) => a.selection), - // TODO: could be nicer to validate that the possesor is inflected before - // and just pass in the selection - possesor, - }, - }, - errors, - }); - } - ); - }); - } - }); - }); - }); - }); - return [...withAdj, ...w]; -} - -function adjsMatch( - adjectives: Parameters[3], - gender: T.Gender, - inf: 0 | 1 | 2, - plural: boolean | undefined -): { ok: boolean; error: string[] } { - const inflection = (plural && inf < 2 ? inf + 1 : inf) as 0 | 1 | 2; - const unmatching = adjectives.filter( - (adj) => - !adj.gender.includes(gender) || - !adj.inflection.some((i) => i === inflection) - ); - if (unmatching.length) { - return { - ok: false, - error: unmatching.map((x) => { - const adjText = - x.given === x.selection.entry.p - ? x.given - : `${x.given} (${x.selection.entry.p})`; - const inflectionIssue = !x.inflection.some((x) => x === inflection) - ? ` should be ${showInflection(inflection)}` - : ``; - return `Adjective agreement error: ${adjText} should be ${inflectionIssue} ${gender}.`; - }), - }; - } else { - return { - ok: true, - error: [], - }; - } -} - -function convertInflection( - inflection: 0 | 1 | 2, - entry: T.NounEntry | T.AdjectiveEntry, - gender: T.Gender, - plural: boolean | undefined -): { - inflected: boolean; - number: T.NounNumber; -}[] { - const pattern = getInflectionPattern(entry); - const inf = (plural && inflection < 2 ? inflection + 1 : inflection) as - | 0 - | 1 - | 2; - if (inf === 0) { - return [ - { - inflected: false, - number: "singular", - }, - ]; - } else if (inf === 1) { - return [ - ...(!((isNounEntry(entry) && isPluralNounEntry(entry)) || plural) && - !(pattern === 4 && entry.p.endsWith("ه") && gender === "masc") - ? [ - { - inflected: true, - number: "singular" as T.NounNumber, - }, - ] - : []), - ...(pattern > 1 || - (pattern > 0 && gender === "fem") || - (isNounEntry(entry) && isPluralNounEntry(entry)) || - plural - ? [ - { - inflected: false, - number: "plural" as T.NounNumber, - }, - ] - : []), - ]; - } - return [ - { - inflected: true, - number: "plural", - }, - ]; -} - -function showInflection(inf: 0 | 1 | 2): string { - return inf === 0 - ? "plain" - : inf === 1 - ? "first inflection" - : "second inflection"; -} diff --git a/src/lib/src/parsing/parse-np.ts b/src/lib/src/parsing/parse-np.ts index daf2cab..98b5dde 100644 --- a/src/lib/src/parsing/parse-np.ts +++ b/src/lib/src/parsing/parse-np.ts @@ -1,13 +1,12 @@ import * as T from "../../../types"; import { parsePronoun } from "./parse-pronoun"; -import { parseNoun } from "./parse-noun"; +import { parseNoun } from "./parse-noun-new"; import { fmapParseResult } from "../fp-ps"; import { parseParticiple } from "./parse-participle"; -import { LookupFunction } from "./lookup"; export function parseNP( s: Readonly, - lookup: LookupFunction, + dicitonary: T.DictionaryAPI, possesor: T.PossesorSelection | undefined ): T.ParseResult[] { if (s.length === 0) { @@ -41,7 +40,7 @@ export function parseNP( return fmapParseResult(makeNPSl, [ ...(!possesor ? parsePronoun(s) : []), - ...parseNoun(s, lookup, possesor, []), - ...parseParticiple(s, lookup, possesor), + ...parseNoun(s, dicitonary, possesor), + ...parseParticiple(s, dicitonary, possesor), ]); } diff --git a/src/lib/src/parsing/parse-npap.ts b/src/lib/src/parsing/parse-npap.ts index be2c038..eb11790 100644 --- a/src/lib/src/parsing/parse-npap.ts +++ b/src/lib/src/parsing/parse-npap.ts @@ -1,5 +1,4 @@ import * as T from "../../../types"; -import { LookupFunction } from "./lookup"; import { parseAP } from "./parse-ap"; import { parseNP } from "./parse-np"; import { parsePossesor } from "./parse-possesor"; @@ -7,19 +6,25 @@ import { bindParseResult } from "./utils"; export function parseNPAP( s: Readonly, - lookup: LookupFunction + dictionary: T.DictionaryAPI ): T.ParseResult[] { if (s.length === 0) { return []; } - const possesor = parsePossesor(s, lookup, undefined); + const possesor = parsePossesor(s, dictionary, undefined); if (!possesor.length) { - return [...parseNP(s, lookup, undefined), ...parseAP(s, lookup, undefined)]; + return [ + ...parseNP(s, dictionary, undefined), + ...parseAP(s, dictionary, undefined), + ]; } return bindParseResult( possesor, (tokens, p) => { - return [...parseNP(tokens, lookup, p), ...parseAP(tokens, lookup, p)]; + return [ + ...parseNP(tokens, dictionary, p), + ...parseAP(tokens, dictionary, p), + ]; } ); } diff --git a/src/lib/src/parsing/parse-participle.test.ts b/src/lib/src/parsing/parse-participle.test.ts index 338e1ee..53ef457 100644 --- a/src/lib/src/parsing/parse-participle.test.ts +++ b/src/lib/src/parsing/parse-participle.test.ts @@ -4,14 +4,16 @@ import { makePossesorSelection, } from "../phrase-building/make-selections"; import * as T from "../../../types"; -import { lookup, wordQuery } from "./lookup"; +import { testDictionary } from "./mini-test-dictionary"; import { tokenizer } from "./tokenizer"; import { parseNPAP } from "./parse-npap"; -const leedul = wordQuery("لیدل", "verb"); -const akheestul = wordQuery("اخیستل", "verb"); -const wahul = wordQuery("وهل", "verb"); -const saray = wordQuery("سړی", "noun"); +const leedul = testDictionary.verbEntryLookup("لیدل")[0]; +const akheestul = testDictionary.verbEntryLookup("اخیستل")[0]; +const wahul = testDictionary.verbEntryLookup("وهل")[0]; +const saray = testDictionary.nounLookup("سړی")[0]; + +// TODO: uncomment and get parsing of short participles working const tests: { label: string; @@ -113,20 +115,20 @@ const tests: { }, ], }, - { - input: "د سړي لیدو", - output: [ - { - inflected: true, - selection: { - ...makeParticipleSelection(leedul), - possesor: makePossesorSelection( - makeNounSelection(saray, undefined) - ), - }, - }, - ], - }, + // { + // input: "د سړي لیدو", + // output: [ + // { + // inflected: true, + // selection: { + // ...makeParticipleSelection(leedul), + // possesor: makePossesorSelection( + // makeNounSelection(saray, undefined) + // ), + // }, + // }, + // ], + // }, ], }, ]; @@ -136,7 +138,7 @@ describe("parsing participles", () => { test(label, () => { cases.forEach(({ input, output }) => { const tokens = tokenizer(input); - const res = parseNPAP(tokens, lookup).map(({ body }) => body); + const res = parseNPAP(tokens, testDictionary).map(({ body }) => body); expect(res).toEqual( output.map( (x): T.ParsedNP => ({ diff --git a/src/lib/src/parsing/parse-participle.ts b/src/lib/src/parsing/parse-participle.ts index b9b5241..8a265a8 100644 --- a/src/lib/src/parsing/parse-participle.ts +++ b/src/lib/src/parsing/parse-participle.ts @@ -1,5 +1,5 @@ import * as T from "../../../types"; -import { LookupFunction } from "./lookup"; +import { shortVerbEndConsonant } from "./misc"; type ParticipleResult = { inflected: boolean; @@ -7,9 +7,10 @@ type ParticipleResult = { }; // TODO: should have adverbs with participle +// TODO: NOTE this does not work with compound verbs yet export function parseParticiple( tokens: Readonly, - lookup: LookupFunction, + dicitonary: T.DictionaryAPI, possesor: T.PossesorSelection | undefined ): T.ParseResult[] { if (tokens.length === 0) { @@ -20,8 +21,13 @@ export function parseParticiple( return []; } const inflected = first.s.endsWith("و"); - const matches = lookup(first.s, "participle"); - return matches.map>((verb) => ({ + + return [ + ...dicitonary.verbEntryLookup(inflected ? first.s.slice(0, -1) : first.s), + ...(inflected && shortVerbEndConsonant.includes(first.s.at(-2) || "") + ? dicitonary.verbEntryLookup(first.s.slice(0, -1) + "ل") + : []), + ].map>((verb) => ({ tokens: rest, body: { inflected, diff --git a/src/lib/src/parsing/parse-phrase.ts b/src/lib/src/parsing/parse-phrase.ts index cbacc15..2cf7ec4 100644 --- a/src/lib/src/parsing/parse-phrase.ts +++ b/src/lib/src/parsing/parse-phrase.ts @@ -1,24 +1,24 @@ import * as T from "../../../types"; -import { lookup } from "./lookup"; import { parseVP } from "./parse-vp"; // شو should not be sheyaano !! -export function parsePhrase(s: T.Token[]): { - success: ( - | { - inflected: boolean; - selection: T.NPSelection; - } - | Omit - | T.VPSelectionComplete - )[]; +export function parsePhrase( + s: T.Token[], + dicitonary: T.DictionaryAPI +): { + success: // | { + // inflected: boolean; + // selection: T.NPSelection; + // } + // | Omit + T.VPSelectionComplete[]; errors: string[]; } { const res = [ // ...parseNP(s, lookup).filter(({ tokens }) => !tokens.length), // ...parseVerb(s, verbLookup), - ...parseVP(s, lookup), + ...parseVP(s, dicitonary), ]; const success = res.filter((x) => !x.tokens.length).map((x) => x.body); diff --git a/src/lib/src/parsing/parse-possesor.test.ts b/src/lib/src/parsing/parse-possesor.test.ts index b7237ab..bfba9a0 100644 --- a/src/lib/src/parsing/parse-possesor.test.ts +++ b/src/lib/src/parsing/parse-possesor.test.ts @@ -4,16 +4,16 @@ import { makeNounSelection, makePronounSelection, } from "../phrase-building/make-selections"; -import { lookup, wordQuery } from "./lookup"; import { parsePossesor } from "./parse-possesor"; import { tokenizer } from "./tokenizer"; import { isCompleteResult } from "./utils"; +import { testDictionary as dictionary } from "./mini-test-dictionary"; -const sturey = wordQuery("ستړی", "adj"); -const sarey = wordQuery("سړی", "noun"); -const maashoom = wordQuery("ماشوم", "noun"); -const malguray = wordQuery("ملګری", "noun"); -const plaar = wordQuery("پلار", "noun"); +const sturey = dictionary.adjLookup("ستړی")[0]; +const sarey = dictionary.nounLookup("سړی")[0]; +const maashoom = dictionary.nounLookup("ماشوم")[0]; +const malguray = dictionary.nounLookup("ملګری")[0]; +const plaar = dictionary.nounLookup("پلار")[0]; const tests: { input: string; @@ -109,12 +109,12 @@ const tests: { test("parse possesor", () => { tests.forEach(({ input, output }) => { const tokens = tokenizer(input); - const parsed = parsePossesor(tokens, lookup, undefined); + const parsed = parsePossesor(tokens, dictionary, undefined); if (output === "error") { expect(parsed.some((x) => x.errors.length)).toBe(true); } else { expect( - parsePossesor(tokens, lookup, undefined) + parsePossesor(tokens, dictionary, undefined) .filter(isCompleteResult) .map((x) => x.body.np.selection) ).toEqual(output); diff --git a/src/lib/src/parsing/parse-possesor.ts b/src/lib/src/parsing/parse-possesor.ts index 09d65cf..a766bb4 100644 --- a/src/lib/src/parsing/parse-possesor.ts +++ b/src/lib/src/parsing/parse-possesor.ts @@ -1,5 +1,4 @@ import * as T from "../../../types"; -import { LookupFunction } from "./lookup"; import { parseNP } from "./parse-np"; import { bindParseResult } from "./utils"; // TODO: maybe contractions should just be male to cut down on the @@ -19,7 +18,7 @@ const contractions: [string[], T.Person[]][] = [ export function parsePossesor( tokens: Readonly, - lookup: LookupFunction, + dictionary: T.DictionaryAPI, prevPossesor: T.PossesorSelection | undefined ): T.ParseResult[] { if (tokens.length === 0) { @@ -43,14 +42,14 @@ export function parsePossesor( ? [{ message: "a pronoun cannot have a possesor" }] : []; return contractions - .flatMap((p) => parsePossesor(rest, lookup, p)) + .flatMap((p) => parsePossesor(rest, dictionary, p)) .map((x) => ({ ...x, errors: [...errors, ...x.errors], })); } if (first.s === "د") { - const np = parseNP(rest, lookup, undefined); + const np = parseNP(rest, dictionary, undefined); return bindParseResult(np, (tokens, body) => { const possesor: T.PossesorSelection = { shrunken: false, @@ -63,7 +62,11 @@ export function parsePossesor( [{ message: `possesor should be inflected` }] : [], // add and check error - can't add possesor to pronoun - next: parsePossesor(tokens, lookup, addPoss(prevPossesor, possesor)), + next: parsePossesor( + tokens, + dictionary, + addPoss(prevPossesor, possesor) + ), }; }); } diff --git a/src/lib/src/parsing/parse-sandwich.ts b/src/lib/src/parsing/parse-sandwich.ts index cb506d3..b815c42 100644 --- a/src/lib/src/parsing/parse-sandwich.ts +++ b/src/lib/src/parsing/parse-sandwich.ts @@ -1,5 +1,4 @@ import * as T from "../../../types"; -import { LookupFunction } from "./lookup"; import { sandwiches } from "../sandwiches"; import { parseNP } from "./parse-np"; import { bindParseResult } from "./utils"; @@ -14,7 +13,7 @@ import { bindParseResult } from "./utils"; export function parseSandwich( s: Readonly, - lookup: LookupFunction, + dictionary: T.DictionaryAPI, possesor: T.PossesorSelection | undefined ): T.ParseResult>[] { if (s.length === 0) { @@ -27,7 +26,7 @@ export function parseSandwich( (x) => x.before && x.before.p === first.s ); // TODO: this could be be really repetitive... - const nps = parseNP(startMatches.length ? rest : s, lookup, possesor); + const nps = parseNP(startMatches.length ? rest : s, dictionary, possesor); return bindParseResult(nps, (tokens, np) => { if (!tokens.length) { return []; diff --git a/src/lib/src/parsing/parse-vbe.test.ts b/src/lib/src/parsing/parse-vbe-new.test.ts similarity index 82% rename from src/lib/src/parsing/parse-vbe.test.ts rename to src/lib/src/parsing/parse-vbe-new.test.ts index d20bf38..771118c 100644 --- a/src/lib/src/parsing/parse-vbe.test.ts +++ b/src/lib/src/parsing/parse-vbe-new.test.ts @@ -7,28 +7,35 @@ import { wartlul, raatlul, } from "./irreg-verbs"; -import { lookup, wordQuery } from "./lookup"; -import { parseVBE } from "./parse-vbe"; +import { parseVBE } from "./parse-vbe-new"; import { tokenizer } from "./tokenizer"; import { getPeople, removeKeys } from "./utils"; +import { testDictionary } from "./mini-test-dictionary"; -const wahul = wordQuery("وهل", "verb"); -const leekul = wordQuery("لیکل", "verb"); -const manul = wordQuery("منل", "verb"); -// const gaalul = wordQuery("ګالل", "verb"); -const rasedul = wordQuery("رسېدل", "verb"); -const leedul = wordQuery("لیدل", "verb"); -const khorul = wordQuery("خوړل", "verb"); -const kenaastul = wordQuery("کېناستل", "verb"); -const prexodul = wordQuery("پرېښودل", "verb"); -const xodul = wordQuery("ښودل", "verb"); -const kexodul = wordQuery("کېښودل", "verb"); -const katul = wordQuery("کتل", "verb"); -const watul = wordQuery("وتل", "verb"); -const wurul = wordQuery("وړل", "verb"); -const akheestul = wordQuery("اخیستل", "verb"); -const alwatul = wordQuery("الوتل", "verb"); -// const dartlul = wordQuery("درتلل", "verb") +const wahul = testDictionary.verbEntryLookup("وهل")[0]; +const leekul = testDictionary.verbEntryLookup("لیکل")[0]; +const manul = testDictionary.verbEntryLookup("منل")[0]; +const gaalul = testDictionary.verbEntryLookup("ګالل")[0]; +const rasedul = testDictionary.verbEntryLookup("رسېدل")[0]; +const leedul = testDictionary.verbEntryLookup("لیدل")[0]; +const awuxtul = testDictionary.verbEntryLookup("اوښتل")[0]; +const khorul = testDictionary.verbEntryLookup("خوړل")[0]; +const kenaastul = testDictionary.verbEntryLookup("کېناستل")[0]; +const kxenaastul = testDictionary.verbEntryLookup("کښېناستل")[0]; +const prexodul = testDictionary.verbEntryLookup("پرېښودل")[0]; +const prexowul = testDictionary.verbEntryLookup("پرېښوول")[0]; +const prexawul = testDictionary.verbEntryLookup("پرېښول")[0]; +const xodul = testDictionary.verbEntryLookup("ښودل")[0]; +const kexodul = testDictionary.verbEntryLookup("کېښودل")[0]; +const kxexodul = testDictionary.verbEntryLookup("کښېښودل")[0]; +const katul = testDictionary.verbEntryLookup("کتل")[0]; +const watul = testDictionary.verbEntryLookup("وتل")[0]; +const wurul = testDictionary.verbEntryLookup("وړل")[0]; +const akheestul = testDictionary.verbEntryLookup("اخیستل")[0]; +const alwatul = testDictionary.verbEntryLookup("الوتل")[0]; +const dartlul = testDictionary.verbEntryLookup("درتلل")[0]; + +// TODO: Prefix searching on split verbs for perfective head parsing // TODO: azmoyul etc // TODO: cleaner and more thorough handling of ا seperating verbs ee - wee etc @@ -311,19 +318,6 @@ const tests: { }, ], }, - { - input: "وینم", - output: [ - { - stem: { - persons: [T.Person.FirstSingMale, T.Person.FirstSingFemale], - aspects: ["imperfective", "perfective"], - }, - verb: leedul, - }, - ], - }, - // TODO!! THESE COULD ALSO BE MALE { input: "لیده", output: [ @@ -364,42 +358,6 @@ const tests: { }, ], }, - { - input: "خوړ", - output: [ - { - root: { - persons: [T.Person.ThirdSingMale], - aspects: ["imperfective", "perfective"], - }, - verb: khorul, - }, - ], - }, - { - input: "کوت", - output: [ - { - root: { - persons: [T.Person.ThirdSingMale], - aspects: ["imperfective", "perfective"], - }, - verb: katul, - }, - ], - }, - { - input: "کاته", - output: [ - { - root: { - persons: [T.Person.ThirdSingMale], - aspects: ["imperfective", "perfective"], - }, - verb: katul, - }, - ], - }, { input: "خلم", output: [ @@ -436,6 +394,11 @@ const tests: { }, ], }, + ], + }, + { + label: "verbs with seperating perfective heads", + cases: [ { input: "الوځې", output: [ @@ -460,6 +423,18 @@ const tests: { }, ], }, + { + input: "لوتلم", + output: [ + { + root: { + persons: [T.Person.FirstSingMale, T.Person.FirstSingFemale], + aspects: ["perfective"], + }, + verb: alwatul, + }, + ], + }, ], }, { @@ -492,6 +467,13 @@ const tests: { }, verb: kenaastul, }, + { + stem: { + persons: [T.Person.FirstSingMale, T.Person.FirstSingFemale], + aspects: ["perfective"], + }, + verb: kxenaastul, + }, ], }, { @@ -507,46 +489,64 @@ const tests: { ], }, { - input: "ناست", + input: "کېناسته", output: [ { root: { - persons: [T.Person.ThirdSingMale], - aspects: ["perfective"], + persons: [T.Person.ThirdSingMale, T.Person.ThirdSingFemale], + aspects: ["imperfective"], }, verb: kenaastul, }, ], }, { - input: "پرېږدو", - output: [ - { - stem: { - persons: [T.Person.FirstPlurMale, T.Person.FirstPlurFemale], - aspects: ["imperfective"], - }, - verb: prexodul, + input: "ناست", + output: [kenaastul, kxenaastul].map((verb) => ({ + root: { + persons: [T.Person.ThirdSingMale], + aspects: ["perfective"], }, - ], + verb, + })), + }, + { + input: "ناسته", + output: [kenaastul, kxenaastul].map((verb) => ({ + root: { + persons: [T.Person.ThirdSingMale, T.Person.ThirdSingFemale], + aspects: ["perfective"], + }, + verb, + })), + }, + { + input: "پرېږدو", + output: [prexodul, prexowul, prexawul].map((verb) => ({ + stem: { + persons: [T.Person.FirstPlurMale, T.Person.FirstPlurFemale], + aspects: ["imperfective"], + }, + verb, + })), }, { input: "ږدو", output: [ - { + ...[prexodul, prexawul, prexowul, kexodul, kxexodul].map((verb) => ({ stem: { persons: [T.Person.FirstPlurMale, T.Person.FirstPlurFemale], - aspects: ["perfective"], + aspects: ["perfective"] satisfies T.Aspect[], }, - verb: prexodul, - }, - { + verb, + })), + ...[kexodul, kxexodul].map((verb) => ({ stem: { persons: [T.Person.FirstPlurMale, T.Person.FirstPlurFemale], - aspects: ["imperfective", "perfective"], + aspects: ["imperfective"] satisfies T.Aspect[], }, - verb: kexodul, - }, + verb, + })), ], }, { @@ -571,20 +571,13 @@ const tests: { }, verb: xodul, }, - { + ...[prexodul, kexodul, kxexodul].map((verb) => ({ root: { persons: [T.Person.ThirdSingFemale], - aspects: ["perfective"], + aspects: ["perfective"] satisfies T.Aspect[], }, - verb: prexodul, - }, - { - root: { - persons: [T.Person.ThirdSingFemale], - aspects: ["perfective"], - }, - verb: kexodul, - }, + verb, + })), ], }, { @@ -661,43 +654,9 @@ const tests: { }, ], }, - { - input: "ړلم", - output: [ - { - root: { - persons: getPeople(1, "sing"), - aspects: ["perfective"], - }, - verb: wurul, - }, - { - root: { - persons: getPeople(1, "sing"), - aspects: ["perfective"], - }, - verb: tlul, - }, - ], - }, { input: "ړ", - output: [ - { - root: { - persons: [T.Person.ThirdSingMale], - aspects: ["perfective"], - }, - verb: wurul, - }, - { - root: { - persons: [T.Person.ThirdSingMale], - aspects: ["perfective"], - }, - verb: tlul, - }, - ], + output: [], }, // should not match with the prefix for perfective { @@ -713,6 +672,78 @@ const tests: { { label: "verbs with different 3rd pers sing past endings", cases: [ + { + input: "خوړ", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale], + aspects: ["imperfective", "perfective"], + }, + verb: khorul, + }, + ], + }, + { + input: "خوړه", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale, T.Person.ThirdSingFemale], + aspects: ["imperfective", "perfective"], + }, + verb: khorul, + }, + ], + }, + { + input: "کوت", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale], + aspects: ["imperfective", "perfective"], + }, + verb: katul, + }, + ], + }, + { + input: "کاته", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale], + aspects: ["imperfective", "perfective"], + }, + verb: katul, + }, + ], + }, + { + input: "واته", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale], + aspects: ["imperfective", "perfective"], + }, + verb: watul, + }, + ], + }, + { + input: "ووت", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale], + aspects: ["imperfective", "perfective"], + }, + verb: watul, + }, + ], + }, { input: "رسېد", output: [ @@ -725,6 +756,18 @@ const tests: { }, ], }, + { + input: "رسېده", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale, T.Person.ThirdSingFemale], + aspects: ["imperfective", "perfective"], + }, + verb: rasedul, + }, + ], + }, { input: "کېناسته", output: [ @@ -766,27 +809,69 @@ const tests: { ], }, { - input: "واته", + input: "اوښت", output: [ { root: { persons: [T.Person.ThirdSingMale], - aspects: ["imperfective", "perfective"], + aspects: ["imperfective"], }, - verb: watul, + verb: awuxtul, }, ], }, { - input: "ووت", + input: "ښت", + output: [], + }, + { + input: "اوښته", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale, T.Person.ThirdSingFemale], + aspects: ["imperfective"], + }, + verb: awuxtul, + }, + ], + }, + { + input: "ښود", output: [ { root: { persons: [T.Person.ThirdSingMale], aspects: ["imperfective", "perfective"], }, - verb: watul, + verb: xodul, }, + ...[prexodul, kexodul, kxexodul].map((verb) => ({ + root: { + persons: [T.Person.ThirdSingMale], + aspects: ["perfective"] satisfies T.Aspect[], + }, + verb, + })), + ], + }, + { + input: "ښوده", + output: [ + { + root: { + persons: [T.Person.ThirdSingMale, T.Person.ThirdSingFemale], + aspects: ["imperfective", "perfective"], + }, + verb: xodul, + }, + ...[prexodul, kexodul, kxexodul].map((verb) => ({ + root: { + persons: [T.Person.ThirdSingMale, T.Person.ThirdSingFemale], + aspects: ["perfective"] satisfies T.Aspect[], + }, + verb, + })), ], }, ], @@ -971,7 +1056,7 @@ tests.forEach(({ label, cases }) => { test(label, () => { cases.forEach(({ input, output }) => { const tokens = tokenizer(input); - const vbs = parseVBE(tokens, lookup).map((r) => r.body); + const vbs = parseVBE(tokens, testDictionary).map((r) => r.body); const madeVbsS = output.reduce((acc, o) => { return [ ...acc, diff --git a/src/lib/src/parsing/parse-vbe-new.ts b/src/lib/src/parsing/parse-vbe-new.ts new file mode 100644 index 0000000..8bae392 --- /dev/null +++ b/src/lib/src/parsing/parse-vbe-new.ts @@ -0,0 +1,385 @@ +import * as T from "../../../types"; +import { dartlul, raatlul, tlul, wartlul } from "./irreg-verbs"; +import { parseKedul } from "./parse-kedul"; +import { getVerbEnding } from "./parse-verb-helpers"; +import { returnParseResults } from "./utils"; +import { entries as splitVerbEntries } from "./split-verbs"; +import * as tp from "../type-predicates"; +import memoize from "micro-memoize"; +import { pashtoConsonants } from "../pashto-consonants"; + +// TODO: کول verbs! +// check that aawu stuff is working +// check oo`azmooy - +// TODO: proper use of sepOo (hasBreakawayAleph) when checking for perfective roots/stems +// check څاته +// laaRa shum etc +// TODO: proper use of perfective with sh +// TODO: use of raa, dar, war with sh +// TODO: هغه لاړ +// TODO: don't have کول کېدل in split-verbs + +type BaseInfo = Extract; +type StemInfo = Omit & { + base: "stem"; +}; +type RootInfo = Omit & { + base: "root"; +}; + +export function parseVBE( + tokens: Readonly, + dictionary: T.DictionaryAPI +): T.ParseResult[] { + if (tokens.length === 0) { + return []; + } + const [first, ...rest] = tokens; + const irregResults = parseIrregularVerb(first.s); + if (irregResults.length) { + return returnParseResults(rest, irregResults); + } + const kedulStat = parseKedul(tokens); + const ending = first.s.at(-1) || ""; + const base = ending === "ل" ? first.s : first.s.slice(0, -1); + const { stem, root } = getVerbEnding(ending); + // todo imperative for seperating + const imperative = getImperativeVerbEnding(ending); + const stemRes = returnParseResults(rest, [ + ...[ + ...findImperfectiveStem(base, dictionary), + ...findPerfectiveStem(base, dictionary), + ].flatMap((info) => [ + ...stem.map((person) => ({ + type: "VB", + person, + info, + })), + ...imperative.map((person) => ({ + type: "VB", + person, + info: { + ...info, + imperative: true, + }, + })), + ]), + ]); + const rootRes = returnParseResults(rest, [ + ...[ + ...findImperfectiveRoot(base, dictionary), + ...findPerfectiveRoot(base, dictionary), + ].flatMap((info) => { + const shortThird = thirdPersSingMascShortFromRoot(base, ending, info); + return [ + ...shortThird, + ...root.map((person) => ({ + type: "VB", + person, + info, + })), + ]; + }), + ...specialThirdPersMascSingForm(base, ending, dictionary), + ]); + return [...kedulStat, ...stemRes, ...rootRes]; +} + +function specialThirdPersMascSingForm( + base: string, + ending: string, + dicitonary: T.DictionaryAPI +): T.ParsedVBE[] { + if (ending !== "ه" && !pashtoConsonants.includes(ending)) { + return []; + } + // const imperfectiveWSep = [base + ending, ...(ending === "ه" ? [base] : [])] + // .flatMap((v) => + // splitVerbEntries.filter((entry) => entry.entry.p.slice(0, -1) === v) + // ) + // .map((verb) => ({ + // type: "VB", + // person: T.Person.ThirdSingMale, + // info: { + // type: "verb", + // aspect: "imperfective", + // base: "root", + // verb, + // }, + // })); + + // const perfectiveWSep = [base + ending, ...(ending === "ه" ? [base] : [])] + // .flatMap((v) => { + // const b = splitVerbEntries.filter(({ entry }) => { + // if (entry.tppp) { + // return splitVarients(entry.tppp).some( + // (varient) => varient.slice(entry.separationAtP) === v + // ); + // } else { + // return entry.p.slice(entry.separationAtP, -1) === v; + // } + // }); + // return b; + // }) + // .map((verb) => ({ + // type: "VB", + // person: T.Person.ThirdSingMale, + // info: { + // type: "verb", + // aspect: "perfective", + // base: "root", + // verb, + // }, + // })); + + const hardEnding: T.ParsedVBE[] = + (ending === "د" && ["ې", "و"].some((x) => base.endsWith(x))) || + (ending === "ت" && + ["س", "ښ"].some((x) => base.endsWith(x)) && + base.length > 1) + ? [ + ...findPerfectiveRoot(base + ending + "ل", dicitonary), + ...findImperfectiveRoot(base + ending + "ل", dicitonary), + ].map((info) => ({ + type: "VB", + person: T.Person.ThirdSingMale, + info, + })) + : []; + + const regular: T.ParsedVBE[] = [ + base + ending, + ...(ending === "ه" ? [base] : []), + ] + .flatMap(withAlefAdded) + .flatMap((v) => dicitonary.otherLookup("tppp", v, true)) + .filter( + (e): e is T.VerbDictionaryEntry => + tp.isVerbDictionaryEntry(e) && !e.l && !!e.tppp + ) + .flatMap((entry) => + // NOT IF STARTS WITH ALEPH! + (entry.separationAtP + ? (["imperfective"] as const) + : startsWithAleph(entry.p) && !startsWithAleph(base) + ? (["perfective"] as const) + : (["imperfective", "perfective"] as const) + ).map((aspect) => ({ + type: "VB" as const, + person: T.Person.ThirdSingMale, + info: { + type: "verb", + aspect, + base: "root", + verb: { entry }, + } as const, + })) + ); + + return [...regular, ...hardEnding]; + + // ...imperfectiveWSep, ...perfectiveWSep]; +} + +function thirdPersSingMascShortFromRoot( + base: string, + ending: string, + info: RootInfo +): T.ParsedVBE[] { + if (info.verb.entry.tppp) { + return []; + } + if (ending === "ه" && !base.endsWith("ل")) { + return [ + { + type: "VB", + person: T.Person.ThirdSingMale, + info, + }, + ]; + } + return []; +} + +function findImperfectiveStem( + s: string, + dicitonary: T.DictionaryAPI +): StemInfo[] { + if (["کېږ", "کېد", "ش", "شو", "شول"].includes(s)) { + return []; + } + const regulars = regStemSearch(s, dicitonary); + const irregulars = dicitonary + .otherLookup("psp", s) + .filter( + (e): e is T.VerbDictionaryEntry => tp.isVerbDictionaryEntry(e) && !e.l + ) + .map((entry) => ({ + entry, + })); + return [...regulars, ...irregulars].map((verb) => ({ + type: "verb", + aspect: "imperfective", + base: "stem", + verb, + })); +} + +function withAlefAdded(s: string): string[] { + return [s, ...(startsWithAleph(s) ? [] : ["ا" + s, "آ" + s])]; +} + +const stemSplitLookup = memoize((s: string) => + splitVerbEntries.filter( + (e) => + (e.entry.ssp || e.entry.psp || e.entry.p).slice( + e.entry.separationAtP || 0 + ) === s + ) +); + +function findPerfectiveStem( + s: string, + dicitonary: T.DictionaryAPI +): StemInfo[] { + if (["کېږ", "کېد", "ش", "شو", "شول"].includes(s)) { + return []; + } + if (startsWithAleph(s)) { + return []; + } + const baseQ = withAlefAdded(s); + const regulars = baseQ + .flatMap((q) => regStemSearch(q, dicitonary)) + .filter((e) => !e.entry.separationAtP); + const irregularsBasedOnImperf = baseQ + .flatMap((q) => dicitonary.otherLookup("psp", q)) + .filter( + (e): e is T.VerbDictionaryEntry => + tp.isVerbDictionaryEntry(e) && !e.l && !e.ssp && !e.separationAtP + ) + .map((entry) => ({ + entry, + })); + return [...regulars, ...irregularsBasedOnImperf, ...stemSplitLookup(s)].map( + (verb) => ({ + type: "verb", + aspect: "perfective", + base: "stem", + verb, + }) + ); +} + +function regStemSearch(s: string, dicitonary: T.DictionaryAPI): T.VerbEntry[] { + const regTrans = dicitonary + .verbEntryLookup(s + "ل") + .filter( + (e) => + !e.entry.c.includes("comp") && + !e.entry.ssp && + !e.entry.psp && + !e.entry.c.includes("intrans") + ); + const regIntrans = dicitonary + .verbEntryLookup((s.endsWith("ېږ") ? s.slice(0, -2) : s) + "ېدل") + .filter( + (e) => + !e.entry.c.includes("comp") && + !e.entry.ssp && + !e.entry.psp && + e.entry.c.includes("intrans") + ); + return [...regTrans, ...regIntrans]; +} + +function findImperfectiveRoot( + s: string, + dicitonary: T.DictionaryAPI +): RootInfo[] { + if (["کېږ", "کېد", "ش", "شو", "شول"].includes(s)) { + return []; + } + const reg = [s, s + "ل"] + .flatMap(dicitonary.verbEntryLookup) + .filter((e) => !e.entry.c.includes("comp")); + return reg.map((verb) => ({ + type: "verb", + aspect: "imperfective", + base: "root", + verb, + })); +} + +const rootSplitLookup = memoize((s: string) => + splitVerbEntries.filter((e) => + [s, s + "ل"].some( + (x) => (e.entry.prp || e.entry.p).slice(e.entry.separationAtP || 0) === x + ) + ) +); + +function findPerfectiveRoot( + s: string, + dicitonary: T.DictionaryAPI +): RootInfo[] { + if (startsWithAleph(s) || ["کېږ", "کېد", "ش", "شو", "شول"].includes(s)) { + return []; + } + const reg = [s, s + "ل"] + .flatMap(withAlefAdded) + .flatMap(dicitonary.verbEntryLookup) + .filter( + (e) => + !e.entry.c.includes("comp") && !e.entry.prp && !e.entry.separationAtP + ); + return [...reg, ...rootSplitLookup(s)].map((verb) => ({ + type: "verb", + aspect: "perfective", + base: "root", + verb, + })); +} + +function getImperativeVerbEnding(e: string): T.Person[] { + if (e === "ه") { + return [T.Person.SecondSingMale, T.Person.SecondSingFemale]; + } + if (e === "ئ") { + return [T.Person.SecondPlurMale, T.Person.SecondPlurFemale]; + } + return []; +} + +// TODO: could handle all sh- verbs for efficiencies sake +function parseIrregularVerb(s: string): T.ParsedVBE[] { + if (["ته", "راته", "ورته", "درته"].includes(s)) { + return [ + { + type: "VB", + info: { + aspect: "imperfective", + base: "root", + type: "verb", + verb: s.startsWith("را") + ? raatlul + : s.startsWith("ور") + ? wartlul + : s.startsWith("در") + ? dartlul + : tlul, + }, + person: T.Person.ThirdSingMale, + }, + ]; + } + return []; +} + +// function hasBreakawayAlef(e: T.VerbDictionaryEntry): boolean { +// return !e.sepOo && startsWithAleph(e.p); +// } + +function startsWithAleph(base: string): boolean { + return ["ا", "آ"].includes(base[0]); +} diff --git a/src/lib/src/parsing/parse-vbe.ts b/src/lib/src/parsing/parse-vbe.ts deleted file mode 100644 index 0c48b52..0000000 --- a/src/lib/src/parsing/parse-vbe.ts +++ /dev/null @@ -1,354 +0,0 @@ -import * as T from "../../../types"; -import { removeFVarientsFromVerb } from "../accent-and-ps-utils"; -import { isInVarients, lastVowelNotA } from "../p-text-helpers"; -import { dartlul, raatlul, tlul, wartlul } from "./irreg-verbs"; -import { LookupFunction } from "./lookup"; -import { shortVerbEndConsonant } from "./misc"; -import { parseKedul } from "./parse-kedul"; -import { getVerbEnding } from "./parse-verb-helpers"; - -// TODO: کول verbs! -// check that aawu stuff is working -// check oo`azmooy - -// check څاته -// laaRa shum etc -// TODO: proper use of perfective with sh -// TODO: use of raa, dar, war with sh -// TODO: هغه لاړ - -export function parseVBE( - tokens: Readonly, - lookup: LookupFunction -): T.ParseResult[] { - if (tokens.length === 0) { - return []; - } - const [first, ...rest] = tokens; - const irregResults = parseIrregularVerb(first.s); - if (irregResults.length) { - return irregResults.map((body) => ({ - tokens: rest, - body, - errors: [], - })); - } - const kedulStat = parseKedul(tokens); - const ending = first.s.at(-1) || ""; - const people = getVerbEnding(ending); - const imperativePeople = getImperativeVerbEnding(ending); - // First do rough verb lookup, grab wide pool of possible verbs (low searching complexity for fast lookup) - // TODO: can optimize this to not have to look for possible stems/roots if none - const verbs = lookup(first.s, "verb"); - // if (first.s === "سم") { - // console.log({ verbs: JSON.stringify(verbs) }); - // } - // Then find out which ones match exactly and how - return [ - ...kedulStat, - ...matchVerbs(first.s, verbs, people, imperativePeople).map((body) => ({ - tokens: rest, - body, - errors: [], - })), - ]; -} - -function matchVerbs( - s: string, - entries: T.VerbEntry[], - people: { - root: T.Person[]; - stem: T.Person[]; - }, - imperativePeople: T.Person[] -): T.ParsedVBE[] { - const w: T.ParsedVBE[] = []; - const lEnding = s.endsWith("ل"); - const base = s.endsWith("ل") ? s : s.slice(0, -1); - if (["کېږ", "کېد", "ش", "شو", "شول"].includes(base)) { - return []; - } - const matchShortOrLong = (b: string, x: string) => { - return b === x || (!lEnding && b === x.slice(0, -1)); - }; - if (people.stem.length || imperativePeople.length) { - const stemMatches = { - imperfective: entries.filter(({ entry: e }) => { - if (e.c.includes("comp")) { - return false; - } - if (e.psp) { - return e.psp === base; - } - if (e.c.includes("intrans.")) { - const miniRoot = e.p !== "کېدل" && e.p.slice(0, -3); - return miniRoot + "ېږ" === base || miniRoot === base; - } else { - return e.p.slice(0, -1) === base; - } - }), - perfective: entries.reduce((acc, entry) => { - const e = entry.entry; - const baseWAa = "ا" + base; - if (e.c.includes("comp")) { - return acc; - } - if (e.ssp) { - if (e.separationAtP) { - const bRest = e.ssp.slice(e.separationAtP); - if (bRest === base) { - return [...acc, entry]; - } - } else { - if (e.ssp === base) { - return [...acc, entry]; - } - } - } else if (e.psp) { - if (hasBreakawayAlef(e) && startsWithAleph(base)) { - return acc; - } - if (e.separationAtP) { - const bRest = e.psp.slice(e.separationAtP); - if (bRest === base) { - return [...acc, entry]; - } - } else { - if (!e.sepOo) { - if (baseWAa === e.psp) { - return [...acc, entry]; - } - } - if (base === e.psp) { - return [...acc, entry]; - } - } - } else if (hasBreakawayAlef(e) && startsWithAleph(base)) { - return acc; - } else if (e.c.includes("intrans.")) { - const miniRoot = e.p !== "کېدل" && e.p.slice(0, -3); - const miniRootEg = miniRoot + "ېږ"; - if ([miniRoot, miniRootEg].includes(base)) { - return [...acc, entry]; - } - } else { - const eb = e.p.slice(0, -1); - if (eb === base) { - return [...acc, entry]; - } else if (!e.sepOo) { - if (baseWAa === base.slice(1)) { - return [...acc, entry]; - } - } - } - return acc; - }, []), - }; - Object.entries(stemMatches).forEach(([aspect, entries]) => { - entries.forEach((verb) => { - people.stem.forEach((person) => { - w.push({ - type: "VB", - person, - info: { - type: "verb", - aspect: aspect as T.Aspect, - base: "stem", - verb: removeFVarientsFromVerb(verb), - }, - }); - }); - imperativePeople.forEach((person) => { - w.push({ - type: "VB", - person, - info: { - type: "verb", - aspect: aspect as T.Aspect, - base: "stem", - verb: removeFVarientsFromVerb(verb), - imperative: true, - }, - }); - }); - }); - }); - } - if (people.root.length) { - const rootMatches = { - imperfective: entries.filter( - ({ entry: e }) => !e.c.includes("comp") && matchShortOrLong(base, e.p) - ), - perfective: entries.reduce((acc, entry) => { - const e = entry.entry; - if (e.c.includes("comp")) { - return acc; - } - if (e.separationAtP) { - const b = e.prp || e.p; - const bRest = b.slice(e.separationAtP); - if (matchShortOrLong(base, bRest)) { - return [...acc, entry]; - } - } else if (hasBreakawayAlef(e) && startsWithAleph(base) && !e.prp) { - return acc; - } else { - const p = e.prp || e.p; - if (matchShortOrLong(base, p) || matchShortOrLong("ا" + base, p)) { - return [...acc, entry]; - } - } - return acc; - }, []), - }; - - Object.entries(rootMatches).forEach(([aspect, entries]) => { - entries.forEach((verb) => { - people.root.forEach((person) => { - w.push({ - type: "VB", - person, - info: { - type: "verb", - aspect: aspect as T.Aspect, - base: "root", - verb: removeFVarientsFromVerb(verb), - }, - }); - }); - }); - }); - } - const hamzaEnd = s.at(-1) === "ه"; - const oEnd = s.at(-1) === "و"; - const abruptEnd = shortVerbEndConsonant.includes(s.slice(-1)); - const tppMatches = { - imperfective: entries.reduce((acc, entry) => { - const e = entry.entry; - if (e.c.includes("comp")) { - return acc; - } - if (!e.prp && isInVarients(e.tppp, s)) { - return [...acc, entry]; - } - if (oEnd && matchShortOrLong(base, e.p)) { - return [...acc, entry]; - } - if ( - lastVowelNotA(e.g.slice(0, -2)) && - (hamzaEnd ? base : abruptEnd ? s : "XX") === e.p.slice(0, -1) - ) { - return [...acc, entry]; - } - // TODO: if check for modified aaXu thing! - return acc; - }, []), - perfective: entries.reduce((acc, entry) => { - const e = entry.entry; - if (e.c.includes("comp")) { - return acc; - } - if (e.separationAtP) { - const b = e.prp || e.p; - const bRest = b.slice(e.separationAtP); - if (bRest === "شول") { - return acc; - } - if (abruptEnd) { - if (s === bRest.slice(0, -1)) { - return [...acc, entry]; - } - } else if (hamzaEnd) { - if (base === bRest.slice(0, -1)) { - return [...acc, entry]; - } - } else if (oEnd) { - if ([bRest, bRest.slice(0, -1)].includes(base)) { - return [...acc, entry]; - } - } - } else if (!e.prp) { - if (hasBreakawayAlef(e) && startsWithAleph(base)) { - return acc; - } - if (oEnd) { - if ([e.p, e.p.slice(0, -1)].includes(base)) { - return [...acc, entry]; - } - } else if ((hamzaEnd || abruptEnd) && lastVowelNotA(e.g.slice(0, -2))) { - const b = hamzaEnd ? base : s; - const p = e.p.slice(0, -1); - if (b === p) { - return [...acc, entry]; - } - } - } - if (!e.separationAtP) { - if (isInVarients(e.tppp, s)) { - return [...acc, entry]; - } else if (isInVarients(e.tppp, "ا" + s)) { - return [...acc, entry]; - } - } - return acc; - }, []), - }; - Object.entries(tppMatches).forEach(([aspect, entries]) => { - entries.forEach((verb) => { - w.push({ - type: "VB", - person: T.Person.ThirdSingMale, - info: { - type: "verb", - aspect: aspect as T.Aspect, - base: "root", - verb: removeFVarientsFromVerb(verb), - }, - }); - }); - }); - return w; -} - -function getImperativeVerbEnding(e: string): T.Person[] { - if (e === "ه") { - return [T.Person.SecondSingMale, T.Person.SecondSingFemale]; - } - if (e === "ئ") { - return [T.Person.SecondPlurMale, T.Person.SecondPlurFemale]; - } - return []; -} - -// TODO: could handle all sh- verbs for efficiencies sake -function parseIrregularVerb(s: string): T.ParsedVBE[] { - if (["ته", "راته", "ورته", "درته"].includes(s)) { - return [ - { - type: "VB", - info: { - aspect: "imperfective", - base: "root", - type: "verb", - verb: s.startsWith("را") - ? raatlul - : s.startsWith("ور") - ? wartlul - : s.startsWith("در") - ? dartlul - : tlul, - }, - person: T.Person.ThirdSingMale, - }, - ]; - } - return []; -} - -function hasBreakawayAlef(e: T.VerbDictionaryEntry): boolean { - return !e.sepOo && ["ا", "آ"].includes(e.p[0]); -} - -function startsWithAleph(base: string): boolean { - return ["ا", "آ"].includes(base[0]); -} diff --git a/src/lib/src/parsing/parse-vbp.ts b/src/lib/src/parsing/parse-vbp.ts index 0b75fab..cec1eaa 100644 --- a/src/lib/src/parsing/parse-vbp.ts +++ b/src/lib/src/parsing/parse-vbp.ts @@ -1,46 +1,46 @@ import * as T from "../../../types"; -import { LookupFunction } from "./lookup"; -import { returnParseResult } from "./utils"; +// import { returnParseResult } from "./utils"; export function parseVBP( tokens: Readonly, - lookup: LookupFunction + dictionary: T.DictionaryAPI ): T.ParseResult[] { if (tokens.length === 0) { return []; } - return [ - ...parsePastPart(tokens, lookup), - // ...parseAbility(tokens), - ]; + return []; + // return [ + // ...parsePastPart(tokens, lookup), + // // ...parseAbility(tokens), + // ]; } -function parsePastPart( - tokens: Readonly, - lookup: LookupFunction -): T.ParseResult[] { - const [{ s }, ...rest] = tokens; - const ending: "ی" | "ي" | "ې" = s.at(-1) as "ی" | "ي" | "ې"; - if (!ending || !["ی", "ي", "ې"].includes(ending)) { - return []; - } - // TODO: ALSO HANDLE SHORT FORMS - const wOutEnd = s.slice(0, -1); - const matches = lookup(wOutEnd, "pPart"); - const genNums = endingGenderNum(ending); - return matches - .flatMap((verb) => - genNums.map((genNum) => ({ - type: "VB", - info: { - type: "ppart", - verb, - genNum, - }, - })) - ) - .flatMap((m) => returnParseResult(rest, m)); -} +// function parsePastPart( +// tokens: Readonly, +// dicitonary: T.DictionaryAPI, +// ): T.ParseResult[] { +// const [{ s }, ...rest] = tokens; +// const ending: "ی" | "ي" | "ې" = s.at(-1) as "ی" | "ي" | "ې"; +// if (!ending || !["ی", "ي", "ې"].includes(ending)) { +// return []; +// } +// // TODO: ALSO HANDLE SHORT FORMS +// const wOutEnd = s.slice(0, -1); +// const matches = lookup(wOutEnd, "pPart"); +// const genNums = endingGenderNum(ending); +// return matches +// .flatMap((verb) => +// genNums.map((genNum) => ({ +// type: "VB", +// info: { +// type: "ppart", +// verb, +// genNum, +// }, +// })) +// ) +// .flatMap((m) => returnParseResult(rest, m)); +// } // function parseAbility( // tokens: Readonly, @@ -70,33 +70,33 @@ function parsePastPart( // .flatMap((m) => returnParseResult(rest, m)); // } -function endingGenderNum(ending: "ی" | "ي" | "ې"): T.GenderNumber[] { - if (ending === "ی") { - return [ - { - gender: "masc", - number: "singular", - }, - ]; - } - if (ending === "ي") { - return [ - { - gender: "masc", - number: "plural", - }, - ]; - } - // if (ending === "ې") { - return [ - { - gender: "fem", - number: "singular", - }, - { - gender: "fem", - number: "plural", - }, - ]; - // } -} +// function endingGenderNum(ending: "ی" | "ي" | "ې"): T.GenderNumber[] { +// if (ending === "ی") { +// return [ +// { +// gender: "masc", +// number: "singular", +// }, +// ]; +// } +// if (ending === "ي") { +// return [ +// { +// gender: "masc", +// number: "plural", +// }, +// ]; +// } +// // if (ending === "ې") { +// return [ +// { +// gender: "fem", +// number: "singular", +// }, +// { +// gender: "fem", +// number: "plural", +// }, +// ]; +// // } +// } diff --git a/src/lib/src/parsing/parse-verb-helpers.ts b/src/lib/src/parsing/parse-verb-helpers.ts index dfb9160..7a4b2a2 100644 --- a/src/lib/src/parsing/parse-verb-helpers.ts +++ b/src/lib/src/parsing/parse-verb-helpers.ts @@ -4,6 +4,10 @@ export function isKedulStatEntry(v: T.VerbDictionaryEntry): boolean { return v.p === "کېدل" && v.e === "to become _____"; } +/** + * gets the possible people for stem and root endings + * but DOES NOT INCLUDE short third pers masc sing + */ export function getVerbEnding(e: string): { stem: T.Person[]; root: T.Person[]; @@ -34,7 +38,11 @@ export function getVerbEnding(e: string): { }; } else if (e === "و") { return { - root: [T.Person.FirstPlurMale, T.Person.FirstPlurFemale], + root: [ + T.Person.FirstPlurMale, + T.Person.FirstPlurFemale, + T.Person.ThirdSingMale, + ], stem: [T.Person.FirstPlurMale, T.Person.FirstPlurFemale], }; } else if (e === "ئ") { diff --git a/src/lib/src/parsing/parse-vp.ts b/src/lib/src/parsing/parse-vp.ts index f03f982..9ad9c56 100644 --- a/src/lib/src/parsing/parse-vp.ts +++ b/src/lib/src/parsing/parse-vp.ts @@ -24,7 +24,6 @@ import { import { parseBlocks } from "./parse-blocks"; import { makePronounSelection } from "../phrase-building/make-selections"; import { isFirstOrSecondPersPronoun } from "../phrase-building/render-vp"; -import { LookupFunction } from "./lookup"; import { isSecondPerson, personToGenNum } from "../misc-helpers"; import { equals, zip } from "rambda"; import { isImperativeTense } from "../type-predicates"; @@ -41,12 +40,12 @@ import { isImperativeTense } from "../type-predicates"; export function parseVP( tokens: Readonly, - lookup: LookupFunction + dictionary: T.DictionaryAPI ): T.ParseResult[] { if (tokens.length === 0) { return []; } - const blocks = parseBlocks(tokens, lookup, [], []); + const blocks = parseBlocks(tokens, dictionary, [], []); return bindParseResult( createPossesivePossibilities(blocks), (tokens, { blocks, kids }) => { @@ -892,7 +891,7 @@ function getMiniPronouns(kids: T.ParsedKid[]): T.ParsedMiniPronoun[] { function getPeopleFromMiniPronouns(kids: T.ParsedKid[]): T.Person[] { const p: T.Person[] = []; - for (let k of kids) { + for (const k of kids) { if (k === "me") { p.push(T.Person.FirstSingMale); p.push(T.Person.FirstSingFemale); diff --git a/src/lib/src/parsing/utils.ts b/src/lib/src/parsing/utils.ts index a4f470a..1600f94 100644 --- a/src/lib/src/parsing/utils.ts +++ b/src/lib/src/parsing/utils.ts @@ -163,6 +163,38 @@ export function parserCombMany(parser: Parser): Parser { return r; } +export function parserCombSucc2( + parsers: [Parser, Parser] +): Parser<[A, B]> { + return function ( + tokens: Readonly, + dictionary: T.DictionaryAPI + ): T.ParseResult<[A, B]>[] { + return bindParseResult(parsers[0](tokens, dictionary), (t, a) => + bindParseResult(parsers[1](t, dictionary), (tk, b) => + returnParseResult(tk, [a, b]) + ) + ); + }; +} + +export function parserCombSucc3( + parsers: [Parser, Parser, Parser] +): Parser<[A, B, C]> { + return function ( + tokens: Readonly, + dictionary: T.DictionaryAPI + ): T.ParseResult<[A, B, C]>[] { + return bindParseResult(parsers[0](tokens, dictionary), (t, a) => + bindParseResult(parsers[1](t, dictionary), (tk, b) => + bindParseResult(parsers[2](tk, dictionary), (tkn, c) => + returnParseResult(tkn, [a, b, c]) + ) + ) + ); + }; +} + export function isCompleteResult( r: T.ParseResult ): boolean { diff --git a/src/lib/src/phrase-building/np-tools.ts b/src/lib/src/phrase-building/np-tools.ts index bf58373..803e412 100644 --- a/src/lib/src/phrase-building/np-tools.ts +++ b/src/lib/src/phrase-building/np-tools.ts @@ -214,7 +214,7 @@ function addArticlesAndAdjs( ? np.determiners.determiners // @ts-ignore - weird, ts is not recognizing this as rendered .map((x) => (moreThanOneDet ? `(${x.e})` : x.e)) - .join(" ") + .join(" ") + " " : ""; const detsWithoutNoun = np.determiners && !np.determiners.withNoun; return `${np.determiners ? "" : articles}${determiners}${ diff --git a/src/lib/src/phrase-building/remove-redundant.ts b/src/lib/src/phrase-building/remove-redundant.ts new file mode 100644 index 0000000..c240e83 --- /dev/null +++ b/src/lib/src/phrase-building/remove-redundant.ts @@ -0,0 +1,58 @@ +import * as T from "../../../types"; +import { compileVP } from "./compile"; +import { renderVP } from "./render-vp"; + +export function removeRedundantVPSs( + vs: T.VPSelectionComplete[] +): T.VPSelectionComplete[] { + const versions = vs.map((x) => compileVP(renderVP(x), x.form)); + const toRemove = new Set(); + versions.forEach((a, i) => { + const duplicates = findAllIndices( + versions.slice(i + 1), + (b) => !toRemove.has(i) && isDuplicate(a, b) + ); + duplicates.forEach((d) => toRemove.add(d + i + 1)); + }); + return vs.reduce((acc, v, i) => { + if (toRemove.has(i)) { + return acc; + } + return [...acc, v]; + }, []); +} + +function isDuplicate( + a: { + ps: T.SingleOrLengthOpts; + e?: string[]; + }, + b: { ps: T.SingleOrLengthOpts; e?: string[] } +): boolean { + if (!a.e || !b.e) { + return false; + } + if (a.e.length !== b.e.length) { + return false; + } + return a.e.every( + (x, i) => + removeGenderGloss(x) === removeGenderGloss(b.e ? b.e[i] : "") && + JSON.stringify(a.ps) === JSON.stringify(b.ps) + ); +} + +function removeGenderGloss(s: string): string { + // TODO: combine into one RegEx + return s.replaceAll(/\((m|f)\.\)/g, "").replaceAll(/\((m|f)\. pl\.\)/g, ""); +} + +function findAllIndices(arr: N[], f: (x: N) => boolean): number[] { + const indices: number[] = []; + arr.forEach((x, i) => { + if (f(x)) { + indices.push(i); + } + }); + return indices; +} diff --git a/src/types.ts b/src/types.ts index fc16856..0d39da3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1259,8 +1259,13 @@ export type DictionaryAPI = { queryP: (p: string) => DictionaryEntry[]; adjLookup: (p: string) => AdjectiveEntry[]; nounLookup: (p: string) => NounEntry[]; - otherLookup: (key: keyof DictionaryEntry, p: string) => DictionaryEntry[]; + otherLookup: ( + key: keyof DictionaryEntry, + p: string, + regex?: boolean + ) => DictionaryEntry[]; specialPluralLookup: (p: string) => NounEntry[]; + verbEntryLookup: (p: string) => VerbEntry[]; }; export type Parser = ( diff --git a/tsconfig.node.json b/tsconfig.node.json index c6f14df..b859a3c 100644 --- a/tsconfig.node.json +++ b/tsconfig.node.json @@ -18,5 +18,5 @@ "noUnusedParameters": true, "noFallthroughCasesInSwitch": true }, - "include": ["vite.config.ts", "get-mini-dict.ts"] + "include": ["vite.config.ts", "get-mini-dict-and-split-verbs.ts"] } diff --git a/vocab/mini-dict-tss.ts b/vocab/mini-dict-tss.ts index b9a75c1..c7530b3 100644 --- a/vocab/mini-dict-tss.ts +++ b/vocab/mini-dict-tss.ts @@ -34,6 +34,7 @@ export const entries: T.DictionaryEntry["ts"][] = [ 1527812908, // مېلمه 1575924767041, // شپون 1527815333, // نتور + 1527812881, // ماشوم // fem nouns 1527811877, // دوستي @@ -50,4 +51,28 @@ export const entries: T.DictionaryEntry["ts"][] = [ 1589023873660, // فتح - fatha 1527814342, // نفع - nafa 1527815329, // تجربه + + // verbs + 1527815399, // وهل + 1527817298, // اخیستل + 1527812275, // لیدل + 1527812856, // لیکل + 1527815085, // منل + 1527817661, // ګالل + 1527813573, // رسېدل + 1527812790, // خوړل + 1527812759, // کېناستل + 1527812758, // کښېناستل + 1527815190, // پرېښودل + 1527811293, // ښودل + 1527812284, // کېښودل + 1527812751, // کتل + 1527823376, // وتل + 1527816865, // وړل + 1527813473, // الوتل + 1585228551150, // درتلل + 1527817577, // کښېښودل + 1527814012, // اوښتل + 1577390597820, // پرېښوول + 1527815191, // پرېښول ];