with hunspell

This commit is contained in:
lingdocs 2022-02-07 14:32:31 +04:00
parent e7525cf8c8
commit c0329417e3
2 changed files with 50 additions and 80 deletions

View File

@ -11,9 +11,9 @@ import {
simplifyPhonetics, simplifyPhonetics,
standardizeEntry, standardizeEntry,
} from "@lingdocs/pashto-inflector"; } from "@lingdocs/pashto-inflector";
// import { import {
// getWordList, getWordList,
// } from "./word-list-maker"; } from "./word-list-maker";
import { import {
PublishDictionaryResponse, PublishDictionaryResponse,
} from "../../website/src/types/functions-types"; } from "../../website/src/types/functions-types";
@ -28,8 +28,8 @@ const bucketName = "lingdocs";
const baseUrl = `https://storage.googleapis.com/${bucketName}/`; const baseUrl = `https://storage.googleapis.com/${bucketName}/`;
const dictionaryFilename = "dictionary"; const dictionaryFilename = "dictionary";
const dictionaryInfoFilename = "dictionary-info"; const dictionaryInfoFilename = "dictionary-info";
// const hunspellAffFileFilename = "ps_AFF.aff"; const hunspellAffFileFilename = "ps_AFF.aff";
// const hunspellDicFileFilename = "ps_AFF.dic"; const hunspellDicFileFilename = "ps_AFF.dic";
const url = `${baseUrl}${dictionaryFilename}`; const url = `${baseUrl}${dictionaryFilename}`;
const infoUrl = `${baseUrl}${dictionaryInfoFilename}`; const infoUrl = `${baseUrl}${dictionaryInfoFilename}`;
@ -69,22 +69,21 @@ export default async function publish(): Promise<PublishDictionaryResponse> {
} }
await uploadDictionaryToStorage(dictionary); await uploadDictionaryToStorage(dictionary);
// TODO: make this async and run after publish response // TODO: make this async and run after publish response
// doHunspell(entries).catch(console.error); await doHunspell(entries);
return { return {
ok: true, ok: true,
info: dictionary.info info: dictionary.info
}; };
} }
// async function doHunspell(entries: T.DictionaryEntry[]) { async function doHunspell(entries: T.DictionaryEntry[]) {
// const wordlistResponse = getWordList(entries); const wordlistResponse = getWordList(entries);
// if (!wordlistResponse.ok) { if (!wordlistResponse.ok) {
// throw new Error(JSON.stringify(wordlistResponse.errors)); throw new Error(JSON.stringify(wordlistResponse.errors));
// } }
// const hunspell = makeHunspell(wordlistResponse.wordlist); const hunspell = makeHunspell(wordlistResponse.wordlist);
// await uploadHunspellToStorage(hunspell); await uploadHunspellToStorage(hunspell);
// } }
async function getRawEntries(): Promise<T.DictionaryEntry[]> { async function getRawEntries(): Promise<T.DictionaryEntry[]> {
const doc = new GoogleSpreadsheet( const doc = new GoogleSpreadsheet(
@ -197,15 +196,15 @@ async function upload(content: Buffer | string, filename: string) {
}); });
} }
// async function uploadHunspellToStorage(wordlist: { async function uploadHunspellToStorage(wordlist: {
// affContent: string, affContent: string,
// dicContent: string, dicContent: string,
// }) { }) {
// await Promise.all([ await Promise.all([
// upload(wordlist.affContent, hunspellAffFileFilename), upload(wordlist.affContent, hunspellAffFileFilename),
// upload(wordlist.dicContent, hunspellDicFileFilename), upload(wordlist.dicContent, hunspellDicFileFilename),
// ]); ]);
// } }
async function uploadDictionaryToStorage(dictionary: T.Dictionary) { async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
const dictionaryBuffer = writeDictionary(dictionary); const dictionaryBuffer = writeDictionary(dictionary);
@ -218,9 +217,9 @@ async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
]); ]);
} }
// function makeHunspell(wordlist: string[]) { function makeHunspell(wordlist: string[]) {
// return { return {
// dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"), dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
// affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n", affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
// }; };
// } }

View File

@ -2,8 +2,6 @@ import {
inflectWord, inflectWord,
conjugateVerb, conjugateVerb,
Types as T, Types as T,
pashtoConsonants,
isNounAdjOrVerb,
} from "@lingdocs/pashto-inflector"; } from "@lingdocs/pashto-inflector";
@ -41,49 +39,22 @@ export function getWordList(entries: T.DictionaryEntry[]): {
const allInflections: Set<string> = new Set(); const allInflections: Set<string> = new Set();
const errors: T.DictionaryEntryError[] = []; const errors: T.DictionaryEntryError[] = [];
function getNounAdjInflections(entry: T.DictionaryEntry) { function getNounAdjInflections(entry: T.DictionaryEntry) {
if (entry.app) allInflections.add(entry.app); const infs = inflectWord(entry);
if (entry.ppp) allInflections.add(entry.ppp); if (infs) {
search("p", infs).forEach(w => allInflections.add(w));
const inflections = inflectWord(entry); }
const wordsFromInf = inflections
? search("p", inflections)
: [];
wordsFromInf.forEach(w => allInflections.add(w));
} }
function getVerbConjugations(word: T.DictionaryEntry, linked?: T.DictionaryEntry) { function getVerbConjugations(word: T.DictionaryEntry, linked?: T.DictionaryEntry) {
const pWords = search("p", conjugateVerb(word, linked)); search("p", conjugateVerb(word, linked)).forEach(w => allInflections.add(w));
pWords.forEach(w => allInflections.add(w));
} }
// got the entries, make a wordList of all the possible inflections // got the entries, make a wordList of all the possible inflections
entries.forEach((entry) => { entries.forEach((entry) => {
try { try {
if (entry.c && isNounAdjOrVerb(entry) === "nounAdj") { if (entry.c?.startsWith("v. ")) {
// it's a noun/adjective - get all inflections and plurals etc. const linked = entry.l ? entries.find((e) => e.ts === entry.l) : undefined;
getNounAdjInflections(entry); getVerbConjugations(entry, linked);
// hack to add some plurals and mayonnaise
if (entry.c.includes("n. m.") && pashtoConsonants.includes(entry.p.slice(-1))) {
allInflections.add(entry.p + "ونه")
allInflections.add(entry.p + "ونو")
allInflections.add(entry.p + "ه");
}
if (entry.c.includes("n. f.") && entry.p.slice(-1) === "ا") {
allInflections.add(entry.p + "ګانې")
allInflections.add(entry.p + "ګانو");
}
} else if (entry.c && isNounAdjOrVerb(entry) === "verb") {
// it's a verb - get all the conjugations for it
if (entry.l && entry.c.includes("comp.")) {
// it's a compound verb, conjugate it with the linked complement
const linkedEntry = entries.find((e) => e.ts === entry.l);
getVerbConjugations(entry, linkedEntry);
} else {
// it's a non-compound verb, conjugate it
getVerbConjugations(entry);
}
} else {
// it's something else, just put the word(s) in
entry.p.split(" ").forEach(w => allInflections.add(w));
} }
getNounAdjInflections(entry);
} catch (error) { } catch (error) {
errors.push({ errors.push({
ts: entry.ts, ts: entry.ts,
@ -91,7 +62,7 @@ export function getWordList(entries: T.DictionaryEntry[]): {
f: entry.f, f: entry.f,
e: entry.e, e: entry.e,
erroneousFields: [], erroneousFields: [],
errors: ["error inflecting/conjugating entry", error.toString()], errors: ["error inflecting/conjugating entry"],
}); });
} }
}); });
@ -103,16 +74,16 @@ export function getWordList(entries: T.DictionaryEntry[]): {
} }
// add ی version of words with ې (to accomadate for some bad spelling) // add ی version of words with ې (to accomadate for some bad spelling)
allInflections.forEach((word: string) => { // allInflections.forEach((word: string) => {
// for words with ې in the middle, also have a version with ی in the middle instead // // for words with ې in the middle, also have a version with ی in the middle instead
if (eInMiddleRegex.test(word)) { // // if (eInMiddleRegex.test(word)) {
allInflections.add(word.replace(eInMiddleRegex, "ی")); // // allInflections.add(word.replace(eInMiddleRegex, "ی"));
} // // }
// for words ending in ې, also have a version ending in ي // // for words ending in ې, also have a version ending in ي
if (word.slice(-1) === "ې") { // // if (word.slice(-1) === "ې") {
allInflections.add(word.slice(0, -1) + "ي"); // // allInflections.add(word.slice(0, -1) + "ي");
} // // }
}); // });
const wordlist = Array.from(allInflections).filter((s) => !(s.includes(".") || s.includes("?"))); const wordlist = Array.from(allInflections).filter((s) => !(s.includes(".") || s.includes("?")));
wordlist.sort((a, b) => a.localeCompare(b, "ps")); wordlist.sort((a, b) => a.localeCompare(b, "ps"));
return { return {
@ -121,4 +92,4 @@ export function getWordList(entries: T.DictionaryEntry[]): {
}; };
} }
const eInMiddleRegex = new RegExp("ې(?=[\u0621-\u065f\u0670-\u06d3\u06d5])", "g"); // const eInMiddleRegex = new RegExp("ې(?=[\u0621-\u065f\u0670-\u06d3\u06d5])", "g");