with hunspell

This commit is contained in:
lingdocs 2022-02-07 14:32:31 +04:00
parent e7525cf8c8
commit c0329417e3
2 changed files with 50 additions and 80 deletions

View File

@ -11,9 +11,9 @@ import {
simplifyPhonetics,
standardizeEntry,
} from "@lingdocs/pashto-inflector";
// import {
// getWordList,
// } from "./word-list-maker";
import {
getWordList,
} from "./word-list-maker";
import {
PublishDictionaryResponse,
} from "../../website/src/types/functions-types";
@ -28,8 +28,8 @@ const bucketName = "lingdocs";
const baseUrl = `https://storage.googleapis.com/${bucketName}/`;
const dictionaryFilename = "dictionary";
const dictionaryInfoFilename = "dictionary-info";
// const hunspellAffFileFilename = "ps_AFF.aff";
// const hunspellDicFileFilename = "ps_AFF.dic";
const hunspellAffFileFilename = "ps_AFF.aff";
const hunspellDicFileFilename = "ps_AFF.dic";
const url = `${baseUrl}${dictionaryFilename}`;
const infoUrl = `${baseUrl}${dictionaryInfoFilename}`;
@ -69,22 +69,21 @@ export default async function publish(): Promise<PublishDictionaryResponse> {
}
await uploadDictionaryToStorage(dictionary);
// TODO: make this async and run after publish response
// doHunspell(entries).catch(console.error);
await doHunspell(entries);
return {
ok: true,
info: dictionary.info
};
}
// async function doHunspell(entries: T.DictionaryEntry[]) {
// const wordlistResponse = getWordList(entries);
// if (!wordlistResponse.ok) {
// throw new Error(JSON.stringify(wordlistResponse.errors));
// }
// const hunspell = makeHunspell(wordlistResponse.wordlist);
// await uploadHunspellToStorage(hunspell);
// }
async function doHunspell(entries: T.DictionaryEntry[]) {
const wordlistResponse = getWordList(entries);
if (!wordlistResponse.ok) {
throw new Error(JSON.stringify(wordlistResponse.errors));
}
const hunspell = makeHunspell(wordlistResponse.wordlist);
await uploadHunspellToStorage(hunspell);
}
async function getRawEntries(): Promise<T.DictionaryEntry[]> {
const doc = new GoogleSpreadsheet(
@ -197,15 +196,15 @@ async function upload(content: Buffer | string, filename: string) {
});
}
// async function uploadHunspellToStorage(wordlist: {
// affContent: string,
// dicContent: string,
// }) {
// await Promise.all([
// upload(wordlist.affContent, hunspellAffFileFilename),
// upload(wordlist.dicContent, hunspellDicFileFilename),
// ]);
// }
async function uploadHunspellToStorage(wordlist: {
affContent: string,
dicContent: string,
}) {
await Promise.all([
upload(wordlist.affContent, hunspellAffFileFilename),
upload(wordlist.dicContent, hunspellDicFileFilename),
]);
}
async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
const dictionaryBuffer = writeDictionary(dictionary);
@ -218,9 +217,9 @@ async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
]);
}
// function makeHunspell(wordlist: string[]) {
// return {
// dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
// affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
// };
// }
function makeHunspell(wordlist: string[]) {
return {
dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
};
}

View File

@ -2,8 +2,6 @@ import {
inflectWord,
conjugateVerb,
Types as T,
pashtoConsonants,
isNounAdjOrVerb,
} from "@lingdocs/pashto-inflector";
@ -41,49 +39,22 @@ export function getWordList(entries: T.DictionaryEntry[]): {
const allInflections: Set<string> = new Set();
const errors: T.DictionaryEntryError[] = [];
function getNounAdjInflections(entry: T.DictionaryEntry) {
if (entry.app) allInflections.add(entry.app);
if (entry.ppp) allInflections.add(entry.ppp);
const inflections = inflectWord(entry);
const wordsFromInf = inflections
? search("p", inflections)
: [];
wordsFromInf.forEach(w => allInflections.add(w));
const infs = inflectWord(entry);
if (infs) {
search("p", infs).forEach(w => allInflections.add(w));
}
}
function getVerbConjugations(word: T.DictionaryEntry, linked?: T.DictionaryEntry) {
const pWords = search("p", conjugateVerb(word, linked));
pWords.forEach(w => allInflections.add(w));
search("p", conjugateVerb(word, linked)).forEach(w => allInflections.add(w));
}
// got the entries, make a wordList of all the possible inflections
entries.forEach((entry) => {
try {
if (entry.c && isNounAdjOrVerb(entry) === "nounAdj") {
// it's a noun/adjective - get all inflections and plurals etc.
getNounAdjInflections(entry);
// hack to add some plurals and mayonnaise
if (entry.c.includes("n. m.") && pashtoConsonants.includes(entry.p.slice(-1))) {
allInflections.add(entry.p + "ونه")
allInflections.add(entry.p + "ونو")
allInflections.add(entry.p + "ه");
}
if (entry.c.includes("n. f.") && entry.p.slice(-1) === "ا") {
allInflections.add(entry.p + "ګانې")
allInflections.add(entry.p + "ګانو");
}
} else if (entry.c && isNounAdjOrVerb(entry) === "verb") {
// it's a verb - get all the conjugations for it
if (entry.l && entry.c.includes("comp.")) {
// it's a compound verb, conjugate it with the linked complement
const linkedEntry = entries.find((e) => e.ts === entry.l);
getVerbConjugations(entry, linkedEntry);
} else {
// it's a non-compound verb, conjugate it
getVerbConjugations(entry);
}
} else {
// it's something else, just put the word(s) in
entry.p.split(" ").forEach(w => allInflections.add(w));
if (entry.c?.startsWith("v. ")) {
const linked = entry.l ? entries.find((e) => e.ts === entry.l) : undefined;
getVerbConjugations(entry, linked);
}
getNounAdjInflections(entry);
} catch (error) {
errors.push({
ts: entry.ts,
@ -91,7 +62,7 @@ export function getWordList(entries: T.DictionaryEntry[]): {
f: entry.f,
e: entry.e,
erroneousFields: [],
errors: ["error inflecting/conjugating entry", error.toString()],
errors: ["error inflecting/conjugating entry"],
});
}
});
@ -103,16 +74,16 @@ export function getWordList(entries: T.DictionaryEntry[]): {
}
// add ی version of words with ې (to accomadate for some bad spelling)
allInflections.forEach((word: string) => {
// for words with ې in the middle, also have a version with ی in the middle instead
if (eInMiddleRegex.test(word)) {
allInflections.add(word.replace(eInMiddleRegex, "ی"));
}
// for words ending in ې, also have a version ending in ي
if (word.slice(-1) === "ې") {
allInflections.add(word.slice(0, -1) + "ي");
}
});
// allInflections.forEach((word: string) => {
// // for words with ې in the middle, also have a version with ی in the middle instead
// // if (eInMiddleRegex.test(word)) {
// // allInflections.add(word.replace(eInMiddleRegex, "ی"));
// // }
// // for words ending in ې, also have a version ending in ي
// // if (word.slice(-1) === "ې") {
// // allInflections.add(word.slice(0, -1) + "ي");
// // }
// });
const wordlist = Array.from(allInflections).filter((s) => !(s.includes(".") || s.includes("?")));
wordlist.sort((a, b) => a.localeCompare(b, "ps"));
return {
@ -121,4 +92,4 @@ export function getWordList(entries: T.DictionaryEntry[]): {
};
}
const eInMiddleRegex = new RegExp("ې(?=[\u0621-\u065f\u0670-\u06d3\u06d5])", "g");
// const eInMiddleRegex = new RegExp("ې(?=[\u0621-\u065f\u0670-\u06d3\u06d5])", "g");