with hunspell
This commit is contained in:
parent
e7525cf8c8
commit
c0329417e3
|
@ -11,9 +11,9 @@ import {
|
|||
simplifyPhonetics,
|
||||
standardizeEntry,
|
||||
} from "@lingdocs/pashto-inflector";
|
||||
// import {
|
||||
// getWordList,
|
||||
// } from "./word-list-maker";
|
||||
import {
|
||||
getWordList,
|
||||
} from "./word-list-maker";
|
||||
import {
|
||||
PublishDictionaryResponse,
|
||||
} from "../../website/src/types/functions-types";
|
||||
|
@ -28,8 +28,8 @@ const bucketName = "lingdocs";
|
|||
const baseUrl = `https://storage.googleapis.com/${bucketName}/`;
|
||||
const dictionaryFilename = "dictionary";
|
||||
const dictionaryInfoFilename = "dictionary-info";
|
||||
// const hunspellAffFileFilename = "ps_AFF.aff";
|
||||
// const hunspellDicFileFilename = "ps_AFF.dic";
|
||||
const hunspellAffFileFilename = "ps_AFF.aff";
|
||||
const hunspellDicFileFilename = "ps_AFF.dic";
|
||||
const url = `${baseUrl}${dictionaryFilename}`;
|
||||
const infoUrl = `${baseUrl}${dictionaryInfoFilename}`;
|
||||
|
||||
|
@ -69,22 +69,21 @@ export default async function publish(): Promise<PublishDictionaryResponse> {
|
|||
}
|
||||
await uploadDictionaryToStorage(dictionary);
|
||||
// TODO: make this async and run after publish response
|
||||
// doHunspell(entries).catch(console.error);
|
||||
await doHunspell(entries);
|
||||
return {
|
||||
ok: true,
|
||||
info: dictionary.info
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
// async function doHunspell(entries: T.DictionaryEntry[]) {
|
||||
// const wordlistResponse = getWordList(entries);
|
||||
// if (!wordlistResponse.ok) {
|
||||
// throw new Error(JSON.stringify(wordlistResponse.errors));
|
||||
// }
|
||||
// const hunspell = makeHunspell(wordlistResponse.wordlist);
|
||||
// await uploadHunspellToStorage(hunspell);
|
||||
// }
|
||||
async function doHunspell(entries: T.DictionaryEntry[]) {
|
||||
const wordlistResponse = getWordList(entries);
|
||||
if (!wordlistResponse.ok) {
|
||||
throw new Error(JSON.stringify(wordlistResponse.errors));
|
||||
}
|
||||
const hunspell = makeHunspell(wordlistResponse.wordlist);
|
||||
await uploadHunspellToStorage(hunspell);
|
||||
}
|
||||
|
||||
async function getRawEntries(): Promise<T.DictionaryEntry[]> {
|
||||
const doc = new GoogleSpreadsheet(
|
||||
|
@ -197,15 +196,15 @@ async function upload(content: Buffer | string, filename: string) {
|
|||
});
|
||||
}
|
||||
|
||||
// async function uploadHunspellToStorage(wordlist: {
|
||||
// affContent: string,
|
||||
// dicContent: string,
|
||||
// }) {
|
||||
// await Promise.all([
|
||||
// upload(wordlist.affContent, hunspellAffFileFilename),
|
||||
// upload(wordlist.dicContent, hunspellDicFileFilename),
|
||||
// ]);
|
||||
// }
|
||||
async function uploadHunspellToStorage(wordlist: {
|
||||
affContent: string,
|
||||
dicContent: string,
|
||||
}) {
|
||||
await Promise.all([
|
||||
upload(wordlist.affContent, hunspellAffFileFilename),
|
||||
upload(wordlist.dicContent, hunspellDicFileFilename),
|
||||
]);
|
||||
}
|
||||
|
||||
async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
|
||||
const dictionaryBuffer = writeDictionary(dictionary);
|
||||
|
@ -218,9 +217,9 @@ async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
|
|||
]);
|
||||
}
|
||||
|
||||
// function makeHunspell(wordlist: string[]) {
|
||||
// return {
|
||||
// dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
|
||||
// affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
|
||||
// };
|
||||
// }
|
||||
function makeHunspell(wordlist: string[]) {
|
||||
return {
|
||||
dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
|
||||
affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
|
||||
};
|
||||
}
|
||||
|
|
|
@ -2,8 +2,6 @@ import {
|
|||
inflectWord,
|
||||
conjugateVerb,
|
||||
Types as T,
|
||||
pashtoConsonants,
|
||||
isNounAdjOrVerb,
|
||||
} from "@lingdocs/pashto-inflector";
|
||||
|
||||
|
||||
|
@ -41,49 +39,22 @@ export function getWordList(entries: T.DictionaryEntry[]): {
|
|||
const allInflections: Set<string> = new Set();
|
||||
const errors: T.DictionaryEntryError[] = [];
|
||||
function getNounAdjInflections(entry: T.DictionaryEntry) {
|
||||
if (entry.app) allInflections.add(entry.app);
|
||||
if (entry.ppp) allInflections.add(entry.ppp);
|
||||
|
||||
const inflections = inflectWord(entry);
|
||||
const wordsFromInf = inflections
|
||||
? search("p", inflections)
|
||||
: [];
|
||||
wordsFromInf.forEach(w => allInflections.add(w));
|
||||
const infs = inflectWord(entry);
|
||||
if (infs) {
|
||||
search("p", infs).forEach(w => allInflections.add(w));
|
||||
}
|
||||
}
|
||||
function getVerbConjugations(word: T.DictionaryEntry, linked?: T.DictionaryEntry) {
|
||||
const pWords = search("p", conjugateVerb(word, linked));
|
||||
pWords.forEach(w => allInflections.add(w));
|
||||
search("p", conjugateVerb(word, linked)).forEach(w => allInflections.add(w));
|
||||
}
|
||||
// got the entries, make a wordList of all the possible inflections
|
||||
entries.forEach((entry) => {
|
||||
try {
|
||||
if (entry.c && isNounAdjOrVerb(entry) === "nounAdj") {
|
||||
// it's a noun/adjective - get all inflections and plurals etc.
|
||||
if (entry.c?.startsWith("v. ")) {
|
||||
const linked = entry.l ? entries.find((e) => e.ts === entry.l) : undefined;
|
||||
getVerbConjugations(entry, linked);
|
||||
}
|
||||
getNounAdjInflections(entry);
|
||||
// hack to add some plurals and mayonnaise
|
||||
if (entry.c.includes("n. m.") && pashtoConsonants.includes(entry.p.slice(-1))) {
|
||||
allInflections.add(entry.p + "ونه")
|
||||
allInflections.add(entry.p + "ونو")
|
||||
allInflections.add(entry.p + "ه");
|
||||
}
|
||||
if (entry.c.includes("n. f.") && entry.p.slice(-1) === "ا") {
|
||||
allInflections.add(entry.p + "ګانې")
|
||||
allInflections.add(entry.p + "ګانو");
|
||||
}
|
||||
} else if (entry.c && isNounAdjOrVerb(entry) === "verb") {
|
||||
// it's a verb - get all the conjugations for it
|
||||
if (entry.l && entry.c.includes("comp.")) {
|
||||
// it's a compound verb, conjugate it with the linked complement
|
||||
const linkedEntry = entries.find((e) => e.ts === entry.l);
|
||||
getVerbConjugations(entry, linkedEntry);
|
||||
} else {
|
||||
// it's a non-compound verb, conjugate it
|
||||
getVerbConjugations(entry);
|
||||
}
|
||||
} else {
|
||||
// it's something else, just put the word(s) in
|
||||
entry.p.split(" ").forEach(w => allInflections.add(w));
|
||||
}
|
||||
} catch (error) {
|
||||
errors.push({
|
||||
ts: entry.ts,
|
||||
|
@ -91,7 +62,7 @@ export function getWordList(entries: T.DictionaryEntry[]): {
|
|||
f: entry.f,
|
||||
e: entry.e,
|
||||
erroneousFields: [],
|
||||
errors: ["error inflecting/conjugating entry", error.toString()],
|
||||
errors: ["error inflecting/conjugating entry"],
|
||||
});
|
||||
}
|
||||
});
|
||||
|
@ -103,16 +74,16 @@ export function getWordList(entries: T.DictionaryEntry[]): {
|
|||
}
|
||||
|
||||
// add ی version of words with ې (to accomadate for some bad spelling)
|
||||
allInflections.forEach((word: string) => {
|
||||
// for words with ې in the middle, also have a version with ی in the middle instead
|
||||
if (eInMiddleRegex.test(word)) {
|
||||
allInflections.add(word.replace(eInMiddleRegex, "ی"));
|
||||
}
|
||||
// for words ending in ې, also have a version ending in ي
|
||||
if (word.slice(-1) === "ې") {
|
||||
allInflections.add(word.slice(0, -1) + "ي");
|
||||
}
|
||||
});
|
||||
// allInflections.forEach((word: string) => {
|
||||
// // for words with ې in the middle, also have a version with ی in the middle instead
|
||||
// // if (eInMiddleRegex.test(word)) {
|
||||
// // allInflections.add(word.replace(eInMiddleRegex, "ی"));
|
||||
// // }
|
||||
// // for words ending in ې, also have a version ending in ي
|
||||
// // if (word.slice(-1) === "ې") {
|
||||
// // allInflections.add(word.slice(0, -1) + "ي");
|
||||
// // }
|
||||
// });
|
||||
const wordlist = Array.from(allInflections).filter((s) => !(s.includes(".") || s.includes("?")));
|
||||
wordlist.sort((a, b) => a.localeCompare(b, "ps"));
|
||||
return {
|
||||
|
@ -121,4 +92,4 @@ export function getWordList(entries: T.DictionaryEntry[]): {
|
|||
};
|
||||
}
|
||||
|
||||
const eInMiddleRegex = new RegExp("ې(?=[\u0621-\u065f\u0670-\u06d3\u06d5])", "g");
|
||||
// const eInMiddleRegex = new RegExp("ې(?=[\u0621-\u065f\u0670-\u06d3\u06d5])", "g");
|
||||
|
|
Loading…
Reference in New Issue