with hunspell
This commit is contained in:
parent
e7525cf8c8
commit
c0329417e3
|
@ -11,9 +11,9 @@ import {
|
||||||
simplifyPhonetics,
|
simplifyPhonetics,
|
||||||
standardizeEntry,
|
standardizeEntry,
|
||||||
} from "@lingdocs/pashto-inflector";
|
} from "@lingdocs/pashto-inflector";
|
||||||
// import {
|
import {
|
||||||
// getWordList,
|
getWordList,
|
||||||
// } from "./word-list-maker";
|
} from "./word-list-maker";
|
||||||
import {
|
import {
|
||||||
PublishDictionaryResponse,
|
PublishDictionaryResponse,
|
||||||
} from "../../website/src/types/functions-types";
|
} from "../../website/src/types/functions-types";
|
||||||
|
@ -28,8 +28,8 @@ const bucketName = "lingdocs";
|
||||||
const baseUrl = `https://storage.googleapis.com/${bucketName}/`;
|
const baseUrl = `https://storage.googleapis.com/${bucketName}/`;
|
||||||
const dictionaryFilename = "dictionary";
|
const dictionaryFilename = "dictionary";
|
||||||
const dictionaryInfoFilename = "dictionary-info";
|
const dictionaryInfoFilename = "dictionary-info";
|
||||||
// const hunspellAffFileFilename = "ps_AFF.aff";
|
const hunspellAffFileFilename = "ps_AFF.aff";
|
||||||
// const hunspellDicFileFilename = "ps_AFF.dic";
|
const hunspellDicFileFilename = "ps_AFF.dic";
|
||||||
const url = `${baseUrl}${dictionaryFilename}`;
|
const url = `${baseUrl}${dictionaryFilename}`;
|
||||||
const infoUrl = `${baseUrl}${dictionaryInfoFilename}`;
|
const infoUrl = `${baseUrl}${dictionaryInfoFilename}`;
|
||||||
|
|
||||||
|
@ -69,22 +69,21 @@ export default async function publish(): Promise<PublishDictionaryResponse> {
|
||||||
}
|
}
|
||||||
await uploadDictionaryToStorage(dictionary);
|
await uploadDictionaryToStorage(dictionary);
|
||||||
// TODO: make this async and run after publish response
|
// TODO: make this async and run after publish response
|
||||||
// doHunspell(entries).catch(console.error);
|
await doHunspell(entries);
|
||||||
return {
|
return {
|
||||||
ok: true,
|
ok: true,
|
||||||
info: dictionary.info
|
info: dictionary.info
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// async function doHunspell(entries: T.DictionaryEntry[]) {
|
async function doHunspell(entries: T.DictionaryEntry[]) {
|
||||||
// const wordlistResponse = getWordList(entries);
|
const wordlistResponse = getWordList(entries);
|
||||||
// if (!wordlistResponse.ok) {
|
if (!wordlistResponse.ok) {
|
||||||
// throw new Error(JSON.stringify(wordlistResponse.errors));
|
throw new Error(JSON.stringify(wordlistResponse.errors));
|
||||||
// }
|
}
|
||||||
// const hunspell = makeHunspell(wordlistResponse.wordlist);
|
const hunspell = makeHunspell(wordlistResponse.wordlist);
|
||||||
// await uploadHunspellToStorage(hunspell);
|
await uploadHunspellToStorage(hunspell);
|
||||||
// }
|
}
|
||||||
|
|
||||||
async function getRawEntries(): Promise<T.DictionaryEntry[]> {
|
async function getRawEntries(): Promise<T.DictionaryEntry[]> {
|
||||||
const doc = new GoogleSpreadsheet(
|
const doc = new GoogleSpreadsheet(
|
||||||
|
@ -197,15 +196,15 @@ async function upload(content: Buffer | string, filename: string) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// async function uploadHunspellToStorage(wordlist: {
|
async function uploadHunspellToStorage(wordlist: {
|
||||||
// affContent: string,
|
affContent: string,
|
||||||
// dicContent: string,
|
dicContent: string,
|
||||||
// }) {
|
}) {
|
||||||
// await Promise.all([
|
await Promise.all([
|
||||||
// upload(wordlist.affContent, hunspellAffFileFilename),
|
upload(wordlist.affContent, hunspellAffFileFilename),
|
||||||
// upload(wordlist.dicContent, hunspellDicFileFilename),
|
upload(wordlist.dicContent, hunspellDicFileFilename),
|
||||||
// ]);
|
]);
|
||||||
// }
|
}
|
||||||
|
|
||||||
async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
|
async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
|
||||||
const dictionaryBuffer = writeDictionary(dictionary);
|
const dictionaryBuffer = writeDictionary(dictionary);
|
||||||
|
@ -218,9 +217,9 @@ async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// function makeHunspell(wordlist: string[]) {
|
function makeHunspell(wordlist: string[]) {
|
||||||
// return {
|
return {
|
||||||
// dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
|
dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
|
||||||
// affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
|
affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
|
||||||
// };
|
};
|
||||||
// }
|
}
|
||||||
|
|
|
@ -2,8 +2,6 @@ import {
|
||||||
inflectWord,
|
inflectWord,
|
||||||
conjugateVerb,
|
conjugateVerb,
|
||||||
Types as T,
|
Types as T,
|
||||||
pashtoConsonants,
|
|
||||||
isNounAdjOrVerb,
|
|
||||||
} from "@lingdocs/pashto-inflector";
|
} from "@lingdocs/pashto-inflector";
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,49 +39,22 @@ export function getWordList(entries: T.DictionaryEntry[]): {
|
||||||
const allInflections: Set<string> = new Set();
|
const allInflections: Set<string> = new Set();
|
||||||
const errors: T.DictionaryEntryError[] = [];
|
const errors: T.DictionaryEntryError[] = [];
|
||||||
function getNounAdjInflections(entry: T.DictionaryEntry) {
|
function getNounAdjInflections(entry: T.DictionaryEntry) {
|
||||||
if (entry.app) allInflections.add(entry.app);
|
const infs = inflectWord(entry);
|
||||||
if (entry.ppp) allInflections.add(entry.ppp);
|
if (infs) {
|
||||||
|
search("p", infs).forEach(w => allInflections.add(w));
|
||||||
const inflections = inflectWord(entry);
|
}
|
||||||
const wordsFromInf = inflections
|
|
||||||
? search("p", inflections)
|
|
||||||
: [];
|
|
||||||
wordsFromInf.forEach(w => allInflections.add(w));
|
|
||||||
}
|
}
|
||||||
function getVerbConjugations(word: T.DictionaryEntry, linked?: T.DictionaryEntry) {
|
function getVerbConjugations(word: T.DictionaryEntry, linked?: T.DictionaryEntry) {
|
||||||
const pWords = search("p", conjugateVerb(word, linked));
|
search("p", conjugateVerb(word, linked)).forEach(w => allInflections.add(w));
|
||||||
pWords.forEach(w => allInflections.add(w));
|
|
||||||
}
|
}
|
||||||
// got the entries, make a wordList of all the possible inflections
|
// got the entries, make a wordList of all the possible inflections
|
||||||
entries.forEach((entry) => {
|
entries.forEach((entry) => {
|
||||||
try {
|
try {
|
||||||
if (entry.c && isNounAdjOrVerb(entry) === "nounAdj") {
|
if (entry.c?.startsWith("v. ")) {
|
||||||
// it's a noun/adjective - get all inflections and plurals etc.
|
const linked = entry.l ? entries.find((e) => e.ts === entry.l) : undefined;
|
||||||
|
getVerbConjugations(entry, linked);
|
||||||
|
}
|
||||||
getNounAdjInflections(entry);
|
getNounAdjInflections(entry);
|
||||||
// hack to add some plurals and mayonnaise
|
|
||||||
if (entry.c.includes("n. m.") && pashtoConsonants.includes(entry.p.slice(-1))) {
|
|
||||||
allInflections.add(entry.p + "ونه")
|
|
||||||
allInflections.add(entry.p + "ونو")
|
|
||||||
allInflections.add(entry.p + "ه");
|
|
||||||
}
|
|
||||||
if (entry.c.includes("n. f.") && entry.p.slice(-1) === "ا") {
|
|
||||||
allInflections.add(entry.p + "ګانې")
|
|
||||||
allInflections.add(entry.p + "ګانو");
|
|
||||||
}
|
|
||||||
} else if (entry.c && isNounAdjOrVerb(entry) === "verb") {
|
|
||||||
// it's a verb - get all the conjugations for it
|
|
||||||
if (entry.l && entry.c.includes("comp.")) {
|
|
||||||
// it's a compound verb, conjugate it with the linked complement
|
|
||||||
const linkedEntry = entries.find((e) => e.ts === entry.l);
|
|
||||||
getVerbConjugations(entry, linkedEntry);
|
|
||||||
} else {
|
|
||||||
// it's a non-compound verb, conjugate it
|
|
||||||
getVerbConjugations(entry);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// it's something else, just put the word(s) in
|
|
||||||
entry.p.split(" ").forEach(w => allInflections.add(w));
|
|
||||||
}
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
errors.push({
|
errors.push({
|
||||||
ts: entry.ts,
|
ts: entry.ts,
|
||||||
|
@ -91,7 +62,7 @@ export function getWordList(entries: T.DictionaryEntry[]): {
|
||||||
f: entry.f,
|
f: entry.f,
|
||||||
e: entry.e,
|
e: entry.e,
|
||||||
erroneousFields: [],
|
erroneousFields: [],
|
||||||
errors: ["error inflecting/conjugating entry", error.toString()],
|
errors: ["error inflecting/conjugating entry"],
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -103,16 +74,16 @@ export function getWordList(entries: T.DictionaryEntry[]): {
|
||||||
}
|
}
|
||||||
|
|
||||||
// add ی version of words with ې (to accomadate for some bad spelling)
|
// add ی version of words with ې (to accomadate for some bad spelling)
|
||||||
allInflections.forEach((word: string) => {
|
// allInflections.forEach((word: string) => {
|
||||||
// for words with ې in the middle, also have a version with ی in the middle instead
|
// // for words with ې in the middle, also have a version with ی in the middle instead
|
||||||
if (eInMiddleRegex.test(word)) {
|
// // if (eInMiddleRegex.test(word)) {
|
||||||
allInflections.add(word.replace(eInMiddleRegex, "ی"));
|
// // allInflections.add(word.replace(eInMiddleRegex, "ی"));
|
||||||
}
|
// // }
|
||||||
// for words ending in ې, also have a version ending in ي
|
// // for words ending in ې, also have a version ending in ي
|
||||||
if (word.slice(-1) === "ې") {
|
// // if (word.slice(-1) === "ې") {
|
||||||
allInflections.add(word.slice(0, -1) + "ي");
|
// // allInflections.add(word.slice(0, -1) + "ي");
|
||||||
}
|
// // }
|
||||||
});
|
// });
|
||||||
const wordlist = Array.from(allInflections).filter((s) => !(s.includes(".") || s.includes("?")));
|
const wordlist = Array.from(allInflections).filter((s) => !(s.includes(".") || s.includes("?")));
|
||||||
wordlist.sort((a, b) => a.localeCompare(b, "ps"));
|
wordlist.sort((a, b) => a.localeCompare(b, "ps"));
|
||||||
return {
|
return {
|
||||||
|
@ -121,4 +92,4 @@ export function getWordList(entries: T.DictionaryEntry[]): {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const eInMiddleRegex = new RegExp("ې(?=[\u0621-\u065f\u0670-\u06d3\u06d5])", "g");
|
// const eInMiddleRegex = new RegExp("ې(?=[\u0621-\u065f\u0670-\u06d3\u06d5])", "g");
|
||||||
|
|
Loading…
Reference in New Issue