pashto-dictionary/functions/src/publish.ts

241 lines
8.4 KiB
TypeScript

import { GoogleSpreadsheet } from "google-spreadsheet";
import * as functions from "firebase-functions";
import {
Types as T,
dictionaryEntryBooleanFields,
dictionaryEntryNumberFields,
dictionaryEntryTextFields,
validateEntry,
writeDictionary,
writeDictionaryInfo,
simplifyPhonetics,
standardizeEntry,
} from "@lingdocs/inflect";
import {
getWordList,
} from "./word-list-maker";
import {
PublishDictionaryResponse,
} from "../../website/src/types/functions-types";
import { Storage } from "@google-cloud/storage";
const storage = new Storage({
projectId: "lingdocs",
});
const title = "LingDocs Pashto Dictionary"
const license = "Copyright © 2021 lingdocs.com All Rights Reserved - Licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License - https://creativecommons.org/licenses/by-nc-sa/4.0/";
const bucketName = "lingdocs";
const baseUrl = `https://storage.googleapis.com/${bucketName}/`;
const dictionaryFilename = "dictionary";
const dictionaryInfoFilename = "dictionary-info";
const hunspellAffFileFilename = "ps_AFF.aff";
const hunspellDicFileFilename = "ps_AFF.dic";
const url = `${baseUrl}${dictionaryFilename}`;
const infoUrl = `${baseUrl}${dictionaryInfoFilename}`;
// TODO: Create a seperate function for publishing the Hunspell that can run after the publish function?
// to keep the publish function time down
export default async function publish(): Promise<PublishDictionaryResponse> {
const entries = await getRawEntries();
const errors = checkForErrors(entries);
if (errors.length) {
return({ ok: false, errors });
}
// const duplicates = findDuplicates(entries);
// duplicates.forEach((duplicate) => {
// const index = entries.findIndex(e => e.ts === duplicate.ts);
// if (index > -1) entries.splice(index, 1);
// })
const dictionary: T.Dictionary = {
info: {
title,
license,
url,
infoUrl,
release: new Date().getTime(),
numberOfEntries: entries.length,
},
entries,
}
uploadDictionaryToStorage(dictionary).catch(console.error);
// TODO: make this async and run after publish response
doHunspell(entries).catch(console.error);
return {
ok: true,
info: dictionary.info
};
}
async function doHunspell(entries: T.DictionaryEntry[]) {
const wordlistResponse = getWordList(entries);
if (!wordlistResponse.ok) {
throw new Error(JSON.stringify(wordlistResponse.errors));
}
const hunspell = makeHunspell(wordlistResponse.wordlist);
await uploadHunspellToStorage(hunspell);
}
/**
* Gets the entries from the spreadsheet, and also deletes duplicate
* entries that are sometimes annoyingly created by the GoogleSheets API
* when adding entries programmatically
*
* @returns
*
*/
async function getRawEntries(): Promise<T.DictionaryEntry[]> {
const doc = new GoogleSpreadsheet(
functions.config().sheet.id,
);
await doc.useServiceAccountAuth({
client_email: functions.config().serviceacct.email,
private_key: functions.config().serviceacct.key,
});
await doc.loadInfo();
const sheet = doc.sheetsByIndex[0];
const rows = await sheet.getRows();
async function deleteRow(r: number) {
await rows[r].delete();
}
return await makeEntries(rows, deleteRow);
}
async function makeEntries(rows: any[], deleteRow: (r: number) => Promise<void>): Promise<T.DictionaryEntry[]> {
const entries: T.DictionaryEntry[] = [];
let sheetIndex = 0;
for (let i = 0; i < rows.length; i++) {
sheetIndex++;
const row = rows[i];
const nextRow = rows[i+1] || undefined;
if (row.ts === nextRow?.ts) {
if (row.p !== nextRow.p) throw new Error(`ts ${row.ts} is a duplicate of a different entry`);
// this looks like a duplicate entry made by the sheets api
// delete it and keep going
await deleteRow(sheetIndex);
sheetIndex--;
continue;
}
const e: T.DictionaryEntry = {
i: 1,
ts: parseInt(row.ts),
p: row.p,
f: row.f,
g: simplifyPhonetics(row.f),
e: row.e,
};
dictionaryEntryNumberFields.forEach((field: T.DictionaryEntryNumberField) => {
if (row[field]) e[field] = parseInt(row[field]);
});
dictionaryEntryTextFields.forEach((field: T.DictionaryEntryTextField) => {
if (row[field]) e[field] = row[field].trim();
});
dictionaryEntryBooleanFields.forEach((field: T.DictionaryEntryBooleanField) => {
if (row[field]) e[field] = true;
});
entries.push(standardizeEntry(e));
}
// add alphabetical index
entries.sort((a, b) => a.p.localeCompare(b.p, "ps"));
const entriesLength = entries.length;
// add index
for (let i = 0; i < entriesLength; i++) {
entries[i].i = i;
}
return entries;
}
function checkForErrors(entries: T.DictionaryEntry[]): T.DictionaryEntryError[] {
return entries.reduce((errors: T.DictionaryEntryError[], entry: T.DictionaryEntry) => {
const response = validateEntry(entry);
if ("errors" in response && response.errors.length) {
return [...errors, response];
}
if ("checkComplement" in response) {
const complement = entries.find((e) => e.ts === entry.l);
if (!complement) {
const error: T.DictionaryEntryError = {
errors: ["complement link not found in dictonary"],
ts: entry.ts,
p: entry.p,
f: entry.f,
e: entry.e,
erroneousFields: ["l"],
};
return [...errors, error];
}
if (!complement.c?.includes("n.") && !complement.c?.includes("adj.") && !complement.c?.includes("adv.")) {
const error: T.DictionaryEntryError = {
errors: ["complement link to invalid complement"],
ts: entry.ts,
p: entry.p,
f: entry.f,
e: entry.e,
erroneousFields: ["l"],
};
return [...errors, error];
}
}
return errors;
}, []);
}
// function findDuplicates(entries: T.DictionaryEntry[]): T.DictionaryEntry[] {
// const tsSoFar = new Set();
// const duplicates: T.DictionaryEntry[] = [];
// // tslint:disable-next-line: prefer-for-of
// for (let i = 0; i < entries.length; i++) {
// const ts = entries[i].ts;
// if (tsSoFar.has(ts)) {
// duplicates.push(entries[i]);
// }
// tsSoFar.add(ts);
// }
// return duplicates;
// }
async function upload(content: Buffer | string, filename: string) {
const isBuffer = typeof content !== "string";
const file = storage.bucket(bucketName).file(filename);
await file.save(content, {
gzip: isBuffer ? false : true,
predefinedAcl: "publicRead",
metadata: {
contentType: isBuffer
? "application/octet-stream"
: filename.slice(-5) === ".json"
? "application/json"
: "text/plain; charset=UTF-8",
cacheControl: "no-cache",
},
});
}
async function uploadHunspellToStorage(wordlist: {
affContent: string,
dicContent: string,
}) {
await Promise.all([
upload(wordlist.affContent, hunspellAffFileFilename),
upload(wordlist.dicContent, hunspellDicFileFilename),
]);
}
async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
const dictionaryBuffer = writeDictionary(dictionary);
const dictionaryInfoBuffer = writeDictionaryInfo(dictionary.info);
await Promise.all([
upload(JSON.stringify(dictionary), `${dictionaryFilename}.json`),
upload(JSON.stringify(dictionary.info), `${dictionaryInfoFilename}.json`),
upload(dictionaryBuffer as Buffer, dictionaryFilename),
upload(dictionaryInfoBuffer as Buffer, dictionaryInfoFilename),
]);
}
function makeHunspell(wordlist: string[]) {
return {
dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
};
}