pashto-dictionary/functions/src/publish.ts

import { GoogleSpreadsheet } from "google-spreadsheet";
import * as functions from "firebase-functions";
import {
    Types as T,
    dictionaryEntryBooleanFields,
    dictionaryEntryNumberFields,
    dictionaryEntryTextFields,
    validateEntry,
    writeDictionary,
    writeDictionaryInfo,
    simplifyPhonetics,
    standardizeEntry,
} from "@lingdocs/inflect";
import {
    getWordList,
} from "./word-list-maker";
import {
    PublishDictionaryResponse,
} from "../../website/src/types/functions-types";
import { Storage } from "@google-cloud/storage";
const storage = new Storage({
    projectId: "lingdocs",
});

const title = "LingDocs Pashto Dictionary"
const license = "Copyright © 2021 lingdocs.com All Rights Reserved - Licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License - https://creativecommons.org/licenses/by-nc-sa/4.0/";
const bucketName = "lingdocs";
const baseUrl = `https://storage.googleapis.com/${bucketName}/`;
const dictionaryFilename = "dictionary";
const dictionaryInfoFilename = "dictionary-info";
const hunspellAffFileFilename = "ps_AFF.aff";
const hunspellDicFileFilename = "ps_AFF.dic";
const url = `${baseUrl}${dictionaryFilename}`;
const infoUrl = `${baseUrl}${dictionaryInfoFilename}`;

// TODO: Create a seperate function for publishing the Hunspell that can run after the publish function?
// to keep the publish function time down

export default async function publish(): Promise<PublishDictionaryResponse> {
    const entries = await getRawEntries();
    const errors = checkForErrors(entries);
    if (errors.length) {
        return({ ok: false, errors });
    }
    // const duplicates = findDuplicates(entries);
    // duplicates.forEach((duplicate) => {
    //     const index = entries.findIndex(e => e.ts === duplicate.ts);
    //     if (index > -1) entries.splice(index, 1);
    // })
    const dictionary: T.Dictionary = {
        info: {
            title,
            license,
            url,
            infoUrl,
            release: new Date().getTime(),
            numberOfEntries: entries.length,
        },
        entries,
    }
    uploadDictionaryToStorage(dictionary).catch(console.error);
    // TODO: make this async and run after publish response
    doHunspell(entries).catch(console.error);
    return {
        ok: true,
        info: dictionary.info
    };
}

async function doHunspell(entries: T.DictionaryEntry[]) {
    const wordlistResponse = getWordList(entries);
    if (!wordlistResponse.ok) {
        throw new Error(JSON.stringify(wordlistResponse.errors));
    }
    const hunspell = makeHunspell(wordlistResponse.wordlist);
    await uploadHunspellToStorage(hunspell);
}

/**
 * Gets the entries from the spreadsheet, and also deletes duplicate
 * entries that are sometimes annoyingly created by the GoogleSheets API
 * when adding entries programmatically
 *
 * @returns
 *
 */
async function getRawEntries(): Promise<T.DictionaryEntry[]> {
    const doc = new GoogleSpreadsheet(
        functions.config().sheet.id,
    );
    await doc.useServiceAccountAuth({
        client_email: functions.config().serviceacct.email,
        private_key: functions.config().serviceacct.key,
    });
    await doc.loadInfo();
    const sheet = doc.sheetsByIndex[0];
    const rows = await sheet.getRows();
    async function deleteRow(r: number) {
        await rows[r].delete();
    }
    return await makeEntries(rows, deleteRow);
}

async function makeEntries(rows: any[], deleteRow: (r: number) => Promise<void>): Promise<T.DictionaryEntry[]> {
    const entries: T.DictionaryEntry[] = [];
    let sheetIndex = 0;
    for (let i = 0; i < rows.length; i++) {
        sheetIndex++;
        const row = rows[i];
        const nextRow = rows[i+1] || undefined;
        if (row.ts === nextRow?.ts) {
            if (row.p !== nextRow.p) throw new Error(`ts ${row.ts} is a duplicate of a different entry`);
            // this looks like a duplicate entry made by the sheets api
            // delete it and keep going
            await deleteRow(sheetIndex);
            sheetIndex--;
            continue;
        }
        const e: T.DictionaryEntry = {
            i: 1,
            ts: parseInt(row.ts),
            p: row.p,
            f: row.f,
            g: simplifyPhonetics(row.f),
            e: row.e,
        };
        dictionaryEntryNumberFields.forEach((field: T.DictionaryEntryNumberField) => {
            if (row[field]) e[field] = parseInt(row[field]);
        });
        dictionaryEntryTextFields.forEach((field: T.DictionaryEntryTextField) => {
            if (row[field]) e[field] = row[field].trim();
        });
        dictionaryEntryBooleanFields.forEach((field: T.DictionaryEntryBooleanField) => {
            if (row[field]) e[field] = true;
        });
        entries.push(standardizeEntry(e));
    }
    // add alphabetical index
    entries.sort((a, b) => a.p.localeCompare(b.p, "ps"));
    const entriesLength = entries.length;
    // add index
    for (let i = 0; i < entriesLength; i++) {
        entries[i].i = i;
    }
    return entries;
}

function checkForErrors(entries: T.DictionaryEntry[]): T.DictionaryEntryError[] {
    return entries.reduce((errors: T.DictionaryEntryError[], entry: T.DictionaryEntry) => {
        const response = validateEntry(entry);
        if ("errors" in response && response.errors.length) {
            return [...errors, response];
        }
        if ("checkComplement" in response) {
            const complement = entries.find((e) => e.ts === entry.l);
            if (!complement) {
                const error: T.DictionaryEntryError = {
                    errors: ["complement link not found in dictonary"],
                    ts: entry.ts,
                    p: entry.p,
                    f: entry.f,
                    e: entry.e,
                    erroneousFields: ["l"],
                };
                return [...errors, error];
            }
            if (!complement.c?.includes("n.") && !complement.c?.includes("adj.") && !complement.c?.includes("adv.")) {
                const error: T.DictionaryEntryError = {
                    errors: ["complement link to invalid complement"],
                    ts: entry.ts,
                    p: entry.p,
                    f: entry.f,
                    e: entry.e,
                    erroneousFields: ["l"],
                };
                return [...errors, error];
            }
        }
        return errors;
    }, []);
}

// function findDuplicates(entries: T.DictionaryEntry[]): T.DictionaryEntry[] {
//     const tsSoFar = new Set();
//     const duplicates: T.DictionaryEntry[] = [];
//     // tslint:disable-next-line: prefer-for-of
//     for (let i = 0; i < entries.length; i++) {
//         const ts = entries[i].ts;
//         if (tsSoFar.has(ts)) {
//             duplicates.push(entries[i]);
//         }
//         tsSoFar.add(ts);
//     }
//     return duplicates;
// }

async function upload(content: Buffer | string, filename: string) {
    const isBuffer = typeof content !== "string";
    const file = storage.bucket(bucketName).file(filename);
    await file.save(content, {
        gzip: isBuffer ? false : true,
        predefinedAcl: "publicRead",
        metadata: {
            contentType: isBuffer
                ? "application/octet-stream"
                : filename.slice(-5) === ".json"
                ? "application/json"
                : "text/plain; charset=UTF-8",
            cacheControl: "no-cache",
        },
    });
}

async function uploadHunspellToStorage(wordlist: {
    affContent: string,
    dicContent: string,
}) {
    await Promise.all([
        upload(wordlist.affContent, hunspellAffFileFilename),
        upload(wordlist.dicContent, hunspellDicFileFilename),
    ]);
}

async function uploadDictionaryToStorage(dictionary: T.Dictionary) {
    const dictionaryBuffer = writeDictionary(dictionary);
    const dictionaryInfoBuffer = writeDictionaryInfo(dictionary.info);
    await Promise.all([
        upload(JSON.stringify(dictionary), `${dictionaryFilename}.json`),
        upload(JSON.stringify(dictionary.info), `${dictionaryInfoFilename}.json`),
        upload(dictionaryBuffer as Buffer, dictionaryFilename),
        upload(dictionaryInfoBuffer as Buffer, dictionaryInfoFilename),
    ]);
}

function makeHunspell(wordlist: string[]) {
    return {
        dicContent: wordlist.reduce((acc, word) => acc + word + "\n", wordlist.length + "\n"),
        affContent: "SET UTF-8\nCOMPLEXPREFIXES\nIGNORE ۱۲۳۴۵۶۷۸۹۰-=ًٌٍَُِّْ؛:؟.،,،؟\n",
    };
}