add script to phonetics conversion logic

2023-01-29 22:56:36 +05:00 · 2023-01-29 22:56:36 +05:00 · 0c1f6f56d0
parent a1a6194717
commit 0c1f6f56d0
5 changed files with 453 additions and 6 deletions
--- a/account/src/lib/dictionary.ts
+++ b/account/src/lib/dictionary.ts
@ -33,6 +33,7 @@ async function fetchDictionary(): Promise<T.Dictionary> {
 }
 async function fetchAllWords(): Promise<T.AllWordsWithInflections> {
    // TODO: this is really ugly
    const res = await fetch(process.env.LINGDOCS_DICTIONARY_URL?.slice(0, -4) + "all-words.json");
    return await res.json();
 }
--- a/account/src/lib/handle-unmatched.ts
+++ b/account/src/lib/handle-unmatched.ts
@ -0,0 +1,271 @@
 const arabicNumsRegex = /[۰-۹]/g;
 const pRegex = /اً|أ|ا|آ|ٱ|ٲ|ٳ|ئی|ئي|ئے|یٰ|ی|ي|ې|ۍ|ئ|ے|س|ص|ث|څ|ج|چ|هٔ|ه|ۀ|غز|زغ|کش|شک|ښک|ښک|پښ|ښپ|ہ|ع|و|ؤ|ښ|غ|خ|ح|ش|ز|ض|ذ|ځ|ظ|ژ|ر|ړ|ڑ|ت|ټ|ٹ|ط|د|ډ|ڈ|مب|م|نب|ن|ڼ|ک|ګ|گ|ل|ق|ږ|ب|پ|ف/g;
 // [\u0621-\u065f\u0670-\u06d3\u06d5]/g;
 const pTable: ({
    chars: string[],
    beg: string,
    mid: string,
    end: string,
 } | {
    chars: string[],
    sound: string,
 })[] = [
    {
        chars: ["ءع"],
        sound: "",
    },
    {
        chars: ["آ"],
        sound: "a",
    },
    {
        chars: ["أ"],
        sound: "U",
    },
    {
        chars: ["ؤ"],
        sound: "o/w",
    },
    {
        chars: ["إ"],
        sound: "i",
    },
    {
        chars: ["ئ"],
        beg: "y",
        mid: "y",
        end: "eyy",
    },
    {
        chars: ["ا"],
        beg: "aa/a/i/u/U",
        mid: "aa",
        end: "aa",
    },
    {
        chars: ["ب"],
        sound: "b",
    },
    {
        chars: ["ة"],
        sound: "a/u",
    },
    {
        chars: ["ت", "ط"],
        sound: "t",
    },
    {
        chars: ["ټ"],
        sound: "T",
    },
    {
        chars: ["ث", "س", "ص"],
        sound: "s",
    },
    {
        chars: ["ج"],
        sound: "j",
    },
    {
        chars: ["ح"],
        sound: "h",
    },
    {
        chars: ["اه"],
        sound: "aah",
    },
    {
        chars: ["ه"],
        beg: "h",
        mid: "h",
        end: "a/i/u/h",
    },
    {
        chars: ["خ"],
        sound: "kh",
    },
    {
        chars: ["د"],
        sound: "d",
    },
    {
        chars: ["ذ", "ز", "ض", "ظ"],
        sound: "z",
    },
    {
        chars: ["ډ"],
        sound: "D",
    },
    {
        chars: ["ر"],
        sound: "r",
    },
    {
        chars: ["ړ"],
        sound: "R",
    },
    {
        chars: ["ش"],
        sound: "sh",
    },
    {
        chars: ["غ"],
        sound: "gh",
    },
    {
        chars: ["ف"],
        sound: "f",
    },
    {
        chars: ["ق"],
        sound: "q",
    },
    {
        chars: ["ك", "ک"],
        sound: "k",
    },
    {
        chars: ["ل"],
        sound: "l",
    },
    {
        chars: ["م"],
        sound: "m",
    },
    {
        chars: ["ن"],
        sound: "n",
    },
    {
        chars: ["ڼ"],
        sound: "N",
    },
    {
        chars: ["و"],
        beg: "w",
        mid: "w/o/oo",
        end: "w/o/oo",
    },
    {
        chars: ["ای"],
        sound: "aay",
    },
    {
        chars: ["وی"],
        sound: "ooy",
    },
    {
        chars: ["ی", "ے"],
        beg: "y",
        mid: "ey/ee/y",
        end: "ey",
    },
    {
        chars: ["ي"],
        beg: "y",
        mid: "ey/ee/y",
        end: "ee",
    },
    {
        chars: ["اً"],
        sound: "an",
    },
    {
        chars: ["ځ"],
        sound: "dz",
    },
    {
        chars: ["څ"],
        sound: "ts",
    },
    {
        chars: ["چ"],
        sound: "ch",
    },
    {
        chars: ["ږ"],
        sound: "G",
    },
    {
        chars: ["ژ"],
        sound: "jz",
    },
    {
        chars: ["ښ"],
        sound: "x",
    },
    {
        chars: ["ۍ"],
        sound: "uy",
    },
    {
        chars: ["ې"],
        sound: "e",
    },
    {
        chars: ["ګ", "گ"],
        sound: "g",
    },
    {
        chars: ["یٰ"],
        sound: "aa",
    },
 ];
 //     "ء": "",
 //     "آ": "",
 //     "أ": "",
 //     "ؤ": "",
 //     "إ": "",
 //     "ئ": "",
 //     "ا": "",
 //     "": "",
 //     "": "",
 //     "": "",
 //     "": "",
 //     "": "",
 //     "": "",
 //     "": "",
 //     "": "",
 //     "": "",
 // }
 const numsTable = {
    "۰": "0",
    "۱": "1",
    "۲": "2",
    "۳": "3",
    "۴": "4",
    "۵": "5",
    "۶": "6",
    "۷": "7",
    "۸": "8",
    "۹": "9",
 };
 export function handlePunctuationAndNums(s: string): string {
    return s.replace(/؟/g, "?")
        .replace(/،/g, ",")
        .replace(/«/g, '"')
        .replace(/»/g, '"')
        .replace(arabicNumsRegex, (mtch) => {
            // @ts-ignore
            return numsTable[mtch];
        });
 }
 export function handleUnmatched(s: string): string {
    const g = s.replace(pRegex, (mtch, i) => {
        const pos: "beg" | "mid" | "end" = i === 0
            ? "beg"
            : i === s.length-1
            ? "end"
            : "mid";
        const m = pTable.find(x => x.chars.includes(mtch));
        if (!m) return "";
        const sound = "sound" in m ? m.sound : m[pos];
        return sound.includes("/") ? `(${sound})` : sound;
    })
    return `?*${g}*?`;
 }
--- a/account/src/lib/scriptToPhonetics.ts
+++ b/account/src/lib/scriptToPhonetics.ts
@ -0,0 +1,173 @@
 import {
    Types as T,
    standardizePashto,
    removeAccents,
 } from "@lingdocs/inflect";
 import { findInAllWords } from "./dictionary";
 import {
    handlePunctuationAndNums,
    handleUnmatched,
 } from "./handle-unmatched";
 // TODO: handle و ارزي
 // spacing error with کور کې چېرته اوسېږئ
 function isP(c: string): boolean {
    return !!c.match(/[\u0621-\u065f\u0670-\u06d3\u06d5]/);
 }
 // TODO: ERRORING WHEN YOU JUST PUT A BUNCH OF ENGLISH CHARS IN THE TEXT
 /**
 * Converts some Pashto texts to phonetics by looking up each word in the dictionary and finding 
 * the phonetic equivalent
 * 
 * @param p 
 * @returns 
 */
 export function scriptToPhonetics(p: string, accents: boolean): {
    phonetics: string,
    missing: string[],
 } {
    const words = splitWords(standardizePashto(p));
    if (!words.length) return {
        phonetics: "",
        missing: [],
    }
    // TODO: keep going with the hyphens etc
    // also و ارزي
    const converted: string[] = [];
    const missing = new Set<string>();
    let i = 0;
    function handleAccents(f: string): string {
        return accents ? f : removeAccents(f);
    }
    function checkHyphenMatch(psw: T.PsWord): {
        match: boolean,
        words: number,
        f: string,
    } {
        if (!psw.hyphen) {
            throw new Error("checking a match without a hyphen content");
        }
        let match = false;
        let f = psw.f;
        let k = 1;
        for (let j = 0; j < psw.hyphen.length; j++) {
            const h = psw.hyphen[j];
            const w = words[i+k];
            if (h.type === "unwritten" && w === " ") {
                match = true;
                f += `-${h.f}`;
                k += 1;
            } else if (h.type === "written" && w === h.p) {
                match = true;
                f += `-${h.f}`;
                k += 1;
            } else if (h.type === "written" && w === " " && words[i+1+k] === h.p) {
                match = true;
                f += `-${h.f}`;
                k += 2;
            } else {
                match = false;
                break;
            }
        }
        return {
            match,
            f,
            words: k,
        }
    }
    function handleMatches(matches: T.PsWord[]): string[] {
        const hyphens = matches.filter(x => x.hyphen);
        const plain = matches.filter(x => !x.hyphen);
        const processed = new Set<string>();
        if (hyphens.length) {
            for (let h of hyphens) {
                const res = checkHyphenMatch(h);
                if (res.match) {
                    i += res.words;
                    processed.add(handleAccents(res.f));
                    break;
                }
            }
        } else if (hyphens.length && !plain.length) {
            processed.add("ERR");
            i++;
        } {
            plain.forEach((x) => {
                processed.add(handleAccents(x.f));
            });
            i++;
        }
        return Array.from(processed);
    }
    while (i < words.length) {
        const word = words[i];
        const p = isP(word);
        if (p) {
            const matches = findInAllWords(possibleFuzzify(word));
            if (!matches) {
                throw new Error("not initialized");
            }
            if (matches.length > 0) {
                const possibilities = handleMatches(matches);
                converted.push(possibilities.join("/"));
            } else {
                missing.add(word);
                converted.push(handleUnmatched(word));
                i++;
            }
        } else {
            converted.push(handlePunctuationAndNums(word));
            i++;
        }
    }
    return {
        phonetics: converted.join(""),
        missing: Array.from(missing),
    };
 }
 function splitWords(p: string): string[] {
    const words: string[] = [];
    let current = "";
    let onP: boolean = true;
    const chars = p.split("");
    for (let char of chars) {
        const p = isP(char);
        if (p) {
            if (onP) {
                current += char;
            } else {
                words.push(current);
                current = char;
                onP = true;
            }
        } else {
            if (onP) {
                words.push(current);
                current = char;
                onP = false;
            } else {
                current += char;
            }
        }
    }
    words.push(current);
    return words;
 }
 function possibleFuzzify(s: string): string | RegExp {
    if (s.length < 3) {
        return s;
    }
    const middle = s.slice(1, -1);
    if (middle.includes("ې") || middle.includes("ی")) {
        return new RegExp(`^${s[0]}${middle.replace(/[ی|ې]/g, "[ې|ی]")}${s.slice(-1)}$`);
    } else {
        return s;
    }
 }
--- a/account/src/routers/dictionary-router.ts
+++ b/account/src/routers/dictionary-router.ts
@ -6,6 +6,7 @@ import {
    getEntries,
    updateDictionary,    
 } from "../lib/dictionary";
 import { scriptToPhonetics } from "../lib/scriptToPhonetics";
 const dictionaryRouter = express.Router();
@ -14,15 +15,16 @@ dictionaryRouter.post("/update", async (req, res, next) => {
    res.send({ ok: true, result });
 });
-dictionaryRouter.post("/all-words", async (req, res, next) => {
+dictionaryRouter.post("/script-to-phonetics", async (req, res, next) => {
    if (!allWordsCollection) {
        return res.send({ ok: false, message: "allWords not ready" });
    }
-    const word = req.body.word as string;
+    const text = req.body.text as unknown;
-    if (!word) {
+    const accents = req.body.accents as unknown;
    if (!text || typeof text !== "string" || typeof accents !== "boolean") {
        return res.status(400).send({ ok: false, error: "invalid query" });
    }
-    const results = await findInAllWords(word);
+    const results = await scriptToPhonetics(text, accents);
    res.send(results);
 })
--- a/account/test.http
+++ b/account/test.http
@ -1,6 +1,6 @@
-POST https://account.lingdocs.com/dictionary/entries HTTP/1.1
+POST https://account.lingdocs.com/dictionary/all-words HTTP/1.1
 content-type: application/json
 {
-    "ids": ["لیدل", 1527815306]
+    "word": "کور"
 }