add script to phonetics conversion logic

2023-01-29 22:56:36 +05:00 · 2023-01-29 22:56:36 +05:00 · 0c1f6f56d0
parent a1a6194717
commit 0c1f6f56d0
5 changed files with 453 additions and 6 deletions
--- a/account/src/lib/dictionary.ts
+++ b/account/src/lib/dictionary.ts
@ -33,6 +33,7 @@ async function fetchDictionary(): Promise<T.Dictionary> {
 }

 async function fetchAllWords(): Promise<T.AllWordsWithInflections> {
+    // TODO: this is really ugly
    const res = await fetch(process.env.LINGDOCS_DICTIONARY_URL?.slice(0, -4) + "all-words.json");
    return await res.json();
 }
--- a/account/src/lib/handle-unmatched.ts
+++ b/account/src/lib/handle-unmatched.ts
@ -0,0 +1,271 @@
+const arabicNumsRegex = /[۰-۹]/g;
+const pRegex = /اً|أ|ا|آ|ٱ|ٲ|ٳ|ئی|ئي|ئے|یٰ|ی|ي|ې|ۍ|ئ|ے|س|ص|ث|څ|ج|چ|هٔ|ه|ۀ|غز|زغ|کش|شک|ښک|ښک|پښ|ښپ|ہ|ع|و|ؤ|ښ|غ|خ|ح|ش|ز|ض|ذ|ځ|ظ|ژ|ر|ړ|ڑ|ت|ټ|ٹ|ط|د|ډ|ڈ|مب|م|نب|ن|ڼ|ک|ګ|گ|ل|ق|ږ|ب|پ|ف/g;
+// [\u0621-\u065f\u0670-\u06d3\u06d5]/g;
+const pTable: ({
+    chars: string[],
+    beg: string,
+    mid: string,
+    end: string,
+} | {
+    chars: string[],
+    sound: string,
+})[] = [
+    {
+        chars: ["ءع"],
+        sound: "",
+    },
+    {
+        chars: ["آ"],
+        sound: "a",
+    },
+    {
+        chars: ["أ"],
+        sound: "U",
+    },
+    {
+        chars: ["ؤ"],
+        sound: "o/w",
+    },
+    {
+        chars: ["إ"],
+        sound: "i",
+    },
+    {
+        chars: ["ئ"],
+        beg: "y",
+        mid: "y",
+        end: "eyy",
+    },
+    {
+        chars: ["ا"],
+        beg: "aa/a/i/u/U",
+        mid: "aa",
+        end: "aa",
+    },
+    {
+        chars: ["ب"],
+        sound: "b",
+    },
+    {
+        chars: ["ة"],
+        sound: "a/u",
+    },
+    {
+        chars: ["ت", "ط"],
+        sound: "t",
+    },
+    {
+        chars: ["ټ"],
+        sound: "T",
+    },
+    {
+        chars: ["ث", "س", "ص"],
+        sound: "s",
+    },
+    {
+        chars: ["ج"],
+        sound: "j",
+    },
+    {
+        chars: ["ح"],
+        sound: "h",
+    },
+    {
+        chars: ["اه"],
+        sound: "aah",
+    },
+    {
+        chars: ["ه"],
+        beg: "h",
+        mid: "h",
+        end: "a/i/u/h",
+    },
+    {
+        chars: ["خ"],
+        sound: "kh",
+    },
+    {
+        chars: ["د"],
+        sound: "d",
+    },
+    {
+        chars: ["ذ", "ز", "ض", "ظ"],
+        sound: "z",
+    },
+    {
+        chars: ["ډ"],
+        sound: "D",
+    },
+    {
+        chars: ["ر"],
+        sound: "r",
+    },
+    {
+        chars: ["ړ"],
+        sound: "R",
+    },
+    {
+        chars: ["ش"],
+        sound: "sh",
+    },
+    {
+        chars: ["غ"],
+        sound: "gh",
+    },
+    {
+        chars: ["ف"],
+        sound: "f",
+    },
+    {
+        chars: ["ق"],
+        sound: "q",
+    },
+    {
+        chars: ["ك", "ک"],
+        sound: "k",
+    },
+    {
+        chars: ["ل"],
+        sound: "l",
+    },
+    {
+        chars: ["م"],
+        sound: "m",
+    },
+    {
+        chars: ["ن"],
+        sound: "n",
+    },
+    {
+        chars: ["ڼ"],
+        sound: "N",
+    },
+    {
+        chars: ["و"],
+        beg: "w",
+        mid: "w/o/oo",
+        end: "w/o/oo",
+    },
+    {
+        chars: ["ای"],
+        sound: "aay",
+    },
+    {
+        chars: ["وی"],
+        sound: "ooy",
+    },
+    {
+        chars: ["ی", "ے"],
+        beg: "y",
+        mid: "ey/ee/y",
+        end: "ey",
+    },
+    {
+        chars: ["ي"],
+        beg: "y",
+        mid: "ey/ee/y",
+        end: "ee",
+    },
+    {
+        chars: ["اً"],
+        sound: "an",
+    },
+    {
+        chars: ["ځ"],
+        sound: "dz",
+    },
+    {
+        chars: ["څ"],
+        sound: "ts",
+    },
+    {
+        chars: ["چ"],
+        sound: "ch",
+    },
+    {
+        chars: ["ږ"],
+        sound: "G",
+    },
+    {
+        chars: ["ژ"],
+        sound: "jz",
+    },
+    {
+        chars: ["ښ"],
+        sound: "x",
+    },
+    {
+        chars: ["ۍ"],
+        sound: "uy",
+    },
+    {
+        chars: ["ې"],
+        sound: "e",
+    },
+    {
+        chars: ["ګ", "گ"],
+        sound: "g",
+    },
+    {
+        chars: ["یٰ"],
+        sound: "aa",
+    },
+];
+
+
+//     "ء": "",
+//     "آ": "",
+//     "أ": "",
+//     "ؤ": "",
+//     "إ": "",
+//     "ئ": "",
+//     "ا": "",
+//     "": "",
+//     "": "",
+//     "": "",
+//     "": "",
+//     "": "",
+//     "": "",
+//     "": "",
+//     "": "",
+//     "": "",
+// }
+const numsTable = {
+    "۰": "0",
+    "۱": "1",
+    "۲": "2",
+    "۳": "3",
+    "۴": "4",
+    "۵": "5",
+    "۶": "6",
+    "۷": "7",
+    "۸": "8",
+    "۹": "9",
+};
+
+
+export function handlePunctuationAndNums(s: string): string {
+    return s.replace(/؟/g, "?")
+        .replace(/،/g, ",")
+        .replace(/«/g, '"')
+        .replace(/»/g, '"')
+        .replace(arabicNumsRegex, (mtch) => {
+            // @ts-ignore
+            return numsTable[mtch];
+        });
+}
+
+export function handleUnmatched(s: string): string {
+    const g = s.replace(pRegex, (mtch, i) => {
+        const pos: "beg" | "mid" | "end" = i === 0
+            ? "beg"
+            : i === s.length-1
+            ? "end"
+            : "mid";
+        const m = pTable.find(x => x.chars.includes(mtch));
+        if (!m) return "";
+        const sound = "sound" in m ? m.sound : m[pos];
+        return sound.includes("/") ? `(${sound})` : sound;
+    })
+    return `?*${g}*?`;
+}
--- a/account/src/lib/scriptToPhonetics.ts
+++ b/account/src/lib/scriptToPhonetics.ts
@ -0,0 +1,173 @@
+import {
+    Types as T,
+    standardizePashto,
+    removeAccents,
+} from "@lingdocs/inflect";
+import { findInAllWords } from "./dictionary";
+import {
+    handlePunctuationAndNums,
+    handleUnmatched,
+} from "./handle-unmatched";
+
+
+// TODO: handle و ارزي
+// spacing error with کور کې چېرته اوسېږئ
+
+function isP(c: string): boolean {
+    return !!c.match(/[\u0621-\u065f\u0670-\u06d3\u06d5]/);
+}
+
+// TODO: ERRORING WHEN YOU JUST PUT A BUNCH OF ENGLISH CHARS IN THE TEXT
+
+/**
+ * Converts some Pashto texts to phonetics by looking up each word in the dictionary and finding 
+ * the phonetic equivalent
+ * 
+ * @param p 
+ * @returns 
+ */
+export function scriptToPhonetics(p: string, accents: boolean): {
+    phonetics: string,
+    missing: string[],
+} {
+    const words = splitWords(standardizePashto(p));
+    if (!words.length) return {
+        phonetics: "",
+        missing: [],
+    }
+    // TODO: keep going with the hyphens etc
+    // also و ارزي
+    const converted: string[] = [];
+    const missing = new Set<string>();
+    let i = 0;
+    function handleAccents(f: string): string {
+        return accents ? f : removeAccents(f);
+    }
+    function checkHyphenMatch(psw: T.PsWord): {
+        match: boolean,
+        words: number,
+        f: string,
+    } {
+        if (!psw.hyphen) {
+            throw new Error("checking a match without a hyphen content");
+        }
+        let match = false;
+        let f = psw.f;
+        let k = 1;
+        for (let j = 0; j < psw.hyphen.length; j++) {
+            const h = psw.hyphen[j];
+            const w = words[i+k];
+            if (h.type === "unwritten" && w === " ") {
+                match = true;
+                f += `-${h.f}`;
+                k += 1;
+            } else if (h.type === "written" && w === h.p) {
+                match = true;
+                f += `-${h.f}`;
+                k += 1;
+            } else if (h.type === "written" && w === " " && words[i+1+k] === h.p) {
+                match = true;
+                f += `-${h.f}`;
+                k += 2;
+            } else {
+                match = false;
+                break;
+            }
+        }
+        return {
+            match,
+            f,
+            words: k,
+        }
+    }
+    function handleMatches(matches: T.PsWord[]): string[] {
+        const hyphens = matches.filter(x => x.hyphen);
+        const plain = matches.filter(x => !x.hyphen);
+        const processed = new Set<string>();
+        if (hyphens.length) {
+            for (let h of hyphens) {
+                const res = checkHyphenMatch(h);
+                if (res.match) {
+                    i += res.words;
+                    processed.add(handleAccents(res.f));
+                    break;
+                }
+            }
+        } else if (hyphens.length && !plain.length) {
+            processed.add("ERR");
+            i++;
+        } {
+            plain.forEach((x) => {
+                processed.add(handleAccents(x.f));
+            });
+            i++;
+        }
+        return Array.from(processed);
+    }
+    while (i < words.length) {
+        const word = words[i];
+        const p = isP(word);
+        if (p) {
+            const matches = findInAllWords(possibleFuzzify(word));
+            if (!matches) {
+                throw new Error("not initialized");
+            }
+            if (matches.length > 0) {
+                const possibilities = handleMatches(matches);
+                converted.push(possibilities.join("/"));
+            } else {
+                missing.add(word);
+                converted.push(handleUnmatched(word));
+                i++;
+            }
+        } else {
+            converted.push(handlePunctuationAndNums(word));
+            i++;
+        }
+    }
+    return {
+        phonetics: converted.join(""),
+        missing: Array.from(missing),
+    };
+}
+
+function splitWords(p: string): string[] {
+    const words: string[] = [];
+    let current = "";
+    let onP: boolean = true;
+    const chars = p.split("");
+    for (let char of chars) {
+        const p = isP(char);
+        if (p) {
+            if (onP) {
+                current += char;
+            } else {
+                words.push(current);
+                current = char;
+                onP = true;
+            }
+        } else {
+            if (onP) {
+                words.push(current);
+                current = char;
+                onP = false;
+            } else {
+                current += char;
+            }
+        }
+    }
+    words.push(current);
+    return words;
+}
+
+function possibleFuzzify(s: string): string | RegExp {
+    if (s.length < 3) {
+        return s;
+    }
+    const middle = s.slice(1, -1);
+    if (middle.includes("ې") || middle.includes("ی")) {
+        return new RegExp(`^${s[0]}${middle.replace(/[ی|ې]/g, "[ې|ی]")}${s.slice(-1)}$`);
+    } else {
+        return s;
+    }
+}
--- a/account/src/routers/dictionary-router.ts
+++ b/account/src/routers/dictionary-router.ts
@ -6,6 +6,7 @@ import {
    getEntries,
    updateDictionary,    
 } from "../lib/dictionary";
+import { scriptToPhonetics } from "../lib/scriptToPhonetics";

 const dictionaryRouter = express.Router();

@ -14,15 +15,16 @@ dictionaryRouter.post("/update", async (req, res, next) => {
    res.send({ ok: true, result });
 });

-dictionaryRouter.post("/all-words", async (req, res, next) => {
+dictionaryRouter.post("/script-to-phonetics", async (req, res, next) => {
    if (!allWordsCollection) {
        return res.send({ ok: false, message: "allWords not ready" });
    }
-    const word = req.body.word as string;
-    if (!word) {
+    const text = req.body.text as unknown;
+    const accents = req.body.accents as unknown;
+    if (!text || typeof text !== "string" || typeof accents !== "boolean") {
        return res.status(400).send({ ok: false, error: "invalid query" });
    }
-    const results = await findInAllWords(word);
+    const results = await scriptToPhonetics(text, accents);
    res.send(results);
 })

--- a/account/test.http
+++ b/account/test.http
@ -1,6 +1,6 @@
-POST https://account.lingdocs.com/dictionary/entries HTTP/1.1
+POST https://account.lingdocs.com/dictionary/all-words HTTP/1.1
 content-type: application/json

 {
-    "ids": ["لیدل", 1527815306]
+    "word": "کور"
 }