diff --git a/account/src/lib/dictionary.ts b/account/src/lib/dictionary.ts index ade8b8c..88fb521 100644 --- a/account/src/lib/dictionary.ts +++ b/account/src/lib/dictionary.ts @@ -33,6 +33,7 @@ async function fetchDictionary(): Promise { } async function fetchAllWords(): Promise { + // TODO: this is really ugly const res = await fetch(process.env.LINGDOCS_DICTIONARY_URL?.slice(0, -4) + "all-words.json"); return await res.json(); } diff --git a/account/src/lib/handle-unmatched.ts b/account/src/lib/handle-unmatched.ts new file mode 100644 index 0000000..46b7866 --- /dev/null +++ b/account/src/lib/handle-unmatched.ts @@ -0,0 +1,271 @@ +const arabicNumsRegex = /[۰-۹]/g; +const pRegex = /اً|أ|ا|آ|ٱ|ٲ|ٳ|ئی|ئي|ئے|یٰ|ی|ي|ې|ۍ|ئ|ے|س|ص|ث|څ|ج|چ|هٔ|ه|ۀ|غز|زغ|کش|شک|ښک|ښک|پښ|ښپ|ہ|ع|و|ؤ|ښ|غ|خ|ح|ش|ز|ض|ذ|ځ|ظ|ژ|ر|ړ|ڑ|ت|ټ|ٹ|ط|د|ډ|ڈ|مب|م|نب|ن|ڼ|ک|ګ|گ|ل|ق|ږ|ب|پ|ف/g; +// [\u0621-\u065f\u0670-\u06d3\u06d5]/g; +const pTable: ({ + chars: string[], + beg: string, + mid: string, + end: string, +} | { + chars: string[], + sound: string, +})[] = [ + { + chars: ["ءع"], + sound: "", + }, + { + chars: ["آ"], + sound: "a", + }, + { + chars: ["أ"], + sound: "U", + }, + { + chars: ["ؤ"], + sound: "o/w", + }, + { + chars: ["إ"], + sound: "i", + }, + { + chars: ["ئ"], + beg: "y", + mid: "y", + end: "eyy", + }, + { + chars: ["ا"], + beg: "aa/a/i/u/U", + mid: "aa", + end: "aa", + }, + { + chars: ["ب"], + sound: "b", + }, + { + chars: ["ة"], + sound: "a/u", + }, + { + chars: ["ت", "ط"], + sound: "t", + }, + { + chars: ["ټ"], + sound: "T", + }, + { + chars: ["ث", "س", "ص"], + sound: "s", + }, + { + chars: ["ج"], + sound: "j", + }, + { + chars: ["ح"], + sound: "h", + }, + { + chars: ["اه"], + sound: "aah", + }, + { + chars: ["ه"], + beg: "h", + mid: "h", + end: "a/i/u/h", + }, + { + chars: ["خ"], + sound: "kh", + }, + { + chars: ["د"], + sound: "d", + }, + { + chars: ["ذ", "ز", "ض", "ظ"], + sound: "z", + }, + { + chars: ["ډ"], + sound: "D", + }, + { + chars: ["ر"], + sound: "r", + }, + { + chars: ["ړ"], + sound: "R", + }, + { + chars: ["ش"], + sound: "sh", + }, + { + chars: ["غ"], + sound: "gh", + }, + { + chars: ["ف"], + sound: "f", + }, + { + chars: ["ق"], + sound: "q", + }, + { + chars: ["ك", "ک"], + sound: "k", + }, + { + chars: ["ل"], + sound: "l", + }, + { + chars: ["م"], + sound: "m", + }, + { + chars: ["ن"], + sound: "n", + }, + { + chars: ["ڼ"], + sound: "N", + }, + { + chars: ["و"], + beg: "w", + mid: "w/o/oo", + end: "w/o/oo", + }, + { + chars: ["ای"], + sound: "aay", + }, + { + chars: ["وی"], + sound: "ooy", + }, + { + chars: ["ی", "ے"], + beg: "y", + mid: "ey/ee/y", + end: "ey", + }, + { + chars: ["ي"], + beg: "y", + mid: "ey/ee/y", + end: "ee", + }, + { + chars: ["اً"], + sound: "an", + }, + { + chars: ["ځ"], + sound: "dz", + }, + { + chars: ["څ"], + sound: "ts", + }, + { + chars: ["چ"], + sound: "ch", + }, + { + chars: ["ږ"], + sound: "G", + }, + { + chars: ["ژ"], + sound: "jz", + }, + { + chars: ["ښ"], + sound: "x", + }, + { + chars: ["ۍ"], + sound: "uy", + }, + { + chars: ["ې"], + sound: "e", + }, + { + chars: ["ګ", "گ"], + sound: "g", + }, + { + chars: ["یٰ"], + sound: "aa", + }, +]; + + +// "ء": "", +// "آ": "", +// "أ": "", +// "ؤ": "", +// "إ": "", +// "ئ": "", +// "ا": "", +// "": "", +// "": "", +// "": "", +// "": "", +// "": "", +// "": "", +// "": "", +// "": "", +// "": "", +// } +const numsTable = { + "۰": "0", + "۱": "1", + "۲": "2", + "۳": "3", + "۴": "4", + "۵": "5", + "۶": "6", + "۷": "7", + "۸": "8", + "۹": "9", +}; + + +export function handlePunctuationAndNums(s: string): string { + return s.replace(/؟/g, "?") + .replace(/،/g, ",") + .replace(/«/g, '"') + .replace(/»/g, '"') + .replace(arabicNumsRegex, (mtch) => { + // @ts-ignore + return numsTable[mtch]; + }); +} + +export function handleUnmatched(s: string): string { + const g = s.replace(pRegex, (mtch, i) => { + const pos: "beg" | "mid" | "end" = i === 0 + ? "beg" + : i === s.length-1 + ? "end" + : "mid"; + const m = pTable.find(x => x.chars.includes(mtch)); + if (!m) return ""; + const sound = "sound" in m ? m.sound : m[pos]; + return sound.includes("/") ? `(${sound})` : sound; + }) + return `?*${g}*?`; +} \ No newline at end of file diff --git a/account/src/lib/scriptToPhonetics.ts b/account/src/lib/scriptToPhonetics.ts new file mode 100644 index 0000000..81a0a3a --- /dev/null +++ b/account/src/lib/scriptToPhonetics.ts @@ -0,0 +1,173 @@ +import { + Types as T, + standardizePashto, + removeAccents, +} from "@lingdocs/inflect"; +import { findInAllWords } from "./dictionary"; +import { + handlePunctuationAndNums, + handleUnmatched, +} from "./handle-unmatched"; + + +// TODO: handle و ارزي +// spacing error with کور کې چېرته اوسېږئ + +function isP(c: string): boolean { + return !!c.match(/[\u0621-\u065f\u0670-\u06d3\u06d5]/); +} + +// TODO: ERRORING WHEN YOU JUST PUT A BUNCH OF ENGLISH CHARS IN THE TEXT + +/** + * Converts some Pashto texts to phonetics by looking up each word in the dictionary and finding + * the phonetic equivalent + * + * @param p + * @returns + */ +export function scriptToPhonetics(p: string, accents: boolean): { + phonetics: string, + missing: string[], +} { + const words = splitWords(standardizePashto(p)); + if (!words.length) return { + phonetics: "", + missing: [], + } + // TODO: keep going with the hyphens etc + // also و ارزي + const converted: string[] = []; + const missing = new Set(); + let i = 0; + function handleAccents(f: string): string { + return accents ? f : removeAccents(f); + } + function checkHyphenMatch(psw: T.PsWord): { + match: boolean, + words: number, + f: string, + } { + if (!psw.hyphen) { + throw new Error("checking a match without a hyphen content"); + } + let match = false; + let f = psw.f; + let k = 1; + for (let j = 0; j < psw.hyphen.length; j++) { + const h = psw.hyphen[j]; + const w = words[i+k]; + if (h.type === "unwritten" && w === " ") { + match = true; + f += `-${h.f}`; + k += 1; + } else if (h.type === "written" && w === h.p) { + match = true; + f += `-${h.f}`; + k += 1; + } else if (h.type === "written" && w === " " && words[i+1+k] === h.p) { + match = true; + f += `-${h.f}`; + k += 2; + } else { + match = false; + break; + } + } + return { + match, + f, + words: k, + } + } + function handleMatches(matches: T.PsWord[]): string[] { + const hyphens = matches.filter(x => x.hyphen); + const plain = matches.filter(x => !x.hyphen); + const processed = new Set(); + if (hyphens.length) { + for (let h of hyphens) { + const res = checkHyphenMatch(h); + if (res.match) { + i += res.words; + processed.add(handleAccents(res.f)); + break; + } + } + } else if (hyphens.length && !plain.length) { + processed.add("ERR"); + i++; + } { + plain.forEach((x) => { + processed.add(handleAccents(x.f)); + }); + i++; + } + return Array.from(processed); + } + while (i < words.length) { + const word = words[i]; + const p = isP(word); + if (p) { + const matches = findInAllWords(possibleFuzzify(word)); + if (!matches) { + throw new Error("not initialized"); + } + if (matches.length > 0) { + const possibilities = handleMatches(matches); + converted.push(possibilities.join("/")); + } else { + missing.add(word); + converted.push(handleUnmatched(word)); + i++; + } + } else { + converted.push(handlePunctuationAndNums(word)); + i++; + } + } + return { + phonetics: converted.join(""), + missing: Array.from(missing), + }; +} + +function splitWords(p: string): string[] { + const words: string[] = []; + let current = ""; + let onP: boolean = true; + const chars = p.split(""); + for (let char of chars) { + const p = isP(char); + if (p) { + if (onP) { + current += char; + } else { + words.push(current); + current = char; + onP = true; + } + } else { + if (onP) { + words.push(current); + current = char; + onP = false; + } else { + current += char; + } + } + } + words.push(current); + return words; +} + +function possibleFuzzify(s: string): string | RegExp { + if (s.length < 3) { + return s; + } + const middle = s.slice(1, -1); + if (middle.includes("ې") || middle.includes("ی")) { + return new RegExp(`^${s[0]}${middle.replace(/[ی|ې]/g, "[ې|ی]")}${s.slice(-1)}$`); + } else { + return s; + } +} \ No newline at end of file diff --git a/account/src/routers/dictionary-router.ts b/account/src/routers/dictionary-router.ts index aa75cf2..5834a29 100644 --- a/account/src/routers/dictionary-router.ts +++ b/account/src/routers/dictionary-router.ts @@ -6,6 +6,7 @@ import { getEntries, updateDictionary, } from "../lib/dictionary"; +import { scriptToPhonetics } from "../lib/scriptToPhonetics"; const dictionaryRouter = express.Router(); @@ -14,15 +15,16 @@ dictionaryRouter.post("/update", async (req, res, next) => { res.send({ ok: true, result }); }); -dictionaryRouter.post("/all-words", async (req, res, next) => { +dictionaryRouter.post("/script-to-phonetics", async (req, res, next) => { if (!allWordsCollection) { return res.send({ ok: false, message: "allWords not ready" }); } - const word = req.body.word as string; - if (!word) { + const text = req.body.text as unknown; + const accents = req.body.accents as unknown; + if (!text || typeof text !== "string" || typeof accents !== "boolean") { return res.status(400).send({ ok: false, error: "invalid query" }); } - const results = await findInAllWords(word); + const results = await scriptToPhonetics(text, accents); res.send(results); }) diff --git a/account/test.http b/account/test.http index e8d9da3..b8b3af2 100644 --- a/account/test.http +++ b/account/test.http @@ -1,6 +1,6 @@ -POST https://account.lingdocs.com/dictionary/entries HTTP/1.1 +POST https://account.lingdocs.com/dictionary/all-words HTTP/1.1 content-type: application/json { - "ids": ["لیدل", 1527815306] + "word": "کور" } \ No newline at end of file