add script to phonetics conversion logic

This commit is contained in:
adueck 2023-01-29 22:56:36 +05:00
parent a1a6194717
commit 0c1f6f56d0
5 changed files with 453 additions and 6 deletions

View File

@ -33,6 +33,7 @@ async function fetchDictionary(): Promise<T.Dictionary> {
}
async function fetchAllWords(): Promise<T.AllWordsWithInflections> {
// TODO: this is really ugly
const res = await fetch(process.env.LINGDOCS_DICTIONARY_URL?.slice(0, -4) + "all-words.json");
return await res.json();
}

View File

@ -0,0 +1,271 @@
const arabicNumsRegex = /[۰-۹]/g;
const pRegex = /اً|أ|ا|آ|ٱ|ٲ|ٳ|ئی|ئي|ئے|یٰ|ی|ي|ې|ۍ|ئ|ے|س|ص|ث|څ|ج|چ|هٔ|ه|ۀ|غز|زغ|کش|شک|ښک|ښک|پښ|ښپ|ہ|ع|و|ؤ|ښ|غ|خ|ح|ش|ز|ض|ذ|ځ|ظ|ژ|ر|ړ|ڑ|ت|ټ|ٹ|ط|د|ډ|ڈ|مب|م|نب|ن|ڼ|ک|ګ|گ|ل|ق|ږ|ب|پ|ف/g;
// [\u0621-\u065f\u0670-\u06d3\u06d5]/g;
const pTable: ({
chars: string[],
beg: string,
mid: string,
end: string,
} | {
chars: string[],
sound: string,
})[] = [
{
chars: ["ءع"],
sound: "",
},
{
chars: ["آ"],
sound: "a",
},
{
chars: ["أ"],
sound: "U",
},
{
chars: ["ؤ"],
sound: "o/w",
},
{
chars: ["إ"],
sound: "i",
},
{
chars: ["ئ"],
beg: "y",
mid: "y",
end: "eyy",
},
{
chars: ["ا"],
beg: "aa/a/i/u/U",
mid: "aa",
end: "aa",
},
{
chars: ["ب"],
sound: "b",
},
{
chars: ["ة"],
sound: "a/u",
},
{
chars: ["ت", "ط"],
sound: "t",
},
{
chars: ["ټ"],
sound: "T",
},
{
chars: ["ث", "س", "ص"],
sound: "s",
},
{
chars: ["ج"],
sound: "j",
},
{
chars: ["ح"],
sound: "h",
},
{
chars: ["اه"],
sound: "aah",
},
{
chars: ["ه"],
beg: "h",
mid: "h",
end: "a/i/u/h",
},
{
chars: ["خ"],
sound: "kh",
},
{
chars: ["د"],
sound: "d",
},
{
chars: ["ذ", "ز", "ض", "ظ"],
sound: "z",
},
{
chars: ["ډ"],
sound: "D",
},
{
chars: ["ر"],
sound: "r",
},
{
chars: ["ړ"],
sound: "R",
},
{
chars: ["ش"],
sound: "sh",
},
{
chars: ["غ"],
sound: "gh",
},
{
chars: ["ف"],
sound: "f",
},
{
chars: ["ق"],
sound: "q",
},
{
chars: ["ك", "ک"],
sound: "k",
},
{
chars: ["ل"],
sound: "l",
},
{
chars: ["م"],
sound: "m",
},
{
chars: ["ن"],
sound: "n",
},
{
chars: ["ڼ"],
sound: "N",
},
{
chars: ["و"],
beg: "w",
mid: "w/o/oo",
end: "w/o/oo",
},
{
chars: ["ای"],
sound: "aay",
},
{
chars: ["وی"],
sound: "ooy",
},
{
chars: ["ی", "ے"],
beg: "y",
mid: "ey/ee/y",
end: "ey",
},
{
chars: ["ي"],
beg: "y",
mid: "ey/ee/y",
end: "ee",
},
{
chars: ["اً"],
sound: "an",
},
{
chars: ["ځ"],
sound: "dz",
},
{
chars: ["څ"],
sound: "ts",
},
{
chars: ["چ"],
sound: "ch",
},
{
chars: ["ږ"],
sound: "G",
},
{
chars: ["ژ"],
sound: "jz",
},
{
chars: ["ښ"],
sound: "x",
},
{
chars: ["ۍ"],
sound: "uy",
},
{
chars: ["ې"],
sound: "e",
},
{
chars: ["ګ", "گ"],
sound: "g",
},
{
chars: ["یٰ"],
sound: "aa",
},
];
// "ء": "",
// "آ": "",
// "أ": "",
// "ؤ": "",
// "إ": "",
// "ئ": "",
// "ا": "",
// "": "",
// "": "",
// "": "",
// "": "",
// "": "",
// "": "",
// "": "",
// "": "",
// "": "",
// }
const numsTable = {
"۰": "0",
"۱": "1",
"۲": "2",
"۳": "3",
"۴": "4",
"۵": "5",
"۶": "6",
"۷": "7",
"۸": "8",
"۹": "9",
};
export function handlePunctuationAndNums(s: string): string {
return s.replace(/؟/g, "?")
.replace(/،/g, ",")
.replace(/«/g, '"')
.replace(/»/g, '"')
.replace(arabicNumsRegex, (mtch) => {
// @ts-ignore
return numsTable[mtch];
});
}
export function handleUnmatched(s: string): string {
const g = s.replace(pRegex, (mtch, i) => {
const pos: "beg" | "mid" | "end" = i === 0
? "beg"
: i === s.length-1
? "end"
: "mid";
const m = pTable.find(x => x.chars.includes(mtch));
if (!m) return "";
const sound = "sound" in m ? m.sound : m[pos];
return sound.includes("/") ? `(${sound})` : sound;
})
return `?*${g}*?`;
}

View File

@ -0,0 +1,173 @@
import {
Types as T,
standardizePashto,
removeAccents,
} from "@lingdocs/inflect";
import { findInAllWords } from "./dictionary";
import {
handlePunctuationAndNums,
handleUnmatched,
} from "./handle-unmatched";
// TODO: handle و ارزي
// spacing error with کور کې چېرته اوسېږئ
function isP(c: string): boolean {
return !!c.match(/[\u0621-\u065f\u0670-\u06d3\u06d5]/);
}
// TODO: ERRORING WHEN YOU JUST PUT A BUNCH OF ENGLISH CHARS IN THE TEXT
/**
* Converts some Pashto texts to phonetics by looking up each word in the dictionary and finding
* the phonetic equivalent
*
* @param p
* @returns
*/
export function scriptToPhonetics(p: string, accents: boolean): {
phonetics: string,
missing: string[],
} {
const words = splitWords(standardizePashto(p));
if (!words.length) return {
phonetics: "",
missing: [],
}
// TODO: keep going with the hyphens etc
// also و ارزي
const converted: string[] = [];
const missing = new Set<string>();
let i = 0;
function handleAccents(f: string): string {
return accents ? f : removeAccents(f);
}
function checkHyphenMatch(psw: T.PsWord): {
match: boolean,
words: number,
f: string,
} {
if (!psw.hyphen) {
throw new Error("checking a match without a hyphen content");
}
let match = false;
let f = psw.f;
let k = 1;
for (let j = 0; j < psw.hyphen.length; j++) {
const h = psw.hyphen[j];
const w = words[i+k];
if (h.type === "unwritten" && w === " ") {
match = true;
f += `-${h.f}`;
k += 1;
} else if (h.type === "written" && w === h.p) {
match = true;
f += `-${h.f}`;
k += 1;
} else if (h.type === "written" && w === " " && words[i+1+k] === h.p) {
match = true;
f += `-${h.f}`;
k += 2;
} else {
match = false;
break;
}
}
return {
match,
f,
words: k,
}
}
function handleMatches(matches: T.PsWord[]): string[] {
const hyphens = matches.filter(x => x.hyphen);
const plain = matches.filter(x => !x.hyphen);
const processed = new Set<string>();
if (hyphens.length) {
for (let h of hyphens) {
const res = checkHyphenMatch(h);
if (res.match) {
i += res.words;
processed.add(handleAccents(res.f));
break;
}
}
} else if (hyphens.length && !plain.length) {
processed.add("ERR");
i++;
} {
plain.forEach((x) => {
processed.add(handleAccents(x.f));
});
i++;
}
return Array.from(processed);
}
while (i < words.length) {
const word = words[i];
const p = isP(word);
if (p) {
const matches = findInAllWords(possibleFuzzify(word));
if (!matches) {
throw new Error("not initialized");
}
if (matches.length > 0) {
const possibilities = handleMatches(matches);
converted.push(possibilities.join("/"));
} else {
missing.add(word);
converted.push(handleUnmatched(word));
i++;
}
} else {
converted.push(handlePunctuationAndNums(word));
i++;
}
}
return {
phonetics: converted.join(""),
missing: Array.from(missing),
};
}
function splitWords(p: string): string[] {
const words: string[] = [];
let current = "";
let onP: boolean = true;
const chars = p.split("");
for (let char of chars) {
const p = isP(char);
if (p) {
if (onP) {
current += char;
} else {
words.push(current);
current = char;
onP = true;
}
} else {
if (onP) {
words.push(current);
current = char;
onP = false;
} else {
current += char;
}
}
}
words.push(current);
return words;
}
function possibleFuzzify(s: string): string | RegExp {
if (s.length < 3) {
return s;
}
const middle = s.slice(1, -1);
if (middle.includes("ې") || middle.includes("ی")) {
return new RegExp(`^${s[0]}${middle.replace(/[ی|ې]/g, "[ې|ی]")}${s.slice(-1)}$`);
} else {
return s;
}
}

View File

@ -6,6 +6,7 @@ import {
getEntries,
updateDictionary,
} from "../lib/dictionary";
import { scriptToPhonetics } from "../lib/scriptToPhonetics";
const dictionaryRouter = express.Router();
@ -14,15 +15,16 @@ dictionaryRouter.post("/update", async (req, res, next) => {
res.send({ ok: true, result });
});
dictionaryRouter.post("/all-words", async (req, res, next) => {
dictionaryRouter.post("/script-to-phonetics", async (req, res, next) => {
if (!allWordsCollection) {
return res.send({ ok: false, message: "allWords not ready" });
}
const word = req.body.word as string;
if (!word) {
const text = req.body.text as unknown;
const accents = req.body.accents as unknown;
if (!text || typeof text !== "string" || typeof accents !== "boolean") {
return res.status(400).send({ ok: false, error: "invalid query" });
}
const results = await findInAllWords(word);
const results = await scriptToPhonetics(text, accents);
res.send(results);
})

View File

@ -1,6 +1,6 @@
POST https://account.lingdocs.com/dictionary/entries HTTP/1.1
POST https://account.lingdocs.com/dictionary/all-words HTTP/1.1
content-type: application/json
{
"ids": ["لیدل", 1527815306]
"word": "کور"
}