add script to phonetics conversion logic
This commit is contained in:
parent
a1a6194717
commit
0c1f6f56d0
|
@ -33,6 +33,7 @@ async function fetchDictionary(): Promise<T.Dictionary> {
|
|||
}
|
||||
|
||||
async function fetchAllWords(): Promise<T.AllWordsWithInflections> {
|
||||
// TODO: this is really ugly
|
||||
const res = await fetch(process.env.LINGDOCS_DICTIONARY_URL?.slice(0, -4) + "all-words.json");
|
||||
return await res.json();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
const arabicNumsRegex = /[۰-۹]/g;
|
||||
const pRegex = /اً|أ|ا|آ|ٱ|ٲ|ٳ|ئی|ئي|ئے|یٰ|ی|ي|ې|ۍ|ئ|ے|س|ص|ث|څ|ج|چ|هٔ|ه|ۀ|غز|زغ|کش|شک|ښک|ښک|پښ|ښپ|ہ|ع|و|ؤ|ښ|غ|خ|ح|ش|ز|ض|ذ|ځ|ظ|ژ|ر|ړ|ڑ|ت|ټ|ٹ|ط|د|ډ|ڈ|مب|م|نب|ن|ڼ|ک|ګ|گ|ل|ق|ږ|ب|پ|ف/g;
|
||||
// [\u0621-\u065f\u0670-\u06d3\u06d5]/g;
|
||||
const pTable: ({
|
||||
chars: string[],
|
||||
beg: string,
|
||||
mid: string,
|
||||
end: string,
|
||||
} | {
|
||||
chars: string[],
|
||||
sound: string,
|
||||
})[] = [
|
||||
{
|
||||
chars: ["ءع"],
|
||||
sound: "",
|
||||
},
|
||||
{
|
||||
chars: ["آ"],
|
||||
sound: "a",
|
||||
},
|
||||
{
|
||||
chars: ["أ"],
|
||||
sound: "U",
|
||||
},
|
||||
{
|
||||
chars: ["ؤ"],
|
||||
sound: "o/w",
|
||||
},
|
||||
{
|
||||
chars: ["إ"],
|
||||
sound: "i",
|
||||
},
|
||||
{
|
||||
chars: ["ئ"],
|
||||
beg: "y",
|
||||
mid: "y",
|
||||
end: "eyy",
|
||||
},
|
||||
{
|
||||
chars: ["ا"],
|
||||
beg: "aa/a/i/u/U",
|
||||
mid: "aa",
|
||||
end: "aa",
|
||||
},
|
||||
{
|
||||
chars: ["ب"],
|
||||
sound: "b",
|
||||
},
|
||||
{
|
||||
chars: ["ة"],
|
||||
sound: "a/u",
|
||||
},
|
||||
{
|
||||
chars: ["ت", "ط"],
|
||||
sound: "t",
|
||||
},
|
||||
{
|
||||
chars: ["ټ"],
|
||||
sound: "T",
|
||||
},
|
||||
{
|
||||
chars: ["ث", "س", "ص"],
|
||||
sound: "s",
|
||||
},
|
||||
{
|
||||
chars: ["ج"],
|
||||
sound: "j",
|
||||
},
|
||||
{
|
||||
chars: ["ح"],
|
||||
sound: "h",
|
||||
},
|
||||
{
|
||||
chars: ["اه"],
|
||||
sound: "aah",
|
||||
},
|
||||
{
|
||||
chars: ["ه"],
|
||||
beg: "h",
|
||||
mid: "h",
|
||||
end: "a/i/u/h",
|
||||
},
|
||||
{
|
||||
chars: ["خ"],
|
||||
sound: "kh",
|
||||
},
|
||||
{
|
||||
chars: ["د"],
|
||||
sound: "d",
|
||||
},
|
||||
{
|
||||
chars: ["ذ", "ز", "ض", "ظ"],
|
||||
sound: "z",
|
||||
},
|
||||
{
|
||||
chars: ["ډ"],
|
||||
sound: "D",
|
||||
},
|
||||
{
|
||||
chars: ["ر"],
|
||||
sound: "r",
|
||||
},
|
||||
{
|
||||
chars: ["ړ"],
|
||||
sound: "R",
|
||||
},
|
||||
{
|
||||
chars: ["ش"],
|
||||
sound: "sh",
|
||||
},
|
||||
{
|
||||
chars: ["غ"],
|
||||
sound: "gh",
|
||||
},
|
||||
{
|
||||
chars: ["ف"],
|
||||
sound: "f",
|
||||
},
|
||||
{
|
||||
chars: ["ق"],
|
||||
sound: "q",
|
||||
},
|
||||
{
|
||||
chars: ["ك", "ک"],
|
||||
sound: "k",
|
||||
},
|
||||
{
|
||||
chars: ["ل"],
|
||||
sound: "l",
|
||||
},
|
||||
{
|
||||
chars: ["م"],
|
||||
sound: "m",
|
||||
},
|
||||
{
|
||||
chars: ["ن"],
|
||||
sound: "n",
|
||||
},
|
||||
{
|
||||
chars: ["ڼ"],
|
||||
sound: "N",
|
||||
},
|
||||
{
|
||||
chars: ["و"],
|
||||
beg: "w",
|
||||
mid: "w/o/oo",
|
||||
end: "w/o/oo",
|
||||
},
|
||||
{
|
||||
chars: ["ای"],
|
||||
sound: "aay",
|
||||
},
|
||||
{
|
||||
chars: ["وی"],
|
||||
sound: "ooy",
|
||||
},
|
||||
{
|
||||
chars: ["ی", "ے"],
|
||||
beg: "y",
|
||||
mid: "ey/ee/y",
|
||||
end: "ey",
|
||||
},
|
||||
{
|
||||
chars: ["ي"],
|
||||
beg: "y",
|
||||
mid: "ey/ee/y",
|
||||
end: "ee",
|
||||
},
|
||||
{
|
||||
chars: ["اً"],
|
||||
sound: "an",
|
||||
},
|
||||
{
|
||||
chars: ["ځ"],
|
||||
sound: "dz",
|
||||
},
|
||||
{
|
||||
chars: ["څ"],
|
||||
sound: "ts",
|
||||
},
|
||||
{
|
||||
chars: ["چ"],
|
||||
sound: "ch",
|
||||
},
|
||||
{
|
||||
chars: ["ږ"],
|
||||
sound: "G",
|
||||
},
|
||||
{
|
||||
chars: ["ژ"],
|
||||
sound: "jz",
|
||||
},
|
||||
{
|
||||
chars: ["ښ"],
|
||||
sound: "x",
|
||||
},
|
||||
{
|
||||
chars: ["ۍ"],
|
||||
sound: "uy",
|
||||
},
|
||||
{
|
||||
chars: ["ې"],
|
||||
sound: "e",
|
||||
},
|
||||
{
|
||||
chars: ["ګ", "گ"],
|
||||
sound: "g",
|
||||
},
|
||||
{
|
||||
chars: ["یٰ"],
|
||||
sound: "aa",
|
||||
},
|
||||
];
|
||||
|
||||
|
||||
// "ء": "",
|
||||
// "آ": "",
|
||||
// "أ": "",
|
||||
// "ؤ": "",
|
||||
// "إ": "",
|
||||
// "ئ": "",
|
||||
// "ا": "",
|
||||
// "": "",
|
||||
// "": "",
|
||||
// "": "",
|
||||
// "": "",
|
||||
// "": "",
|
||||
// "": "",
|
||||
// "": "",
|
||||
// "": "",
|
||||
// "": "",
|
||||
// }
|
||||
const numsTable = {
|
||||
"۰": "0",
|
||||
"۱": "1",
|
||||
"۲": "2",
|
||||
"۳": "3",
|
||||
"۴": "4",
|
||||
"۵": "5",
|
||||
"۶": "6",
|
||||
"۷": "7",
|
||||
"۸": "8",
|
||||
"۹": "9",
|
||||
};
|
||||
|
||||
|
||||
export function handlePunctuationAndNums(s: string): string {
|
||||
return s.replace(/؟/g, "?")
|
||||
.replace(/،/g, ",")
|
||||
.replace(/«/g, '"')
|
||||
.replace(/»/g, '"')
|
||||
.replace(arabicNumsRegex, (mtch) => {
|
||||
// @ts-ignore
|
||||
return numsTable[mtch];
|
||||
});
|
||||
}
|
||||
|
||||
export function handleUnmatched(s: string): string {
|
||||
const g = s.replace(pRegex, (mtch, i) => {
|
||||
const pos: "beg" | "mid" | "end" = i === 0
|
||||
? "beg"
|
||||
: i === s.length-1
|
||||
? "end"
|
||||
: "mid";
|
||||
const m = pTable.find(x => x.chars.includes(mtch));
|
||||
if (!m) return "";
|
||||
const sound = "sound" in m ? m.sound : m[pos];
|
||||
return sound.includes("/") ? `(${sound})` : sound;
|
||||
})
|
||||
return `?*${g}*?`;
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
import {
|
||||
Types as T,
|
||||
standardizePashto,
|
||||
removeAccents,
|
||||
} from "@lingdocs/inflect";
|
||||
import { findInAllWords } from "./dictionary";
|
||||
import {
|
||||
handlePunctuationAndNums,
|
||||
handleUnmatched,
|
||||
} from "./handle-unmatched";
|
||||
|
||||
|
||||
// TODO: handle و ارزي
|
||||
// spacing error with کور کې چېرته اوسېږئ
|
||||
|
||||
function isP(c: string): boolean {
|
||||
return !!c.match(/[\u0621-\u065f\u0670-\u06d3\u06d5]/);
|
||||
}
|
||||
|
||||
// TODO: ERRORING WHEN YOU JUST PUT A BUNCH OF ENGLISH CHARS IN THE TEXT
|
||||
|
||||
/**
|
||||
* Converts some Pashto texts to phonetics by looking up each word in the dictionary and finding
|
||||
* the phonetic equivalent
|
||||
*
|
||||
* @param p
|
||||
* @returns
|
||||
*/
|
||||
export function scriptToPhonetics(p: string, accents: boolean): {
|
||||
phonetics: string,
|
||||
missing: string[],
|
||||
} {
|
||||
const words = splitWords(standardizePashto(p));
|
||||
if (!words.length) return {
|
||||
phonetics: "",
|
||||
missing: [],
|
||||
}
|
||||
// TODO: keep going with the hyphens etc
|
||||
// also و ارزي
|
||||
const converted: string[] = [];
|
||||
const missing = new Set<string>();
|
||||
let i = 0;
|
||||
function handleAccents(f: string): string {
|
||||
return accents ? f : removeAccents(f);
|
||||
}
|
||||
function checkHyphenMatch(psw: T.PsWord): {
|
||||
match: boolean,
|
||||
words: number,
|
||||
f: string,
|
||||
} {
|
||||
if (!psw.hyphen) {
|
||||
throw new Error("checking a match without a hyphen content");
|
||||
}
|
||||
let match = false;
|
||||
let f = psw.f;
|
||||
let k = 1;
|
||||
for (let j = 0; j < psw.hyphen.length; j++) {
|
||||
const h = psw.hyphen[j];
|
||||
const w = words[i+k];
|
||||
if (h.type === "unwritten" && w === " ") {
|
||||
match = true;
|
||||
f += `-${h.f}`;
|
||||
k += 1;
|
||||
} else if (h.type === "written" && w === h.p) {
|
||||
match = true;
|
||||
f += `-${h.f}`;
|
||||
k += 1;
|
||||
} else if (h.type === "written" && w === " " && words[i+1+k] === h.p) {
|
||||
match = true;
|
||||
f += `-${h.f}`;
|
||||
k += 2;
|
||||
} else {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return {
|
||||
match,
|
||||
f,
|
||||
words: k,
|
||||
}
|
||||
}
|
||||
function handleMatches(matches: T.PsWord[]): string[] {
|
||||
const hyphens = matches.filter(x => x.hyphen);
|
||||
const plain = matches.filter(x => !x.hyphen);
|
||||
const processed = new Set<string>();
|
||||
if (hyphens.length) {
|
||||
for (let h of hyphens) {
|
||||
const res = checkHyphenMatch(h);
|
||||
if (res.match) {
|
||||
i += res.words;
|
||||
processed.add(handleAccents(res.f));
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (hyphens.length && !plain.length) {
|
||||
processed.add("ERR");
|
||||
i++;
|
||||
} {
|
||||
plain.forEach((x) => {
|
||||
processed.add(handleAccents(x.f));
|
||||
});
|
||||
i++;
|
||||
}
|
||||
return Array.from(processed);
|
||||
}
|
||||
while (i < words.length) {
|
||||
const word = words[i];
|
||||
const p = isP(word);
|
||||
if (p) {
|
||||
const matches = findInAllWords(possibleFuzzify(word));
|
||||
if (!matches) {
|
||||
throw new Error("not initialized");
|
||||
}
|
||||
if (matches.length > 0) {
|
||||
const possibilities = handleMatches(matches);
|
||||
converted.push(possibilities.join("/"));
|
||||
} else {
|
||||
missing.add(word);
|
||||
converted.push(handleUnmatched(word));
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
converted.push(handlePunctuationAndNums(word));
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return {
|
||||
phonetics: converted.join(""),
|
||||
missing: Array.from(missing),
|
||||
};
|
||||
}
|
||||
|
||||
function splitWords(p: string): string[] {
|
||||
const words: string[] = [];
|
||||
let current = "";
|
||||
let onP: boolean = true;
|
||||
const chars = p.split("");
|
||||
for (let char of chars) {
|
||||
const p = isP(char);
|
||||
if (p) {
|
||||
if (onP) {
|
||||
current += char;
|
||||
} else {
|
||||
words.push(current);
|
||||
current = char;
|
||||
onP = true;
|
||||
}
|
||||
} else {
|
||||
if (onP) {
|
||||
words.push(current);
|
||||
current = char;
|
||||
onP = false;
|
||||
} else {
|
||||
current += char;
|
||||
}
|
||||
}
|
||||
}
|
||||
words.push(current);
|
||||
return words;
|
||||
}
|
||||
|
||||
function possibleFuzzify(s: string): string | RegExp {
|
||||
if (s.length < 3) {
|
||||
return s;
|
||||
}
|
||||
const middle = s.slice(1, -1);
|
||||
if (middle.includes("ې") || middle.includes("ی")) {
|
||||
return new RegExp(`^${s[0]}${middle.replace(/[ی|ې]/g, "[ې|ی]")}${s.slice(-1)}$`);
|
||||
} else {
|
||||
return s;
|
||||
}
|
||||
}
|
|
@ -6,6 +6,7 @@ import {
|
|||
getEntries,
|
||||
updateDictionary,
|
||||
} from "../lib/dictionary";
|
||||
import { scriptToPhonetics } from "../lib/scriptToPhonetics";
|
||||
|
||||
const dictionaryRouter = express.Router();
|
||||
|
||||
|
@ -14,15 +15,16 @@ dictionaryRouter.post("/update", async (req, res, next) => {
|
|||
res.send({ ok: true, result });
|
||||
});
|
||||
|
||||
dictionaryRouter.post("/all-words", async (req, res, next) => {
|
||||
dictionaryRouter.post("/script-to-phonetics", async (req, res, next) => {
|
||||
if (!allWordsCollection) {
|
||||
return res.send({ ok: false, message: "allWords not ready" });
|
||||
}
|
||||
const word = req.body.word as string;
|
||||
if (!word) {
|
||||
const text = req.body.text as unknown;
|
||||
const accents = req.body.accents as unknown;
|
||||
if (!text || typeof text !== "string" || typeof accents !== "boolean") {
|
||||
return res.status(400).send({ ok: false, error: "invalid query" });
|
||||
}
|
||||
const results = await findInAllWords(word);
|
||||
const results = await scriptToPhonetics(text, accents);
|
||||
res.send(results);
|
||||
})
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
POST https://account.lingdocs.com/dictionary/entries HTTP/1.1
|
||||
POST https://account.lingdocs.com/dictionary/all-words HTTP/1.1
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"ids": ["لیدل", 1527815306]
|
||||
"word": "کور"
|
||||
}
|
Loading…
Reference in New Issue