pashto-inflector/src/lib/diacritics.test.ts

390 lines
7.2 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Copyright (c) 2021 lingdocs.com
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*
*/
import {
addDiacritics,
splitFIntoPhonemes,
} from "./diacritics";
import * as T from "../types";
const zwar = "َ";
const zwarakey = "ٙ";
const zer = "ِ";
const pesh = "ُ";
const sukun = "ْ";
const hamzaAbove = "ٔ";
const tashdeed = "ّ";
const wasla = "ٱ";
const daggerAlif = "ٰ";
const fathahan = "ً";
const phonemeSplits: Array<{
in: string,
out: string[],
}> = [
{
in: "kor",
out: ["k", "o", "r"],
},
{
in: "raaghey",
out: ["r", "aa", "gh", "ey"],
},
{
in: "hatsa",
out: ["h", "a", "ts", "a"],
},
{
in: "ba",
out: ["b", "a"],
},
{
in: "peydáa",
out: ["p", "ey", "d", "aa"],
},
{
in: "be kaar",
out: ["b", "e", "k", "aa", "r"],
},
{
in: "raadzeyy",
out: ["r", "aa", "dz", "eyy"],
},
{
in: "badanuy ??",
out: ["b", "a", "d", "a", "n", "uy"],
},
{
in: "tur ... pore",
out: ["t", "u", "r", "p", "o", "r", "e"],
},
{
in: "daar-Ul-iqaama",
out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"],
},
];
const diacriticsTest: Array<{
in: T.PsString,
out: string,
}> = [
{
in: {
p: "کور",
f: "kor",
},
out: "کور",
},
{
in: {
p: "کور",
f: "koor",
},
out: "کُور",
},
{
in: {
p: "کور کور",
f: "kor koor",
},
out: "کور کُور",
},
{
in: {
p: "تب",
f: "tib",
},
out: "تِب",
},
{
in: {
p: "تب",
f: "tab",
},
out: "تَب",
},
{
in: {
p: "تب",
f: "tUb",
},
out: "تُب",
},
{
in: {
p: "تب",
f: "tub",
},
out: "تٙب",
},
{
in: {
p: "تب",
f: "tb",
},
out: "تْب",
},
{
in: {
p: "تلب",
f: "tilab",
},
out: "تِلَب",
},
{
in: {
p: "تشناب",
f: "tashnaab",
},
out: "تَشْناب",
},
// working with وs
{
in: {
p: "کول",
f: "kwal",
},
out: "کْوَل",
},
{
in: {
p: "تول",
f: "tool",
},
out: "تُول",
},
{
in: {
p: "مقبول",
f: "maqbool",
},
out: "مَقْبُول",
},
{
in: {
p: "کول",
f: "kawul",
},
out: "کَو" + zwarakey + "ل",
},
{
in: {
p: "کول",
f: "kiwul",
},
out: "کِو" + zwarakey + "ل",
},
{
in: {
p: "کول",
f: "kUwul",
},
out: "کُو" + zwarakey + "ل",
},
{
in: {
p: "کول",
f: "kuwul",
},
out: "ک" + zwarakey + "و" + zwarakey + "ل",
},
{
in: {
p: "کول",
f: "kawal",
},
out: "کَوَل",
},
{
in: {
p: "کول",
f: "kUwal",
},
out: "کُوَل",
},
{
in: {
p: "پشتګرد",
f: "pishtgird",
},
out: "پِشْتْګِرْد",
},
{
in: {
p: "سپین",
f: "speen",
},
out: "سْپِین",
},
{
in: {
p: "سپین",
f: "speyn",
},
out: "سْپین",
},
{
in: {
p: "پېش",
f: "pesh",
},
out: "پېش",
},
{
in: {
p: "لیک",
f: "leek",
},
out: "لِیک",
},
// starting alefs
{
in: {
p: "اسلام",
f: "islaam",
},
out: "اِسْلام",
},
// starting long vowels with ا
{
in: {
p: "ایسار",
f: "eesaar",
},
out: "اِیسار",
},
// double consonant / tashdeed
{
in: {
p: "بتن",
f: "battan",
},
out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
},
{
in: {
p: "بتطن",
f: "battan",
},
out: "ب" + zwar + "ت" + sukun + "ط" + zwar + "ن",
},
// vowel endings working
{
in: {
p: "بته",
f: "bata",
},
out: "بَتَه",
},
{
in: {
p: "بته",
f: "bati",
},
out: "بَتِه",
},
{
in: {
p: "پرمختیا",
f: "parmakhtyaa",
},
out: "پَرْمَخْتْیا",
},
// {
// in: {
// p: "پته",
// f: "patta",
// },
// out: "پَتّه",
// },
// get ayn stuff working
// {
// in: {
// p: "اعتصاب شکن",
// f: "itisaabshikan",
// },
// out: "اِعتِصاب شِکَن",
// },
// avoid false double consonant
{
in: {
p: "ازل لیک",
f: "azalléek",
},
out: "اَزَل لِیک",
},
];
phonemeSplits.forEach((s) => {
test(`${s.in} should split properly`, () => {
const result = splitFIntoPhonemes(s.in);
expect(result).toEqual(s.out);
});
});
test("adding diacritics should work", () => {
diacriticsTest.forEach((t) => {
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
});
});
// ERRORS
const brokenDiacritics = [
{
p: "تشناب",
f: "peshnaab",
},
{
p: "وسېدل",
f: "osedul",
},
];
const badPhonetics: Array<{
in: string,
problem: string,
}> = [
{
in: "acar",
problem: "c",
},
{
in: "a7am",
problem: "7",
},
];
test("bad phonetic characters should throw an error", () => {
badPhonetics.forEach((s) => {
expect(() => {
splitFIntoPhonemes(s.in);
}).toThrow(`illegal phonetic character: ${s.problem}`);
});
});
test("ending with left over Pashto script will throw an error", () => {
expect(() => {
addDiacritics({ p: "کور ته", f: "kor" });
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
});
test("ending with left over phonetics will throw an error", () => {
expect(() => {
addDiacritics({ p: "کار", f: "kaar kawul" });
}).toThrow();
});
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
brokenDiacritics.forEach((t) => {
expect(() => {
addDiacritics(t);
}).toThrow();
});
});