pashto-inflector/src/lib/diacritics.test.ts

803 lines
20 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Copyright (c) 2021 lingdocs.com
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*
*/
import {
addDiacritics,
} from "./diacritics";
import {
zwar,
zwarakey,
zer,
pesh,
sukun,
hamzaAbove,
tashdeed,
wasla,
daggerAlif,
fathahan,
} from "./diacritics-helpers";
import * as T from "../types";
const diacriticsSections: {
describe: string,
tests: {
in: T.PsString,
out: string | null,
}[],
}[] = [
{
describe: "regular, native Pashto script/sounds",
tests: [
{
in: {
p: "کور",
f: "kor",
},
out: "کور",
},
{
in: {
p: "کور",
f: "koor",
},
out: "کُور",
},
{
in: {
p: "کور کور",
f: "kor koor",
},
out: "کور کُور",
},
{
in: {
p: "تب",
f: "tib",
},
out: "تِب",
},
{
in: {
p: "تب",
f: "tab",
},
out: "تَب",
},
{
in: {
p: "تب",
f: "tUb",
},
out: "تُب",
},
{
in: {
p: "تب",
f: "tub",
},
out: "تٙب",
},
{
in: {
p: "تب",
f: "tb",
},
out: "تْب",
},
{
in: {
p: "تلب",
f: "tilab",
},
out: "تِلَب",
},
{
in: {
p: "تشناب",
f: "tashnaab",
},
out: "تَشْناب",
},
{
in: {
p: "پسته",
f: "pasta",
},
out: "پَسْتَه",
},
// working with وs
{
in: {
p: "کول",
f: "kwal",
},
out: "کْوَل",
},
{
in: {
p: "تول",
f: "tool",
},
out: "تُول",
},
{
in: {
p: "مقبول",
f: "maqbool",
},
out: "مَقْبُول",
},
{
in: {
p: "کول",
f: "kawul",
},
out: "کَو" + zwarakey + "ل",
},
{
in: {
p: "کول",
f: "kiwul",
},
out: "کِو" + zwarakey + "ل",
},
{
in: {
p: "کول",
f: "kUwul",
},
out: "کُو" + zwarakey + "ل",
},
{
in: {
p: "کول",
f: "kuwul",
},
out: "ک" + zwarakey + "و" + zwarakey + "ل",
},
{
in: {
p: "کول",
f: "kawal",
},
out: "کَوَل",
},
{
in: {
p: "کول",
f: "kUwal",
},
out: "کُوَل",
},
{
in: {
p: "پشتګرد",
f: "pishtgird",
},
out: "پِشْتْګِرْد",
},
{
in: {
p: "سپین",
f: "speen",
},
out: "سْپِین",
},
{
in: {
p: "سپین",
f: "speyn",
},
out: "سْپین",
},
{
in: {
p: "پېش",
f: "pesh",
},
out: "پېش",
},
{
in: {
p: "لیک",
f: "leek",
},
out: "لِیک",
},
{
in: {
p: "رغېدل",
f: "raghedul",
},
out: "رَغېد" + zwarakey + "ل",
},
{
in: {
p: "کارول",
f: "kaarawul",
},
out: "کارَو" + zwarakey + "ل",
},
{
in: {
p: "پېښېدل",
f: "pexedul",
},
out: "پېښېد" + zwarakey + "ل",
},
{
in: {
p: "مین",
f: "mayín",
},
out: "مَیِن",
},
{
in: {
p: "سړی",
f: "saRey",
},
out: "سَړی",
},
{
in: {
p: "سړي",
f: "saRee",
},
out: "سَړي",
},
{
in: {
p: "زه",
f: "zu",
},
out: "زهٔ",
},
{
in: {
p: "زه",
f: "za",
},
out: "زَه",
},
{
in: {
p: "پېشنهاد",
f: "peshniháad",
},
out: "پېشْنِهاد",
},
{
in: {
p: "ایستل",
f: "eestul",
},
out: "اِیسْت" + zwarakey + "ل",
},
{
in: {
p: "ایستل",
f: "eystul",
},
out: "ایسْت" + zwarakey + "ل",
},
{
in: {
p: "اېسېدل",
f: "esedul",
},
out: "اېسېد" + zwarakey + "ل",
},
{
in: {
p: "اوسېدل",
f: "osedul",
},
out: "اوسېد" + zwarakey + "ل",
},
{
in: {
p: "اواز",
f: "awaaz",
},
out: "اَواز",
},
{
in: {
p: "اسلام",
f: "islaam",
},
out: "اِسْلام",
},
{
in: {
p: "واردول",
f: "waaridawul",
},
out: "وارِدَو" + zwarakey + "ل",
},
{
in: {
p: "غاړه",
f: "ghaaRa",
},
out: "غاړَه",
},
{
in: {
p: "اوتر",
f: "awtár",
},
out: "اَوْتَر",
},
{
in: {
p: "اختیار",
f: "ikhtiyáar",
},
out: "اِخْتِیار",
},
{
in: {
p: "فریاد",
f: "faryáad",
},
out: "فَرْیاد",
},
{
in: {
p: "کارغه",
f: "kaarghu",
},
out: "کارْغهٔ",
},
{
in: {
p: "بې کار",
f: "be kaar",
},
out: "بې کار",
},
{
in: {
p: "بې کار",
f: "bekaar",
},
out: "بې کار",
},
{
in: {
p: "ارغون",
f: "arghóon",
},
out: "اَرْغُون",
},
{
in: {
p: "ارمټه",
f: "armaTa",
},
out: "اَرْمَټَه",
},
{
in: {
p: "اروا پوه",
f: "arwaa poh",
},
out: "اَرْوا پوهْ",
},
// starting alefs
{
in: {
p: "اسلام",
f: "islaam",
},
out: "اِسْلام",
},
// starting long vowels with ا
{
in: {
p: "ایسار",
f: "eesaar",
},
out: "اِیسار",
},
// double consonant / tashdeed
{
in: {
p: "بتن",
f: "battan",
},
out: "ب" + zwar + "ت" + tashdeed + zwar + "ن",
},
{
in: {
p: "بتطن",
f: "battan",
},
out: "ب" + zwar + "ت" + sukun + "ط" + zwar + "ن",
},
// vowel endings working
{
in: {
p: "بته",
f: "bata",
},
out: "بَتَه",
},
{
in: {
p: "بته",
f: "bati",
},
out: "بَتِه",
},
{
in: {
p: "پرمختیا",
f: "parmakhtyaa",
},
out: "پَرْمَخْتْیا",
},
{
in: {
p: "پته",
f: "patta",
},
out: "پَتَّه",
},
{
in: {
p: "پته تور",
f: "patta toor",
},
out: "پَتَّه تُور",
},
// avoid false double consonant
{
in: {
p: "ازل لیک",
f: "azalléek",
},
out: "اَزَل لِیک",
},
],
},
{
describe: "nm - mb thing",
tests: [
{
in: {
p: "انبار",
f: "ambáar",
},
out: "اَنْبار",
},
],
},
{
describe: "ayn stuff",
tests: [
{
in: {
p: "بعد",
f: "ba'd",
},
out: "بَعْد",
},
{
in: {
p: "بعد",
f: "b'ad",
},
out: "بْعَد",
},
{
in: {
p: "بعد",
f: "ba'ad",
},
out: "بَعَد",
},
{
in: {
p: "بعد",
f: "baad",
},
out: "بَعَد",
},
{
in: {
p: "بعد",
f: "bad",
},
out: "بَعد",
},
{
in: {
p: "معلوم",
f: "maaloom",
},
out: "مَعَلُوم",
},
{
in: {
p: "منبع",
f: "manbi'",
},
out: "مَنْبِع",
},
{
in: {
p: "منبع",
f: "manb'i",
},
out: "مَنْبْعِ"
},
{
in: {
p: "منبع",
f: "manbee",
},
out: "مَنْبِعِ",
},
{
in: {
p: "منبع",
f: "manbi",
},
out: "مَنْبِع"
},
{
in: {
p: "معنا",
f: "ma'náa",
},
out: "مَعْنا",
},
{
in: {
p: "معنا",
f: "maanáa",
},
out: "مَعَنا",
},
// طمع - tama // استعمال - istimaal
// TODO: Starting like عام اعتصاب etc.
// {
// in: {
// p: "اعتصاب شکن",
// f: "itisaab shakan",
// },
// out: "اِعتِصاب شَکَن",
// },
// // starting with ع
// {
// in: {
// p: "عزت",
// f: "izzat",
// },
// out: "عِزَّت",
// },
// {
// in: {
// p: "عزت",
// f: "i'zzat",
// },
// out: "عِزَّت",
// },
// // middle ع
// {
// in: {
// p: "معنا",
// f: "ma'anaa",
// },
// out: "مَعَنا",
// },
// ending with ayn
// {
// in: {
// p: "طمع کېدل",
// f: "tama kedul",
// },
// out: "طَمَع کېد" + zwarakey + "ل",
// },
// {
// in: {
// p: "منبع",
// f: "manbí",
// },
// out: "مَنْبِع",
// },
],
},
{
describe: "ئ in the middle",
tests: [
{
in: {
p: "برائت",
f: "baraa'at",
},
out: "بَرائَت",
},
{
in: {
p: "فائده",
f: "faaida",
},
out: "فائِدَه",
},
],
},
{
describe: "واخ being khaa in the middle of a word",
tests: [
{
in: {
p: "استخوان",
f: "UstUkháan",
},
out: "اُسْتُخ(و)ان",
},
],
},
{
describe: "Arabic wasla",
tests: [
{
in: {
p: "بالکل",
f: "bilkUl",
},
out: "بِٱلْکُل",
},
],
},
{
describe: "izafe",
tests: [
{
in: {
p: "ایصال ثواب",
f: "eesaal-i-sawaab",
},
out: "اِیصالِ ثَواب",
},
],
},
{
describe: "special behaviour with د",
tests: [
{
in: {
p: "د",
f: "du",
},
out: "د" + zwarakey,
},
{
in: {
p: "د لاس",
f: "du laas",
},
out: "د" + zwarakey + " لاس",
},
{
in: {
p: "د ... په شان",
f: "du ... pu shaan",
},
out: "د" + zwarakey + " ... پهٔ شان",
},
],
},
{
describe: "ha ending with ح",
tests: [
// {
// in: {
// p: "ذبح",
// f: "zabha",
// },
// out: "ذَبْحَ",
// },
{
in: {
p: "ذبح کول",
f: "zabha kawul",
},
out: "ذَبْحَ کَو" + zwarakey + "ل",
},
],
},
{
describe: "require dagger alif on words ending with یٰ",
tests: [
{
in: {
p: "یحیی",
f: "yahyaa",
},
out: null,
},
{
in: {
p: "یحییٰ",
f: "yahyaa",
},
out: "یَحْییٰ",
},
{
in: {
p: "یحییٰ چېرته",
f: "yahyaa cherta",
},
out: "یَحْییٰ چېرْتَه",
},
{
in: {
p: "معنیٰ",
f: "ma'anaa",
},
out: "مَعَنیٰ",
},
],
}
];
diacriticsSections.forEach((section) => {
describe(section.describe, () => {
section.tests.forEach((t) => {
if (t.out) {
test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => {
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
});
} else {
expect(() => {
expect(addDiacritics(t.in)).toThrowError();
});
}
});
});
});
// ERRORS
const brokenDiacritics = [
{
p: "تشناب",
f: "peshnaab",
},
{
p: "وسېدل",
f: "osedul",
},
];
test("ending with left over Pashto script will throw an error", () => {
expect(() => {
addDiacritics({ p: "کور ته", f: "kor" });
}).toThrow(`phonetics error - phonetics shorter than pashto script`);
});
test("ending with left over phonetics will throw an error", () => {
expect(() => {
addDiacritics({ p: "کار", f: "kaar kawul" });
}).toThrow();
});
test("adding diacritics errors when phonetecs and pashto do not line up", () => {
brokenDiacritics.forEach((t) => {
expect(() => {
addDiacritics(t);
}).toThrow();
});
});