urdu ه standardization

This commit is contained in:
Bill D 2021-06-17 22:46:11 +04:30
parent c8da32764e
commit e41b335821
8 changed files with 122 additions and 23 deletions

3
.gitignore vendored
View File

@ -8,6 +8,9 @@
# testing
/coverage
dict
diac.ts
# production
/build
/dist

View File

@ -1,6 +1,6 @@
{
"name": "@lingdocs/pashto-inflector",
"version": "0.4.3",
"version": "0.4.4",
"author": "lingdocs.com",
"description": "A Pashto inflection and verb conjugation engine, inculding React components for displaying Pashto text, inflections, and conjugations",
"homepage": "https://verbs.lingdocs.com",
@ -36,7 +36,8 @@
"@testing-library/react": "^11.1.0",
"@testing-library/user-event": "^12.1.10",
"@types/jest": "^26.0.20",
"@types/node": "^14.14.32",
"@types/node": "^15.12.1",
"@types/node-fetch": "^2.5.10",
"@types/pbf": "^3.0.2",
"@types/react": "^17.0.3",
"@types/react-dom": "^17.0.2",

View File

@ -324,6 +324,7 @@ export enum PhonemeStatus {
EndOfDuParticle,
ShortAEndingAfterHeem,
AlefDaggarEnding,
SilentAinAfterAlef,
AinWithLongAAtBeginning,
LongAinVowelMissingComma,
ShortAinVowelMissingComma,
@ -340,6 +341,7 @@ export enum PhonemeStatus {
AlefHamzaBeg,
GlottalStopBeforeOo,
OoAfterGlottalStopOo,
EndingSmallH,
}
export function stateInfo({ state, i, phonemes, phoneme }: {
@ -362,7 +364,7 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const needsSukun = (doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter))) // || (isEndOfWord && phonemeInfo.takesSukunOnEnding);
const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
const diacritic = useAinBlendDiacritics
? phonemeInfo.ainBlendDiacritic
@ -390,6 +392,9 @@ export function stateInfo({ state, i, phonemes, phoneme }: {
if (isBeginningOfWord && phoneme === "aa" && currentPLetter === "ع" && nextPLetter === "ا") {
return PhonemeStatus.AinWithLongAAtBeginning;
}
if (currentPLetter === "ا" && nextPLetter === "ع" && phoneme === "aa" && nextPhoneme !== "'") {
return PhonemeStatus.SilentAinAfterAlef;
}
// console.log("------");
// console.log("phoneme", phoneme);
// console.log("state", state);

View File

@ -622,6 +622,48 @@ const diacriticsSections: {
},
out: "قِطْعَه بازي",
},
{
in: {
p: "مقرر",
f: "mUqarrár",
},
out: "مُقَرٌَر",
},
{
in: {
p: "متردد",
f: "mUtariddíd",
},
out: "مُتَرِدِّد",
},
{
in: {
p: "زره",
f: "zirih",
},
out: "زِرِهْ",
},
{
in: {
p: "وری",
f: "waréy",
},
out: "وَری",
},
{
in: {
p: "فلاح",
f: "faláa",
},
out: "فَلاح",
},
{
in: {
p: "امزری",
f: "umzaréy",
},
out: zwarakey + "مْزَری",
},
],
},
{
@ -921,6 +963,25 @@ const diacriticsSections: {
},
],
},
{
describe: "ayn at the end",
tests: [
{
in: {
p: "اجماع",
f: "ijmaa",
},
out: "اِجْماع",
},
{
in: {
p: "اجماع",
f: "ijmaa'",
},
out: "اِجْماع",
}
],
},
{
describe: "ئ in the middle",
tests: [
@ -1227,22 +1288,22 @@ const diacriticsSections: {
},
];
diacriticsSections.forEach((section) => {
// if (!section.describe.includes("allow for beginnings")) return;
describe(section.describe, () => {
section.tests.forEach((t) => {
if (t.out) {
test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => {
expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
});
} else {
expect(() => {
expect(addDiacritics(t.in)).toThrowError();
});
}
});
});
});
// diacriticsSections.forEach((section) => {
// // if (!section.describe.includes("allow for beginnings")) return;
// describe(section.describe, () => {
// section.tests.forEach((t) => {
// if (t.out) {
// test(`diacritics should work for ${t.in.p} - ${t.in.f}`, () => {
// expect(addDiacritics(t.in)).toEqual({ p: t.out, f: t.in.f });
// });
// } else {
// expect(() => {
// expect(addDiacritics(t.in)).toThrowError();
// });
// }
// });
// });
// });
test("ending with left over Pashto script will throw an error", () => {
expect(() => {

View File

@ -201,5 +201,10 @@ function processPhoneme(
pipe(
advanceP,
)(state)
: (phs === PhonemeStatus.SilentAinAfterAlef) ?
pipe(
advanceP,
advanceP,
)(state)
: state;
}

View File

@ -3,7 +3,7 @@ import * as T from "../types";
import * as protoModels from "./dictionary-models.js";
import Pbf from "pbf";
export function writeDictionary(dictionary: T.Dictionary): Uint8Array {
export function writeDictionary(dictionary: T.Dictionary): ArrayBuffer {
const pbfDict = new Pbf();
protoModels.Dictionary.write(dictionary, pbfDict);
const buffer = pbfDict.finish();
@ -16,7 +16,7 @@ export function readDictionary(buffer: Uint8Array): T.Dictionary {
return dictionary;
}
export function writeDictionaryInfo(dictionary: T.DictionaryInfo): Uint8Array {
export function writeDictionaryInfo(dictionary: T.DictionaryInfo): ArrayBuffer {
const pbfDict = new Pbf();
protoModels.DictionaryInfo.write(dictionary, pbfDict);
const buffer = pbfDict.finish();

View File

@ -9,6 +9,8 @@
export function standardizePashto(input: string): string {
// Replace Arabic ى with Farsi ی
return input.replace(/\u0649/g, "\u06cc")
// Replace Urdu ہ and ه with ه
.replace(/ہ|ه/g, "ه")
// Replace Arabic ك with ک
.replace(/\u0643/g, "\u06a9")
// Replace Farsi گ with ګ

View File

@ -1834,11 +1834,24 @@
resolved "https://registry.yarnpkg.com/@types/minimatch/-/minimatch-3.0.3.tgz#3dca0e3f33b200fc7d1139c0cd96c1268cadfd9d"
integrity sha512-tHq6qdbT9U1IRSGf14CL0pUlULksvY9OZ+5eEgl1N7t+OA3tGvNpxJCzuKQlsNgCVwbAs670L1vcVQi8j9HjnA==
"@types/node@*", "@types/node@^14.14.32":
"@types/node-fetch@^2.5.10":
version "2.5.10"
resolved "https://registry.yarnpkg.com/@types/node-fetch/-/node-fetch-2.5.10.tgz#9b4d4a0425562f9fcea70b12cb3fcdd946ca8132"
integrity sha512-IpkX0AasN44hgEad0gEF/V6EgR5n69VEqPEgnmoM8GsIGro3PowbWs4tR6IhxUTyPLpOn+fiGG6nrQhcmoCuIQ==
dependencies:
"@types/node" "*"
form-data "^3.0.0"
"@types/node@*":
version "14.14.32"
resolved "https://registry.yarnpkg.com/@types/node/-/node-14.14.32.tgz#90c5c4a8d72bbbfe53033f122341343249183448"
integrity sha512-/Ctrftx/zp4m8JOujM5ZhwzlWLx22nbQJiVqz8/zE15gOeEW+uly3FSX4fGFpcfEvFzXcMCJwq9lGVWgyARXhg==
"@types/node@^15.12.1":
version "15.12.1"
resolved "https://registry.yarnpkg.com/@types/node/-/node-15.12.1.tgz#9b60797dee1895383a725f828a869c86c6caa5c2"
integrity sha512-zyxJM8I1c9q5sRMtVF+zdd13Jt6RU4r4qfhTd7lQubyThvLfx6yYekWSQjGCGV2Tkecgxnlpl/DNlb6Hg+dmEw==
"@types/normalize-package-data@^2.4.0":
version "2.4.0"
resolved "https://registry.yarnpkg.com/@types/normalize-package-data/-/normalize-package-data-2.4.0.tgz#e486d0d97396d79beedd0a6e33f4534ff6b4973e"
@ -3475,7 +3488,7 @@ colorette@^1.2.1, colorette@^1.2.2:
resolved "https://registry.yarnpkg.com/colorette/-/colorette-1.2.2.tgz#cbcc79d5e99caea2dbf10eb3a26fd8b3e6acfa94"
integrity sha512-MKGMzyfeuutC/ZJ1cba9NqcNpfeqMUcYmyF1ZFY6/Cn7CNSAKx6a+s48sqLqyAiZuaP2TcqMhoo+dlwFnVxT9w==
combined-stream@^1.0.6, combined-stream@~1.0.6:
combined-stream@^1.0.6, combined-stream@^1.0.8, combined-stream@~1.0.6:
version "1.0.8"
resolved "https://registry.yarnpkg.com/combined-stream/-/combined-stream-1.0.8.tgz#c3d45a8b34fd730631a110a8a2520682b31d5a7f"
integrity sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==
@ -5183,6 +5196,15 @@ fork-ts-checker-webpack-plugin@4.1.6:
tapable "^1.0.0"
worker-rpc "^0.1.0"
form-data@^3.0.0:
version "3.0.1"
resolved "https://registry.yarnpkg.com/form-data/-/form-data-3.0.1.tgz#ebd53791b78356a99af9a300d4282c4d5eb9755f"
integrity sha512-RHkBKtLWUVwd7SqRIvCZMEvAMoGUp0XU+seQiZejj0COz3RI3hWP4sCv3gZWWLjJTd7rGwcsF5eKZGii0r/hbg==
dependencies:
asynckit "^0.4.0"
combined-stream "^1.0.8"
mime-types "^2.1.12"
form-data@~2.3.2:
version "2.3.3"
resolved "https://registry.yarnpkg.com/form-data/-/form-data-2.3.3.tgz#dcce52c05f644f298c6a7ab936bd724ceffbf3a6"