From c0cd34c3d662f407134e602ee359d7b957382d53 Mon Sep 17 00:00:00 2001 From: adueck Date: Thu, 27 Jul 2023 12:28:50 +0400 Subject: [PATCH] phonetics conversion done --- src/components/src/blocks/Block.tsx | 1038 +++++--- src/lib/src/diacritics-helpers.test.ts | 210 +- src/lib/src/diacritics-helpers.ts | 1190 +++++---- src/lib/src/diacritics.test.ts | 2560 +++++++++---------- src/lib/src/diacritics.ts | 299 +-- src/lib/src/phonetics-to-diacritics.test.ts | 2195 ++++++++-------- src/lib/src/phonetics-to-diacritics.ts | 473 +++- src/lib/src/sandwiches.ts | 268 +- src/lib/src/translate-phonetics-replacer.ts | 17 +- src/lib/src/translate-phonetics.test.ts | 35 +- src/lib/src/validate-entry.test.ts | 623 +++-- 11 files changed, 4890 insertions(+), 4018 deletions(-) diff --git a/src/components/src/blocks/Block.tsx b/src/components/src/blocks/Block.tsx index ab1f7f9..f58443a 100644 --- a/src/components/src/blocks/Block.tsx +++ b/src/components/src/blocks/Block.tsx @@ -1,134 +1,226 @@ import * as T from "../../../types"; import classNames from "classnames"; +import { getEnglishFromRendered } from "../../../lib/src/phrase-building/np-tools"; import { - getEnglishFromRendered, -} from "../../../lib/src/phrase-building/np-tools"; -import { getEnglishPersonInfo, getEnglishParticipleInflection, getEnglishGenNumInfo } from "../../../lib/src/misc-helpers"; + getEnglishPersonInfo, + getEnglishParticipleInflection, + getEnglishGenNumInfo, +} from "../../../lib/src/misc-helpers"; import { useState } from "react"; import { getLength } from "../../../lib/src/p-text-helpers"; import { roleIcon } from "../vp-explorer/VPExplorerExplanationModal"; import { negativeParticle } from "../../../lib/src/grammar-units"; -function Block({ opts, block, king, script }: { - opts: T.TextOptions, - block: T.Block, - king?: "subject" | "object" | undefined, - script: "p" | "f"; +function Block({ + opts, + block, + king, + script, +}: { + opts: T.TextOptions; + block: T.Block; + king?: "subject" | "object" | undefined; + script: "p" | "f"; }) { - if ("equative" in block.block) { - return ; - } - if (block.block.type === "AP") { - const english = getEnglishFromRendered(block.block); - return {block.block} - } - if (block.block.type === "subjectSelection") { - const role = king === "subject" ? "king" : king === "object" ? "servant" : undefined; - return - } - if (block.block.type === "objectSelection") { - const role = king === "object" ? "king" : king === "subject" ? "servant" : undefined; - return ; - } - if (block.block.type === "predicateSelection") { - const english = getEnglishFromRendered(block.block.selection); - return
-
Predicate
- {block.block.selection.type === "complement" - ? - : {block.block.selection}} + if ("equative" in block.block) { + return ( + + ); + } + if (block.block.type === "AP") { + const english = getEnglishFromRendered(block.block); + return ( + + {block.block} + + ); + } + if (block.block.type === "subjectSelection") { + const role = + king === "subject" ? "king" : king === "object" ? "servant" : undefined; + return ( + + ); + } + if (block.block.type === "objectSelection") { + const role = + king === "object" ? "king" : king === "subject" ? "servant" : undefined; + return ( + + ); + } + if (block.block.type === "predicateSelection") { + const english = getEnglishFromRendered(block.block.selection); + return ( +
+
+ Predicate
- } - if (block.block.type === "negative") { - return - } - if (block.block.type === "PH") { - return ; - } - if (block.block.type === "VB") { - return ; - } - if (block.block.type === "complement") { - return - } - if (block.block.type === "NComp") { - return - } - return + {block.block.selection.type === "complement" ? ( + + ) : ( + + {block.block.selection} + + )} +
+ ); + } + if (block.block.type === "negative") { + return ( + + ); + } + if (block.block.type === "PH") { + return ; + } + if (block.block.type === "VB") { + return ; + } + if (block.block.type === "complement") { + return ( + + ); + } + if (block.block.type === "NComp") { + return ; + } + return ; } export default Block; -function Border({ children, extraClassName, padding }: { children: JSX.Element | JSX.Element[] | string, extraClassName?: string, padding?: string }) { - return
- <>{children} + <>{children}
+ ); } -function VBBlock({ opts, block, script }: { - opts: T.TextOptions, - script: "p" | "f", - block: T.VBBasic | T.VBGenNum | (T.VBBasic & { +function VBBlock({ + opts, + block, + script, +}: { + opts: T.TextOptions; + script: "p" | "f"; + block: + | T.VBBasic + | T.VBGenNum + | (T.VBBasic & { person: T.Person; - }), + }); }) { - const [length, setLength] = useState("long"); - const [version, setVersion] = useState(0); - const ps = getLength(block.ps, length); - function changeVersion() { - setVersion(o => (o + 1) % ps.length); - } - function changeLength() { - setLength(o => ( - o === "long" - ? "short" - : o === "short" && "mini" in block.ps - ? "mini" - : "long" - )); - } - const infInfo = "gender" in block - ? getEnglishGenNumInfo(block.gender, block.number) - : "person" in block - ? getEnglishPersonInfo(block.person, "short") - : ""; - return
-
- {"long" in block.ps &&
{length}
} - {ps.length > 1 &&
v. {version + 1}
} -
- - <> - {ps[version][script]} - - -
VBlock
- {infInfo} + const [length, setLength] = useState("long"); + const [version, setVersion] = useState(0); + const ps = getLength(block.ps, length); + function changeVersion() { + setVersion((o) => (o + 1) % ps.length); + } + function changeLength() { + setLength((o) => + o === "long" + ? "short" + : o === "short" && "mini" in block.ps + ? "mini" + : "long" + ); + } + const infInfo = + "gender" in block + ? getEnglishGenNumInfo(block.gender, block.number) + : "person" in block + ? getEnglishPersonInfo(block.person, "short") + : ""; + return ( +
+
+ {"long" in block.ps && ( +
+ {length} +
+ )} + {ps.length > 1 && ( +
+ v. {version + 1} +
+ )} +
+ + <>{ps[version][script]} + +
VBlock
+ {infInfo}
+ ); } -function WeldedBlock({ opts, welded, script }: { - opts: T.TextOptions, - script: "p" | "f", - welded: T.Welded, +function WeldedBlock({ + opts, + welded, + script, +}: { + opts: T.TextOptions; + script: "p" | "f"; + welded: T.Welded; }) { - return
- - {welded.left.type === "NComp" - ? - : welded.left.type === "VB" - ? - : } - - + return ( +
+ + {welded.left.type === "NComp" ? ( + + ) : welded.left.type === "VB" ? ( + + ) : ( + + )} + +
+ ); } // function VerbSBlock({ opts, v, script }: { @@ -192,19 +284,22 @@ function WeldedBlock({ opts, welded, script }: { //
// } -function PerfHeadBlock({ opts, ps, script }: { - opts: T.TextOptions, - ps: T.PsString, - script: "p" | "f", - +function PerfHeadBlock({ + opts, + ps, + script, +}: { + opts: T.TextOptions; + ps: T.PsString; + script: "p" | "f"; }) { - return
- - {ps[script]} - -
perf. head
- {'\u00A0'} -
; + return ( +
+ {ps[script]} +
perf. head
+ {"\u00A0"} +
+ ); } // function ModalAuxBlock({ opts, aux, script }: { @@ -222,313 +317,498 @@ function PerfHeadBlock({ opts, ps, script }: { //
; // } -function NegBlock({ opts, imperative, script }: { - opts: T.TextOptions, - imperative: boolean, - script: "p" | "f", +function NegBlock({ + opts, + imperative, + script, +}: { + opts: T.TextOptions; + imperative: boolean; + script: "p" | "f"; }) { - return
- - {negativeParticle[imperative ? "imperative" : "nonImperative"][script]} - -
Neg.
- {imperative ? "don't" : "not"} -
; + return ( +
+ + {negativeParticle[imperative ? "imperative" : "nonImperative"][script]} + +
Neg.
+ {imperative ? "don't" : "not"} +
+ ); } -function EquativeBlock({ opts, eq, script }: { - opts: T.TextOptions, - eq: T.EquativeRendered, - script: "p" | "f", +function EquativeBlock({ + opts, + eq, + script, +}: { + opts: T.TextOptions; + eq: T.EquativeRendered; + script: "p" | "f"; }) { - const [length, setLength] = useState("long"); - function changeLength() { - setLength(o => ( - o === "long" - ? "short" - : o === "short" && "mini" in eq.ps - ? "mini" - : "long" - )); - } - return
- {"long" in eq.ps &&
{length}
} - - {getLength(eq.ps, length)[0][script]} - -
Equative
- {getEnglishPersonInfo(eq.person, "short")} -
; + const [length, setLength] = useState("long"); + function changeLength() { + setLength((o) => + o === "long" + ? "short" + : o === "short" && "mini" in eq.ps + ? "mini" + : "long" + ); + } + return ( +
+ {"long" in eq.ps && ( +
+ {length} +
+ )} + {getLength(eq.ps, length)[0][script]} +
Equative
+ {getEnglishPersonInfo(eq.person, "short")} +
+ ); } -function SubjectBlock({ opts, np, role, script }: { - opts: T.TextOptions, - np: T.Rendered, - role: "king" | "servant" | undefined, - script: "p" | "f", +function SubjectBlock({ + opts, + np, + role, + script, +}: { + opts: T.TextOptions; + np: T.Rendered; + role: "king" | "servant" | undefined; + script: "p" | "f"; }) { - const english = getEnglishFromRendered(np); - return
-
Subject{role ? roleIcon[role] : ""}
- {np} -
; + const english = getEnglishFromRendered(np); + return ( +
+
+ Subject + {role ? roleIcon[role] : ""} +
+ + {np} + +
+ ); } -function ObjectBlock({ opts, obj, role, script }: { - opts: T.TextOptions, - obj: T.Rendered["selection"], - role: "king" | "servant" | undefined, - script: "p" | "f", +function ObjectBlock({ + opts, + obj, + role, + script, +}: { + opts: T.TextOptions; + obj: T.Rendered["selection"]; + role: "king" | "servant" | undefined; + script: "p" | "f"; }) { - if (typeof obj !== "object") { - return null; - } - const english = getEnglishFromRendered(obj); - return
-
Object{role ? roleIcon[role] : ""}
- {obj} -
; + if (typeof obj !== "object") { + return null; + } + const english = getEnglishFromRendered(obj); + return ( +
+
+ Object + {role ? roleIcon[role] : ""} +
+ + {obj} + +
+ ); } -function NCompBlock({ opts, comp, script }: { - script: "p" | "f", - opts: T.TextOptions, - comp: T.Comp, +function NCompBlock({ + opts, + comp, + script, +}: { + script: "p" | "f"; + opts: T.TextOptions; + comp: T.Comp; }) { - return
- - {comp.ps[script]} - - {comp.type === "AdjComp" - ?
adj. {getEnglishGenNumInfo(comp.gender, comp.number)}
- :
TODO
} - - todo - {/* {adj.e} */} - -
; + return ( +
+ {comp.ps[script]} + {comp.type === "AdjComp" && ( +
+
+ adj.{" "} + + {getEnglishGenNumInfo(comp.gender, comp.number)} + +
+ {comp.ps.e} +
+ )} +
+ ); } -function ComplementBlock({ opts, comp, script, inside }: { - script: "p" | "f", - opts: T.TextOptions, - comp: T.Rendered | T.Rendered["selection"], - inside?: boolean, +function ComplementBlock({ + opts, + comp, + script, + inside, +}: { + script: "p" | "f"; + opts: T.TextOptions; + comp: + | T.Rendered + | T.Rendered["selection"]; + inside?: boolean; }) { - function AdjectiveBlock({ opts, adj }: { - opts: T.TextOptions, - adj: T.Rendered, - }) { - return
- - {adj.ps[0][script]} - -
Adj. ({getEnglishParticipleInflection(adj.person, "short")})
- {adj.e} -
; - } + function AdjectiveBlock({ + opts, + adj, + }: { + opts: T.TextOptions; + adj: T.Rendered; + }) { + return ( +
+ {adj.ps[0][script]} +
+ Adj.{" "} + + ({getEnglishParticipleInflection(adj.person, "short")}) + +
+ {adj.e} +
+ ); + } - function LocAdvBlock({ opts, adv }: { - opts: T.TextOptions, - adv: T.Rendered, - }) { - return
- - {adv.ps[0][script]} - -
Loc. Adv.
- {adv.e} -
; - } - return
-
Complement
- {comp.type === "adjective" - ? - : comp.type === "loc. adv." - ? - : comp.type === "noun" - ? - : comp.type === "unselected" - ?
- - ____ - - {!inside && <> -
 
- {comp.e} - } -
- :
- -
Sandwich
- {comp.e} -
} -
; + function LocAdvBlock({ + opts, + adv, + }: { + opts: T.TextOptions; + adv: T.Rendered; + }) { + return ( +
+ {adv.ps[0][script]} +
Loc. Adv.
+ {adv.e} +
+ ); + } + return ( +
+
Complement
+ {comp.type === "adjective" ? ( + + ) : comp.type === "loc. adv." ? ( + + ) : comp.type === "noun" ? ( + + ) : comp.type === "unselected" ? ( +
+ ____ + {!inside && ( + <> +
 
+ {comp.e} + + )} +
+ ) : ( +
+ +
Sandwich
+ {comp.e} +
+ )} +
+ ); } -export function APBlock({ opts, children, english, script }: { - opts: T.TextOptions, - children: T.Rendered, - english?: string, - script: "p" | "f", +export function APBlock({ + opts, + children, + english, + script, +}: { + opts: T.TextOptions; + children: T.Rendered; + english?: string; + script: "p" | "f"; }) { - const ap = children; - if (ap.selection.type === "adverb") { - return
- - {ap.selection.ps[0][script]} - -
AP
- {english} -
; - } - return
- + const ap = children; + if (ap.selection.type === "adverb") { + return ( +
+ {ap.selection.ps[0][script]}
AP
{english} -
; -} - -function Sandwich({ opts, sandwich, script }: { - opts: T.TextOptions, - sandwich: T.Rendered>, - script: "p" | "f", -}) { - return
-
Sandwich 🥪
- -
- {sandwich.inside.selection.type !== "pronoun" ? sandwich.inside.selection.possesor : undefined} -
{sandwich.before ? sandwich.before.f : ""}
-
- {sandwich.inside} -
-
{sandwich.after ? sandwich.after.f : ""}
-
-
-
; -} - -function CompNounBlock({ opts, noun, script }: { - opts: T.TextOptions, - noun: T.Rendered, - script: "p" | "f", -}) { - return
- - {noun.ps[0][script]} - -
- Comp. Noun -
- {noun.e} +
+ ); + } + return ( +
+ +
AP
+ {english}
+ ); } -export function NPBlock({ opts, children, inside, english, script }: { - opts: T.TextOptions, - children: T.Rendered, - inside?: boolean, - english?: string, - script: "p" | "f", +function Sandwich({ + opts, + sandwich, + script, +}: { + opts: T.TextOptions; + sandwich: T.Rendered>; + script: "p" | "f"; }) { - const np = children; - const hasPossesor = !!(np.selection.type !== "pronoun" && np.selection.possesor && !np.selection.possesor.shrunken); - const elements = [ - ...!inside ? [{np.selection.type !== "pronoun" ? np.selection.possesor : undefined}] : [], - {np.selection.adjectives}, -
{np.selection.ps[0][script]}
, - ]; - const el = script === "p" ? elements.reverse() : elements; - return
- +
Sandwich 🥪
+ +
- {el} - -
- NP - {!inside ? <> - {` `} - ({getEnglishPersonInfo(np.selection.person, "short")}) - : <>} + + {sandwich.inside.selection.type !== "pronoun" + ? sandwich.inside.selection.possesor + : undefined} + +
+ {sandwich.before ? sandwich.before.f : ""} +
+
+ + {sandwich.inside} + +
+
+ {sandwich.after ? sandwich.after.f : ""} +
- {!inside && {english}} +
+ ); } -function Possesors({ opts, children, script }: { - opts: T.TextOptions, - children: { shrunken: boolean, np: T.Rendered } | undefined, - script: "p" | "f", +function CompNounBlock({ + opts, + noun, + script, +}: { + opts: T.TextOptions; + noun: T.Rendered; + script: "p" | "f"; }) { - if (!children) { - return null; - } - if (children.shrunken) { - return null; - } - const contraction = checkForContraction(children.np, script); - return
+ + {noun.ps[0][script]} + +
Comp. Noun
+ {noun.e} +
+ ); +} + +export function NPBlock({ + opts, + children, + inside, + english, + script, +}: { + opts: T.TextOptions; + children: T.Rendered; + inside?: boolean; + english?: string; + script: "p" | "f"; +}) { + const np = children; + const hasPossesor = !!( + np.selection.type !== "pronoun" && + np.selection.possesor && + !np.selection.possesor.shrunken + ); + const elements = [ + ...(!inside + ? [ + + {np.selection.type !== "pronoun" + ? np.selection.possesor + : undefined} + , + ] + : []), + + {np.selection.adjectives} + , +
+ {" "} + {np.selection.ps[0][script]} +
, + ]; + const el = script === "p" ? elements.reverse() : elements; + return ( +
+ + {el} + +
+ NP + {!inside ? ( + <> + {` `} + + ({getEnglishPersonInfo(np.selection.person, "short")}) + + + ) : ( + <> + )} +
+ {!inside && {english}} +
+ ); +} + +function Possesors({ + opts, + children, + script, +}: { + opts: T.TextOptions; + children: { shrunken: boolean; np: T.Rendered } | undefined; + script: "p" | "f"; +}) { + if (!children) { + return null; + } + if (children.shrunken) { + return null; + } + const contraction = checkForContraction(children.np, script); + return ( +
- {children.np.selection.type !== "pronoun" && {children.np.selection.possesor}} -
- {contraction &&
({contraction})
} -
-
{script === "p" ? "د" : "du"}
-
- {children.np} -
-
- + }} + > + {children.np.selection.type !== "pronoun" && ( + + {children.np.selection.possesor} + + )} +
+ {contraction &&
({contraction})
} +
+
{script === "p" ? "د" : "du"}
+
+ + {children.np} + +
+
+ ); } -function Adjectives({ opts, children, script }: { - opts: T.TextOptions, - children: T.Rendered[] | undefined, - script: "p" | "f", +function Adjectives({ + opts, + children, + script, +}: { + opts: T.TextOptions; + children: T.Rendered[] | undefined; + script: "p" | "f"; }) { - if (!children) { - return null; - } - const c = script === "p" - ? children.reverse() - : children; - return - {c.map(a => a.ps[0][script]).join(" ")}{` `} + if (!children) { + return null; + } + const c = script === "p" ? children.reverse() : children; + return ( + + {c.map((a) => a.ps[0][script]).join(" ")} + {` `} + ); } function SubText({ children: e }: { children: string | undefined }) { - return
{e ? e : ""}
; + }} + > + {e ? e : ""} +
+ ); } -function checkForContraction(np: T.Rendered, script: "p" | "f"): string | undefined { - if (np.selection.type !== "pronoun") return undefined; - if (np.selection.person === T.Person.FirstSingMale || np.selection.person === T.Person.FirstSingFemale) { - return script === "f" ? "zmaa" : "زما"; - } - if (np.selection.person === T.Person.SecondSingMale || np.selection.person === T.Person.SecondSingFemale) { - return script === "f" ? "staa" : "ستا"; - } - if (np.selection.person === T.Person.FirstPlurMale || np.selection.person === T.Person.FirstPlurFemale) { - return script === "f" ? "zmoonG" : "زمونږ"; - } - if (np.selection.person === T.Person.SecondPlurMale || np.selection.person === T.Person.SecondPlurFemale) { - return script === "f" ? "staaso" : "ستاسو"; - } - return undefined; +function checkForContraction( + np: T.Rendered, + script: "p" | "f" +): string | undefined { + if (np.selection.type !== "pronoun") return undefined; + if ( + np.selection.person === T.Person.FirstSingMale || + np.selection.person === T.Person.FirstSingFemale + ) { + return script === "f" ? "zmaa" : "زما"; + } + if ( + np.selection.person === T.Person.SecondSingMale || + np.selection.person === T.Person.SecondSingFemale + ) { + return script === "f" ? "staa" : "ستا"; + } + if ( + np.selection.person === T.Person.FirstPlurMale || + np.selection.person === T.Person.FirstPlurFemale + ) { + return script === "f" ? "zmoonG" : "زمونږ"; + } + if ( + np.selection.person === T.Person.SecondPlurMale || + np.selection.person === T.Person.SecondPlurFemale + ) { + return script === "f" ? "staaso" : "ستاسو"; + } + return undefined; } - diff --git a/src/lib/src/diacritics-helpers.test.ts b/src/lib/src/diacritics-helpers.test.ts index a827ee0..9e4f247 100644 --- a/src/lib/src/diacritics-helpers.test.ts +++ b/src/lib/src/diacritics-helpers.test.ts @@ -1,131 +1,133 @@ import { - splitFIntoPhonemes, - last, - addP, - lastNonWhitespace, - advanceP, - reverseP, - overwriteP, - advanceForHamza, - advanceForHamzaMid, + splitFIntoPhonemes, + last, + addP, + lastNonWhitespace, + reverseP, } from "./diacritics-helpers"; const phonemeSplits: Array<{ - in: string, - out: string[], + in: string; + out: string[]; }> = [ - { - in: "kor", - out: ["k", "o", "r"], - }, - { - in: "raaghey", - out: ["r", "aa", "gh", "ey"], - }, - { - in: "ist'imaal", - out: ["i", "s", "t", "'", "i", "m", "aa", "l"], - }, - { - in: "hatsa", - out: ["h", "a", "ts", "a"], - }, - { - in: "ba", - out: ["b", "a"], - }, - { - in: "peydáa", - out: ["p", "ey", "d", "aa"], - }, - { - in: "be kaar", - out: ["b", "e", "k", "aa", "r"], - }, - { - in: "raadzeyy", - out: ["r", "aa", "dz", "eyy"], - }, - { - in: "badanuy ??", - out: ["b", "a", "d", "a", "n", "uy"], - }, - { - in: "tur ... pore", - out: ["t", "u", "r", "p", "o", "r", "e"], - }, - { - in: "daar-Ul-iqaama", - out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"], - }, + { + in: "kor", + out: ["k", "o", "r"], + }, + { + in: "raaghay", + out: ["r", "aa", "gh", "ay"], + }, + { + in: "ist'imaal", + out: ["i", "s", "t", "'", "i", "m", "aa", "l"], + }, + { + in: "hatsa", + out: ["h", "a", "ts", "a"], + }, + { + in: "ba", + out: ["b", "a"], + }, + { + in: "paydáa", + out: ["p", "ay", "d", "aa"], + }, + { + in: "be kaar", + out: ["b", "e", "k", "aa", "r"], + }, + { + in: "raadzey", + out: ["r", "aa", "dz", "ey"], + }, + { + in: "badanuy ??", + out: ["b", "a", "d", "a", "n", "uy"], + }, + { + in: "tur ... pore", + out: ["t", "u", "r", "p", "o", "r", "e"], + }, + { + in: "daar-Ul-iqaama", + out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"], + }, ]; phonemeSplits.forEach((s) => { - test(`${s.in} should split properly`, () => { - const result = splitFIntoPhonemes(s.in); - expect(result).toEqual(s.out); - }); + test(`${s.in} should split properly`, () => { + const result = splitFIntoPhonemes(s.in); + expect(result).toEqual(s.out); + }); }); const badPhonetics: Array<{ - in: string, - problem: string, + in: string; + problem: string; }> = [ - { - in: "acar", - problem: "c", - }, - { - in: "a7am", - problem: "7", - }, + { + in: "acar", + problem: "c", + }, + { + in: "a7am", + problem: "7", + }, ]; test("bad phonetic characters should throw an error", () => { - badPhonetics.forEach((s) => { - expect(() => { - splitFIntoPhonemes(s.in); - }).toThrow(`illegal phonetic character: ${s.problem}`); - }); + badPhonetics.forEach((s) => { + expect(() => { + splitFIntoPhonemes(s.in); + }).toThrow(`illegal phonetic character: ${s.problem}`); + }); }); test("last should work", () => { - expect(last("this")).toBe("s"); + expect(last("this")).toBe("s"); }); test("addP should work", () => { - expect(addP("ت")({ pIn: "", pOut: "کر" })).toEqual({ - pIn: "", - pOut: "کرت", - }); + expect(addP("ت")({ pIn: "", pOut: "کر" })).toEqual({ + pIn: "", + pOut: "کرت", + }); }); test("lastNonWhiteSpace should work", () => { - expect(lastNonWhitespace("تورن")).toBe("ن"); - expect(lastNonWhitespace("وست .. ")).toBe("ت"); - expect(lastNonWhitespace("د ... ")).toBe("د"); + expect(lastNonWhitespace("تورن")).toBe("ن"); + expect(lastNonWhitespace("وست .. ")).toBe("ت"); + expect(lastNonWhitespace("د ... ")).toBe("د"); }); test("reverseP should work", () => { - expect(reverseP({ - pIn: "کور", - pOut: "تور ", - })).toEqual({ - pIn: " کور", - pOut: "تور", - }); - expect(reverseP({ - pIn: "کور", - pOut: "تور ... ", - })).toEqual({ - pIn: " ... کور", - pOut: "تور", - }); - expect(reverseP({ - pIn: "کور", - pOut: "تور . ", - })).toEqual({ - pIn: " . کور", - pOut: "تور", - }); -}) \ No newline at end of file + expect( + reverseP({ + pIn: "کور", + pOut: "تور ", + }) + ).toEqual({ + pIn: " کور", + pOut: "تور", + }); + expect( + reverseP({ + pIn: "کور", + pOut: "تور ... ", + }) + ).toEqual({ + pIn: " ... کور", + pOut: "تور", + }); + expect( + reverseP({ + pIn: "کور", + pOut: "تور . ", + }) + ).toEqual({ + pIn: " . کور", + pOut: "تور", + }); +}); diff --git a/src/lib/src/diacritics-helpers.ts b/src/lib/src/diacritics-helpers.ts index ee6c487..347eb40 100644 --- a/src/lib/src/diacritics-helpers.ts +++ b/src/lib/src/diacritics-helpers.ts @@ -8,31 +8,62 @@ import { removeAccents } from "./accent-helpers"; -export type DiacriticsAccumulator = { pIn: string, pOut: string }; +export type DiacriticsAccumulator = { pIn: string; pOut: string }; -type Consonant = "b" | "p" | "t" | "T" | "s" | "j" | "ch" | "kh" | "ts" | "dz" | "d" | "D" | "r" | "R" | "z" | "jz" | "G" | "sh" | "x" | "gh" | "f" | "q" | "k" | "g" | "l" | "m" | "n" | "N" | "h" | "w" | "y"; -type Ain = "'" -type JoiningVowel = "-i-" | "-U-" | "-Ul-"; -type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ey" | "uy" | "eyy"; +type Consonant = + | "b" + | "p" + | "t" + | "T" + | "s" + | "j" + | "ch" + | "kh" + | "ts" + | "dz" + | "d" + | "D" + | "r" + | "R" + | "z" + | "jz" + | "G" + | "sh" + | "x" + | "gh" + | "f" + | "q" + | "k" + | "g" + | "l" + | "m" + | "n" + | "N" + | "h" + | "w" + | "y"; +type Ain = "'"; +type JoiningVowel = "-i-" | "-U-" | "-Ul-"; +type LongVowel = "aa" | "ee" | "e" | "oo" | "o" | "ay" | "uy" | "ey"; type ShortVowel = "a" | "i" | "u" | "U"; export type Phoneme = Consonant | Ain | LongVowel | ShortVowel | JoiningVowel; type PhonemeInfo = { - matches?: string[], - beginningMatches?: string[], - endingMatches?: string[], - consonant?: true, - diacritic?: string, - endingOnly?: true, - takesSukunOnEnding?: true, - longVowel?: true, - canStartWithAynBefore?: true, - useEndingDiacritic?: true, - ainBlendDiacritic?: string, -} + matches?: string[]; + beginningMatches?: string[]; + endingMatches?: string[]; + consonant?: true; + diacritic?: string; + endingOnly?: true; + takesSukunOnEnding?: true; + longVowel?: true; + canStartWithAynBefore?: true; + useEndingDiacritic?: true; + ainBlendDiacritic?: string; +}; export const zwar = "َ"; -export const zwarakey = "ٙ"; +export const zwarakay = "ٙ"; export const zer = "ِ"; export const pesh = "ُ"; export const sukun = "ْ"; @@ -43,513 +74,677 @@ export const daggerAlif = "ٰ"; export const fathahan = "ً"; export const phonemeTable: Record = { - // Consonants - "b": { - matches: ["ب"], - consonant: true, - }, - "p": { - matches: ["پ"], - consonant: true, - }, - "t": { - matches: ["ت", "ط"], - consonant: true, - }, - "T": { - matches: ["ټ"], - consonant: true, - }, - "s": { - matches: ["س", "ص", "ث"], - consonant: true, - }, - "j": { - matches: ["ج"], - consonant: true, - }, - "ch": { - matches: ["چ"], - consonant: true, - }, - "kh": { - matches: ["خ"], - consonant: true, - }, - "ts": { - matches: ["څ"], - consonant: true, - }, - "dz": { - matches: ["ځ"], - consonant: true, - }, - "d": { - matches: ["د"], - consonant: true, - }, - "D": { - matches: ["ډ"], - consonant: true, - }, - "r": { - matches: ["ر"], - consonant: true, - }, - "R": { - matches: ["ړ"], - consonant: true, - }, - "z": { - matches: ["ز", "ذ", "ظ", "ض"], - consonant: true, - }, - "jz": { - matches: ["ژ"], - consonant: true, - }, - "G": { - matches: ["ږ"], - consonant: true, - }, - "sh": { - matches: ["ش"], - consonant: true, - }, - "x": { - matches: ["ښ"], - consonant: true, - }, - "gh": { - matches: ["غ"], - consonant: true, - }, - "f": { - matches: ["ف"], - consonant: true, - }, - "q": { - matches: ["ق"], - consonant: true, - }, - "k": { - matches: ["ک"], - consonant: true, - }, - "g": { - matches: ["ګ"], - consonant: true, - }, - "l": { - matches: ["ل"], - consonant: true, - }, - "m": { - matches: ["م"], - consonant: true, - }, - "n": { - matches: ["ن"], - consonant: true, - }, - "N": { - matches: ["ڼ"], - consonant: true, - }, - "h": { - matches: ["ه", "ح"], - consonant: true, - takesSukunOnEnding: true, - }, - "w": { - matches: ["و"], - consonant: true, - }, - "y": { - matches: ["ی"], - consonant: true, - }, - // Ain - "'": { - matches: ["ع", "ئ"], - consonant: true, - }, - // Joining Vowels - "-i-": { - }, - "-U-": { - matches: [" و ", "و"], - }, - "-Ul-": { - matches: ["ال"], - }, - // Long Vowels - "aa": { - matches: ["ا", "أ"], - beginningMatches: ["آ", "ا"], - endingMatches: ["ا", "یٰ"], - longVowel: true, - ainBlendDiacritic: zwar, - }, - "ee": { - matches: ["ی"], - longVowel: true, - endingMatches: ["ي"], - diacritic: zer, - canStartWithAynBefore: true, - ainBlendDiacritic: zer, - }, - "e": { - matches: ["ې"], - longVowel: true, - }, - "o": { - matches: ["و"], - longVowel: true, - }, - "oo": { - matches: ["و"], - longVowel: true, - diacritic: pesh, - useEndingDiacritic: true, - ainBlendDiacritic: pesh, - }, - "ey": { - matches: ["ی"], - longVowel: true, - endingMatches: ["ی"], - }, - "uy": { - matches: ["ۍ"], - longVowel: true, - endingOnly: true, - }, - "eyy": { - matches: ["ئ"], - longVowel: true, - endingOnly: true, - }, - // Short Vowels - "a": { - diacritic: zwar, - endingMatches: ["ه"], - beginningMatches: ["ا", "ع"], - // canComeAfterHeyEnding: true, - }, - "u": { - diacritic: zwarakey, - endingMatches: ["ه"], - }, - "i": { - diacritic: zer, - endingMatches: ["ه"], - beginningMatches: ["ا", "ع"], - // takesDiacriticBeforeGurdaHeyEnding: true, - // canBeWasla: true, - }, - "U": { - diacritic: pesh, - endingMatches: ["ه"], - // takesDiacriticBeforeGurdaHeyEnding: true, - beginningMatches: ["ا", "ع"], - }, -} + // Consonants + b: { + matches: ["ب"], + consonant: true, + }, + p: { + matches: ["پ"], + consonant: true, + }, + t: { + matches: ["ت", "ط"], + consonant: true, + }, + T: { + matches: ["ټ"], + consonant: true, + }, + s: { + matches: ["س", "ص", "ث"], + consonant: true, + }, + j: { + matches: ["ج"], + consonant: true, + }, + ch: { + matches: ["چ"], + consonant: true, + }, + kh: { + matches: ["خ"], + consonant: true, + }, + ts: { + matches: ["څ"], + consonant: true, + }, + dz: { + matches: ["ځ"], + consonant: true, + }, + d: { + matches: ["د"], + consonant: true, + }, + D: { + matches: ["ډ"], + consonant: true, + }, + r: { + matches: ["ر"], + consonant: true, + }, + R: { + matches: ["ړ"], + consonant: true, + }, + z: { + matches: ["ز", "ذ", "ظ", "ض"], + consonant: true, + }, + jz: { + matches: ["ژ"], + consonant: true, + }, + G: { + matches: ["ږ"], + consonant: true, + }, + sh: { + matches: ["ش"], + consonant: true, + }, + x: { + matches: ["ښ"], + consonant: true, + }, + gh: { + matches: ["غ"], + consonant: true, + }, + f: { + matches: ["ف"], + consonant: true, + }, + q: { + matches: ["ق"], + consonant: true, + }, + k: { + matches: ["ک"], + consonant: true, + }, + g: { + matches: ["ګ"], + consonant: true, + }, + l: { + matches: ["ل"], + consonant: true, + }, + m: { + matches: ["م"], + consonant: true, + }, + n: { + matches: ["ن"], + consonant: true, + }, + N: { + matches: ["ڼ"], + consonant: true, + }, + h: { + matches: ["ه", "ح"], + consonant: true, + takesSukunOnEnding: true, + }, + w: { + matches: ["و"], + consonant: true, + }, + y: { + matches: ["ی"], + consonant: true, + }, + // Ain + "'": { + matches: ["ع", "ئ"], + consonant: true, + }, + // Joining Vowels + "-i-": {}, + "-U-": { + matches: [" و ", "و"], + }, + "-Ul-": { + matches: ["ال"], + }, + // Long Vowels + aa: { + matches: ["ا", "أ"], + beginningMatches: ["آ", "ا"], + endingMatches: ["ا", "یٰ"], + longVowel: true, + ainBlendDiacritic: zwar, + }, + ee: { + matches: ["ی"], + longVowel: true, + endingMatches: ["ي"], + diacritic: zer, + canStartWithAynBefore: true, + ainBlendDiacritic: zer, + }, + e: { + matches: ["ې"], + longVowel: true, + }, + o: { + matches: ["و"], + longVowel: true, + }, + oo: { + matches: ["و"], + longVowel: true, + diacritic: pesh, + useEndingDiacritic: true, + ainBlendDiacritic: pesh, + }, + ay: { + matches: ["ی"], + longVowel: true, + endingMatches: ["ی"], + }, + uy: { + matches: ["ۍ"], + longVowel: true, + endingOnly: true, + }, + ey: { + matches: ["ئ"], + longVowel: true, + endingOnly: true, + }, + // Short Vowels + a: { + diacritic: zwar, + endingMatches: ["ه"], + beginningMatches: ["ا", "ع"], + // canComeAfterHayEnding: true, + }, + u: { + diacritic: zwarakay, + endingMatches: ["ه"], + }, + i: { + diacritic: zer, + endingMatches: ["ه"], + beginningMatches: ["ا", "ع"], + // takesDiacriticBeforeGurdaHayEnding: true, + // canBeWasla: true, + }, + U: { + diacritic: pesh, + endingMatches: ["ه"], + // takesDiacriticBeforeGurdaHayEnding: true, + beginningMatches: ["ا", "ع"], + }, +}; /** * splits a phonetics string into an array of Phonemes - * + * * will error if there is an illeagal phonetics character - * + * * @param fIn a phonetics string * @returns an array of phonemes */ - export function splitFIntoPhonemes(fIn: string): Phoneme[] { - const singleLetterPhonemes: Phoneme[] = ["a", "i", "u", "o", "e", "U", "b", "p", "t", "T", "s", "j", "d", "D", "r", "R", "z", "G", "x", "f", "q", "k", "g", "l", "m", "n", "N", "h", "w", "y", "'"]; - - const quadrigraphs: Phoneme[] = ["-Ul-"]; - const trigraphs: Phoneme[] = ["eyy", "-i-", "-U-"]; - const digraphs: Phoneme[] = ["aa", "ee", "ey", "oo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; - const endingDigraphs: Phoneme[] = ["uy"]; - const willIgnore = ["?", " ", "`", ".", "…", ",", "-"]; - - const result: Phoneme[] = []; - const f = removeAccents(fIn).replace(/ă/g, "a"); - let index = 0; - while (index < f.length) { - const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " "); - const threeLetterChunk = f.slice(index, index + 3) as Phoneme; - const fourLetterChunk = f.slice(index, index + 4) as Phoneme; - if (quadrigraphs.includes(fourLetterChunk)) { - result.push(fourLetterChunk); - index += 4; - continue; - } - if (trigraphs.includes(threeLetterChunk)) { - result.push(threeLetterChunk); - index += 3; - continue; - } - const twoLetterChunk = f.slice(index, index + 2) as Phoneme; - if ( - digraphs.includes(twoLetterChunk) || - (isLastTwoLetters && endingDigraphs.includes(twoLetterChunk)) - ) { - result.push(twoLetterChunk); - index += 2; - continue; - } - const singleLetter = f.slice(index, index + 1) as Phoneme; - if (!willIgnore.includes(singleLetter)) { - if (!singleLetterPhonemes.includes(singleLetter)) { - throw new Error(`illegal phonetic character: ${singleLetter}`); - } - result.push(singleLetter); - } - index++; +export function splitFIntoPhonemes(fIn: string): Phoneme[] { + const singleLetterPhonemes: Phoneme[] = [ + "a", + "i", + "u", + "o", + "e", + "U", + "b", + "p", + "t", + "T", + "s", + "j", + "d", + "D", + "r", + "R", + "z", + "G", + "x", + "f", + "q", + "k", + "g", + "l", + "m", + "n", + "N", + "h", + "w", + "y", + "'", + ]; + + const quadrigraphs: Phoneme[] = ["-Ul-"]; + const trigraphs: Phoneme[] = ["ey", "-i-", "-U-"]; + const digraphs: Phoneme[] = [ + "aa", + "ee", + "ay", + "oo", + "kh", + "gh", + "ts", + "dz", + "jz", + "ch", + "sh", + ]; + const endingDigraphs: Phoneme[] = ["uy"]; + const willIgnore = ["?", " ", "`", ".", "…", ",", "-"]; + + const result: Phoneme[] = []; + const f = removeAccents(fIn).replace(/ă/g, "a"); + let index = 0; + while (index < f.length) { + const isLastTwoLetters = index === f.length - 2 || f[index + 2] === " "; + const threeLetterChunk = f.slice(index, index + 3) as Phoneme; + const fourLetterChunk = f.slice(index, index + 4) as Phoneme; + if (quadrigraphs.includes(fourLetterChunk)) { + result.push(fourLetterChunk); + index += 4; + continue; } - return result; + if (trigraphs.includes(threeLetterChunk)) { + result.push(threeLetterChunk); + index += 3; + continue; + } + const twoLetterChunk = f.slice(index, index + 2) as Phoneme; + if ( + digraphs.includes(twoLetterChunk) || + (isLastTwoLetters && endingDigraphs.includes(twoLetterChunk)) + ) { + result.push(twoLetterChunk); + index += 2; + continue; + } + const singleLetter = f.slice(index, index + 1) as Phoneme; + if (!willIgnore.includes(singleLetter)) { + if (!singleLetterPhonemes.includes(singleLetter)) { + throw new Error(`illegal phonetic character: ${singleLetter}`); + } + result.push(singleLetter); + } + index++; + } + return result; } export enum PhonemeStatus { - LeadingLongVowel, - LeadingConsonantOrShortVowel, - DoubleConsonantTashdeed, - EndingWithHeyHim, - DirectMatch, - DirectMatchAfterSukun, - EndingWithHeyHimFromSukun, - ShortVowel, - PersianSilentWWithAa, - ArabicWasla, - Izafe, - EndOfDuParticle, - ShortAEndingAfterHeem, - AlefDaggarEnding, - SilentAinAfterAlef, - AinWithLongAAtBeginning, - LongAinVowelMissingComma, - ShortAinVowelMissingComma, - ShortAinVowelMissingCommaAfterAlefStart, - AinBeginningAfterShortVowel, - AlefWithHamza, - AlefWithHamzaWithGlottalStop, - WoEndingO, - ShortAForAlefBeforeFathatan, - NOnFathatan, - HamzaOnWow, - ArabicDefiniteArticleUl, - OoPrefix, - AlefHamzaBeg, - GlottalStopBeforeOo, - OoAfterGlottalStopOo, - EndingSmallH, + LeadingLongVowel, + LeadingConsonantOrShortVowel, + DoubleConsonantTashdeed, + EndingWithHayHim, + DirectMatch, + DirectMatchAfterSukun, + EndingWithHayHimFromSukun, + ShortVowel, + PersianSilentWWithAa, + ArabicWasla, + Izafe, + EndOfDuParticle, + ShortAEndingAfterHeem, + AlefDaggarEnding, + SilentAinAfterAlef, + AinWithLongAAtBeginning, + LongAinVowelMissingComma, + ShortAinVowelMissingComma, + ShortAinVowelMissingCommaAfterAlefStart, + AinBeginningAfterShortVowel, + AlefWithHamza, + AlefWithHamzaWithGlottalStop, + WoEndingO, + ShortAForAlefBeforeFathatan, + NOnFathatan, + HamzaOnWow, + ArabicDefiniteArticleUl, + OoPrefix, + AlefHamzaBeg, + GlottalStopBeforeOo, + OoAfterGlottalStopOo, + EndingSmallH, } -export function stateInfo({ state, i, phonemes, phoneme }: { - state: DiacriticsAccumulator, - i: number, - phonemes: Phoneme[], - phoneme: Phoneme, +export function stateInfo({ + state, + i, + phonemes, + phoneme, +}: { + state: DiacriticsAccumulator; + i: number; + phonemes: Phoneme[]; + phoneme: Phoneme; }) { - const isOutOfWord = (char: string) => !char || char === " "; - const prevPLetter = last(state.pOut); - const currentPLetter = state.pIn[0]; - const nextPLetter = state.pIn[1]; - const nextPhoneme = phonemes[i+1]; - const previousPhoneme = i > 0 && phonemes[i-1]; - const lastThreePLetters = last(state.pOut, 3) + last(state.pOut, 2) + prevPLetter; - const isBeginningOfWord = (state.pOut === "" || prevPLetter === " ") || (previousPhoneme === "-Ul-" && prevPLetter === "ل") || (["دَر", "وَر"].includes(lastThreePLetters) || (last(state.pOut, 2) + prevPLetter) === "را"); - const isEndOfWord = isOutOfWord(nextPLetter); - const phonemeInfo = phonemeTable[phoneme]; - const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]]; - // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; - // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined; - const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant); - const needsSukun = (doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter))) // || (isEndOfWord && phonemeInfo.takesSukunOnEnding); - const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع")); - const diacritic = useAinBlendDiacritics + const isOutOfWord = (char: string) => !char || char === " "; + const prevPLetter = last(state.pOut); + const currentPLetter = state.pIn[0]; + const nextPLetter = state.pIn[1]; + const nextPhoneme = phonemes[i + 1]; + const previousPhoneme = i > 0 && phonemes[i - 1]; + const lastThreePLetters = + last(state.pOut, 3) + last(state.pOut, 2) + prevPLetter; + const isBeginningOfWord = + state.pOut === "" || + prevPLetter === " " || + (previousPhoneme === "-Ul-" && prevPLetter === "ل") || + ["دَر", "وَر"].includes(lastThreePLetters) || + last(state.pOut, 2) + prevPLetter === "را"; + const isEndOfWord = isOutOfWord(nextPLetter); + const phonemeInfo = phonemeTable[phoneme]; + const previousPhonemeInfo = + !isBeginningOfWord && i > 0 && phonemeTable[phonemes[i - 1]]; + // const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1]; + // const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined; + const doubleConsonant = + previousPhonemeInfo && + phonemeInfo.consonant && + previousPhonemeInfo.consonant; + const needsSukun = + doubleConsonant && + (previousPhoneme !== phoneme || + phonemeInfo.matches?.includes(currentPLetter)); // || (isEndOfWord && phonemeInfo.takesSukunOnEnding); + const useAinBlendDiacritics = + !isBeginningOfWord && + phonemeInfo.ainBlendDiacritic && + currentPLetter === "ع"; + const diacritic = useAinBlendDiacritics ? phonemeInfo.ainBlendDiacritic - : isEndOfWord - ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic; - - const lastWordEndedW = (char: string) => ((prevPLetter === char && !currentPLetter) || (prevPLetter === " " && last(state.pOut, 2) === char)); + : isEndOfWord + ? !phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic + ? phonemeInfo.diacritic + : undefined + : phonemeInfo.diacritic; - function getPhonemeState(): PhonemeStatus { - if (isBeginningOfWord && phoneme === "aa" && phonemeInfo.beginningMatches?.includes(currentPLetter)) { - return PhonemeStatus.DirectMatch; - } - if (isBeginningOfWord && phoneme === "oo" && currentPLetter === "و") { - return PhonemeStatus.OoPrefix; - } - if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) { - if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) { - throw Error("phonetics error - needs alef prefix"); - } - return PhonemeStatus.LeadingLongVowel; - } - if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) { - return PhonemeStatus.LeadingConsonantOrShortVowel; - } - if (isBeginningOfWord && phoneme === "aa" && currentPLetter === "ع" && nextPLetter === "ا") { - return PhonemeStatus.AinWithLongAAtBeginning; - } - if (currentPLetter === "ا" && nextPLetter === "ع" && phoneme === "aa" && nextPhoneme !== "'") { - return PhonemeStatus.SilentAinAfterAlef; - } - // console.log("------"); - // console.log("phoneme", phoneme); - // console.log("state", state); - // console.log("prevPLetter is space", prevPLetter === " "); - // console.log("------"); - if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && lastNonWhitespace(state.pOut) === "د") { - return PhonemeStatus.EndOfDuParticle - } - if (isBeginningOfWord && phoneme === "-Ul-" && currentPLetter === "ا" && nextPLetter === "ل") { - return PhonemeStatus.ArabicDefiniteArticleUl; - } - if (phoneme === "a" && nextPhoneme === "'" && phonemes[i+2] === "a" && currentPLetter === "أ") { - return PhonemeStatus.AlefHamzaBeg; - } - if (phoneme === "a" && previousPhoneme === "U" && currentPLetter === "و") { - return PhonemeStatus.HamzaOnWow; - } - if (phoneme === "a" && currentPLetter === "ا" && nextPLetter === fathahan) { - return PhonemeStatus.ShortAForAlefBeforeFathatan; - } - if (phoneme === "'" && currentPLetter === "و" && nextPLetter === "و") { - return PhonemeStatus.GlottalStopBeforeOo; - } - if (phoneme === "oo" && previousPhoneme === "'" && currentPLetter === "و" && prevPLetter === hamzaAbove) { - return PhonemeStatus.OoAfterGlottalStopOo; - } - if (phoneme === "'" && last(state.pOut, 2) === "ع" && isOutOfWord(last(state.pOut, 3))) { - return PhonemeStatus.AinBeginningAfterShortVowel; - } - if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") { - return PhonemeStatus.PersianSilentWWithAa; - } - if (!isBeginningOfWord && phoneme === "i" && currentPLetter === "ا" && nextPLetter === "ل") { - return PhonemeStatus.ArabicWasla; - } - if (phoneme === "-i-" && isBeginningOfWord) { - return PhonemeStatus.Izafe; - } - if (phoneme === "a" && currentPLetter === "أ") { - return PhonemeStatus.AlefWithHamza; - } - if (phoneme === "'" && nextPhoneme === "a" && currentPLetter === "أ") { - return PhonemeStatus.AlefWithHamzaWithGlottalStop; - } - if (currentPLetter === "ع" && phoneme !== "'" && nextPhoneme !== "'") { - if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { - return PhonemeStatus.ShortAinVowelMissingComma; - } - if ((last(state.pOut, 2) === "ا") && isOutOfWord(last(state.pOut, 3))) { - return PhonemeStatus.ShortAinVowelMissingCommaAfterAlefStart; - } - } - if (useAinBlendDiacritics) { - return PhonemeStatus.LongAinVowelMissingComma; - } - if (((!isBeginningOfWord && doubleConsonant) || prevPLetter === " ") && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter)) { - return PhonemeStatus.DoubleConsonantTashdeed; - } - if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) { - return PhonemeStatus.AlefDaggarEnding; - } - if (phoneme === "a" && lastWordEndedW("ح")) { - return PhonemeStatus.ShortAEndingAfterHeem; - } - if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) { - return needsSukun ? PhonemeStatus.EndingWithHeyHimFromSukun : PhonemeStatus.EndingWithHeyHim; - } - if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) { - return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch; - } - if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { - return PhonemeStatus.ShortVowel; - } - if (phoneme === "o" && previousPhoneme === "w" && lastWordEndedW("و")) { - return PhonemeStatus.WoEndingO; - } - if (isEndOfWord && phoneme === "n" && currentPLetter === fathahan && prevPLetter === "ا") { - return PhonemeStatus.NOnFathatan; - } - // console.log("errored", "current", phoneme, "next", nextPhoneme); - // console.log("bad phoneme is ", phoneme); - throw new Error("phonetics error - no status found for phoneme: " + phoneme); + const lastWordEndedW = (char: string) => + (prevPLetter === char && !currentPLetter) || + (prevPLetter === " " && last(state.pOut, 2) === char); + + function getPhonemeState(): PhonemeStatus { + if ( + isBeginningOfWord && + phoneme === "aa" && + phonemeInfo.beginningMatches?.includes(currentPLetter) + ) { + return PhonemeStatus.DirectMatch; } + if (isBeginningOfWord && phoneme === "oo" && currentPLetter === "و") { + return PhonemeStatus.OoPrefix; + } + if (isBeginningOfWord && phonemeInfo.longVowel && !phonemeInfo.endingOnly) { + if ( + phoneme !== "aa" && + currentPLetter !== "ا" && + !phonemeInfo.matches?.includes(nextPLetter) + ) { + throw Error("phonetics error - needs alef prefix"); + } + return PhonemeStatus.LeadingLongVowel; + } + if ( + isBeginningOfWord && + (phonemeInfo.beginningMatches?.includes(currentPLetter) || + phonemeInfo.matches?.includes(currentPLetter)) + ) { + return PhonemeStatus.LeadingConsonantOrShortVowel; + } + if ( + isBeginningOfWord && + phoneme === "aa" && + currentPLetter === "ع" && + nextPLetter === "ا" + ) { + return PhonemeStatus.AinWithLongAAtBeginning; + } + if ( + currentPLetter === "ا" && + nextPLetter === "ع" && + phoneme === "aa" && + nextPhoneme !== "'" + ) { + return PhonemeStatus.SilentAinAfterAlef; + } + // console.log("------"); + // console.log("phoneme", phoneme); + // console.log("state", state); + // console.log("prevPLetter is space", prevPLetter === " "); + // console.log("------"); + if ( + isBeginningOfWord && + phoneme === "u" && + prevPLetter === " " && + lastNonWhitespace(state.pOut) === "د" + ) { + return PhonemeStatus.EndOfDuParticle; + } + if ( + isBeginningOfWord && + phoneme === "-Ul-" && + currentPLetter === "ا" && + nextPLetter === "ل" + ) { + return PhonemeStatus.ArabicDefiniteArticleUl; + } + if ( + phoneme === "a" && + nextPhoneme === "'" && + phonemes[i + 2] === "a" && + currentPLetter === "أ" + ) { + return PhonemeStatus.AlefHamzaBeg; + } + if (phoneme === "a" && previousPhoneme === "U" && currentPLetter === "و") { + return PhonemeStatus.HamzaOnWow; + } + if (phoneme === "a" && currentPLetter === "ا" && nextPLetter === fathahan) { + return PhonemeStatus.ShortAForAlefBeforeFathatan; + } + if (phoneme === "'" && currentPLetter === "و" && nextPLetter === "و") { + return PhonemeStatus.GlottalStopBeforeOo; + } + if ( + phoneme === "oo" && + previousPhoneme === "'" && + currentPLetter === "و" && + prevPLetter === hamzaAbove + ) { + return PhonemeStatus.OoAfterGlottalStopOo; + } + if ( + phoneme === "'" && + last(state.pOut, 2) === "ع" && + isOutOfWord(last(state.pOut, 3)) + ) { + return PhonemeStatus.AinBeginningAfterShortVowel; + } + if ( + !isBeginningOfWord && + phoneme === "aa" && + currentPLetter === "و" && + nextPLetter === "ا" + ) { + return PhonemeStatus.PersianSilentWWithAa; + } + if ( + !isBeginningOfWord && + phoneme === "i" && + currentPLetter === "ا" && + nextPLetter === "ل" + ) { + return PhonemeStatus.ArabicWasla; + } + if (phoneme === "-i-" && isBeginningOfWord) { + return PhonemeStatus.Izafe; + } + if (phoneme === "a" && currentPLetter === "أ") { + return PhonemeStatus.AlefWithHamza; + } + if (phoneme === "'" && nextPhoneme === "a" && currentPLetter === "أ") { + return PhonemeStatus.AlefWithHamzaWithGlottalStop; + } + if (currentPLetter === "ع" && phoneme !== "'" && nextPhoneme !== "'") { + if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { + return PhonemeStatus.ShortAinVowelMissingComma; + } + if (last(state.pOut, 2) === "ا" && isOutOfWord(last(state.pOut, 3))) { + return PhonemeStatus.ShortAinVowelMissingCommaAfterAlefStart; + } + } + if (useAinBlendDiacritics) { + return PhonemeStatus.LongAinVowelMissingComma; + } + if ( + ((!isBeginningOfWord && doubleConsonant) || prevPLetter === " ") && + previousPhoneme === phoneme && + !phonemeInfo.matches?.includes(currentPLetter) + ) { + return PhonemeStatus.DoubleConsonantTashdeed; + } + if ( + phoneme === "aa" && + currentPLetter === "ی" && + nextPLetter === daggerAlif + ) { + return PhonemeStatus.AlefDaggarEnding; + } + if (phoneme === "a" && lastWordEndedW("ح")) { + return PhonemeStatus.ShortAEndingAfterHeem; + } + if ( + isEndOfWord && + ((phoneme === "u" && currentPLetter === "ه") || + (phoneme === "h" && ["ه", "ح"].includes(currentPLetter))) + ) { + return needsSukun + ? PhonemeStatus.EndingWithHayHimFromSukun + : PhonemeStatus.EndingWithHayHim; + } + if ( + phonemeInfo.matches?.includes(currentPLetter) || + (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || + (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب") + ) { + return needsSukun + ? PhonemeStatus.DirectMatchAfterSukun + : PhonemeStatus.DirectMatch; + } + if (phonemeInfo.diacritic && !phonemeInfo.longVowel) { + return PhonemeStatus.ShortVowel; + } + if (phoneme === "o" && previousPhoneme === "w" && lastWordEndedW("و")) { + return PhonemeStatus.WoEndingO; + } + if ( + isEndOfWord && + phoneme === "n" && + currentPLetter === fathahan && + prevPLetter === "ا" + ) { + return PhonemeStatus.NOnFathatan; + } + // console.log("errored", "current", phoneme, "next", nextPhoneme); + // console.log("bad phoneme is ", phoneme); + throw new Error( + "phonetics error - no status found for phoneme: " + phoneme + ); + } - const phs = getPhonemeState(); - - return { - phs, phonemeInfo, diacritic, prevPLetter, - }; -}; + const phs = getPhonemeState(); + return { + phs, + phonemeInfo, + diacritic, + prevPLetter, + }; +} /** * returns the nth last character of a string - * - * @param s + * + * @param s */ export function last(s: string, n = 1) { - return s[s.length - n]; + return s[s.length - n]; } -export function advanceP(state: DiacriticsAccumulator, n: number = 1): DiacriticsAccumulator { - return { - pIn: state.pIn.slice(n), - pOut: state.pOut + state.pIn.slice(0, n), - }; +export function advanceP( + state: DiacriticsAccumulator, + n: number = 1 +): DiacriticsAccumulator { + return { + pIn: state.pIn.slice(n), + pOut: state.pOut + state.pIn.slice(0, n), + }; } /** * moves back to the last character that wasn't a " " or "." - * - * @param state - * @returns + * + * @param state + * @returns */ export function reverseP(state: DiacriticsAccumulator): DiacriticsAccumulator { - const reversed = [...state.pOut].reverse(); - const howFar = reversed.findIndex((c) => ![" ", "."].includes(c)); - return { - pIn: state.pOut.slice(-howFar) + state.pIn, - pOut: state.pOut.slice(0, -howFar), - }; + const reversed = [...state.pOut].reverse(); + const howFar = reversed.findIndex((c) => ![" ", "."].includes(c)); + return { + pIn: state.pOut.slice(-howFar) + state.pIn, + pOut: state.pOut.slice(0, -howFar), + }; } -export const addP = (toAdd: string | undefined) => (state: DiacriticsAccumulator): DiacriticsAccumulator => { +export const addP = + (toAdd: string | undefined) => + (state: DiacriticsAccumulator): DiacriticsAccumulator => { return { - ...state, - pOut: toAdd ? (state.pOut + toAdd) : state.pOut, + ...state, + pOut: toAdd ? state.pOut + toAdd : state.pOut, }; -}; + }; -export const overwriteP = (toWrite: string) => (state: DiacriticsAccumulator): DiacriticsAccumulator => { +export const overwriteP = + (toWrite: string) => + (state: DiacriticsAccumulator): DiacriticsAccumulator => { return { - pIn: state.pIn.slice(1), - pOut: state.pOut + toWrite, + pIn: state.pIn.slice(1), + pOut: state.pOut + toWrite, }; -}; + }; /** * returns the last letter before any whitespace (" " / ".") - * - * @param s - * @returns + * + * @param s + * @returns */ export function lastNonWhitespace(s: string): string { - const reversed = [...s].reverse(); - const lastIndex = reversed.findIndex((c) => ![" ", "."].includes(c)); - const penultimateChar = reversed[lastIndex]; - return penultimateChar; + const reversed = [...s].reverse(); + const lastIndex = reversed.findIndex((c) => ![" ", "."].includes(c)); + const penultimateChar = reversed[lastIndex]; + return penultimateChar; } -export function getCurrentNext(state: DiacriticsAccumulator): { current: string, next: string} { - return { - current: state.pIn[0], - next: state.pIn[1], - }; +export function getCurrentNext(state: DiacriticsAccumulator): { + current: string; + next: string; +} { + return { + current: state.pIn[0], + next: state.pIn[1], + }; } // export function advanceForAin(state: DiacriticsAccumulator): DiacriticsAccumulator { @@ -557,22 +752,25 @@ export function getCurrentNext(state: DiacriticsAccumulator): { current: string, // return (current === "ع") ? advanceP(state) : state; // } -export function advanceForHamzaMid(state: DiacriticsAccumulator): DiacriticsAccumulator { - const { current, next } = getCurrentNext(state); - if (current === "ئ" && next && next !== "ئ") { - return advanceP(state); - } - return state; +export function advanceForHamzaMid( + state: DiacriticsAccumulator +): DiacriticsAccumulator { + const { current, next } = getCurrentNext(state); + if (current === "ئ" && next && next !== "ئ") { + return advanceP(state); + } + return state; } -export function advanceForHamza(state: DiacriticsAccumulator): DiacriticsAccumulator { - const { current, next } = getCurrentNext(state); - if (current === "ه" && (!next || next === " ")) { - return advanceP(state); - } - // if (current === "ع") { - // return advanceP(state); - // } - return state; +export function advanceForHamza( + state: DiacriticsAccumulator +): DiacriticsAccumulator { + const { current, next } = getCurrentNext(state); + if (current === "ه" && (!next || next === " ")) { + return advanceP(state); + } + // if (current === "ع") { + // return advanceP(state); + // } + return state; } - diff --git a/src/lib/src/diacritics.test.ts b/src/lib/src/diacritics.test.ts index 47fa109..36dd8b4 100644 --- a/src/lib/src/diacritics.test.ts +++ b/src/lib/src/diacritics.test.ts @@ -6,1286 +6,1280 @@ * */ -import { - addDiacritics, -} from "./diacritics"; -import { - zwar, - zwarakey, - sukun, - tashdeed, -} from "./diacritics-helpers"; +import { addDiacritics } from "./diacritics"; +import { zwar, zwarakay, sukun, tashdeed } from "./diacritics-helpers"; import * as T from "../../types"; const diacriticsSections: { - describe: string, - tests: { - in: T.PsString, - out: string | null, - }[], + describe: string; + tests: { + in: T.PsString; + out: string | null; + }[]; }[] = [ - { - describe: "regular, native Pashto script/sounds", - tests: [ - { - in: { - p: "کور", - f: "kor", - }, - out: "کور", - }, - { - in: { - p: "کور", - f: "koor", - }, - out: "کُور", - }, - { - in: { - p: "کور کور", - f: "kor koor", - }, - out: "کور کُور", - }, - { - in: { - p: "تب", - f: "tib", - }, - out: "تِب", - }, - { - in: { - p: "تب", - f: "tab", - }, - out: "تَب", - }, - { - in: { - p: "تب", - f: "tUb", - }, - out: "تُب", - }, - { - in: { - p: "تب", - f: "tub", - }, - out: "تٙب", - }, - { - in: { - p: "تب", - f: "tb", - }, - out: "تْب", - }, - { - in: { - p: "تلب", - f: "tilab", - }, - out: "تِلَب", - }, - { - in: { - p: "تشناب", - f: "tashnaab", - }, - out: "تَشْناب", - }, - { - in: { - p: "پسته", - f: "pasta", - }, - out: "پَسْتَه", - }, - // working with ئ as vowel at end - { - in: { - p: "شئ", - f: "sheyy", - }, - out: "شئ", - }, - { - in: { - p: "کار کوئ چې لاړ شئ", - f: "kaar kawéyy che laaR sheyy", - }, - out: "کار کَوئ چې لاړ شئ", - }, - // working with وs - { - in: { - p: "کول", - f: "kwal", - }, - out: "کْوَل", - }, - { - in: { - p: "تول", - f: "tool", - }, - out: "تُول", - }, - { - in: { - p: "مقبول", - f: "maqbool", - }, - out: "مَقْبُول", - }, - { - in: { - p: "کول", - f: "kawul", - }, - out: "کَو" + zwarakey + "ل", - }, - { - in: { - p: "کول", - f: "kiwul", - }, - out: "کِو" + zwarakey + "ل", - }, - { - in: { - p: "کول", - f: "kUwul", - }, - out: "کُو" + zwarakey + "ل", - }, - { - in: { - p: "کول", - f: "kuwul", - }, - out: "ک" + zwarakey + "و" + zwarakey + "ل", - }, - { - in: { - p: "کول", - f: "kawal", - }, - out: "کَوَل", - }, - { - in: { - p: "کول", - f: "kUwal", - }, - out: "کُوَل", - }, - { - in: { - p: "پشتګرد", - f: "pishtgird", - }, - out: "پِشْتْګِرْد", - }, - { - in: { - p: "سپین", - f: "speen", - }, - out: "سْپِین", - }, - { - in: { - p: "سپین", - f: "speyn", - }, - out: "سْپین", - }, - { - in: { - p: "پېش", - f: "pesh", - }, - out: "پېش", - }, - { - in: { - p: "لیک", - f: "leek", - }, - out: "لِیک", - }, - { - in: { - p: "ماضی", - f: "maazee", - }, - out: null, - }, - { - in: { - p: "وسېدل", - f: "osedul", - }, - out: null, - }, - { - in: { - p: "يست", - f: "eest", - }, - out: null, - }, - { - in: { - p: "ست", - f: "ist", - }, - out: null, - }, - { - in: { - p: "haca", - f: "هځه", - }, - out: null, - }, - { - in: { - p: "تشناب", - f: "peshnaab", - }, - out: null, - }, - { - in: { - p: "وسېدل", - f: "osedul", - }, - out: null, - }, - { - in: { - p: "رغېدل", - f: "raghedul", - }, - out: "رَغېد" + zwarakey + "ل", - }, - { - in: { - p: "کارول", - f: "kaarawul", - }, - out: "کارَو" + zwarakey + "ل", - }, - { - in: { - p: "پېښېدل", - f: "pexedul", - }, - out: "پېښېد" + zwarakey + "ل", - }, - { - in: { - p: "مین", - f: "mayín", - }, - out: "مَیِن", - }, - { - in: { - p: "سړی", - f: "saRey", - }, - out: "سَړی", - }, - { - in: { - p: "سړي", - f: "saRee", - }, - out: "سَړي", - }, - { - in: { - p: "زه", - f: "zu", - }, - out: "زهٔ", - }, - { - in: { - p: "زه", - f: "za", - }, - out: "زَه", - }, - { - in: { - p: "پېشنهاد", - f: "peshniháad", - }, - out: "پېشْنِهاد", - }, - { - in: { - p: "ایستل", - f: "eestul", - }, - out: "اِیسْت" + zwarakey + "ل", - }, - { - in: { - p: "ایستل", - f: "eystul", - }, - out: "ایسْت" + zwarakey + "ل", - }, - { - in: { - p: "اېسېدل", - f: "esedul", - }, - out: "اېسېد" + zwarakey + "ل", - }, - { - in: { - p: "اوسېدل", - f: "osedul", - }, - out: "اوسېد" + zwarakey + "ل", - }, - { - in: { - p: "اواز", - f: "awaaz", - }, - out: "اَواز", - }, - { - in: { - p: "اسلام", - f: "islaam", - }, - out: "اِسْلام", - }, - { - in: { - p: "واردول", - f: "waaridawul", - }, - out: "وارِدَو" + zwarakey + "ل", - }, - { - in: { - p: "غاړه", - f: "ghaaRa", - }, - out: "غاړَه", - }, - { - in: { - p: "اوتر", - f: "awtár", - }, - out: "اَوْتَر", - }, - { - in: { - p: "اختیار", - f: "ikhtiyáar", - }, - out: "اِخْتِیار", - }, - { - in: { - p: "فریاد", - f: "faryáad", - }, - out: "فَرْیاد", - }, - { - in: { - p: "کارغه", - f: "kaarghu", - }, - out: "کارْغهٔ", - }, - { - in: { - p: "بې کار", - f: "be kaar", - }, - out: "بې کار", - }, - { - in: { - p: "بې کار", - f: "bekaar", - }, - out: "بې کار", - }, - { - in: { - p: "ارغون", - f: "arghóon", - }, - out: "اَرْغُون", - }, - { - in: { - p: "ارمټه", - f: "armaTa", - }, - out: "اَرْمَټَه", - }, - { - in: { - p: "اروا پوه", - f: "arwaa poh", - }, - out: "اَرْوا پوهْ", - }, - // starting alefs - { - in: { - p: "اسلام", - f: "islaam", - }, - out: "اِسْلام", - }, - // starting long vowels with ا - { - in: { - p: "ایسار", - f: "eesaar", - }, - out: "اِیسار", - }, - // double consonant / tashdeed - { - in: { - p: "بتن", - f: "battan", - }, - out: "ب" + zwar + "ت" + tashdeed + zwar + "ن", - }, - { - in: { - p: "بتطن", - f: "battan", - }, - out: "ب" + zwar + "ت" + sukun + "ط" + zwar + "ن", - }, - // vowel endings working - { - in: { - p: "بته", - f: "bata", - }, - out: "بَتَه", - }, - { - in: { - p: "بته", - f: "bati", - }, - out: "بَتِه", - }, - { - in: { - p: "پرمختیا", - f: "parmakhtyaa", - }, - out: "پَرْمَخْتْیا", - }, - { - in: { - p: "پته", - f: "patta", - }, - out: "پَتَّه", - }, - { - in: { - p: "پته تور", - f: "patta toor", - }, - out: "پَتَّه تُور", - }, - { - in: { - p: "لکۍ وال", - f: "lakuy waal", - }, - out: "لَکۍ وال", - }, - // avoid false double consonant - { - in: { - p: "ازل لیک", - f: "azalléek", - }, - out: "اَزَل لِیک", - }, - { - in: { - p: "سه", - f: "si", - }, - out: "سِه", - }, - { - in: { - p: "سه شنبه", - f: "sishamba", - }, - out: "سِه شَنْبَه", - }, - { - in: { - p: "توجه", - f: "tawajÚ", - }, - out: "تَوَجُه", - }, - { - in: { - p: "توجه کول", - f: "tawajU kawul", - }, - out: "تَوَجُه کَو" + zwarakey + "ل", - }, - { - in: { - p: "با استعداد", - f: "baa isti'dáad", - }, - out: "با اِسْتِعْداد", - }, - { - in: { - p: "آدم", - f: "aadam", - }, - out: "آدَم", - }, - { - in: { - p: "آسان", - f: "aasáan", - }, - out: "آسان", - }, - { - in: { - p: "آسان", - f: "asáan", - }, - out: null, - }, - { - in: { - p: "یدام", - f: "aadam", - }, - out: null, - }, - { - in: { - p: "سختسری", - f: "sakht sărey", - }, - out: "سَخْتْسَری", - }, - { - in: { - p: " سپین کړه", - f: " speen kRu", - }, - out: "سْپِین کْړهٔ", - }, - { - in: { - p: "اوب", - f: "ob", - }, - out: "اوب", - }, - { - in: { - p: "قطعه بازي", - f: "qit'a baazee", - }, - out: "قِطْعَه بازي", - }, - { - in: { - p: "مقرر", - f: "mUqarrár", - }, - out: "مُقَرٌَر", - }, - { - in: { - p: "متردد", - f: "mUtariddíd", - }, - out: "مُتَرِدِّد", - }, - { - in: { - p: "زره", - f: "zirih", - }, - out: "زِرِهْ", - }, - { - in: { - p: "وری", - f: "waréy", - }, - out: "وَری", - }, - { - in: { - p: "فلاح", - f: "faláa", - }, - out: "فَلاح", - }, - { - in: { - p: "امزری", - f: "umzaréy", - }, - out: zwarakey + "مْزَری", - }, - ], - }, - { - describe: "ې followed by ی - y needs to be written as e`y to be distinguished from ey - ی", - tests: [ - { - in: { - p: "پتېیل", - f: "pateyúl", - }, - out: null, - }, - { - in: { - p: "پتېیل", - f: "pate`yúl", - }, - out: "پَتېی" + zwarakey + "ل", - }, - { - in: { - p: "درېیم", - f: "dre`yum", - }, - out: "دْرېی" + zwarakey + "م", - }, - ], - }, - { - describe: "handle circumpositions", - tests: [ - { - in: { - p: "تر ... پورې", - f: "tur ... pore", - }, - out: "ت" + zwarakey + "ر ... پورې", - }, - ], - }, - { - describe: "nm - mb thing", - tests: [ - { - in: { - p: "انبار", - f: "ambáar", - }, - out: "اَنْبار", - }, - ], - }, - { - describe: "excetption for و - wo", - tests: [ - { - in: { - p: "و", - f: "wo", - }, - out: "و", - }, - { - in: { - p: "سړی و", - f: "saRey wo", - }, - out: "سَړی و", - }, - ], - }, - { - describe: "alef with hamza above", - tests: [ - { - in: { - p: "جرأت", - f: "jUrát", - }, - out: "جُرأت", - }, - { - in: { - p: "جرأت", - f: "jUr'át", - }, - out: "جُرأت", - }, - ], - }, - { - describe: "ayn stuff", - tests: [ - { - in: { - p: "بعد", - f: "ba'd", - }, - out: "بَعْد", - }, - { - in: { - p: "بعد", - f: "b'ad", - }, - out: "بْعَد", - }, - { - in: { - p: "بعد", - f: "ba'ad", - }, - out: "بَعَد", - }, - { - in: { - p: "بعد", - f: "baad", - }, - out: "بَعَد", - }, - { - in: { - p: "بعد", - f: "bad", - }, - out: "بَعد", - }, - { - in: { - p: "معلوم", - f: "maaloom", - }, - out: "مَعَلُوم", - }, - { - in: { - p: "منبع", - f: "manbi'", - }, - out: "مَنْبِع", - }, - { - in: { - p: "منبع", - f: "manb'i", - }, - out: "مَنْبْعِ" - }, - { - in: { - p: "منبع", - f: "manbee", - }, - out: "مَنْبِعِ", - }, - { - in: { - p: "منبع", - f: "manbi", - }, - out: "مَنْبِع" - }, - { - in: { - p: "معنا", - f: "ma'náa", - }, - out: "مَعْنا", - }, - { - in: { - p: "معنا", - f: "maanáa", - }, - out: "مَعَنا", - }, - { - in: { - p: "طمع استعمال", - f: "tama istimaal", - }, - out: "طَمَع اِسْتِعمال", - }, - { - in: { - p: "مربع", - f: "mUraba'", - }, - out: "مُرَبَع", - }, - { - in: { - p: "مربع جذر", - f: "mUraba' jazúr", - }, - out: "مُرَبَع جَذ" + zwarakey + "ر", - }, - { - in: { - p: "عام", - f: "'aam", - }, - out: "عام", - }, - { - in: { - p: "قتل عام", - f: "qatl-i-aam", - }, - out: "قَتْلِ عام", - }, - { - in: { - p: "توقع", - f: "tawaqqÚ", - }, - out: "تَوَقُّع", - }, - { - in: { - p: "راجع کېدل", - f: "raaji kedul", - }, - out: "راجِع کېد" + zwarakey + "ل", - }, - { - in: { - p: "ربیع", - f: "rabee'", - }, - out: "رَبِیع", - }, - ], - }, - { - describe: "ayn at the beginning", - tests: [ - // as a short vowel at the beginning - { - in: { - p: "عزت", - f: "izzat", - }, - out: "عِزَّت", - }, - { - in: { - p: "عزت", - f: "i'zzat", - }, - out: "عِْزَّت", - }, - { - in: { - p: "عذر", - f: "Uzar", - }, - out: "عُذَر", - }, - { - in: { - p: "عذر", - f: "U'zar", - }, - out: "عُْذَر", - }, - // as a short i with an alef - { - in: { - p: "اعتصاب شکن", - f: "itisaab shakan", - }, - out: "اِعتِصاب شَکَن", - }, - { - in: { - p: "اعتصاب شکن", - f: "i'tisaab shakan", - }, - out: "اِعْتِصاب شَکَن", - }, - // as a long aa at beginning - { - in: { - p: "عادل", - f: "aadíl", - }, - out: "عادِل", - }, - { - in: { - p: "عید", - f: "eed", - }, - out: "عِید", - }, - ], - }, - { - describe: "ayn at the end", - tests: [ - { - in: { - p: "اجماع", - f: "ijmaa", - }, - out: "اِجْماع", - }, - { - in: { - p: "اجماع", - f: "ijmaa'", - }, - out: "اِجْماع", - } - ], - }, - { - describe: "ئ in the middle", - tests: [ - { - in: { - p: "برائت", - f: "baraa'at", - }, - out: "بَرائَت", - }, - { - in: { - p: "فائده", - f: "faaida", - }, - out: "فائِدَه", - }, - ], - }, - { - describe: "واخ being khaa in the middle of a word", - tests: [ - { - in: { - p: "استخوان", - f: "UstUkháan", - }, - out: "اُسْتُخ(و)ان", - }, - ], - }, - { - describe: "Arabic wasla", - tests: [ - { - in: { - p: "بالکل", - f: "bilkUl", - }, - out: "بِٱلْکُل", - }, - ], - }, - { - describe: "izafe", - tests: [ - { - in: { - p: "ایصال ثواب", - f: "eesaal-i-sawaab", - }, - out: "اِیصالِ ثَواب", - }, - ], - }, - { - describe: "joiner و", - tests: [ - { - in: { - p: "کار و بار", - f: "kaar-U-baar", - }, - out: "کار و بار", - }, - { - in: { - p: "کاروبار", - f: "kaar-U-baar", - }, - out: "کاروبار", - }, - ], - }, - { - describe: "special behaviour with د", - tests: [ - { - in: { - p: "د", - f: "du", - }, - out: "د" + zwarakey, - }, - { - in: { - p: "د لاس", - f: "du laas", - }, - out: "د" + zwarakey + " لاس", - }, - { - in: { - p: "د ... په شان", - f: "du ... pu shaan", - }, - out: "د" + zwarakey + " ... پهٔ شان", - }, - ], - }, - { - describe: "ha ending with ح", - tests: [ - { - in: { - p: "ذبح", - f: "zabha", - }, - out: "ذَبْحَ", - }, - { - in: { - p: "ذبح کول", - f: "zabha kawul", - }, - out: "ذَبْحَ کَو" + zwarakey + "ل", - }, - ], - }, - { - describe: "require dagger alif on words ending with یٰ", - tests: [ - { - in: { - p: "یحیی", - f: "yahyaa", - }, - out: null, - }, - { - in: { - p: "یحییٰ", - f: "yahyaa", - }, - out: "یَحْییٰ", - }, - { - in: { - p: "یحییٰ چېرته", - f: "yahyaa cherta", - }, - out: "یَحْییٰ چېرْتَه", - }, - { - in: { - p: "معنیٰ", - f: "ma'anaa", - }, - out: "مَعَنیٰ", - }, - ], - }, - { - describe: "require fathatan on words ending in اً ", - tests: [ - { - in: { - p: "دقیقا", - f: "daqeeqan", - }, - out: null, - }, - { - in: { - p: "دقیقاً", - f: "daqeeqan", - }, - out: "دَقِیقاً", - }, - ], - }, - { - describe: "Ua ؤ", - tests: [ - { - in: { - p: "مودب", - f: "mUaddab", - }, - out: "مُؤَدَّب", - }, - ], - }, - { - describe: "With Arabic definate article -Ul- ال", - tests: [ - { - in: { - p: "حق الاجاره", - f: "haq-Ul-ijaara", - }, - out: "حَق اُلاِجارَه", - }, - { - in: { - p: "دار العلوم", - f: "daar-Ul-Ulóom", - }, - out: "دار اُلعُلُوم", - }, - ], - }, - { - describe: "double consonants on end of words", - tests: [ - { - in: { - p: "حق", - f: "haqq", - }, - out: "حَقّ", - }, - { - in: { - p: "حق پر", - f: "haqq par", - }, - out: "حَقّ پَر", - }, - ], - }, - { - describe: "أ in the middle of the word", - tests: [ - { - in: { - p: "متأسف", - f: "mUtaassif", - }, - out: "مُتأسِّف", - }, - { - in: { - p: "متأسف", - f: "mUta'assif", - }, - out: "مُتأسِّف", - }, - ], - }, - { - describe: "ؤو in middle of the word", - tests: [ - { - in: { - p: "مسوول", - f: "mas'ool", - }, - out: "مَسؤول", // TODO: Is this best?? - }, - ], - }, - { - describe: "allow for beginnings prefixed with ور در را", - tests: [ - { - in: { - p: "وراوږد", - f: "wăr-ooGad", - }, - out: "وَراُوږَد", - }, - { - in: { - p: "دراوږد", - f: "dăr-ooGad", - }, - out: "دَراُوږَد", - }, - { - in: { - p: "رااوږد", - f: "raa-ooGad", - }, - out: "رااُوږَد", - }, - ], - }, - { - describe: "allow oo at start with و prefix", - tests: [ - { - in: { - p: "وباسي", - f: "oobaasee", - }, - out: "وُباسي", - }, - { - in: { - p: "وځم", - f: "oodzum", - }, - out: "وُځ" + zwarakey + "م", - }, - { - in: { - p: "وځم", - f: "wUdzum", - }, - out: "وُځ" + zwarakey + "م", - }, - ], - }, + { + describe: "regular, native Pashto script/sounds", + tests: [ + { + in: { + p: "کور", + f: "kor", + }, + out: "کور", + }, + { + in: { + p: "کور", + f: "koor", + }, + out: "کُور", + }, + { + in: { + p: "کور کور", + f: "kor koor", + }, + out: "کور کُور", + }, + { + in: { + p: "تب", + f: "tib", + }, + out: "تِب", + }, + { + in: { + p: "تب", + f: "tab", + }, + out: "تَب", + }, + { + in: { + p: "تب", + f: "tUb", + }, + out: "تُب", + }, + { + in: { + p: "تب", + f: "tub", + }, + out: "تٙب", + }, + { + in: { + p: "تب", + f: "tb", + }, + out: "تْب", + }, + { + in: { + p: "تلب", + f: "tilab", + }, + out: "تِلَب", + }, + { + in: { + p: "تشناب", + f: "tashnaab", + }, + out: "تَشْناب", + }, + { + in: { + p: "پسته", + f: "pasta", + }, + out: "پَسْتَه", + }, + // working with ئ as vowel at end + { + in: { + p: "شئ", + f: "shey", + }, + out: "شئ", + }, + { + in: { + p: "کار کوئ چې لاړ شئ", + f: "kaar kawéy che laaR shey", + }, + out: "کار کَوئ چې لاړ شئ", + }, + // working with وs + { + in: { + p: "کول", + f: "kwal", + }, + out: "کْوَل", + }, + { + in: { + p: "تول", + f: "tool", + }, + out: "تُول", + }, + { + in: { + p: "مقبول", + f: "maqbool", + }, + out: "مَقْبُول", + }, + { + in: { + p: "کول", + f: "kawul", + }, + out: "کَو" + zwarakay + "ل", + }, + { + in: { + p: "کول", + f: "kiwul", + }, + out: "کِو" + zwarakay + "ل", + }, + { + in: { + p: "کول", + f: "kUwul", + }, + out: "کُو" + zwarakay + "ل", + }, + { + in: { + p: "کول", + f: "kuwul", + }, + out: "ک" + zwarakay + "و" + zwarakay + "ل", + }, + { + in: { + p: "کول", + f: "kawal", + }, + out: "کَوَل", + }, + { + in: { + p: "کول", + f: "kUwal", + }, + out: "کُوَل", + }, + { + in: { + p: "پشتګرد", + f: "pishtgird", + }, + out: "پِشْتْګِرْد", + }, + { + in: { + p: "سپین", + f: "speen", + }, + out: "سْپِین", + }, + { + in: { + p: "سپین", + f: "spayn", + }, + out: "سْپین", + }, + { + in: { + p: "پېش", + f: "pesh", + }, + out: "پېش", + }, + { + in: { + p: "لیک", + f: "leek", + }, + out: "لِیک", + }, + { + in: { + p: "ماضی", + f: "maazee", + }, + out: null, + }, + { + in: { + p: "وسېدل", + f: "osedul", + }, + out: null, + }, + { + in: { + p: "يست", + f: "eest", + }, + out: null, + }, + { + in: { + p: "ست", + f: "ist", + }, + out: null, + }, + { + in: { + p: "haca", + f: "هځه", + }, + out: null, + }, + { + in: { + p: "تشناب", + f: "peshnaab", + }, + out: null, + }, + { + in: { + p: "وسېدل", + f: "osedul", + }, + out: null, + }, + { + in: { + p: "رغېدل", + f: "raghedul", + }, + out: "رَغېد" + zwarakay + "ل", + }, + { + in: { + p: "کارول", + f: "kaarawul", + }, + out: "کارَو" + zwarakay + "ل", + }, + { + in: { + p: "پېښېدل", + f: "pexedul", + }, + out: "پېښېد" + zwarakay + "ل", + }, + { + in: { + p: "مین", + f: "mayín", + }, + out: "مَیِن", + }, + { + in: { + p: "سړی", + f: "saRay", + }, + out: "سَړی", + }, + { + in: { + p: "سړي", + f: "saRee", + }, + out: "سَړي", + }, + { + in: { + p: "زه", + f: "zu", + }, + out: "زهٔ", + }, + { + in: { + p: "زه", + f: "za", + }, + out: "زَه", + }, + { + in: { + p: "پېشنهاد", + f: "peshniháad", + }, + out: "پېشْنِهاد", + }, + { + in: { + p: "ایستل", + f: "eestul", + }, + out: "اِیسْت" + zwarakay + "ل", + }, + { + in: { + p: "ایستل", + f: "aystul", + }, + out: "ایسْت" + zwarakay + "ل", + }, + { + in: { + p: "اېسېدل", + f: "esedul", + }, + out: "اېسېد" + zwarakay + "ل", + }, + { + in: { + p: "اوسېدل", + f: "osedul", + }, + out: "اوسېد" + zwarakay + "ل", + }, + { + in: { + p: "اواز", + f: "awaaz", + }, + out: "اَواز", + }, + { + in: { + p: "اسلام", + f: "islaam", + }, + out: "اِسْلام", + }, + { + in: { + p: "واردول", + f: "waaridawul", + }, + out: "وارِدَو" + zwarakay + "ل", + }, + { + in: { + p: "غاړه", + f: "ghaaRa", + }, + out: "غاړَه", + }, + { + in: { + p: "اوتر", + f: "awtár", + }, + out: "اَوْتَر", + }, + { + in: { + p: "اختیار", + f: "ikhtiyáar", + }, + out: "اِخْتِیار", + }, + { + in: { + p: "فریاد", + f: "faryáad", + }, + out: "فَرْیاد", + }, + { + in: { + p: "کارغه", + f: "kaarghu", + }, + out: "کارْغهٔ", + }, + { + in: { + p: "بې کار", + f: "be kaar", + }, + out: "بې کار", + }, + { + in: { + p: "بې کار", + f: "bekaar", + }, + out: "بې کار", + }, + { + in: { + p: "ارغون", + f: "arghóon", + }, + out: "اَرْغُون", + }, + { + in: { + p: "ارمټه", + f: "armaTa", + }, + out: "اَرْمَټَه", + }, + { + in: { + p: "اروا پوه", + f: "arwaa poh", + }, + out: "اَرْوا پوهْ", + }, + // starting alefs + { + in: { + p: "اسلام", + f: "islaam", + }, + out: "اِسْلام", + }, + // starting long vowels with ا + { + in: { + p: "ایسار", + f: "eesaar", + }, + out: "اِیسار", + }, + // double consonant / tashdeed + { + in: { + p: "بتن", + f: "battan", + }, + out: "ب" + zwar + "ت" + tashdeed + zwar + "ن", + }, + { + in: { + p: "بتطن", + f: "battan", + }, + out: "ب" + zwar + "ت" + sukun + "ط" + zwar + "ن", + }, + // vowel endings working + { + in: { + p: "بته", + f: "bata", + }, + out: "بَتَه", + }, + { + in: { + p: "بته", + f: "bati", + }, + out: "بَتِه", + }, + { + in: { + p: "پرمختیا", + f: "parmakhtyaa", + }, + out: "پَرْمَخْتْیا", + }, + { + in: { + p: "پته", + f: "patta", + }, + out: "پَتَّه", + }, + { + in: { + p: "پته تور", + f: "patta toor", + }, + out: "پَتَّه تُور", + }, + { + in: { + p: "لکۍ وال", + f: "lakuy waal", + }, + out: "لَکۍ وال", + }, + // avoid false double consonant + { + in: { + p: "ازل لیک", + f: "azalléek", + }, + out: "اَزَل لِیک", + }, + { + in: { + p: "سه", + f: "si", + }, + out: "سِه", + }, + { + in: { + p: "سه شنبه", + f: "sishamba", + }, + out: "سِه شَنْبَه", + }, + { + in: { + p: "توجه", + f: "tawajÚ", + }, + out: "تَوَجُه", + }, + { + in: { + p: "توجه کول", + f: "tawajU kawul", + }, + out: "تَوَجُه کَو" + zwarakay + "ل", + }, + { + in: { + p: "با استعداد", + f: "baa isti'dáad", + }, + out: "با اِسْتِعْداد", + }, + { + in: { + p: "آدم", + f: "aadam", + }, + out: "آدَم", + }, + { + in: { + p: "آسان", + f: "aasáan", + }, + out: "آسان", + }, + { + in: { + p: "آسان", + f: "asáan", + }, + out: null, + }, + { + in: { + p: "یدام", + f: "aadam", + }, + out: null, + }, + { + in: { + p: "سختسری", + f: "sakht săray", + }, + out: "سَخْتْسَری", + }, + { + in: { + p: " سپین کړه", + f: " speen kRu", + }, + out: "سْپِین کْړهٔ", + }, + { + in: { + p: "اوب", + f: "ob", + }, + out: "اوب", + }, + { + in: { + p: "قطعه بازي", + f: "qit'a baazee", + }, + out: "قِطْعَه بازي", + }, + { + in: { + p: "مقرر", + f: "mUqarrár", + }, + out: "مُقَرٌَر", + }, + { + in: { + p: "متردد", + f: "mUtariddíd", + }, + out: "مُتَرِدِّد", + }, + { + in: { + p: "زره", + f: "zirih", + }, + out: "زِرِهْ", + }, + { + in: { + p: "وری", + f: "waráy", + }, + out: "وَری", + }, + { + in: { + p: "فلاح", + f: "faláa", + }, + out: "فَلاح", + }, + { + in: { + p: "امزری", + f: "umzaráy", + }, + out: zwarakay + "مْزَری", + }, + ], + }, + { + describe: + "ې followed by ی - y needs to be written as e`y to be distinguished from ay - ی", + tests: [ + { + in: { + p: "پتېیل", + f: "patayúl", + }, + out: null, + }, + { + in: { + p: "پتېیل", + f: "pate`yúl", + }, + out: "پَتېی" + zwarakay + "ل", + }, + { + in: { + p: "درېیم", + f: "dre`yum", + }, + out: "دْرېی" + zwarakay + "م", + }, + ], + }, + { + describe: "handle circumpositions", + tests: [ + { + in: { + p: "تر ... پورې", + f: "tur ... pore", + }, + out: "ت" + zwarakay + "ر ... پورې", + }, + ], + }, + { + describe: "nm - mb thing", + tests: [ + { + in: { + p: "انبار", + f: "ambáar", + }, + out: "اَنْبار", + }, + ], + }, + { + describe: "excetption for و - wo", + tests: [ + { + in: { + p: "و", + f: "wo", + }, + out: "و", + }, + { + in: { + p: "سړی و", + f: "saRay wo", + }, + out: "سَړی و", + }, + ], + }, + { + describe: "alef with hamza above", + tests: [ + { + in: { + p: "جرأت", + f: "jUrát", + }, + out: "جُرأت", + }, + { + in: { + p: "جرأت", + f: "jUr'át", + }, + out: "جُرأت", + }, + ], + }, + { + describe: "ayn stuff", + tests: [ + { + in: { + p: "بعد", + f: "ba'd", + }, + out: "بَعْد", + }, + { + in: { + p: "بعد", + f: "b'ad", + }, + out: "بْعَد", + }, + { + in: { + p: "بعد", + f: "ba'ad", + }, + out: "بَعَد", + }, + { + in: { + p: "بعد", + f: "baad", + }, + out: "بَعَد", + }, + { + in: { + p: "بعد", + f: "bad", + }, + out: "بَعد", + }, + { + in: { + p: "معلوم", + f: "maaloom", + }, + out: "مَعَلُوم", + }, + { + in: { + p: "منبع", + f: "manbi'", + }, + out: "مَنْبِع", + }, + { + in: { + p: "منبع", + f: "manb'i", + }, + out: "مَنْبْعِ", + }, + { + in: { + p: "منبع", + f: "manbee", + }, + out: "مَنْبِعِ", + }, + { + in: { + p: "منبع", + f: "manbi", + }, + out: "مَنْبِع", + }, + { + in: { + p: "معنا", + f: "ma'náa", + }, + out: "مَعْنا", + }, + { + in: { + p: "معنا", + f: "maanáa", + }, + out: "مَعَنا", + }, + { + in: { + p: "طمع استعمال", + f: "tama istimaal", + }, + out: "طَمَع اِسْتِعمال", + }, + { + in: { + p: "مربع", + f: "mUraba'", + }, + out: "مُرَبَع", + }, + { + in: { + p: "مربع جذر", + f: "mUraba' jazúr", + }, + out: "مُرَبَع جَذ" + zwarakay + "ر", + }, + { + in: { + p: "عام", + f: "'aam", + }, + out: "عام", + }, + { + in: { + p: "قتل عام", + f: "qatl-i-aam", + }, + out: "قَتْلِ عام", + }, + { + in: { + p: "توقع", + f: "tawaqqÚ", + }, + out: "تَوَقُّع", + }, + { + in: { + p: "راجع کېدل", + f: "raaji kedul", + }, + out: "راجِع کېد" + zwarakay + "ل", + }, + { + in: { + p: "ربیع", + f: "rabee'", + }, + out: "رَبِیع", + }, + ], + }, + { + describe: "ayn at the beginning", + tests: [ + // as a short vowel at the beginning + { + in: { + p: "عزت", + f: "izzat", + }, + out: "عِزَّت", + }, + { + in: { + p: "عزت", + f: "i'zzat", + }, + out: "عِْزَّت", + }, + { + in: { + p: "عذر", + f: "Uzar", + }, + out: "عُذَر", + }, + { + in: { + p: "عذر", + f: "U'zar", + }, + out: "عُْذَر", + }, + // as a short i with an alef + { + in: { + p: "اعتصاب شکن", + f: "itisaab shakan", + }, + out: "اِعتِصاب شَکَن", + }, + { + in: { + p: "اعتصاب شکن", + f: "i'tisaab shakan", + }, + out: "اِعْتِصاب شَکَن", + }, + // as a long aa at beginning + { + in: { + p: "عادل", + f: "aadíl", + }, + out: "عادِل", + }, + { + in: { + p: "عید", + f: "eed", + }, + out: "عِید", + }, + ], + }, + { + describe: "ayn at the end", + tests: [ + { + in: { + p: "اجماع", + f: "ijmaa", + }, + out: "اِجْماع", + }, + { + in: { + p: "اجماع", + f: "ijmaa'", + }, + out: "اِجْماع", + }, + ], + }, + { + describe: "ئ in the middle", + tests: [ + { + in: { + p: "برائت", + f: "baraa'at", + }, + out: "بَرائَت", + }, + { + in: { + p: "فائده", + f: "faaida", + }, + out: "فائِدَه", + }, + ], + }, + { + describe: "واخ being khaa in the middle of a word", + tests: [ + { + in: { + p: "استخوان", + f: "UstUkháan", + }, + out: "اُسْتُخ(و)ان", + }, + ], + }, + { + describe: "Arabic wasla", + tests: [ + { + in: { + p: "بالکل", + f: "bilkUl", + }, + out: "بِٱلْکُل", + }, + ], + }, + { + describe: "izafe", + tests: [ + { + in: { + p: "ایصال ثواب", + f: "eesaal-i-sawaab", + }, + out: "اِیصالِ ثَواب", + }, + ], + }, + { + describe: "joiner و", + tests: [ + { + in: { + p: "کار و بار", + f: "kaar-U-baar", + }, + out: "کار و بار", + }, + { + in: { + p: "کاروبار", + f: "kaar-U-baar", + }, + out: "کاروبار", + }, + ], + }, + { + describe: "special behaviour with د", + tests: [ + { + in: { + p: "د", + f: "du", + }, + out: "د" + zwarakay, + }, + { + in: { + p: "د لاس", + f: "du laas", + }, + out: "د" + zwarakay + " لاس", + }, + { + in: { + p: "د ... په شان", + f: "du ... pu shaan", + }, + out: "د" + zwarakay + " ... پهٔ شان", + }, + ], + }, + { + describe: "ha ending with ح", + tests: [ + { + in: { + p: "ذبح", + f: "zabha", + }, + out: "ذَبْحَ", + }, + { + in: { + p: "ذبح کول", + f: "zabha kawul", + }, + out: "ذَبْحَ کَو" + zwarakay + "ل", + }, + ], + }, + { + describe: "require dagger alif on words ending with یٰ", + tests: [ + { + in: { + p: "یحیی", + f: "yahyaa", + }, + out: null, + }, + { + in: { + p: "یحییٰ", + f: "yahyaa", + }, + out: "یَحْییٰ", + }, + { + in: { + p: "یحییٰ چېرته", + f: "yahyaa cherta", + }, + out: "یَحْییٰ چېرْتَه", + }, + { + in: { + p: "معنیٰ", + f: "ma'anaa", + }, + out: "مَعَنیٰ", + }, + ], + }, + { + describe: "require fathatan on words ending in اً ", + tests: [ + { + in: { + p: "دقیقا", + f: "daqeeqan", + }, + out: null, + }, + { + in: { + p: "دقیقاً", + f: "daqeeqan", + }, + out: "دَقِیقاً", + }, + ], + }, + { + describe: "Ua ؤ", + tests: [ + { + in: { + p: "مودب", + f: "mUaddab", + }, + out: "مُؤَدَّب", + }, + ], + }, + { + describe: "With Arabic definate article -Ul- ال", + tests: [ + { + in: { + p: "حق الاجاره", + f: "haq-Ul-ijaara", + }, + out: "حَق اُلاِجارَه", + }, + { + in: { + p: "دار العلوم", + f: "daar-Ul-Ulóom", + }, + out: "دار اُلعُلُوم", + }, + ], + }, + { + describe: "double consonants on end of words", + tests: [ + { + in: { + p: "حق", + f: "haqq", + }, + out: "حَقّ", + }, + { + in: { + p: "حق پر", + f: "haqq par", + }, + out: "حَقّ پَر", + }, + ], + }, + { + describe: "أ in the middle of the word", + tests: [ + { + in: { + p: "متأسف", + f: "mUtaassif", + }, + out: "مُتأسِّف", + }, + { + in: { + p: "متأسف", + f: "mUta'assif", + }, + out: "مُتأسِّف", + }, + ], + }, + { + describe: "ؤو in middle of the word", + tests: [ + { + in: { + p: "مسوول", + f: "mas'ool", + }, + out: "مَسؤول", // TODO: Is this best?? + }, + ], + }, + { + describe: "allow for beginnings prefixed with ور در را", + tests: [ + { + in: { + p: "وراوږد", + f: "wăr-ooGad", + }, + out: "وَراُوږَد", + }, + { + in: { + p: "دراوږد", + f: "dăr-ooGad", + }, + out: "دَراُوږَد", + }, + { + in: { + p: "رااوږد", + f: "raa-ooGad", + }, + out: "رااُوږَد", + }, + ], + }, + { + describe: "allow oo at start with و prefix", + tests: [ + { + in: { + p: "وباسي", + f: "oobaasee", + }, + out: "وُباسي", + }, + { + in: { + p: "وځم", + f: "oodzum", + }, + out: "وُځ" + zwarakay + "م", + }, + { + in: { + p: "وځم", + f: "wUdzum", + }, + out: "وُځ" + zwarakay + "م", + }, + ], + }, ]; // diacriticsSections.forEach((section) => { @@ -1306,15 +1300,13 @@ const diacriticsSections: { // }); test("ending with left over Pashto script will throw an error", () => { - expect(() => { - addDiacritics({ p: "کور ته", f: "kor" }); - }).toThrow(`phonetics error - phonetics shorter than pashto script`); + expect(() => { + addDiacritics({ p: "کور ته", f: "kor" }); + }).toThrow(`phonetics error - phonetics shorter than pashto script`); }); test("ending with left over phonetics will throw an error", () => { - expect(() => { - addDiacritics({ p: "کار", f: "kaar kawul" }); - }).toThrow(); + expect(() => { + addDiacritics({ p: "کار", f: "kaar kawul" }); + }).toThrow(); }); - - diff --git a/src/lib/src/diacritics.ts b/src/lib/src/diacritics.ts index f806890..d73a266 100644 --- a/src/lib/src/diacritics.ts +++ b/src/lib/src/diacritics.ts @@ -8,25 +8,25 @@ import * as T from "../../types"; import { - splitFIntoPhonemes, - Phoneme, - zwar, - zwarakey, - zer, - pesh, - sukun, - hamzaAbove, - tashdeed, - wasla, - addP, - advanceP, - reverseP, - overwriteP, - advanceForHamza, - advanceForHamzaMid, - DiacriticsAccumulator, - stateInfo, - PhonemeStatus, + splitFIntoPhonemes, + Phoneme, + zwar, + zwarakay, + zer, + pesh, + sukun, + hamzaAbove, + tashdeed, + wasla, + addP, + advanceP, + reverseP, + overwriteP, + advanceForHamza, + advanceForHamzaMid, + DiacriticsAccumulator, + stateInfo, + PhonemeStatus, } from "./diacritics-helpers"; import { removeFVarients } from "./accent-and-ps-utils"; @@ -35,176 +35,107 @@ import { pipe } from "rambda"; /** * Adds diacritics to a given PsString. * Errors if the phonetics and script don't line up. + * + * IN PROGRESS - This will hopefully get done and replace the messy, unmaintainable phonetics-to-diacritics.ts currently in use */ - export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString { - const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? removeFVarients(f) : f); - const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p.trim() }); - if (pIn !== "") { - throw new Error("phonetics error - phonetics shorter than pashto script"); - } - return { - p: pOut, - f, - }; +export function addDiacritics( + { p, f }: T.PsString, + ignoreCommas?: true +): T.PsString { + const phonemes: Phoneme[] = splitFIntoPhonemes( + !ignoreCommas ? removeFVarients(f) : f + ); + const { pIn, pOut } = phonemes.reduce(processPhoneme, { + pOut: "", + pIn: p.trim(), + }); + if (pIn !== "") { + throw new Error("phonetics error - phonetics shorter than pashto script"); + } + return { + p: pOut, + f, + }; } function processPhoneme( - acc: DiacriticsAccumulator, - phoneme: Phoneme, - i: number, - phonemes: Phoneme[], + acc: DiacriticsAccumulator, + phoneme: Phoneme, + i: number, + phonemes: Phoneme[] ): DiacriticsAccumulator { - const state = acc.pIn.slice(0, 5) === " ... " - ? advanceP(acc, 5) - : acc.pIn[0] === " " - ? advanceP(acc) - : acc; + const state = + acc.pIn.slice(0, 5) === " ... " + ? advanceP(acc, 5) + : acc.pIn[0] === " " + ? advanceP(acc) + : acc; - const { - phonemeInfo, - diacritic, - phs, - prevPLetter, - } = stateInfo({ state, i, phoneme, phonemes }); + const { phonemeInfo, diacritic, phs, prevPLetter } = stateInfo({ + state, + i, + phoneme, + phonemes, + }); - return (phs === PhonemeStatus.LeadingLongVowel) ? - pipe( - advanceP, - addP(phonemeInfo.diacritic), - advanceP, - )(state) - : (phs === PhonemeStatus.LeadingConsonantOrShortVowel) ? - pipe( - advanceP, - addP(diacritic), - )(state) - : (phs === PhonemeStatus.DoubleConsonantTashdeed) ? - pipe( - prevPLetter === " " ? reverseP : addP(""), - addP(tashdeed) - )(state) - : (phs === PhonemeStatus.EndingWithHeyHim) ? - pipe( - advanceP, - addP(phoneme === "u" ? hamzaAbove : sukun), - )(state) - : (phs === PhonemeStatus.DirectMatch) ? - pipe( - addP(diacritic), - advanceP, - )(state) - : (phs === PhonemeStatus.DirectMatchAfterSukun) ? - pipe( - addP(sukun), - advanceP, - )(state) - : (phs === PhonemeStatus.PersianSilentWWithAa) ? - pipe( - addP("("), - advanceP, - addP(")"), - advanceP, - )(state) - : (phs === PhonemeStatus.ArabicWasla) ? - pipe( - addP(zer), - overwriteP(wasla), - )(state) - : (phs === PhonemeStatus.Izafe) ? - pipe( - reverseP, - addP(zer), - )(state) - : (phs === PhonemeStatus.EndOfDuParticle) ? - pipe( - reverseP, - addP(zwarakey), - )(state) - : (phs === PhonemeStatus.ShortAEndingAfterHeem) ? - pipe( - prevPLetter === " " ? reverseP : addP(""), - addP(zwar), - )(state) - : (phs === PhonemeStatus.EndingWithHeyHimFromSukun) ? - pipe( - addP(sukun), - advanceP, - )(state) - : (phs === PhonemeStatus.AlefDaggarEnding) ? - pipe( - advanceP, - advanceP, - )(state) - : (phs === PhonemeStatus.LongAinVowelMissingComma) ? - pipe( - addP(diacritic), - advanceP, - addP(diacritic) - )(state) - : (phs === PhonemeStatus.ShortAinVowelMissingComma) ? - pipe( - addP(diacritic), - advanceP, - )(state) - : (phs === PhonemeStatus.ShortAinVowelMissingCommaAfterAlefStart) ? - pipe( - advanceP, - advanceP, - )(state) - : (phs === PhonemeStatus.AinWithLongAAtBeginning) ? - pipe( - advanceP, - advanceP, - )(state) - : (phs === PhonemeStatus.AlefWithHamza) ? - pipe( - advanceP, - )(state) - : (phs === PhonemeStatus.ShortVowel) ? - pipe( - advanceForHamzaMid, - addP(phonemeInfo.diacritic), - // TODO THIS? - advanceForHamza, - )(state) - : (phs === PhonemeStatus.ShortAForAlefBeforeFathatan) ? - pipe( - advanceP, - )(state) - : (phs === PhonemeStatus.NOnFathatan) ? - pipe( - advanceP, - )(state) - : (phs === PhonemeStatus.HamzaOnWow) ? - pipe( - advanceP, - addP(hamzaAbove), - addP(diacritic), - )(state) - : (phs === PhonemeStatus.ArabicDefiniteArticleUl) ? - pipe( - advanceP, - addP(pesh), - advanceP, - )(state) - : (phs === PhonemeStatus.OoPrefix) ? - pipe( - advanceP, - addP(pesh), - )(state) - : (phs === PhonemeStatus.GlottalStopBeforeOo) ? - pipe( - advanceP, - addP(hamzaAbove), - )(state) - : (phs === PhonemeStatus.OoAfterGlottalStopOo) ? - pipe( - advanceP, - )(state) - : (phs === PhonemeStatus.SilentAinAfterAlef) ? - pipe( - advanceP, - advanceP, - )(state) - : state; + return phs === PhonemeStatus.LeadingLongVowel + ? pipe(advanceP, addP(phonemeInfo.diacritic), advanceP)(state) + : phs === PhonemeStatus.LeadingConsonantOrShortVowel + ? pipe(advanceP, addP(diacritic))(state) + : phs === PhonemeStatus.DoubleConsonantTashdeed + ? pipe(prevPLetter === " " ? reverseP : addP(""), addP(tashdeed))(state) + : phs === PhonemeStatus.EndingWithHayHim + ? pipe(advanceP, addP(phoneme === "u" ? hamzaAbove : sukun))(state) + : phs === PhonemeStatus.DirectMatch + ? pipe(addP(diacritic), advanceP)(state) + : phs === PhonemeStatus.DirectMatchAfterSukun + ? pipe(addP(sukun), advanceP)(state) + : phs === PhonemeStatus.PersianSilentWWithAa + ? pipe(addP("("), advanceP, addP(")"), advanceP)(state) + : phs === PhonemeStatus.ArabicWasla + ? pipe(addP(zer), overwriteP(wasla))(state) + : phs === PhonemeStatus.Izafe + ? pipe(reverseP, addP(zer))(state) + : phs === PhonemeStatus.EndOfDuParticle + ? pipe(reverseP, addP(zwarakay))(state) + : phs === PhonemeStatus.ShortAEndingAfterHeem + ? pipe(prevPLetter === " " ? reverseP : addP(""), addP(zwar))(state) + : phs === PhonemeStatus.EndingWithHayHimFromSukun + ? pipe(addP(sukun), advanceP)(state) + : phs === PhonemeStatus.AlefDaggarEnding + ? pipe(advanceP, advanceP)(state) + : phs === PhonemeStatus.LongAinVowelMissingComma + ? pipe(addP(diacritic), advanceP, addP(diacritic))(state) + : phs === PhonemeStatus.ShortAinVowelMissingComma + ? pipe(addP(diacritic), advanceP)(state) + : phs === PhonemeStatus.ShortAinVowelMissingCommaAfterAlefStart + ? pipe(advanceP, advanceP)(state) + : phs === PhonemeStatus.AinWithLongAAtBeginning + ? pipe(advanceP, advanceP)(state) + : phs === PhonemeStatus.AlefWithHamza + ? pipe(advanceP)(state) + : phs === PhonemeStatus.ShortVowel + ? pipe( + advanceForHamzaMid, + addP(phonemeInfo.diacritic), + // TODO THIS? + advanceForHamza + )(state) + : phs === PhonemeStatus.ShortAForAlefBeforeFathatan + ? pipe(advanceP)(state) + : phs === PhonemeStatus.NOnFathatan + ? pipe(advanceP)(state) + : phs === PhonemeStatus.HamzaOnWow + ? pipe(advanceP, addP(hamzaAbove), addP(diacritic))(state) + : phs === PhonemeStatus.ArabicDefiniteArticleUl + ? pipe(advanceP, addP(pesh), advanceP)(state) + : phs === PhonemeStatus.OoPrefix + ? pipe(advanceP, addP(pesh))(state) + : phs === PhonemeStatus.GlottalStopBeforeOo + ? pipe(advanceP, addP(hamzaAbove))(state) + : phs === PhonemeStatus.OoAfterGlottalStopOo + ? pipe(advanceP)(state) + : phs === PhonemeStatus.SilentAinAfterAlef + ? pipe(advanceP, advanceP)(state) + : state; } diff --git a/src/lib/src/phonetics-to-diacritics.test.ts b/src/lib/src/phonetics-to-diacritics.test.ts index c123a14..3d58668 100644 --- a/src/lib/src/phonetics-to-diacritics.test.ts +++ b/src/lib/src/phonetics-to-diacritics.test.ts @@ -7,1109 +7,1110 @@ */ import { - phoneticsToDiacritics, - splitFIntoPhonemes, + phoneticsToDiacritics, + splitFIntoPhonemes, } from "./phonetics-to-diacritics"; -const zwarakey = "ٙ"; +const zwarakay = "ٙ"; const phonemeSplits: Array<{ - in: string, - out: string[], + in: string; + out: string[]; }> = [ - { - in: "kor", - out: ["k", "o", "r"], - }, - { - in: "raaghey", - out: ["r", "aa", "gh", "ey"], - }, - { - in: "hatsa", - out: ["h", "a", "ts", "a"], - }, - { - in: "ba", - out: ["b", "a"], - }, - { - in: "peydáa", - out: ["p", "ey", "d", "áa"], - }, - { - in: "be kaar", - out: ["b", "e", "k", "aa", "r"], - }, - { - in: "raadzeyy", - out: ["r", "aa", "dz", "eyy"], - }, - { - in: "badanuy ??", - out: ["b", "a", "d", "a", "n", "uy"], - }, - { - in: "tur ... pore", - out: ["t", "u", "r", "p", "o", "r", "e"], - }, - { - in: "daar-Ul-iqaama", - out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"], - }, + { + in: "kor", + out: ["k", "o", "r"], + }, + { + in: "raaghay", + out: ["r", "aa", "gh", "ay"], + }, + { + in: "hatsa", + out: ["h", "a", "ts", "a"], + }, + { + in: "ba", + out: ["b", "a"], + }, + { + in: "paydáa", + out: ["p", "ay", "d", "áa"], + }, + { + in: "be kaar", + out: ["b", "e", "k", "aa", "r"], + }, + { + in: "raadzey", + out: ["r", "aa", "dz", "ey"], + }, + { + in: "badanuy ??", + out: ["b", "a", "d", "a", "n", "uy"], + }, + { + in: "tur ... pore", + out: ["t", "u", "r", "p", "o", "r", "e"], + }, + { + in: "daar-Ul-iqaama", + out: ["d", "aa", "r", "-Ul-", "i", "q", "aa", "m", "a"], + }, ]; phonemeSplits.forEach((s) => { - test(`${s.in} should split properly`, () => { - const result = splitFIntoPhonemes(s.in); - expect(result).toEqual(s.out); - }); + test(`${s.in} should split properly`, () => { + const result = splitFIntoPhonemes(s.in); + expect(result).toEqual(s.out); + }); }); const toTest: Array<{ - in: { p: string, f: string }, - out: string | undefined, + in: { p: string; f: string }; + out: string | undefined; }> = [ - { - in: { - p: "کور", - f: "kor", - }, - out: "کور", - }, - { - in: { - p: "کور", - f: "koor", - }, - out: "کُور", - }, - { - in: { - p: "تب", - f: "tib", - }, - out: "تِب", - }, - { - in: { - p: "تب", - f: "tab", - }, - out: "تَب", - }, - { - in: { - p: "تب", - f: "tUb", - }, - out: "تُب", - }, - { - in: { - p: "تب", - f: "tub", - }, - out: "تٙب", - }, - { - in: { - p: "تب", - f: "tb", - }, - out: "تْب", - }, - { - in: { - p: "تلب", - f: "tilab", - }, - out: "تِلَب", - }, - { - in: { - p: "تشناب", - f: "tashnaab", - }, - out: "تَشْناب", - }, - // broken phonetics will return undefined - { - in: { - p: "تشناب", - f: "peshnaab", - }, - out: undefined, - }, - // working with وs - { - in: { - p: "کول", - f: "kwal", - }, - out: "کْوَل", - }, - { - in: { - p: "تول", - f: "tool", - }, - out: "تُول", - }, - { - in: { - p: "مقبول", - f: "maqbool", - }, - out: "مَقْبُول", - }, - { - in: { - p: "کول", - f: "kawul", - }, - out: "کَو" + zwarakey + "ل", - }, - { - in: { - p: "کول", - f: "kiwul", - }, - out: "کِو" + zwarakey + "ل", - }, - { - in: { - p: "کول", - f: "kUwul", - }, - out: "کُو" + zwarakey + "ل", - }, - { - in: { - p: "کول", - f: "kuwul", - }, - out: "ک" + zwarakey + "و" + zwarakey + "ل", - }, - { - in: { - p: "کول", - f: "kawal", - }, - out: "کَوَل", - }, - { - in: { - p: "کول", - f: "kUwal", - }, - out: "کُوَل", - }, - { - in: { - p: "پشتګرد", - f: "pishtgird", - }, - out: "پِشْتْګِرْد", - }, - { - in: { - p: "سپین", - f: "speen", - }, - out: "سْپِین", - }, - { - in: { - p: "سپین", - f: "speyn", - }, - out: "سْپین", - }, - { - in: { - p: "پېش", - f: "pesh", - }, - out: "پېش", - }, - { - in: { - p: "پېش", - f: "peysh", - }, - out: undefined, - }, - { - in: { - p: "رغېدل", - f: "raghedul", - }, - out: "رَغېد" + zwarakey + "ل", - }, - { - in: { - p: "کارول", - f: "kaarawul", - }, - out: "کارَو" + zwarakey + "ل", - }, - { - in: { - p: "پېښېدل", - f: "pexedul", - }, - out: "پېښېد" + zwarakey + "ل", - }, - { - in: { - p: "مین", - f: "mayín", - }, - out: "مَیِن", - }, - { - in: { - p: "سړی", - f: "saRey", - }, - out: "سَړی", - }, - { - in: { - p: "سړي", - f: "saRee", - }, - out: "سَړي", - }, - { - in: { - p: "زه", - f: "zu", - }, - out: "زهٔ", - }, - { - in: { - p: "زه", - f: "za", - }, - out: "زه", - }, - { - in: { - p: "پېشنهاد", - f: "peshniháad", - }, - out: "پېشْنِهاد", - }, - { - in: { - p: "ایستل", - f: "eestul", - }, - out: "اِیسْت" + zwarakey + "ل", - }, - { - in: { - p: "ایستل", - f: "eystul", - }, - out: "ایسْت" + zwarakey + "ل", - }, - { - in: { - p: "اېسېدل", - f: "esedul", - }, - out: "اېسېد" + zwarakey + "ل", - }, - { - in: { - p: "اوسېدل", - f: "osedul", - }, - out: "اوسېد" + zwarakey + "ل", - }, - { - in: { - p: "اواز", - f: "awaaz", - }, - out: "اَواز", - }, - { - in: { - p: "اسلام", - f: "islaam", - }, - out: "اِسْلام", - }, - { - in: { - p: "واردول", - f: "waaridawul", - }, - out: "وارِدَو" + zwarakey + "ل", - }, - { - in: { - p: "غاړه", - f: "ghaaRa", - }, - out: "غاړه", - }, - { - in: { - p: "اوتر", - f: "awtár", - }, - out: "اَوْتَر", - }, - { - in: { - p: "اختیار", - f: "ikhtiyáar", - }, - out: "اِخْتِیار", - }, - { - in: { - p: "فریاد", - f: "faryáad", - }, - out: "فَرْیاد", - }, - { - in: { - p: "کارغه", - f: "kaarghu", - }, - out: "کارْغهٔ", - }, - { - in: { - p: "بې کار", - f: "be kaar", - }, - out: "بې کار", - }, - { - in: { - p: "بې کار", - f: "bekaar", - }, - out: "بې کار", - }, - { - in: { - p: "انبار", - f: "ambáar", - }, - out: "اَنْبار", - }, - { - in: { - p: "ارغون", - f: "arghóon", - }, - out: "اَرْغُون", - }, - { - in: { - p: "ارمټه", - f: "armaTa", - }, - out: "اَرْمَټه", - }, - { - in: { - p: "اروا پوه", - f: "arwaa poh", - }, - out: "اَرْوا پوهْ", - }, - { - in: { - p: "اسحاق", - f: "ishaaq", - }, - out: undefined, - }, - { - in: { - p: "اسحاق", - f: "is`haaq", - }, - out: "اِسْحاق", - }, - { - in: { - p: "سعات", - f: "saat", - }, - out: "سعات", - }, - { - in: { - p: "سعات", - f: "sa'aat", - }, - out: "سَعات", - }, - { - in: { - p: "استعمال", - f: "ist'imaal", - }, - out: "اِسْتعِمال", - }, - { - in: { - p: "استعمال", - f: "istimaal", - }, - out: "اِسْتعِمال", - }, - { - in: { - p: "اروایي", - f: "arwaayee", - }, - out: "اَرْوایي", - }, - { - in: { - p: "اریځ", - f: "Uryadz", - }, - out: "اُرْیَځ", - }, - { - in: { - p: "ازغن تار", - f: "azghun taar", - }, - out: "اَزْغ" + zwarakey + "ن" + " تار", - }, - { - in: { - p: "اره څکول", - f: "ara tskawul", - }, - out: "اَره څْکَو" + zwarakey + "ل", - }, - { - in: { - p: "اږیل", - f: "aGuyúl", - }, - out: "اَږ" + zwarakey + "ی" + zwarakey + "ل", - }, - { - in: { - p: "استازندوی", - f: "astaazandoy", - }, - out: "اَسْتازَنْدوی", - }, - // واخ being khaa in the middle of a word - { - in: { - p: "استخوان", - f: "UstUkháan", - }, - out: "اُسْتُخ(و)ان", - }, - { - in: { - p: "اسطلاع", - f: "istilaa", - }, - out: "اِسْطِلاع", - }, - { - in: { - p: "اسهال", - f: "is`háal", - }, - out: "اِسْهال", - }, - { - in: { - p: "اسهامي", - f: "as`haamee", - }, - out: "اَسْهامي", - }, - // avoid false double consonant - { - in: { - p: "ازل لیک", - f: "azalléek", - }, - out: "اَزَل لِیک", - }, - // bad ending test - { - in: { - p: "ماضی", - f: "maazee", - }, - out: undefined, - }, - // bad beginning test - { - in: { - p: "وسېدل", - f: "osedul", - }, - out: undefined, - }, - { - in: { - p: "يست", - f: "eest", - }, - out: undefined, - }, - { - in: { - p: "ست", - f: "ist", - }, - out: undefined, - }, - { - in: { - p: "haca", - f: "هځه", - }, - out: undefined, - }, - // tashdeed - { - in: { - p: "پته", - f: "patta", - }, - out: "پَتّه", - }, - { - in: { - p: "اعتصاب شکن", - f: "itisaabshikan", - }, - out: "اِعتِصاب شِکَن", - }, - // Arabic wasla - { - in: { - p: "بالکل", - f: "bilkUl", - }, - out: "بِٱلْکُل", - }, - // izafe - { - in: { - p: "ایصال ثواب", - f: "eesaal-i-sawaab", - }, - out: "اِیصالِ ثَواب", - }, - { - in: { - p: "با استعداد", - f: "baa isti'dáad", - }, - out: "با اِسْتِعداد", - }, - // starting with ع - { - in: { - p: "عزت", - f: "izzat", - }, - out: "عِزَّت", - }, - { - in: { - p: "عزت", - f: "i'zzat", - }, - out: "عِزَّت", - }, - // ئ in the middle - { - in: { - p: "برائت", - f: "baraa'at", - }, - out: "بَرائَت", - }, - { - in: { - p: "فائده", - f: "faaida", - }, - out: "فائِده", - }, - // starting with long aa - { - in: { - p: "آدم", - f: "aadam", - }, - out: "آدَم", - }, - { - in: { - p: "یدام", - f: "aadam", - }, - out: undefined, - }, { - in: { - p: "منع", - f: "mán'a", - }, - out: "مَنعَ", - }, - { - in: { - p: "منع", - f: "mana", - }, - out: "مَنعَ", - }, - { - in: { - p: "منابع", - f: "mUnaabí", - }, - out: "مُنابعِ", - }, - { - // TODO: Is this correct?? - in: { - p: "اسان", - f: "aasaan", - }, - out: "اسان", - }, - // ې followed by ی - y needs to be written as e`y to be distinguished from ey - ی - { - in: { - p: "پتېیل", - f: "pateyúl", - }, - out: undefined, - }, - { - in: { - p: "پتېیل", - f: "pate`yúl", - }, - out: "پَتېی" + zwarakey + "ل", - }, - { - in: { - p: "درېیم", - f: "dre`yum", - }, - out: "دْرېی" + zwarakey + "م", - }, - { - in: { - p: "تابع دار", - f: "taabidaar", - }, - out: "تابعِ دار", - }, - // handle circumpositions - { - in: { - p: "تر ... پورې", - f: "tur ... pore", - }, - out: "ت" + zwarakey + "ر ... پورې", - }, - // joiner و - { - in: { - p: "کار و بار", - f: "kaar-U-baar", - }, - out: "کار و بار", - }, - { - in: { - p: "کاروبار", - f: "kaar-U-baar", - }, - out: "کاروبار", - }, - { - in: { - p: "توقع", - f: "tawaqqÚ", - }, - out: "تَوَقّعُ", - }, - // special behaviour with د - { - in: { - p: "د", - f: "du", - }, - out: "د" + zwarakey, - }, - { - in: { - p: "د لاس", - f: "du laas", - }, - out: "د" + zwarakey + " لاس", - }, - { - in: { - p: "د ... په شان", - f: "du ... pu shaan", - }, - out: "د" + zwarakey + " ... پهٔ شان", - }, - { - in: { - p: "ذبح", - f: "zabha", - }, - out: "ذَبْحَ", - }, - { - in: { - p: "ذبح", - f: "zabha", - }, - out: "ذَبْحَ", - }, - { - in: { - p: "ذبح کول", - f: "zabha kawul", - }, - out: "ذَبْحَ کَو" + zwarakey + "ل", - }, - // require dagger alif on words ending with یٰ - { - in: { - p: "یحیی", - f: "yahyaa", - }, - out: undefined, - }, - { - in: { - p: "یحییٰ", - f: "yahyaa", - }, - out: "یَحْییٰ", - }, - { - in: { - p: "معنیٰ", - f: "ma'anaa", - }, - out: "مَعَنیٰ", - }, - // require fathatan on words ending in اً - { - in: { - p: "دقیقا", - f: "daqeeqan", - }, - out: undefined, - }, - { - in: { - p: "دقیقاً", - f: "daqeeqan", - }, - out: "دَقِیقاً", - }, - // words starting in عا - { - in: { - p: "عام", - f: "aam", - }, - out: "عام", - }, - { - in: { - p: "عام", - f: "'aam", - }, - out: "عام", - }, - { - in: { - p: "قتل عام", - f: "qatl-i-aam", - }, - out: "قَتْلِ عام", - }, - { - in: { - p: "طمع لرل", - f: "tama larul", - }, - out: "طَمعَ لَر" + zwarakey + "ل", - }, - // Ua ؤ - { - in: { - p: "مودب", - f: "mUaddab", - }, - out: "مؤدَّب", - }, - { - in: { - p: "لکۍ وال", - f: "lakuy waal", - }, - out: "لَکۍ وال", - }, - // shouldn't skip the ئ at the end - { - in: { - p: "شئ", - f: "sheyy", - }, - out: "شئ", - }, - // excetption for و - wo - { - in: { - p: "و", - f: "wo", - }, - out: "و", - }, - { - in: { - p: "سړی و", - f: "saRey wo", - }, - out: "سَړی و", - }, - { - in: { - p: "عید", - f: "eed", - }, - out: "عِید", - }, - // i ending can also be i - { - in: { - p: "سه", - f: "si", - }, - out: "سِه", - }, - { - in: { - p: "سه شنبه", - f: "sishamba", - }, - out: "سِه شَنْبه", - }, - { - in: { - p: "توجه", - f: "tawajÚ", - }, - out: "تَوَجُه", - }, - { - in: { - p: "توجه کول", - f: "tawajU kawul", - }, - out: "تَوَجُه کَو" + zwarakey + "ل", - }, - // With Arabic definate article -Ul- ال - { - in: { - p: "حق الاجاره", - f: "haq-Ul-ijaara", - }, - out: "حَق اُلاِجاره", - }, - { - in: { - p: "دار العلوم", - f: "daar-Ul-Ulóom", - }, - out: "دار اُلعُلُوم", - }, - // double consonants on end of words - { - in: { - p: "حق", - f: "haqq", - }, - out: "حَقّ", - }, - { - in: { - p: "حق پر", - f: "haqq par", - }, - out: "حَقّ پَر", - }, - { - in: { - p: "راجع کېدل", - f: "raaji kedul", - }, - out: "راجعِ کېد" + zwarakey + "ل", - }, - { - in: { - p: "ربیع", - f: "rabee'", - }, - out: "رَبِیع", - }, - { - in: { - p: "سختسری", - f: "sakht sărey", - }, - out: "سَخْتْسَری", - }, - { - in: { - p: "معنیٰ", - f: "ma'naa", - }, - out: "مَعنیٰ", - }, - // issue with یٰ ending and then continuing to the next word - { - in: { - p: "معنیٰ دار", - f: "ma'naa daar", - }, - out: "مَعنیٰ دار", - }, - { - in: { - p: "اله", - f: "ilah", - }, - out: "اِلَهْ", - }, - // issue with words ending in عه going to the next word - { - in: { - p: "قطعه بازي", - f: "qit'a baazee", - }, - out: "قِطعه بازي", - }, - // أ in the middle of the word - { - in: { - p: "متأسف", - f: "mUta'assif", - }, - out: "مُتأسِّف", - }, - // words ending in ع a' on to the next word - { - in: { - p: "مربع", - f: "mUraba'", - }, - out: "مُرَبَع", - }, - { - in: { - p: "مربع جذر", - f: "mUraba' jazúr", - }, - out: "مُرَبَع جَذ" + zwarakey + "ر", - }, - { - in: { - p: "مسوول", - f: "mas'ool", - }, - out: "مَسوُول", // TODO: Is this best?? - }, - // allow for beginnings prefixed with ور در را - { - in: { - p: "وراوږد", - f: "wăr-ooGad", - }, - out: "وَراُوږَد", - }, - { - in: { - p: "دراوږد", - f: "dăr-ooGad", - }, - out: "دَراُوږَد", - }, - { - in: { - p: "رااوږد", - f: "raa-ooGad", - }, - out: "رااُوږَد", - }, - // allow for spaces at beginning of phonetics etc. - { - in: { - p: " سپین کړه", - f: " speen kRu", - }, - out: "سْپِین کْړهٔ", - }, - { - in: { - p: "اوب", - f: "ob", - }, - out: "اوب", - }, - // allow oo at start with و prefix - { - in: { - p: "وباسي", - f: "oobaasee", - }, - out: "وباسي", - }, - { - in: { - p: "وځم", - f: "oodzum", - }, - out: "وځ" + zwarakey + "م", - }, - { - in: { - p: "وځم", - f: "wUdzum", - }, - out: "وُځ" + zwarakey + "م", - }, + { + in: { + p: "کور", + f: "kor", + }, + out: "کور", + }, + { + in: { + p: "کور", + f: "koor", + }, + out: "کُور", + }, + { + in: { + p: "تب", + f: "tib", + }, + out: "تِب", + }, + { + in: { + p: "تب", + f: "tab", + }, + out: "تَب", + }, + { + in: { + p: "تب", + f: "tUb", + }, + out: "تُب", + }, + { + in: { + p: "تب", + f: "tub", + }, + out: "تٙب", + }, + { + in: { + p: "تب", + f: "tb", + }, + out: "تْب", + }, + { + in: { + p: "تلب", + f: "tilab", + }, + out: "تِلَب", + }, + { + in: { + p: "تشناب", + f: "tashnaab", + }, + out: "تَشْناب", + }, + // broken phonetics will return undefined + { + in: { + p: "تشناب", + f: "peshnaab", + }, + out: undefined, + }, + // working with وs + { + in: { + p: "کول", + f: "kwal", + }, + out: "کْوَل", + }, + { + in: { + p: "تول", + f: "tool", + }, + out: "تُول", + }, + { + in: { + p: "مقبول", + f: "maqbool", + }, + out: "مَقْبُول", + }, + { + in: { + p: "کول", + f: "kawul", + }, + out: "کَو" + zwarakay + "ل", + }, + { + in: { + p: "کول", + f: "kiwul", + }, + out: "کِو" + zwarakay + "ل", + }, + { + in: { + p: "کول", + f: "kUwul", + }, + out: "کُو" + zwarakay + "ل", + }, + { + in: { + p: "کول", + f: "kuwul", + }, + out: "ک" + zwarakay + "و" + zwarakay + "ل", + }, + { + in: { + p: "کول", + f: "kawal", + }, + out: "کَوَل", + }, + { + in: { + p: "کول", + f: "kUwal", + }, + out: "کُوَل", + }, + { + in: { + p: "پشتګرد", + f: "pishtgird", + }, + out: "پِشْتْګِرْد", + }, + { + in: { + p: "سپین", + f: "speen", + }, + out: "سْپِین", + }, + { + in: { + p: "سپین", + f: "spayn", + }, + out: "سْپین", + }, + { + in: { + p: "پېش", + f: "pesh", + }, + out: "پېش", + }, + { + in: { + p: "پېش", + f: "paysh", + }, + out: undefined, + }, + { + in: { + p: "رغېدل", + f: "raghedul", + }, + out: "رَغېد" + zwarakay + "ل", + }, + { + in: { + p: "کارول", + f: "kaarawul", + }, + out: "کارَو" + zwarakay + "ل", + }, + { + in: { + p: "پېښېدل", + f: "pexedul", + }, + out: "پېښېد" + zwarakay + "ل", + }, + { + in: { + p: "مین", + f: "ma`yín", + }, + out: "مَیِن", + }, + { + in: { + p: "سړی", + f: "saRay", + }, + out: "سَړی", + }, + { + in: { + p: "سړي", + f: "saRee", + }, + out: "سَړي", + }, + { + in: { + p: "زه", + f: "zu", + }, + out: "زهٔ", + }, + { + in: { + p: "زه", + f: "za", + }, + out: "زه", + }, + { + in: { + p: "پېشنهاد", + f: "peshniháad", + }, + out: "پېشْنِهاد", + }, + { + in: { + p: "ایستل", + f: "eestul", + }, + out: "اِیسْت" + zwarakay + "ل", + }, + { + in: { + p: "ایستل", + f: "aystul", + }, + out: "ایسْت" + zwarakay + "ل", + }, + { + in: { + p: "اېسېدل", + f: "esedul", + }, + out: "اېسېد" + zwarakay + "ل", + }, + { + in: { + p: "اوسېدل", + f: "osedul", + }, + out: "اوسېد" + zwarakay + "ل", + }, + { + in: { + p: "اواز", + f: "awaaz", + }, + out: "اَواز", + }, + { + in: { + p: "اسلام", + f: "islaam", + }, + out: "اِسْلام", + }, + { + in: { + p: "واردول", + f: "waaridawul", + }, + out: "وارِدَو" + zwarakay + "ل", + }, + { + in: { + p: "غاړه", + f: "ghaaRa", + }, + out: "غاړه", + }, + { + in: { + p: "اوتر", + f: "awtár", + }, + out: "اَوْتَر", + }, + { + in: { + p: "اختیار", + f: "ikhtiyáar", + }, + out: "اِخْتِیار", + }, + { + in: { + p: "فریاد", + f: "faryáad", + }, + out: "فَرْیاد", + }, + { + in: { + p: "کارغه", + f: "kaarghu", + }, + out: "کارْغهٔ", + }, + { + in: { + p: "بې کار", + f: "be kaar", + }, + out: "بې کار", + }, + { + in: { + p: "بې کار", + f: "bekaar", + }, + out: "بې کار", + }, + { + in: { + p: "انبار", + f: "ambáar", + }, + out: "اَنْبار", + }, + { + in: { + p: "ارغون", + f: "arghóon", + }, + out: "اَرْغُون", + }, + { + in: { + p: "ارمټه", + f: "armaTa", + }, + out: "اَرْمَټه", + }, + { + in: { + p: "اروا پوه", + f: "arwaa poh", + }, + out: "اَرْوا پوهْ", + }, + { + in: { + p: "اسحاق", + f: "ishaaq", + }, + out: undefined, + }, + { + in: { + p: "اسحاق", + f: "is`haaq", + }, + out: "اِسْحاق", + }, + { + in: { + p: "سعات", + f: "saat", + }, + out: "سعات", + }, + { + in: { + p: "سعات", + f: "sa'aat", + }, + out: "سَعات", + }, + { + in: { + p: "استعمال", + f: "ist'imaal", + }, + out: "اِسْتعِمال", + }, + { + in: { + p: "استعمال", + f: "istimaal", + }, + out: "اِسْتعِمال", + }, + { + in: { + p: "اروایي", + f: "arwaayee", + }, + out: "اَرْوایي", + }, + { + in: { + p: "اریځ", + f: "Uryadz", + }, + out: "اُرْیَځ", + }, + { + in: { + p: "ازغن تار", + f: "azghun taar", + }, + out: "اَزْغ" + zwarakay + "ن" + " تار", + }, + { + in: { + p: "اره څکول", + f: "ara tskawul", + }, + out: "اَره څْکَو" + zwarakay + "ل", + }, + { + in: { + p: "اږیل", + f: "aGuyúl", + }, + out: "اَږ" + zwarakay + "ی" + zwarakay + "ل", + }, + { + in: { + p: "استازندوی", + f: "astaazandoy", + }, + out: "اَسْتازَنْدوی", + }, + // واخ being khaa in the middle of a word + { + in: { + p: "استخوان", + f: "UstUkháan", + }, + out: "اُسْتُخ(و)ان", + }, + { + in: { + p: "اسطلاع", + f: "istilaa", + }, + out: "اِسْطِلاع", + }, + { + in: { + p: "اسهال", + f: "is`háal", + }, + out: "اِسْهال", + }, + { + in: { + p: "اسهامي", + f: "as`haamee", + }, + out: "اَسْهامي", + }, + // avoid false double consonant + { + in: { + p: "ازل لیک", + f: "azalléek", + }, + out: "اَزَل لِیک", + }, + // bad ending test + { + in: { + p: "ماضی", + f: "maazee", + }, + out: undefined, + }, + // bad beginning test + { + in: { + p: "وسېدل", + f: "osedul", + }, + out: undefined, + }, + { + in: { + p: "يست", + f: "eest", + }, + out: undefined, + }, + { + in: { + p: "ست", + f: "ist", + }, + out: undefined, + }, + { + in: { + p: "haca", + f: "هځه", + }, + out: undefined, + }, + // tashdeed + { + in: { + p: "پته", + f: "patta", + }, + out: "پَتّه", + }, + { + in: { + p: "اعتصاب شکن", + f: "itisaabshikan", + }, + out: "اِعتِصاب شِکَن", + }, + // Arabic wasla + { + in: { + p: "بالکل", + f: "bilkUl", + }, + out: "بِٱلْکُل", + }, + // izafe + { + in: { + p: "ایصال ثواب", + f: "eesaal-i-sawaab", + }, + out: "اِیصالِ ثَواب", + }, + { + in: { + p: "با استعداد", + f: "baa isti'dáad", + }, + out: "با اِسْتِعداد", + }, + // starting with ع + { + in: { + p: "عزت", + f: "izzat", + }, + out: "عِزَّت", + }, + { + in: { + p: "عزت", + f: "i'zzat", + }, + out: "عِزَّت", + }, + // ئ in the middle + { + in: { + p: "برائت", + f: "baraa'at", + }, + out: "بَرائَت", + }, + { + in: { + p: "فائده", + f: "faaida", + }, + out: "فائِده", + }, + // starting with long aa + { + in: { + p: "آدم", + f: "aadam", + }, + out: "آدَم", + }, + { + in: { + p: "یدام", + f: "aadam", + }, + out: undefined, + }, + { + in: { + p: "منع", + f: "mán'a", + }, + out: "مَنعَ", + }, + { + in: { + p: "منع", + f: "mana", + }, + out: "مَنعَ", + }, + { + in: { + p: "منابع", + f: "mUnaabí", + }, + out: "مُنابعِ", + }, + { + // TODO: Is this correct?? + in: { + p: "اسان", + f: "aasaan", + }, + out: "اسان", + }, + // ې followed by ی - y needs to be written as e`y to be distinguished from ay - ی + { + in: { + p: "پتېیل", + f: "patayúl", + }, + out: undefined, + }, + { + in: { + p: "پتېیل", + f: "pate`yúl", + }, + out: "پَتېی" + zwarakay + "ل", + }, + { + in: { + p: "درېیم", + f: "dre`yum", + }, + out: "دْرېی" + zwarakay + "م", + }, + { + in: { + p: "تابع دار", + f: "taabidaar", + }, + out: "تابعِ دار", + }, + // handle circumpositions + { + in: { + p: "تر ... پورې", + f: "tur ... pore", + }, + out: "ت" + zwarakay + "ر ... پورې", + }, + // joiner و + { + in: { + p: "کار و بار", + f: "kaar-U-baar", + }, + out: "کار و بار", + }, + { + in: { + p: "کاروبار", + f: "kaar-U-baar", + }, + out: "کاروبار", + }, + { + in: { + p: "توقع", + f: "tawaqqÚ", + }, + out: "تَوَقّعُ", + }, + // special behaviour with د + { + in: { + p: "د", + f: "du", + }, + out: "د" + zwarakay, + }, + { + in: { + p: "د لاس", + f: "du laas", + }, + out: "د" + zwarakay + " لاس", + }, + { + in: { + p: "د ... په شان", + f: "du ... pu shaan", + }, + out: "د" + zwarakay + " ... پهٔ شان", + }, + { + in: { + p: "ذبح", + f: "zabha", + }, + out: "ذَبْحَ", + }, + { + in: { + p: "ذبح", + f: "zabha", + }, + out: "ذَبْحَ", + }, + { + in: { + p: "ذبح کول", + f: "zabha kawul", + }, + out: "ذَبْحَ کَو" + zwarakay + "ل", + }, + // require dagger alif on words ending with یٰ + { + in: { + p: "یحیی", + f: "yahyaa", + }, + out: undefined, + }, + { + in: { + p: "یحییٰ", + f: "yahyaa", + }, + out: "یَحْییٰ", + }, + { + in: { + p: "معنیٰ", + f: "ma'anaa", + }, + out: "مَعَنیٰ", + }, + // require fathatan on words ending in اً + { + in: { + p: "دقیقا", + f: "daqeeqan", + }, + out: undefined, + }, + { + in: { + p: "دقیقاً", + f: "daqeeqan", + }, + out: "دَقِیقاً", + }, + // words starting in عا + { + in: { + p: "عام", + f: "aam", + }, + out: "عام", + }, + { + in: { + p: "عام", + f: "'aam", + }, + out: "عام", + }, + { + in: { + p: "قتل عام", + f: "qatl-i-aam", + }, + out: "قَتْلِ عام", + }, + { + in: { + p: "طمع لرل", + f: "tama larul", + }, + out: "طَمعَ لَر" + zwarakay + "ل", + }, + // Ua ؤ + { + in: { + p: "مودب", + f: "mUaddab", + }, + out: "مؤدَّب", + }, + { + in: { + p: "لکۍ وال", + f: "lakuy waal", + }, + out: "لَکۍ وال", + }, + // shouldn't skip the ئ at the end + { + in: { + p: "شئ", + f: "shey", + }, + out: "شئ", + }, + // excetption for و - wo + { + in: { + p: "و", + f: "wo", + }, + out: "و", + }, + { + in: { + p: "سړی و", + f: "saRay wo", + }, + out: "سَړی و", + }, + { + in: { + p: "عید", + f: "eed", + }, + out: "عِید", + }, + // i ending can also be i + { + in: { + p: "سه", + f: "si", + }, + out: "سِه", + }, + { + in: { + p: "سه شنبه", + f: "sishamba", + }, + out: "سِه شَنْبه", + }, + { + in: { + p: "توجه", + f: "tawajÚ", + }, + out: "تَوَجُه", + }, + { + in: { + p: "توجه کول", + f: "tawajU kawul", + }, + out: "تَوَجُه کَو" + zwarakay + "ل", + }, + // With Arabic definate article -Ul- ال + { + in: { + p: "حق الاجاره", + f: "haq-Ul-ijaara", + }, + out: "حَق اُلاِجاره", + }, + { + in: { + p: "دار العلوم", + f: "daar-Ul-Ulóom", + }, + out: "دار اُلعُلُوم", + }, + // double consonants on end of words + { + in: { + p: "حق", + f: "haqq", + }, + out: "حَقّ", + }, + { + in: { + p: "حق پر", + f: "haqq par", + }, + out: "حَقّ پَر", + }, + { + in: { + p: "راجع کېدل", + f: "raaji kedul", + }, + out: "راجعِ کېد" + zwarakay + "ل", + }, + { + in: { + p: "ربیع", + f: "rabee'", + }, + out: "رَبِیع", + }, + { + in: { + p: "سختسری", + f: "sakht săray", + }, + out: "سَخْتْسَری", + }, + { + in: { + p: "معنیٰ", + f: "ma'naa", + }, + out: "مَعنیٰ", + }, + // issue with یٰ ending and then continuing to the next word + { + in: { + p: "معنیٰ دار", + f: "ma'naa daar", + }, + out: "مَعنیٰ دار", + }, + { + in: { + p: "اله", + f: "ilah", + }, + out: "اِلَهْ", + }, + // issue with words ending in عه going to the next word + { + in: { + p: "قطعه بازي", + f: "qit'a baazee", + }, + out: "قِطعه بازي", + }, + // أ in the middle of the word + { + in: { + p: "متأسف", + f: "mUta'assif", + }, + out: "مُتأسِّف", + }, + // words ending in ع a' on to the next word + { + in: { + p: "مربع", + f: "mUraba'", + }, + out: "مُرَبَع", + }, + { + in: { + p: "مربع جذر", + f: "mUraba' jazúr", + }, + out: "مُرَبَع جَذ" + zwarakay + "ر", + }, + { + in: { + p: "مسوول", + f: "mas'ool", + }, + out: "مَسوُول", // TODO: Is this best?? + }, + // allow for beginnings prefixed with ور در را + { + in: { + p: "وراوږد", + f: "wăr-ooGad", + }, + out: "وَراُوږَد", + }, + { + in: { + p: "دراوږد", + f: "dăr-ooGad", + }, + out: "دَراُوږَد", + }, + { + in: { + p: "رااوږد", + f: "raa-ooGad", + }, + out: "رااُوږَد", + }, + // allow for spaces at beginning of phonetics etc. + { + in: { + p: " سپین کړه", + f: " speen kRu", + }, + out: "سْپِین کْړهٔ", + }, + { + in: { + p: "اوب", + f: "ob", + }, + out: "اوب", + }, + // allow oo at start with و prefix + { + in: { + p: "وباسي", + f: "oobaasee", + }, + out: "وباسي", + }, + { + in: { + p: "وځم", + f: "oodzum", + }, + out: "وځ" + zwarakay + "م", + }, + { + in: { + p: "وځم", + f: "wUdzum", + }, + out: "وُځ" + zwarakay + "م", + }, ]; // TODO: قطع کول - qat'a kawul - failing @@ -1117,13 +1118,13 @@ const toTest: Array<{ // TODO: الله words toTest.forEach((t) => { - test(`${t.in.p} given phonetics ${t.in.f} should translate to ${t.out}`, () => { - const output = phoneticsToDiacritics(t.in.p, t.in.f); - expect(output).toBe(t.out); - }); + test(`${t.in.p} given phonetics ${t.in.f} should translate to ${t.out}`, () => { + const output = phoneticsToDiacritics(t.in.p, t.in.f); + expect(output).toBe(t.out); + }); }); test("should forbid oo prefixes when the option is passed", () => { - const output = phoneticsToDiacritics("وځم", "oodzum", true); - expect(output).toBe(undefined); + const output = phoneticsToDiacritics("وځم", "oodzum", true); + expect(output).toBe(undefined); }); diff --git a/src/lib/src/phonetics-to-diacritics.ts b/src/lib/src/phonetics-to-diacritics.ts index 6ab1b2b..9c78917 100644 --- a/src/lib/src/phonetics-to-diacritics.ts +++ b/src/lib/src/phonetics-to-diacritics.ts @@ -7,7 +7,7 @@ */ const zwar = "َ"; -const zwarakey = "ٙ"; +const zwarakay = "ٙ"; const zer = "ِ"; const pesh = "ُ"; const sukun = "ْ"; @@ -19,8 +19,25 @@ const fathahan = "ً"; // TODO: THESE OTHER TRIGRAPHS?? const quadrigraphs = ["-Ul-"]; -const trigraphs = ["eyy", "éyy", "-i-", "-U-"]; // , "aay", "áay", "ooy", "óoy"]; -const digraphs = ["ắ", "aa", "áa", "ee", "ée", "ey", "éy", "oo", "óo", "kh", "gh", "ts", "dz", "jz", "ch", "sh"]; +const trigraphs = ["ey", "éy", "-i-", "-U-"]; // , "aay", "áay", "ooy", "óoy"]; +const digraphs = [ + "ắ", + "aa", + "áa", + "ee", + "ée", + "ay", + "áy", + "oo", + "óo", + "kh", + "gh", + "ts", + "dz", + "jz", + "ch", + "sh", +]; const endingDigraphs = ["uy", "úy"]; const willIgnore = ["?", " ", "`", ".", "…"]; @@ -28,7 +45,7 @@ export function splitFIntoPhonemes(f: string): string[] { const result: string[] = []; let index = 0; while (index < f.length) { - const isLastTwoLetters = (index === f.length - 2 || f[index + 2] === " "); + const isLastTwoLetters = index === f.length - 2 || f[index + 2] === " "; const threeLetterChunk = f.slice(index, index + 3); const fourLetterChunk = f.slice(index, index + 4); if (quadrigraphs.includes(fourLetterChunk)) { @@ -89,43 +106,145 @@ const phonemeTable = [ { phoneme: "m", possibilities: ["م"], consonant: true }, { phoneme: "n", possibilities: ["ن"], consonant: true }, { phoneme: "N", possibilities: ["ڼ"], consonant: true }, - { phoneme: "h", possibilities: ["ه", "ح"], consonant: true, takesSukunOnEnding: true }, + { + phoneme: "h", + possibilities: ["ه", "ح"], + consonant: true, + takesSukunOnEnding: true, + }, { phoneme: "w", possibilities: ["و"], consonant: true }, { phoneme: "y", possibilities: ["ی"], consonant: true }, { phoneme: "'", possibilities: ["ع", "ئ"], consonant: true }, { phoneme: "-i-", isIzafe: true }, - { phoneme: "-U-", possibilities: [" و ", "و"]}, - { phoneme: "-Ul-", possibilities: ["ال"]}, + { phoneme: "-U-", possibilities: [" و ", "و"] }, + { phoneme: "-Ul-", possibilities: ["ال"] }, // vowels - { phoneme: "aa", possibilities: ["ا"], beginning: ["آ", "ا"], endingPossibilities: ["ا", "یٰ"], isLongA: true, canStartWithAynBefore: true }, - { phoneme: "áa", possibilities: ["ا"], beginning: ["آ", "ا"], endingPossibilities: ["ا", "یٰ"], isLongA: true, canStartWithAynBefore: true }, - { phoneme: "ee", possibilities: ["ی"], addAlefOnBeginning: true, endingPossibilities: ["ي"], diacritic: zer, canStartWithAynBefore: true }, - { phoneme: "ée", possibilities: ["ی"], addAlefOnBeginning: true, endingPossibilities: ["ي"], diacritic: zer, canStartWithAynBefore: true }, + { + phoneme: "aa", + possibilities: ["ا"], + beginning: ["آ", "ا"], + endingPossibilities: ["ا", "یٰ"], + isLongA: true, + canStartWithAynBefore: true, + }, + { + phoneme: "áa", + possibilities: ["ا"], + beginning: ["آ", "ا"], + endingPossibilities: ["ا", "یٰ"], + isLongA: true, + canStartWithAynBefore: true, + }, + { + phoneme: "ee", + possibilities: ["ی"], + addAlefOnBeginning: true, + endingPossibilities: ["ي"], + diacritic: zer, + canStartWithAynBefore: true, + }, + { + phoneme: "ée", + possibilities: ["ی"], + addAlefOnBeginning: true, + endingPossibilities: ["ي"], + diacritic: zer, + canStartWithAynBefore: true, + }, { phoneme: "e", possibilities: ["ې"], addAlefOnBeginning: true }, { phoneme: "é", possibilities: ["ې"], addAlefOnBeginning: true }, { phoneme: "o", possibilities: ["و"], addAlefOnBeginning: true }, { phoneme: "ó", possibilities: ["و"], addAlefOnBeginning: true }, - { phoneme: "oo", possibilities: ["و"], addAlefOnBeginning: true, alsoCanBePrefix: true, diacritic: pesh }, - { phoneme: "óo", possibilities: ["و"], addAlefOnBeginning: true, diacritic: pesh }, - { phoneme: "ey", possibilities: ["ی"], addAlefOnBeginning: true, endingPossibilities: ["ی"]}, - { phoneme: "éy", possibilities: ["ی"], addAlefOnBeginning: true, endingPossibilities: ["ی"]}, + { + phoneme: "oo", + possibilities: ["و"], + addAlefOnBeginning: true, + alsoCanBePrefix: true, + diacritic: pesh, + }, + { + phoneme: "óo", + possibilities: ["و"], + addAlefOnBeginning: true, + diacritic: pesh, + }, + { + phoneme: "ay", + possibilities: ["ی"], + addAlefOnBeginning: true, + endingPossibilities: ["ی"], + }, + { + phoneme: "áy", + possibilities: ["ی"], + addAlefOnBeginning: true, + endingPossibilities: ["ی"], + }, { phoneme: "uy", possibilities: ["ۍ"], endingOnly: true }, { phoneme: "úy", possibilities: ["ۍ"], endingOnly: true }, // THIS CAN ONLY COME AT THE END DEAL WITH THIS - { phoneme: "eyy", possibilities: ["ئ"], endingOnly: true }, - { phoneme: "éyy", possibilities: ["ئ"], endingOnly: true }, + { phoneme: "ey", possibilities: ["ئ"], endingOnly: true }, + { phoneme: "éy", possibilities: ["ئ"], endingOnly: true }, - { phoneme: "a", diacritic: zwar, endingPossibilities: ["ه"], canComeAfterHeyEnding: true, canBeFirstPartOfFathahanEnding: true }, - { phoneme: "á", diacritic: zwar, endingPossibilities: ["ه"], canComeAfterHeyEnding: true, canBeFirstPartOfFathahanEnding: true }, + { + phoneme: "a", + diacritic: zwar, + endingPossibilities: ["ه"], + canComeAfterHayEnding: true, + canBeFirstPartOfFathahanEnding: true, + }, + { + phoneme: "á", + diacritic: zwar, + endingPossibilities: ["ه"], + canComeAfterHayEnding: true, + canBeFirstPartOfFathahanEnding: true, + }, { phoneme: "ă", diacritic: zwar }, { phoneme: "ắ", diacritic: zwar }, - { phoneme: "u", diacritic: zwarakey, endingPossibilities: ["ه"], hamzaOnEnd: true }, - { phoneme: "ú", diacritic: zwarakey, endingPossibilities: ["ه"], hamzaOnEnd: true }, - { phoneme: "i", diacritic: zer, endingPossibilities: ["ه"], takesDiacriticBeforeGurdaHeyEnding: true, canBeWasla: true, beginning: ["ا", "ع"] }, - { phoneme: "í", diacritic: zer, endingPossibilities: ["ه"], takesDiacriticBeforeGurdaHeyEnding: true, canBeWasla: true, beginning: ["ا", "ع"] }, - { phoneme: "U", diacritic: pesh, endingPossibilities: ["ه"], takesDiacriticBeforeGurdaHeyEnding: true, beginning: ["ا", "ع"] }, - { phoneme: "Ú", diacritic: pesh, endingPossibilities: ["ه"], takesDiacriticBeforeGurdaHeyEnding: true, beginning: ["ا", "ع"] }, + { + phoneme: "u", + diacritic: zwarakay, + endingPossibilities: ["ه"], + hamzaOnEnd: true, + }, + { + phoneme: "ú", + diacritic: zwarakay, + endingPossibilities: ["ه"], + hamzaOnEnd: true, + }, + { + phoneme: "i", + diacritic: zer, + endingPossibilities: ["ه"], + takesDiacriticBeforeGurdaHayEnding: true, + canBeWasla: true, + beginning: ["ا", "ع"], + }, + { + phoneme: "í", + diacritic: zer, + endingPossibilities: ["ه"], + takesDiacriticBeforeGurdaHayEnding: true, + canBeWasla: true, + beginning: ["ا", "ع"], + }, + { + phoneme: "U", + diacritic: pesh, + endingPossibilities: ["ه"], + takesDiacriticBeforeGurdaHayEnding: true, + beginning: ["ا", "ع"], + }, + { + phoneme: "Ú", + diacritic: pesh, + endingPossibilities: ["ه"], + takesDiacriticBeforeGurdaHayEnding: true, + beginning: ["ا", "ع"], + }, ]; function isSpace(s: string): boolean { @@ -142,7 +261,11 @@ interface IDiacriticsErrorMessage { i: number; } -function possibilityMatches(p: string, pIndex: number, possibilities: string[] | undefined): boolean { +function possibilityMatches( + p: string, + pIndex: number, + possibilities: string[] | undefined +): boolean { /* istanbul ignore next */ if (!possibilities) { return false; @@ -155,10 +278,15 @@ function possibilityMatches(p: string, pIndex: number, possibilities: string[] | return false; } -function isPrefixedByDirectionalPronoun(i: number, phonemes: string[]): boolean { +function isPrefixedByDirectionalPronoun( + i: number, + phonemes: string[] +): boolean { const potentialPronounFourCharSlice = phonemes.slice(i - 4, i).join(""); const potentialPronounThreeCharSlice = phonemes.slice(i - 3, i).join(""); - if (["wăr-", "war-", "dăr-", "dar-"].includes(potentialPronounFourCharSlice)) { + if ( + ["wăr-", "war-", "dăr-", "dar-"].includes(potentialPronounFourCharSlice) + ) { return true; } if (potentialPronounThreeCharSlice === "raa-") { @@ -167,7 +295,11 @@ function isPrefixedByDirectionalPronoun(i: number, phonemes: string[]): boolean return false; } -export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: boolean = false): string | undefined { +export function phoneticsToDiacritics( + ps: string, + ph: string, + forbidOoPrefixes: boolean = false +): string | undefined { const phonemes = splitFIntoPhonemes(ph.trim().split(",")[0]); const p = ps.trim(); let result = ""; @@ -179,58 +311,72 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: if (phoneme === "-") { return; } - const phonemeInfo = phonemeTable.find((element) => element.phoneme === phoneme); + const phonemeInfo = phonemeTable.find( + (element) => element.phoneme === phoneme + ); if (!phonemeInfo) { errored.push({ error: "phoneme info not found", phoneme, i }); return; } - const isDoubleConsonant = ( + const isDoubleConsonant = phonemeInfo.consonant && phoneme === phonemes[i - 1] && // TODO: is this thourough enough to allow double consonants on the ending of the previous word? !(isSpace(p[pIndex - 1]) && phonemeInfo.possibilities.includes(p[pIndex])) // avoid false double consonant ie ازل لیک azalleek - ) ? true : false; - const isBeginning = !isDoubleConsonant && ((i === 0) || isSpace(p[pIndex - 1]) || (phonemes[i - 1] === "-Ul-") || isPrefixedByDirectionalPronoun(i, phonemes)); - const upcomingAEndingAfterHey = (p[pIndex] === "ح" && isSpace(p[pIndex + 1]) && ["a", "á"].includes(phonemes[i + 1])); - - // TODO: break this into a seperate function -- why can it sometimes be set to undefined? - const isEnding = (i === phonemes.length - 1) || (( - (phonemeInfo.possibilities && isSpace(p[pIndex + 1])) || - (!phonemeInfo.possibilities && isSpace(p[pIndex])) || - ( - (!phonemeInfo.possibilities && isSpace(p[pIndex + 1])) && - (possibilityMatches(p, pIndex, phonemeInfo.endingPossibilities) || (p[pIndex] === "ع" && phonemes[i + 1] !== "'")) - ) - ) && !upcomingAEndingAfterHey - && // makes sure the next letter isn't a double consonant like haqq <- - !( - phonemeInfo.consonant && phoneme === phonemes[i + 1] // && - // !(isSpace(p[pIndex + 1]) && phonemeInfo.possibilities.includes(p[pIndex])) - ) - ) || // can be the trailing double consanant on the end of a word - ( - phonemeInfo.consonant && phoneme === phonemes[i - 1] && - !(isEndSpace(p[pIndex - 1]) && phonemeInfo.possibilities.includes(p[pIndex])) - ) || // can be یٰ ending - ( - isEndSpace(p[pIndex + 2]) && (p.slice(pIndex, pIndex + 2) === "یٰ") - ); + ? true + : false; + const isBeginning = + !isDoubleConsonant && + (i === 0 || + isSpace(p[pIndex - 1]) || + phonemes[i - 1] === "-Ul-" || + isPrefixedByDirectionalPronoun(i, phonemes)); + const upcomingAEndingAfterHay = + p[pIndex] === "ح" && + isSpace(p[pIndex + 1]) && + ["a", "á"].includes(phonemes[i + 1]); - const isUofDu = phoneme === "u" && ( - p.slice(pIndex - 2, pIndex) === "د " || // د as previous word - (p[pIndex] === undefined && p[pIndex - 1] === "د") || // د as the whole thing - p.slice(pIndex - 6, pIndex) === "د ... " // ... د is as the previous word - ); + // TODO: break this into a seperate function -- why can it sometimes be set to undefined? + const isEnding = + i === phonemes.length - 1 || + (((phonemeInfo.possibilities && isSpace(p[pIndex + 1])) || + (!phonemeInfo.possibilities && isSpace(p[pIndex])) || + (!phonemeInfo.possibilities && + isSpace(p[pIndex + 1]) && + (possibilityMatches(p, pIndex, phonemeInfo.endingPossibilities) || + (p[pIndex] === "ع" && phonemes[i + 1] !== "'")))) && + !upcomingAEndingAfterHay && // makes sure the next letter isn't a double consonant like haqq <- + !( + (phonemeInfo.consonant && phoneme === phonemes[i + 1]) // && + // !(isSpace(p[pIndex + 1]) && phonemeInfo.possibilities.includes(p[pIndex])) + )) || // can be the trailing double consanant on the end of a word + (phonemeInfo.consonant && + phoneme === phonemes[i - 1] && + !( + isEndSpace(p[pIndex - 1]) && + phonemeInfo.possibilities.includes(p[pIndex]) + )) || // can be یٰ ending + (isEndSpace(p[pIndex + 2]) && p.slice(pIndex, pIndex + 2) === "یٰ"); + + const isUofDu = + phoneme === "u" && + (p.slice(pIndex - 2, pIndex) === "د " || // د as previous word + (p[pIndex] === undefined && p[pIndex - 1] === "د") || // د as the whole thing + p.slice(pIndex - 6, pIndex) === "د ... "); // ... د is as the previous word // TODO: Should p[pIndex - 1] also be in there ??? It messed up قطعه for instance - const isEndingAynVowel = isEnding && phonemeInfo.diacritic && [p[pIndex], p[pIndex - 1]].includes("ع") && p[pIndex] !== "ه"; + const isEndingAynVowel = + isEnding && + phonemeInfo.diacritic && + [p[pIndex], p[pIndex - 1]].includes("ع") && + p[pIndex] !== "ه"; const isMiddle = !isBeginning && !isEnding; - const isSilentWaw = ( + const isSilentWaw = p[pIndex] === "و" && p[pIndex - 1] === "خ" && p[pIndex + 1] === "ا" && - ["áa", "aa"].includes(phoneme) - ); - const isAnAEndingAfterHey = isEnding && p[pIndex - 1] === "ح" && phonemeInfo.canComeAfterHeyEnding; + ["áa", "aa"].includes(phoneme); + const isAnAEndingAfterHay = + isEnding && p[pIndex - 1] === "ح" && phonemeInfo.canComeAfterHayEnding; if (isDoubleConsonant) { pIndex--; if (isSpace(p[pIndex])) { @@ -247,14 +393,22 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: pIndex++; } // special check for Arabic wasla - if (p.slice(0, 3) === "بال" && phonemes[i - 1] === "b" && phonemeInfo.canBeWasla && phonemes[i + 1] === "l") { + if ( + p.slice(0, 3) === "بال" && + phonemes[i - 1] === "b" && + phonemeInfo.canBeWasla && + phonemes[i + 1] === "l" + ) { result += phonemeInfo.diacritic + wasla; pIndex++; previousPhonemeWasAConsonant = false; return; } // special check for fathahan ending - if (phonemeInfo.canBeFirstPartOfFathahanEnding && p.slice(pIndex, pIndex + 2) === "اً") { + if ( + phonemeInfo.canBeFirstPartOfFathahanEnding && + p.slice(pIndex, pIndex + 2) === "اً" + ) { result += "ا"; pIndex++; return; @@ -265,7 +419,12 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: return; } // special check for words starting with عا or عی - if (isBeginning && phonemeInfo.canStartWithAynBefore && p[pIndex] === "ع" && phonemeInfo.possibilities.includes(p[pIndex + 1])) { + if ( + isBeginning && + phonemeInfo.canStartWithAynBefore && + p[pIndex] === "ع" && + phonemeInfo.possibilities.includes(p[pIndex + 1]) + ) { result += "ع"; result += phonemeInfo.diacritic ? phonemeInfo.diacritic : ""; result += p[pIndex + 1]; @@ -273,23 +432,45 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: return; } // special check for ؤ Ua - if (phoneme === "U" && phonemes[i + 1] === "a" && phonemes[i + 2] !== "a" && p[pIndex] === "و") { + if ( + phoneme === "U" && + phonemes[i + 1] === "a" && + phonemes[i + 2] !== "a" && + p[pIndex] === "و" + ) { result += "ؤ"; pIndex++; return; } - if (phoneme === "a" && phonemes[i - 1] === "U" && phonemes[i + 1] !== "a" && result.slice(-2) === "ؤ") { + if ( + phoneme === "a" && + phonemes[i - 1] === "U" && + phonemes[i + 1] !== "a" && + result.slice(-2) === "ؤ" + ) { previousPhonemeWasAConsonant = false; return; } // special check for و wo - if (isBeginning && phoneme === "w" && phonemes[i + 1] === "o" && p[pIndex] === "و" && isEndSpace(p[pIndex + 1])) { + if ( + isBeginning && + phoneme === "w" && + phonemes[i + 1] === "o" && + p[pIndex] === "و" && + isEndSpace(p[pIndex + 1]) + ) { result += "و"; pIndex++; return; } // TODO: isEndSpace here is redundant?? - if (isEnding && phoneme === "o" && phonemes[i - 1] === "w" && p[pIndex - 1] === "و" && isEndSpace(p[pIndex])) { + if ( + isEnding && + phoneme === "o" && + phonemes[i - 1] === "w" && + p[pIndex - 1] === "و" && + isEndSpace(p[pIndex]) + ) { pIndex++; return; } @@ -300,38 +481,67 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: return; } // special check for for أ in the middle of the word - if (!isBeginning && p[pIndex] === "أ" && phoneme === "a" && phonemes[i + 1] === "'" && phonemes[i + 2] === "a") { + if ( + !isBeginning && + p[pIndex] === "أ" && + phoneme === "a" && + phonemes[i + 1] === "'" && + phonemes[i + 2] === "a" + ) { result += "أ"; pIndex++; return; } - if (p[pIndex - 1] === "أ" && phonemes[i - 1] === "a" && phoneme === "'" && phonemes[i + 1] === "a") { + if ( + p[pIndex - 1] === "أ" && + phonemes[i - 1] === "a" && + phoneme === "'" && + phonemes[i + 1] === "a" + ) { return; } - if (p[pIndex - 1] === "أ" && phonemes[i - 2] === "a" && phonemes[i - 1] === "'" && phoneme === "a") { + if ( + p[pIndex - 1] === "أ" && + phonemes[i - 2] === "a" && + phonemes[i - 1] === "'" && + phoneme === "a" + ) { previousPhonemeWasAConsonant = false; return; } // special check for وو 'oo - if (!isBeginning && p[pIndex] === "و" && p[pIndex + 1] === "و" && phoneme === "'" && phonemes[i + 1] === "oo") { + if ( + !isBeginning && + p[pIndex] === "و" && + p[pIndex + 1] === "و" && + phoneme === "'" && + phonemes[i + 1] === "oo" + ) { result += "وُو"; pIndex += 2; return; } - if (p[pIndex - 2] === "و" && p[pIndex - 1] === "و" && phonemes[i - 1] === "'" && phoneme === "oo") { + if ( + p[pIndex - 2] === "و" && + p[pIndex - 1] === "و" && + phonemes[i - 1] === "'" && + phoneme === "oo" + ) { previousPhonemeWasAConsonant = false; return; } - const prevLetterWasBeginningAyn = ( + const prevLetterWasBeginningAyn = p[pIndex - 1] === "ع" && // isEndSpace(p[pIndex]) && // This breaks it - phoneme === "'" - ); + phoneme === "'"; // check if the phoneme lines up in the Pashto word if (isBeginning && !isUofDu && phonemeInfo.addAlefOnBeginning) { // TODO: Maybe a little bad because it doesn't loop through possibilities - if ((!phonemeInfo.alsoCanBePrefix || forbidOoPrefixes) && p.slice(pIndex, pIndex + 2) !== "ا" + phonemeInfo.possibilities[0]) { + if ( + (!phonemeInfo.alsoCanBePrefix || forbidOoPrefixes) && + p.slice(pIndex, pIndex + 2) !== "ا" + phonemeInfo.possibilities[0] + ) { errored.push({ error: "didn't start with an aleph", phoneme, i }); return; } @@ -348,18 +558,18 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: pIndex++; return; } else if ( - (isEnding && phonemeInfo.endingPossibilities) && + isEnding && + phonemeInfo.endingPossibilities && !isUofDu && - ( - !possibilityMatches(p, pIndex, phonemeInfo.endingPossibilities) && - !isEndingAynVowel && // allowing short vowels on the end of words ending with ع - !isAnAEndingAfterHey - ) + !possibilityMatches(p, pIndex, phonemeInfo.endingPossibilities) && + !isEndingAynVowel && // allowing short vowels on the end of words ending with ع + !isAnAEndingAfterHay ) { errored.push({ error: "bad ending", phoneme, i }); return; } else if ( - (isEnding && !phonemeInfo.endingPossibilities) && + isEnding && + !phonemeInfo.endingPossibilities && phonemeInfo.possibilities && !phonemeInfo.possibilities.includes(p[pIndex]) ) { @@ -367,14 +577,17 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: errored.push({ error: "bad ending 2", phoneme, i }); return; } else if ( - (phonemeInfo.possibilities && !isEnding) && - ( - !(phonemeInfo.possibilities.includes(p[pIndex])) && - !(p[pIndex] === "ن" && (p[pIndex + 1] === "ب" && phoneme === "m")) && // && // exception case with نب === mb - !prevLetterWasBeginningAyn // exception case with words starting with ع like i'zzat - ) + phonemeInfo.possibilities && + !isEnding && + !phonemeInfo.possibilities.includes(p[pIndex]) && + !(p[pIndex] === "ن" && p[pIndex + 1] === "ب" && phoneme === "m") && // && // exception case with نب === mb + !prevLetterWasBeginningAyn // exception case with words starting with ع like i'zzat ) { - errored.push({ error: "improper coressponding letter in middle of word", phoneme, i }); + errored.push({ + error: "improper coressponding letter in middle of word", + phoneme, + i, + }); return; } // console.log(phoneme, pIndex, p[pIndex], isEnding); @@ -382,7 +595,12 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: // OK, it lines up with the Pashto word, we're good // Now continue building the result string // deal with starting with short vowels and alef - if (!isUofDu && isBeginning && !phonemeInfo.possibilities && !phonemeInfo.isIzafe) { + if ( + !isUofDu && + isBeginning && + !phonemeInfo.possibilities && + !phonemeInfo.isIzafe + ) { // TODO: WHY IS THIS HERE if (!["ا", "ع"].includes(p[pIndex])) { errored.push({ error: "bad beginning 2", phoneme, i }); @@ -392,22 +610,30 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: pIndex++; } // if the phoneme carries a diacritic insert it (before the letter if it's coming) - const isOoPrefix = (phonemeInfo.alsoCanBePrefix && isBeginning && (p[pIndex - 1] !== "ا")); + const isOoPrefix = + phonemeInfo.alsoCanBePrefix && isBeginning && p[pIndex - 1] !== "ا"; if (phonemeInfo.diacritic && !isEnding && !isOoPrefix) { - // using this hack to remove the space and put it after the zwarakey we're going to add after د - if (isUofDu && result.slice(-5) === " ... ") { - result = result.slice(0, -5) + zwarakey + " ... "; + // using this hack to remove the space and put it after the zwarakay we're going to add after د + if (isUofDu && result.slice(-5) === " ... ") { + result = result.slice(0, -5) + zwarakay + " ... "; } else if (isUofDu && result.slice(-1) === " ") { - result = result.slice(0, -1) + zwarakey + " "; + result = result.slice(0, -1) + zwarakay + " "; } else { result += phonemeInfo.diacritic; } } // TODO: The middle stuff might be unneccessary/unhelpful - const isACommaWithoutAyn = (phoneme === "'" && (p[pIndex] !== "ع" && !(isMiddle && p[pIndex] === "ئ"))); + const isACommaWithoutAyn = + phoneme === "'" && p[pIndex] !== "ع" && !(isMiddle && p[pIndex] === "ئ"); // if the previous phoneme was a consonant insert a sukun // console.log("Will I go into the adding thing?"); - if (!isBeginning && previousPhonemeWasAConsonant && phonemeInfo.consonant && phonemes[i - 1] !== "'" && p[pIndex] !== "ع") { + if ( + !isBeginning && + previousPhonemeWasAConsonant && + phonemeInfo.consonant && + phonemes[i - 1] !== "'" && + p[pIndex] !== "ع" + ) { result += isDoubleConsonant ? tashdeed : sukun; } if (isEnding && isDoubleConsonant) { @@ -417,30 +643,38 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: } } // if there's a pashto letter for the phoneme, insert it - if (!isEndingAynVowel && !isACommaWithoutAyn && (phonemeInfo.possibilities || isEnding)) { + if ( + !isEndingAynVowel && + !isACommaWithoutAyn && + (phonemeInfo.possibilities || isEnding) + ) { // need the isSpace check to prevent weird behaviour with izafe if (!isUofDu) { - if (isAnAEndingAfterHey) { + if (isAnAEndingAfterHay) { result += zwar; if (p[pIndex] === " ") { result += " "; } } else { - result += (isDoubleConsonant || isSpace(p[pIndex])) ? "" : p[pIndex]; + result += isDoubleConsonant || isSpace(p[pIndex]) ? "" : p[pIndex]; } } pIndex++; } if (isEnding) { if (isUofDu) { - result += zwarakey; + result += zwarakay; } else if (phonemeInfo.hamzaOnEnd) { result += hamzaAbove; } else if (phonemeInfo.takesSukunOnEnding) { result += sukun; } else if (p[pIndex] === daggerAlif) { result += daggerAlif; - } else if (isEndSpace(p[pIndex]) && p[pIndex - 1] === "ه" && phonemeInfo.takesDiacriticBeforeGurdaHeyEnding) { + } else if ( + isEndSpace(p[pIndex]) && + p[pIndex - 1] === "ه" && + phonemeInfo.takesDiacriticBeforeGurdaHayEnding + ) { result = result.slice(0, -1) + phonemeInfo.diacritic + "ه"; } } @@ -456,13 +690,20 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: } return; } - previousPhonemeWasAConsonant = (!isEnding && phonemeInfo.consonant) ? true : false; + previousPhonemeWasAConsonant = + !isEnding && phonemeInfo.consonant ? true : false; // ignore the ع or ئ if there's not a ' in the phonetics - const nextPhonemeInfo = phonemeTable.find((element) => phonemes[i + 1] === element.phoneme); + const nextPhonemeInfo = phonemeTable.find( + (element) => phonemes[i + 1] === element.phoneme + ); if ( ["ع", "ئ"].includes(p[pIndex]) && ![phonemes[i + 1], phonemes[i + 2]].includes("'") && - !(nextPhonemeInfo && nextPhonemeInfo.diacritic && isEndSpace(p[pIndex + 1])) && // don't skip the ع on the end if there's another short letter coming after it + !( + nextPhonemeInfo && + nextPhonemeInfo.diacritic && + isEndSpace(p[pIndex + 1]) + ) && // don't skip the ع on the end if there's another short letter coming after it !(p[pIndex] === "ئ" && isEndSpace(p[pIndex + 1])) && // don't skip ئ on the end !phonemeInfo.isIzafe ) { @@ -476,7 +717,11 @@ export function phoneticsToDiacritics(ps: string, ph: string, forbidOoPrefixes: return; } // if we've arrived at a space in the Pashto, move along before the next iteration - if (isSpace(p[pIndex]) && phonemes[i + 1] !== "-i-" && !upcomingAEndingAfterHey) { + if ( + isSpace(p[pIndex]) && + phonemes[i + 1] !== "-i-" && + !upcomingAEndingAfterHay + ) { result += " "; pIndex++; } diff --git a/src/lib/src/sandwiches.ts b/src/lib/src/sandwiches.ts index bd9b4c2..9f72bb2 100644 --- a/src/lib/src/sandwiches.ts +++ b/src/lib/src/sandwiches.ts @@ -1,139 +1,139 @@ import * as T from "../../types"; export const sandwiches: T.Sandwich[] = [ - { - type: "sandwich", - before: { p: "له", f: "la" }, - after: { p: "نه", f: "na" }, - e: "from", - }, - { - type: "sandwich", - before: { p: "له", f: "la" }, - after: { p: "څخه", f: "tsuxa" }, - e: "from", - }, - // TODO: Implement mayonaise - // { - // type: "sandwich", - // before: { p: "له", f: "la" }, - // after: "mayonaise", - // e: "from", - // }, - { - type: "sandwich", - before: { p: "له", f: "la" }, - after: { p: "سره", f: "sara" }, - e: "with", - }, - { - type: "sandwich", - before: undefined, - after: { p: "ته", f: "ta" }, - e: "to", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "لپاره", f: "lapaara" }, - e: "for", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "دمخې", f: "dumúkhe" }, - e: "before/in front of", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "په څانګ", f: "pu tsaang" }, - e: "beside", - }, - { - type: "sandwich", - before: { p: "پر", f: "pur" }, - after: { p: "باندې", f: "baande" }, - e: "on", - }, - { - type: "sandwich", - before: { p: "په", f: "pu" }, - after: { p: "کې", f: "ke" }, - e: "in", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "دننه", f: "dununa" }, - e: "inside", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "دباندې", f: "dubaande" }, - e: "outside", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "مخې ته", f: "mukhe ta" }, - e: "in front of", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "شا ته", f: "shaa ta" }, - e: "behind", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "لاندې", f: "laande" }, - e: "under", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "په شان", f: "pu shaan" }, - e: "like", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "غوندې", f: "ghwunde" }, - e: "like", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "په حیث", f: "pu heys" }, - e: "as", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "په لور", f: "pu lor" }, - e: "towards", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "په اړه", f: "pu aRa" }, - e: "about", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "په باره کې", f: "pu baara ke" }, - e: "about", - }, - { - type: "sandwich", - before: { p: "د", f: "du" }, - after: { p: "په اړوند", f: "pu aRwand" }, - e: "concerning", - }, + { + type: "sandwich", + before: { p: "له", f: "la" }, + after: { p: "نه", f: "na" }, + e: "from", + }, + { + type: "sandwich", + before: { p: "له", f: "la" }, + after: { p: "څخه", f: "tsuxa" }, + e: "from", + }, + // TODO: Implement mayonaise + // { + // type: "sandwich", + // before: { p: "له", f: "la" }, + // after: "mayonaise", + // e: "from", + // }, + { + type: "sandwich", + before: { p: "له", f: "la" }, + after: { p: "سره", f: "sara" }, + e: "with", + }, + { + type: "sandwich", + before: undefined, + after: { p: "ته", f: "ta" }, + e: "to", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "لپاره", f: "lapaara" }, + e: "for", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "دمخې", f: "dumúkhe" }, + e: "before/in front of", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "په څانګ", f: "pu tsaang" }, + e: "beside", + }, + { + type: "sandwich", + before: { p: "پر", f: "pur" }, + after: { p: "باندې", f: "baande" }, + e: "on", + }, + { + type: "sandwich", + before: { p: "په", f: "pu" }, + after: { p: "کې", f: "ke" }, + e: "in", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "دننه", f: "dununa" }, + e: "inside", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "دباندې", f: "dubaande" }, + e: "outside", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "مخې ته", f: "mukhe ta" }, + e: "in front of", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "شا ته", f: "shaa ta" }, + e: "behind", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "لاندې", f: "laande" }, + e: "under", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "په شان", f: "pu shaan" }, + e: "like", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "غوندې", f: "ghwunde" }, + e: "like", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "په حیث", f: "pu hays" }, + e: "as", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "په لور", f: "pu lor" }, + e: "towards", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "په اړه", f: "pu aRa" }, + e: "about", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "په باره کې", f: "pu baara ke" }, + e: "about", + }, + { + type: "sandwich", + before: { p: "د", f: "du" }, + after: { p: "په اړوند", f: "pu aRwand" }, + e: "concerning", + }, ]; -export default sandwiches; \ No newline at end of file +export default sandwiches; diff --git a/src/lib/src/translate-phonetics-replacer.ts b/src/lib/src/translate-phonetics-replacer.ts index 3b475d6..1a6b3ee 100644 --- a/src/lib/src/translate-phonetics-replacer.ts +++ b/src/lib/src/translate-phonetics-replacer.ts @@ -105,14 +105,14 @@ export const replacerInfo: IReplacerInfoItem[] = [ ipa: "ɪ́", }, { - char: "ey", + char: "ay", alalc: "ay", - ipa: "ai", + ipa: "ay", }, { - char: "éy", + char: "áy", alalc: "áy", - ipa: "ái", + ipa: "áj", }, { char: "ee", @@ -140,9 +140,9 @@ export const replacerInfo: IReplacerInfoItem[] = [ ipa: "u:j", }, { - char: "eyy", - alalc: "ạy", - ipa: "ɛ̝j", + char: "ey", + alalc: "ey", + ipa: "ej", }, { char: "e", @@ -351,4 +351,5 @@ export const replacerInfo: IReplacerInfoItem[] = [ ]; // tslint:disable-next-line -export const replacerRegex = /aay|áay|aa|áa|a|á|U|Ú|u|ú|ooy|o{1,2}|óo|ó|ey|éy|e{1,2}|ée|é|uy|úy|i|í|w|y|q|g|ts|sh|s|dz|z|t|T|d|D|r|R|n|N|f|b|p|x|kh|q|k|gh|g|G|j|ch|l|l|m|h/g; +export const replacerRegex = + /aay|áay|aa|áa|a|á|U|Ú|u|ú|ooy|o{1,2}|óo|ó|ay|áy|e{1,2}|ée|é|ey|éy|uy|úy|i|í|w|y|q|g|ts|sh|s|dz|z|t|T|d|D|r|R|n|N|f|b|p|x|kh|q|k|gh|g|G|j|ch|l|l|m|h/g; diff --git a/src/lib/src/translate-phonetics.test.ts b/src/lib/src/translate-phonetics.test.ts index 524c96e..b500a11 100644 --- a/src/lib/src/translate-phonetics.test.ts +++ b/src/lib/src/translate-phonetics.test.ts @@ -6,9 +6,7 @@ * */ -import { - translatePhonetics, -} from "./translate-phonetics"; +import { translatePhonetics } from "./translate-phonetics"; const dialects = ["southern", "standard", "peshawer"]; const systems = ["ipa", "alalc"]; @@ -54,11 +52,11 @@ const translations = [ }, }, { - original: "saRey", + original: "saRay", ipa: { - southern: "saɻai", - standard: "saɻai", - peshawer: "saɻai", + southern: "saɻaj", + standard: "saɻaj", + peshawer: "saɻaj", }, alalc: { southern: "saṛay", @@ -72,20 +70,17 @@ translations.forEach((t) => { systems.forEach((system) => { // check each dialect with given system dialects.forEach((dialect) => { - test( - // @ts-ignore - `${t.original} should be translated to ${t.ipa[dialect]} using ${system} with ${dialect} dialect`, - () => { - const translated = translatePhonetics(t.original, { - // @ts-ignore - system, - // @ts-ignore - dialect, - }); + test(// @ts-ignore + `${t.original} should be translated to ${t.ipa[dialect]} using ${system} with ${dialect} dialect`, () => { + const translated = translatePhonetics(t.original, { // @ts-ignore - expect(translated).toBe(t[system][dialect]); - }, - ); + system, + // @ts-ignore + dialect, + }); + // @ts-ignore + expect(translated).toBe(t[system][dialect]); + }); }); }); }); diff --git a/src/lib/src/validate-entry.test.ts b/src/lib/src/validate-entry.test.ts index 38a0b08..2ef7c55 100644 --- a/src/lib/src/validate-entry.test.ts +++ b/src/lib/src/validate-entry.test.ts @@ -8,234 +8,461 @@ import { standardizeEntry, validateEntry } from "./validate-entry"; import * as T from "../../types"; -import { standardizePhonetics } from "./standardize-pashto"; const toTest: { - input: any, - output: T.DictionaryEntryError | { ok: true } | { checkComplement: true }, + input: any; + output: T.DictionaryEntryError | { ok: true } | { checkComplement: true }; }[] = [ - { - input: { ts: undefined }, - output: { - errors: ["missing ts", "missing i", "missing p", "missing f", "missing e"], - p: "", - f: "", - e: "", - erroneousFields: ["ts", "i", "p", "f", "e"], - ts: 0, - }, + { + input: { ts: undefined }, + output: { + errors: [ + "missing ts", + "missing i", + "missing p", + "missing f", + "missing e", + ], + p: "", + f: "", + e: "", + erroneousFields: ["ts", "i", "p", "f", "e"], + ts: 0, }, - { - input: { ts: 123, p: "کور", e: "house" }, - output: { - errors: ["missing i", "missing f"], - p: "کور", - f: "", - ts: 123, - e: "house", - erroneousFields: ["i", "f"], - }, + }, + { + input: { ts: 123, p: "کور", e: "house" }, + output: { + errors: ["missing i", "missing f"], + p: "کور", + f: "", + ts: 123, + e: "house", + erroneousFields: ["i", "f"], }, - { - input: {"i":293,"ts":1527821299,"p":"اخطار","f":"ixtáar","e":"warning, reprimand, admonishment","c":"n. m."}, - output: { - errors: ["script and phonetics do not match for p and f"], - p: "اخطار", - f: "ixtáar", - e: "warning, reprimand, admonishment", - ts: 1527821299, - erroneousFields: ["p", "f"], - }, + }, + { + input: { + i: 293, + ts: 1527821299, + p: "اخطار", + f: "ixtáar", + e: "warning, reprimand, admonishment", + c: "n. m.", }, - { - input: {"i":2433,"ts":1527815197,"p":"پښتون","f":"puxtoon","e":"Pashtun","c":"n. m. unisex / adj. irreg.","infap":"پښتانه","infaf":"puxtaanu","infbf":"puxtan"}, - output: { - errors: ["missing infbp"], - p: "پښتون", - f: "puxtoon", - e: "Pashtun", - ts: 1527815197, - erroneousFields: ["infbp"], - }, + output: { + errors: ["script and phonetics do not match for p and f"], + p: "اخطار", + f: "ixtáar", + e: "warning, reprimand, admonishment", + ts: 1527821299, + erroneousFields: ["p", "f"], }, - { - input: {"i":2433,"ts":1527815197,"p":"پښتون","f":"puxtoon","e":"Pashtun","c":"n. m. unisex / adj. irreg.","infap":"پښتانه","infaf":"puxtaanu","infbp":"پښتن"}, - output: { - errors: ["missing infbf"], - p: "پښتون", - f: "puxtoon", - e: "Pashtun", - ts: 1527815197, - erroneousFields: ["infbf"], - }, + }, + { + input: { + i: 2433, + ts: 1527815197, + p: "پښتون", + f: "puxtoon", + e: "Pashtun", + c: "n. m. unisex / adj. irreg.", + infap: "پښتانه", + infaf: "puxtaanu", + infbf: "puxtan", }, - { - input: {"i":2433,"ts":1527815197,"p":"پښتون","f":"puxtoon","e":"Pashtun","c":"n. m. unisex / adj. irreg.","infap":"پښتانه","infaf":"puktaanu","infbp":"پښتن"}, - output: { - errors: ["script and phonetics do not match for infap and infaf", "missing infbf"], - p: "پښتون", - f: "puxtoon", - e: "Pashtun", - ts: 1527815197, - erroneousFields: ["infap", "infaf", "infbf"], - }, + output: { + errors: ["missing infbp"], + p: "پښتون", + f: "puxtoon", + e: "Pashtun", + ts: 1527815197, + erroneousFields: ["infbp"], }, - { - input: {"i":5000,"ts":1527819674,"p":"څملاستل","f":"tsumlaastúl","e":"to lie down","l":1596485996977,"separationAtP":2,"c":"v. intrans. seperable","psp":"څمل","psf":"tsaml","noOo":true}, - output: { - errors: ["missing separationAtF"], - p: "څملاستل", - f: "tsumlaastúl", - e: "to lie down", - ts: 1527819674, - erroneousFields: ["separationAtF"], - }, + }, + { + input: { + i: 2433, + ts: 1527815197, + p: "پښتون", + f: "puxtoon", + e: "Pashtun", + c: "n. m. unisex / adj. irreg.", + infap: "پښتانه", + infaf: "puxtaanu", + infbp: "پښتن", }, - { - input: {"i":5000,"ts":1527819674,"p":"څملاستل","f":"sumlaastúl","e":"to lie down","l":1596485996977,"separationAtP":2,"c":"v. intrans. seperable","psp":"څمل","psf":"tsaml","noOo":true}, - output: { - errors: ["script and phonetics do not match for p and f", "missing separationAtF"], - p: "څملاستل", - f: "sumlaastúl", - e: "to lie down", - ts: 1527819674, - erroneousFields: ["p", "f", "separationAtF"], - }, + output: { + errors: ["missing infbf"], + p: "پښتون", + f: "puxtoon", + e: "Pashtun", + ts: 1527815197, + erroneousFields: ["infbf"], }, - { - input: {"i":5000,"ts":1527819674,"p":"څملاستل","f":"tsumlaastúl","e":"to lie down","l":1596485996977,"separationAtF":4,"c":"v. intrans. seperable","psp":"څمل","psf":"tsaml","noOo":true}, - output: { - errors: ["missing separationAtP"], - p: "څملاستل", - f: "tsumlaastúl", - e: "to lie down", - ts: 1527819674, - erroneousFields: ["separationAtP"], - }, + }, + { + input: { + i: 2433, + ts: 1527815197, + p: "پښتون", + f: "puxtoon", + e: "Pashtun", + c: "n. m. unisex / adj. irreg.", + infap: "پښتانه", + infaf: "puktaanu", + infbp: "پښتن", }, - { - input: {"i":2222,"ts":1571859113828,"p":"پخول","f":"pakhawul","e":"to cook, prepare, to cause to ripen, mature","c":"v. stat. comp. trans."}, - output: { - errors: ["missing complement for compound verb"], - p: "پخول", - f: "pakhawul", - e: "to cook, prepare, to cause to ripen, mature", - ts: 1571859113828, - erroneousFields: ["l"], - }, + output: { + errors: [ + "script and phonetics do not match for infap and infaf", + "missing infbf", + ], + p: "پښتون", + f: "puxtoon", + e: "Pashtun", + ts: 1527815197, + erroneousFields: ["infap", "infaf", "infbf"], }, - { - input: {"i":2222,"ts":1571859113828,"p":"پخول","f":"pakhawul","e":"to cook, prepare, to cause to ripen, mature","l":1574867531681,"c":"v. stat. comp. trans."}, - output: { - checkComplement: true, - }, + }, + { + input: { + i: 5000, + ts: 1527819674, + p: "څملاستل", + f: "tsumlaastúl", + e: "to lie down", + l: 1596485996977, + separationAtP: 2, + c: "v. intrans. seperable", + psp: "څمل", + psf: "tsaml", + noOo: true, }, - { - input: {"i":2231,"ts":1527812013,"p":"پراخ","f":"praakh, paráakh","e":"wide, broad, spacious, vast","c":"adj."}, - output: { ok: true }, + output: { + errors: ["missing separationAtF"], + p: "څملاستل", + f: "tsumlaastúl", + e: "to lie down", + ts: 1527819674, + erroneousFields: ["separationAtF"], }, - { - input: {"i":0,"ts":1527812013,"p":"پراخ","f":"praakh, paráakh","e":"wide, broad, spacious, vast","c":"adj."}, - output: { ok: true }, + }, + { + input: { + i: 5000, + ts: 1527819674, + p: "څملاستل", + f: "sumlaastúl", + e: "to lie down", + l: 1596485996977, + separationAtP: 2, + c: "v. intrans. seperable", + psp: "څمل", + psf: "tsaml", + noOo: true, }, - { - input: {"i":12,"ts":1575058859661,"p":"آبدار","f":"aawdáar","e":"watery, damp, humid, juicy","c":"adj."}, - output: { - errors: ["script and phonetics do not match for p and f"], - p: "آبدار", - f: "aawdáar", - e: "watery, damp, humid, juicy", - ts: 1575058859661, - erroneousFields: ["p", "f"], - }, + output: { + errors: [ + "script and phonetics do not match for p and f", + "missing separationAtF", + ], + p: "څملاستل", + f: "sumlaastúl", + e: "to lie down", + ts: 1527819674, + erroneousFields: ["p", "f", "separationAtF"], }, - { - input: {"ts":1591033069786,"i":7717,"p":"ستړی کول","f":"stuRey kawul","g":"stuReykedul","e":"to get tired, fatigued","c":"v. stat. comp. intrans.","l":1527815306,"ec":"get","ep":"tired"}, - output: { - errors: ["wrong ending for intrans. stat. comp"], - p: "ستړی کول", - f: "stuRey kawul", - e: "to get tired, fatigued", - ts: 1591033069786, - erroneousFields: ["p", "f"], - }, + }, + { + input: { + i: 5000, + ts: 1527819674, + p: "څملاستل", + f: "tsumlaastúl", + e: "to lie down", + l: 1596485996977, + separationAtF: 4, + c: "v. intrans. seperable", + psp: "څمل", + psf: "tsaml", + noOo: true, }, - { - input: {"ts":1591033078746,"i":7716,"p":"ستړی کېدل","f":"stuRey kedul","g":"stuReykawul","e":"to make tired, wear out","c":"v. stat. comp. trans.","l":1527815306,"ec":"make","ep":"tired"}, - output: { - errors: ["wrong ending for trans. stat. comp"], - p: "ستړی کېدل", - f: "stuRey kedul", - e: "to make tired, wear out", - ts: 1591033078746, - erroneousFields: ["p", "f"], - }, + output: { + errors: ["missing separationAtP"], + p: "څملاستل", + f: "tsumlaastúl", + e: "to lie down", + ts: 1527819674, + erroneousFields: ["separationAtP"], }, - { - input: {"i":12,"ts":1575058859661,"p":"آبدار","f":"aawdáar","e":"watery, damp, humid, juicy","c":"adj.","diacExcept":true}, - output: { ok: true }, + }, + { + input: { + i: 2222, + ts: 1571859113828, + p: "پخول", + f: "pakhawul", + e: "to cook, prepare, to cause to ripen, mature", + c: "v. stat. comp. trans.", }, - { - input: {"i":12,"ts":1575058859661,"p":"آبدار","f":"aawdáar","e":"watery, damp, humid, juicy","c":"adj.","diacExcept":true}, - output: { ok: true }, + output: { + errors: ["missing complement for compound verb"], + p: "پخول", + f: "pakhawul", + e: "to cook, prepare, to cause to ripen, mature", + ts: 1571859113828, + erroneousFields: ["l"], }, - { - input: {"ts":1527812488,"i":1934,"p":"بې چاره","f":"bechaara","g":"bechaara","e":"poor thing, pitiful","r":3,"c":"adj."}, - output: { - errors: ["spacing discrepency between p and f"], - p: "بې چاره", - f: "bechaara", - e: "poor thing, pitiful", - ts: 1527812488, - erroneousFields: ["p", "f"], - }, + }, + { + input: { + i: 2222, + ts: 1571859113828, + p: "پخول", + f: "pakhawul", + e: "to cook, prepare, to cause to ripen, mature", + l: 1574867531681, + c: "v. stat. comp. trans.", }, - { - input: {"ts":1527812488,"i":1934,"p":"بېچاره","f":"be chaara","g":"bechaara","e":"poor thing, pitiful","r":3,"c":"adj."}, - output: { - errors: ["spacing discrepency between p and f"], - p: "بېچاره", - f: "be chaara", - e: "poor thing, pitiful", - ts: 1527812488, - erroneousFields: ["p", "f"], - }, + output: { + checkComplement: true, }, - { - input: {"ts":1527812488,"i":1934,"p":"بې چاره","f":"be chaara","g":"bechaara","e":"poor thing, pitiful","r":3,"c":"adj."}, - output: { ok: true } + }, + { + input: { + i: 2231, + ts: 1527812013, + p: "پراخ", + f: "praakh, paráakh", + e: "wide, broad, spacious, vast", + c: "adj.", }, - { - input: {"ts":1527814265,"i":12969,"p":"مکتب","f":"maktab","g":"maktab","e":"school","r":4,"c":"n. m.","app":"مکاتب","apf":"ma kaatib"}, - output: { - errors: ["spacing discrepency between app and apf"], - p: "مکتب", - f: "maktab", - e: "school", - ts: 1527814265, - erroneousFields: ["app", "apf"], - }, + output: { ok: true }, + }, + { + input: { + i: 0, + ts: 1527812013, + p: "پراخ", + f: "praakh, paráakh", + e: "wide, broad, spacious, vast", + c: "adj.", }, - { - input: {"ts":1527815870,"i":183,"p":"اثر","f":"asar","g":"asar","e":"influence, impression, tracks, affect","r":4,"c":"n. m.","app":"اثرات, آثار","apf":"asráat"}, - output: { - errors: ["difference in variation length between app and apf", "script and phonetics do not match for app and apf"], - p: "اثر", - f: "asar", - e: "influence, impression, tracks, affect", - ts: 1527815870, - erroneousFields: ["app", "apf"], - }, + output: { ok: true }, + }, + { + input: { + i: 12, + ts: 1575058859661, + p: "آبدار", + f: "aawdáar", + e: "watery, damp, humid, juicy", + c: "adj.", }, + output: { + errors: ["script and phonetics do not match for p and f"], + p: "آبدار", + f: "aawdáar", + e: "watery, damp, humid, juicy", + ts: 1575058859661, + erroneousFields: ["p", "f"], + }, + }, + { + input: { + ts: 1591033069786, + i: 7717, + p: "ستړی کول", + f: "stuRay kawul", + g: "stuRaykedul", + e: "to get tired, fatigued", + c: "v. stat. comp. intrans.", + l: 1527815306, + ec: "get", + ep: "tired", + }, + output: { + errors: ["wrong ending for intrans. stat. comp"], + p: "ستړی کول", + f: "stuRay kawul", + e: "to get tired, fatigued", + ts: 1591033069786, + erroneousFields: ["p", "f"], + }, + }, + { + input: { + ts: 1591033078746, + i: 7716, + p: "ستړی کېدل", + f: "stuRay kedul", + g: "stuRaykawul", + e: "to make tired, wear out", + c: "v. stat. comp. trans.", + l: 1527815306, + ec: "make", + ep: "tired", + }, + output: { + errors: ["wrong ending for trans. stat. comp"], + p: "ستړی کېدل", + f: "stuRay kedul", + e: "to make tired, wear out", + ts: 1591033078746, + erroneousFields: ["p", "f"], + }, + }, + { + input: { + i: 12, + ts: 1575058859661, + p: "آبدار", + f: "aawdáar", + e: "watery, damp, humid, juicy", + c: "adj.", + diacExcept: true, + }, + output: { ok: true }, + }, + { + input: { + i: 12, + ts: 1575058859661, + p: "آبدار", + f: "aawdáar", + e: "watery, damp, humid, juicy", + c: "adj.", + diacExcept: true, + }, + output: { ok: true }, + }, + { + input: { + ts: 1527812488, + i: 1934, + p: "بې چاره", + f: "bechaara", + g: "bechaara", + e: "poor thing, pitiful", + r: 3, + c: "adj.", + }, + output: { + errors: ["spacing discrepency between p and f"], + p: "بې چاره", + f: "bechaara", + e: "poor thing, pitiful", + ts: 1527812488, + erroneousFields: ["p", "f"], + }, + }, + { + input: { + ts: 1527812488, + i: 1934, + p: "بېچاره", + f: "be chaara", + g: "bechaara", + e: "poor thing, pitiful", + r: 3, + c: "adj.", + }, + output: { + errors: ["spacing discrepency between p and f"], + p: "بېچاره", + f: "be chaara", + e: "poor thing, pitiful", + ts: 1527812488, + erroneousFields: ["p", "f"], + }, + }, + { + input: { + ts: 1527812488, + i: 1934, + p: "بې چاره", + f: "be chaara", + g: "bechaara", + e: "poor thing, pitiful", + r: 3, + c: "adj.", + }, + output: { ok: true }, + }, + { + input: { + ts: 1527814265, + i: 12969, + p: "مکتب", + f: "maktab", + g: "maktab", + e: "school", + r: 4, + c: "n. m.", + app: "مکاتب", + apf: "ma kaatib", + }, + output: { + errors: ["spacing discrepency between app and apf"], + p: "مکتب", + f: "maktab", + e: "school", + ts: 1527814265, + erroneousFields: ["app", "apf"], + }, + }, + { + input: { + ts: 1527815870, + i: 183, + p: "اثر", + f: "asar", + g: "asar", + e: "influence, impression, tracks, affect", + r: 4, + c: "n. m.", + app: "اثرات, آثار", + apf: "asráat", + }, + output: { + errors: [ + "difference in variation length between app and apf", + "script and phonetics do not match for app and apf", + ], + p: "اثر", + f: "asar", + e: "influence, impression, tracks, affect", + ts: 1527815870, + erroneousFields: ["app", "apf"], + }, + }, ]; test("validateEntry should work", () => { - toTest.forEach((t) => { - expect(validateEntry(t.input as T.DictionaryEntry)).toEqual(t.output); - }); + toTest.forEach((t) => { + expect(validateEntry(t.input as T.DictionaryEntry)).toEqual(t.output); + }); }); test("standardizeEntry", () => { - expect(standardizeEntry({"i":195,"ts":1527822036,"p":"اجتماعي","f":"ijtimaa‘ee, ijtimaayee","g":"ijtimaaee,ijtimaayee","e":"public, social, societal","c":"adj."})) - .toEqual({"i":195,"ts":1527822036,"p":"اجتماعي","f":"ijtimaa'ee, ijtimaayee","g":"ijtimaaee,ijtimaayee","e":"public, social, societal","c":"adj."}); + expect( + standardizeEntry({ + i: 195, + ts: 1527822036, + p: "اجتماعي", + f: "ijtimaa‘ee, ijtimaayee", + g: "ijtimaaee,ijtimaayee", + e: "public, social, societal", + c: "adj.", + }) + ).toEqual({ + i: 195, + ts: 1527822036, + p: "اجتماعي", + f: "ijtimaa'ee, ijtimaayee", + g: "ijtimaaee,ijtimaayee", + e: "public, social, societal", + c: "adj.", + }); });