2021-05-06 20:28:03 +00:00
|
|
|
|
/**
|
|
|
|
|
* Copyright (c) 2021 lingdocs.com
|
|
|
|
|
*
|
|
|
|
|
* This source code is licensed under the MIT license found in the
|
|
|
|
|
* LICENSE file in the root directory of this source tree.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
import * as T from "../types";
|
2021-05-16 14:13:42 +00:00
|
|
|
|
import {
|
|
|
|
|
splitFIntoPhonemes,
|
|
|
|
|
Phoneme,
|
|
|
|
|
phonemeTable,
|
|
|
|
|
zwar,
|
|
|
|
|
zwarakey,
|
|
|
|
|
zer,
|
|
|
|
|
pesh,
|
|
|
|
|
sukun,
|
|
|
|
|
hamzaAbove,
|
|
|
|
|
tashdeed,
|
|
|
|
|
wasla,
|
|
|
|
|
daggerAlif,
|
|
|
|
|
fathahan,
|
2021-05-16 15:00:05 +00:00
|
|
|
|
lastNonWhitespace,
|
2021-05-16 14:13:42 +00:00
|
|
|
|
addP,
|
|
|
|
|
last,
|
|
|
|
|
advanceP,
|
|
|
|
|
reverseP,
|
|
|
|
|
overwriteP,
|
2021-05-27 06:36:30 +00:00
|
|
|
|
advanceForHamza,
|
2021-05-16 14:13:42 +00:00
|
|
|
|
advanceForHamzaMid,
|
|
|
|
|
DiacriticsAccumulator,
|
|
|
|
|
} from "./diacritics-helpers";
|
|
|
|
|
|
2021-05-06 20:28:03 +00:00
|
|
|
|
import { firstPhonetics } from "./p-text-helpers";
|
2021-05-07 11:48:33 +00:00
|
|
|
|
import { pipe } from "rambda";
|
2021-05-06 20:28:03 +00:00
|
|
|
|
|
2021-05-13 09:54:46 +00:00
|
|
|
|
/**
|
|
|
|
|
* Adds diacritics to a given PsString.
|
|
|
|
|
* Errors if the phonetics and script don't line up.
|
|
|
|
|
*/
|
|
|
|
|
export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
|
|
|
|
|
const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
|
|
|
|
|
const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p });
|
|
|
|
|
if (pIn !== "") {
|
|
|
|
|
throw new Error("phonetics error - phonetics shorter than pashto script");
|
|
|
|
|
}
|
|
|
|
|
return {
|
|
|
|
|
p: pOut,
|
|
|
|
|
f,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-09 17:08:35 +00:00
|
|
|
|
enum PhonemeStatus {
|
|
|
|
|
LeadingLongVowel,
|
|
|
|
|
LeadingConsonantOrShortVowel,
|
|
|
|
|
DoubleConsonantTashdeed,
|
|
|
|
|
EndingWithHeyHim,
|
|
|
|
|
DirectMatch,
|
2021-05-24 17:36:03 +00:00
|
|
|
|
DirectMatchAfterSukun,
|
|
|
|
|
EndingWithHeyHimFromSukun,
|
2021-05-09 17:08:35 +00:00
|
|
|
|
ShortVowel,
|
2021-05-13 09:54:46 +00:00
|
|
|
|
PersianSilentWWithAa,
|
|
|
|
|
ArabicWasla,
|
|
|
|
|
Izafe,
|
2021-05-16 14:13:42 +00:00
|
|
|
|
EndOfDuParticle,
|
2021-05-24 17:36:03 +00:00
|
|
|
|
HaEndingWithHeem,
|
|
|
|
|
AlefDaggarEnding,
|
2021-05-27 06:36:30 +00:00
|
|
|
|
LongAinVowelMissingComma,
|
2021-05-28 11:58:59 +00:00
|
|
|
|
ShortAinVowelMissingComma,
|
2021-05-09 17:08:35 +00:00
|
|
|
|
}
|
|
|
|
|
|
2021-05-07 07:54:09 +00:00
|
|
|
|
function processPhoneme(
|
|
|
|
|
acc: DiacriticsAccumulator,
|
|
|
|
|
phoneme: Phoneme,
|
|
|
|
|
i: number,
|
|
|
|
|
phonemes: Phoneme[],
|
2021-05-25 09:47:02 +00:00
|
|
|
|
): DiacriticsAccumulator {
|
2021-05-08 18:31:59 +00:00
|
|
|
|
// console.log("PHONEME", phoneme);
|
|
|
|
|
// console.log("space coming up", acc.pIn[0] === " ");
|
|
|
|
|
// console.log("state", acc);
|
2021-05-07 07:54:09 +00:00
|
|
|
|
// Prep state
|
2021-05-16 15:00:05 +00:00
|
|
|
|
// TODO: CLEANER function jump to next char
|
|
|
|
|
const state = acc.pIn.slice(0, 5) === " ... "
|
|
|
|
|
? advanceP(acc, 5)
|
|
|
|
|
: acc.pIn[0] === " "
|
2021-05-16 14:13:42 +00:00
|
|
|
|
? advanceP(acc)
|
|
|
|
|
: acc;
|
2021-05-08 18:31:59 +00:00
|
|
|
|
// console.log("AFTER SPACE PREP", phoneme);
|
|
|
|
|
// console.log("state", state);
|
2021-05-07 07:54:09 +00:00
|
|
|
|
// WARNING: Do not use acc after this point!
|
|
|
|
|
|
2021-05-08 18:31:59 +00:00
|
|
|
|
const {
|
|
|
|
|
phonemeInfo,
|
2021-05-24 17:36:03 +00:00
|
|
|
|
diacritic,
|
2021-05-13 09:54:46 +00:00
|
|
|
|
phs,
|
2021-05-24 17:36:03 +00:00
|
|
|
|
prevPLetter,
|
2021-05-08 18:31:59 +00:00
|
|
|
|
} = stateInfo({ state, i, phoneme, phonemes });
|
2021-05-07 07:54:09 +00:00
|
|
|
|
|
2021-05-25 09:47:02 +00:00
|
|
|
|
// console.log("phoneme", phoneme);
|
|
|
|
|
// console.log("state", state);
|
|
|
|
|
// console.log(phs);
|
|
|
|
|
|
2021-05-13 09:54:46 +00:00
|
|
|
|
return (phs === PhonemeStatus.LeadingLongVowel) ?
|
2021-05-09 17:08:35 +00:00
|
|
|
|
pipe(
|
|
|
|
|
advanceP,
|
|
|
|
|
addP(phonemeInfo.diacritic),
|
|
|
|
|
advanceP,
|
|
|
|
|
)(state)
|
2021-05-13 09:54:46 +00:00
|
|
|
|
: (phs === PhonemeStatus.LeadingConsonantOrShortVowel) ?
|
2021-05-09 17:08:35 +00:00
|
|
|
|
pipe(
|
|
|
|
|
advanceP,
|
2021-05-24 17:36:03 +00:00
|
|
|
|
addP(diacritic),
|
2021-05-09 17:08:35 +00:00
|
|
|
|
)(state)
|
2021-05-13 09:54:46 +00:00
|
|
|
|
: (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
|
2021-05-09 17:08:35 +00:00
|
|
|
|
pipe(
|
|
|
|
|
addP(tashdeed)
|
|
|
|
|
)(state)
|
2021-05-13 09:54:46 +00:00
|
|
|
|
: (phs === PhonemeStatus.EndingWithHeyHim) ?
|
2021-05-09 17:08:35 +00:00
|
|
|
|
pipe(
|
|
|
|
|
advanceP,
|
|
|
|
|
addP(phoneme === "u" ? hamzaAbove : sukun),
|
|
|
|
|
)(state)
|
2021-05-13 09:54:46 +00:00
|
|
|
|
: (phs === PhonemeStatus.DirectMatch) ?
|
2021-05-09 17:08:35 +00:00
|
|
|
|
pipe(
|
2021-05-24 17:36:03 +00:00
|
|
|
|
addP(diacritic),
|
|
|
|
|
advanceP,
|
|
|
|
|
)(state)
|
|
|
|
|
: (phs === PhonemeStatus.DirectMatchAfterSukun) ?
|
|
|
|
|
pipe(
|
|
|
|
|
addP(sukun),
|
2021-05-09 17:08:35 +00:00
|
|
|
|
advanceP,
|
|
|
|
|
)(state)
|
2021-05-13 09:54:46 +00:00
|
|
|
|
: (phs === PhonemeStatus.PersianSilentWWithAa) ?
|
|
|
|
|
pipe(
|
|
|
|
|
addP("("),
|
|
|
|
|
advanceP,
|
|
|
|
|
addP(")"),
|
|
|
|
|
advanceP,
|
|
|
|
|
)(state)
|
|
|
|
|
: (phs === PhonemeStatus.ArabicWasla) ?
|
|
|
|
|
pipe(
|
|
|
|
|
addP(zer),
|
|
|
|
|
overwriteP(wasla),
|
|
|
|
|
)(state)
|
|
|
|
|
: (phs === PhonemeStatus.Izafe) ?
|
2021-05-13 19:40:41 +00:00
|
|
|
|
pipe(
|
2021-05-13 09:54:46 +00:00
|
|
|
|
reverseP,
|
|
|
|
|
addP(zer),
|
2021-05-13 19:40:41 +00:00
|
|
|
|
)(state)
|
2021-05-16 14:13:42 +00:00
|
|
|
|
: (phs === PhonemeStatus.EndOfDuParticle) ?
|
2021-05-16 15:00:05 +00:00
|
|
|
|
pipe(
|
2021-05-16 14:13:42 +00:00
|
|
|
|
reverseP,
|
|
|
|
|
addP(zwarakey),
|
2021-05-16 15:00:05 +00:00
|
|
|
|
)(state)
|
2021-05-24 17:36:03 +00:00
|
|
|
|
: (phs === PhonemeStatus.HaEndingWithHeem) ?
|
|
|
|
|
pipe(
|
2021-05-25 09:47:02 +00:00
|
|
|
|
reverseP,
|
|
|
|
|
// prevPLetter === " " ? reverseP ,
|
2021-05-24 17:36:03 +00:00
|
|
|
|
addP(zwar),
|
|
|
|
|
)(state)
|
|
|
|
|
: (phs === PhonemeStatus.EndingWithHeyHimFromSukun) ?
|
|
|
|
|
pipe(
|
|
|
|
|
addP(sukun),
|
|
|
|
|
advanceP,
|
|
|
|
|
)(state)
|
|
|
|
|
: (phs === PhonemeStatus.AlefDaggarEnding) ?
|
|
|
|
|
pipe(
|
|
|
|
|
advanceP,
|
|
|
|
|
advanceP,
|
|
|
|
|
)(state)
|
2021-05-27 06:36:30 +00:00
|
|
|
|
: (phs === PhonemeStatus.LongAinVowelMissingComma) ?
|
2021-05-25 09:47:02 +00:00
|
|
|
|
pipe(
|
|
|
|
|
addP(diacritic),
|
|
|
|
|
advanceP,
|
2021-05-27 06:36:30 +00:00
|
|
|
|
addP(diacritic)
|
2021-05-25 09:47:02 +00:00
|
|
|
|
)(state)
|
2021-05-28 11:58:59 +00:00
|
|
|
|
: (phs === PhonemeStatus.ShortAinVowelMissingComma) ?
|
|
|
|
|
pipe(
|
|
|
|
|
addP(diacritic),
|
|
|
|
|
advanceP,
|
|
|
|
|
)(state)
|
2021-05-13 09:54:46 +00:00
|
|
|
|
:
|
|
|
|
|
// phs === PhonemeState.ShortVowel
|
2021-05-09 17:08:35 +00:00
|
|
|
|
pipe(
|
|
|
|
|
advanceForHamzaMid,
|
|
|
|
|
addP(phonemeInfo.diacritic),
|
2021-05-27 06:36:30 +00:00
|
|
|
|
// TODO THIS?
|
|
|
|
|
advanceForHamza,
|
2021-05-09 17:08:35 +00:00
|
|
|
|
)(state);
|
|
|
|
|
}
|
2021-05-08 18:31:59 +00:00
|
|
|
|
|
2021-05-25 09:47:02 +00:00
|
|
|
|
|
|
|
|
|
|
2021-05-08 18:31:59 +00:00
|
|
|
|
function stateInfo({ state, i, phonemes, phoneme }: {
|
|
|
|
|
state: DiacriticsAccumulator,
|
|
|
|
|
i: number,
|
|
|
|
|
phonemes: Phoneme[],
|
|
|
|
|
phoneme: Phoneme,
|
|
|
|
|
}) {
|
|
|
|
|
const prevPLetter = last(state.pOut);
|
|
|
|
|
const currentPLetter = state.pIn[0];
|
|
|
|
|
const nextPLetter = state.pIn[1];
|
|
|
|
|
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
|
|
|
|
|
const isEndOfWord = !nextPLetter || nextPLetter === " ";
|
|
|
|
|
const phonemeInfo = phonemeTable[phoneme];
|
2021-05-28 11:58:59 +00:00
|
|
|
|
const nextPhoneme = phonemes[i+1];
|
2021-05-08 18:31:59 +00:00
|
|
|
|
const previousPhoneme = i > 0 && phonemes[i-1];
|
|
|
|
|
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
|
|
|
|
|
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
|
|
|
|
|
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
|
|
|
|
|
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
|
|
|
|
|
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
|
|
|
|
|
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
|
2021-05-27 06:36:30 +00:00
|
|
|
|
const useAinBlendDiacritics = (!isBeginningOfWord && (phonemeInfo.ainBlendDiacritic && currentPLetter === "ع"));
|
|
|
|
|
const diacritic = useAinBlendDiacritics
|
|
|
|
|
? phonemeInfo.ainBlendDiacritic
|
|
|
|
|
: isEndOfWord
|
|
|
|
|
? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
|
2021-05-09 17:08:35 +00:00
|
|
|
|
|
|
|
|
|
function getPhonemeState(): PhonemeStatus {
|
|
|
|
|
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
|
|
|
|
|
if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
|
|
|
|
|
throw Error("phonetics error - needs alef prefix");
|
|
|
|
|
}
|
|
|
|
|
return PhonemeStatus.LeadingLongVowel;
|
|
|
|
|
}
|
|
|
|
|
if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
|
|
|
|
|
return PhonemeStatus.LeadingConsonantOrShortVowel;
|
|
|
|
|
}
|
2021-05-16 15:00:05 +00:00
|
|
|
|
// console.log("------");
|
|
|
|
|
// console.log("phoneme", phoneme);
|
|
|
|
|
// console.log("state", state);
|
|
|
|
|
// console.log("prevPLetter is space", prevPLetter === " ");
|
|
|
|
|
// console.log("------");
|
|
|
|
|
if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && lastNonWhitespace(state.pOut) === "د") {
|
2021-05-16 14:13:42 +00:00
|
|
|
|
return PhonemeStatus.EndOfDuParticle
|
|
|
|
|
}
|
2021-05-13 09:54:46 +00:00
|
|
|
|
if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") {
|
|
|
|
|
return PhonemeStatus.PersianSilentWWithAa;
|
|
|
|
|
}
|
|
|
|
|
if (!isBeginningOfWord && phoneme === "i" && currentPLetter === "ا" && nextPLetter === "ل") {
|
|
|
|
|
return PhonemeStatus.ArabicWasla;
|
|
|
|
|
}
|
|
|
|
|
if (phoneme === "-i-" && isBeginningOfWord) {
|
|
|
|
|
return PhonemeStatus.Izafe;
|
|
|
|
|
}
|
2021-05-28 11:58:59 +00:00
|
|
|
|
if (currentPLetter === "ع" && phoneme !== "'" && nextPhoneme !== "'" && phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
|
|
|
|
return PhonemeStatus.ShortAinVowelMissingComma;
|
|
|
|
|
}
|
2021-05-27 06:36:30 +00:00
|
|
|
|
if (useAinBlendDiacritics) {
|
|
|
|
|
return PhonemeStatus.LongAinVowelMissingComma;
|
|
|
|
|
}
|
2021-05-09 17:08:35 +00:00
|
|
|
|
if (needsTashdeed) {
|
|
|
|
|
return PhonemeStatus.DoubleConsonantTashdeed;
|
|
|
|
|
}
|
2021-05-24 17:36:03 +00:00
|
|
|
|
if (phoneme === "aa" && currentPLetter === "ی" && nextPLetter === daggerAlif) {
|
|
|
|
|
return PhonemeStatus.AlefDaggarEnding;
|
|
|
|
|
}
|
|
|
|
|
if (((isEndOfWord && prevPLetter === "ح") || (prevPLetter === " " && state.pOut[state.pOut.length - 2])) && phoneme === "a") {
|
|
|
|
|
return PhonemeStatus.HaEndingWithHeem;
|
|
|
|
|
}
|
2021-05-09 17:08:35 +00:00
|
|
|
|
if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) {
|
2021-05-24 17:36:03 +00:00
|
|
|
|
return needsSukun ? PhonemeStatus.EndingWithHeyHimFromSukun : PhonemeStatus.EndingWithHeyHim;
|
2021-05-09 17:08:35 +00:00
|
|
|
|
}
|
|
|
|
|
if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) {
|
2021-05-24 17:36:03 +00:00
|
|
|
|
return needsSukun ? PhonemeStatus.DirectMatchAfterSukun : PhonemeStatus.DirectMatch;
|
2021-05-09 17:08:35 +00:00
|
|
|
|
}
|
2021-05-27 06:36:30 +00:00
|
|
|
|
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
|
|
|
|
|
return PhonemeStatus.ShortVowel;
|
2021-05-09 17:08:35 +00:00
|
|
|
|
}
|
2021-05-16 14:13:42 +00:00
|
|
|
|
// console.log("bad phoneme is ", phoneme);
|
2021-05-09 17:08:35 +00:00
|
|
|
|
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-13 09:54:46 +00:00
|
|
|
|
const phs = getPhonemeState();
|
2021-05-09 17:08:35 +00:00
|
|
|
|
|
2021-05-08 18:31:59 +00:00
|
|
|
|
return {
|
2021-05-24 17:36:03 +00:00
|
|
|
|
phs, phonemeInfo, diacritic, prevPLetter,
|
2021-05-08 18:31:59 +00:00
|
|
|
|
};
|
|
|
|
|
};
|