pashto-inflector/src/lib/diacritics.ts

216 lines
7.2 KiB
TypeScript
Raw Normal View History

/**
* Copyright (c) 2021 lingdocs.com
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*
*/
import * as T from "../types";
import {
splitFIntoPhonemes,
Phoneme,
phonemeTable,
zwar,
zwarakey,
zer,
pesh,
sukun,
hamzaAbove,
tashdeed,
wasla,
daggerAlif,
fathahan,
prev2Chars,
addP,
last,
advanceP,
reverseP,
overwriteP,
advanceForAin,
advanceForAinOrHamza,
advanceForHamzaMid,
DiacriticsAccumulator,
} from "./diacritics-helpers";
import { firstPhonetics } from "./p-text-helpers";
import { pipe } from "rambda";
2021-05-13 09:54:46 +00:00
/**
* Adds diacritics to a given PsString.
* Errors if the phonetics and script don't line up.
*/
export function addDiacritics({ p, f }: T.PsString, ignoreCommas?: true): T.PsString {
const phonemes: Phoneme[] = splitFIntoPhonemes(!ignoreCommas ? firstPhonetics(f) : f);
const { pIn, pOut } = phonemes.reduce(processPhoneme, { pOut: "", pIn: p });
if (pIn !== "") {
throw new Error("phonetics error - phonetics shorter than pashto script");
}
return {
p: pOut,
f,
};
}
2021-05-09 17:08:35 +00:00
enum PhonemeStatus {
LeadingLongVowel,
LeadingConsonantOrShortVowel,
DoubleConsonantTashdeed,
EndingWithHeyHim,
DirectMatch,
ShortVowel,
2021-05-13 09:54:46 +00:00
PersianSilentWWithAa,
ArabicWasla,
Izafe,
EndOfDuParticle,
2021-05-09 17:08:35 +00:00
}
2021-05-07 07:54:09 +00:00
function processPhoneme(
acc: DiacriticsAccumulator,
phoneme: Phoneme,
i: number,
phonemes: Phoneme[],
) {
2021-05-08 18:31:59 +00:00
// console.log("PHONEME", phoneme);
// console.log("space coming up", acc.pIn[0] === " ");
// console.log("state", acc);
2021-05-07 07:54:09 +00:00
// Prep state
const state = acc.pIn[0] === " "
? advanceP(acc)
: acc;
2021-05-08 18:31:59 +00:00
// console.log("AFTER SPACE PREP", phoneme);
// console.log("state", state);
2021-05-07 07:54:09 +00:00
// WARNING: Do not use acc after this point!
2021-05-08 18:31:59 +00:00
const {
phonemeInfo,
sukunOrDiacritic,
2021-05-13 09:54:46 +00:00
phs,
2021-05-08 18:31:59 +00:00
} = stateInfo({ state, i, phoneme, phonemes });
2021-05-07 07:54:09 +00:00
2021-05-13 09:54:46 +00:00
return (phs === PhonemeStatus.LeadingLongVowel) ?
2021-05-09 17:08:35 +00:00
pipe(
advanceP,
addP(phonemeInfo.diacritic),
advanceP,
)(state)
2021-05-13 09:54:46 +00:00
: (phs === PhonemeStatus.LeadingConsonantOrShortVowel) ?
2021-05-09 17:08:35 +00:00
pipe(
advanceP,
addP(sukunOrDiacritic),
advanceForAin,
)(state)
2021-05-13 09:54:46 +00:00
: (phs === PhonemeStatus.DoubleConsonantTashdeed) ?
2021-05-09 17:08:35 +00:00
pipe(
addP(tashdeed)
)(state)
2021-05-13 09:54:46 +00:00
: (phs === PhonemeStatus.EndingWithHeyHim) ?
2021-05-09 17:08:35 +00:00
pipe(
advanceP,
addP(phoneme === "u" ? hamzaAbove : sukun),
)(state)
2021-05-13 09:54:46 +00:00
: (phs === PhonemeStatus.DirectMatch) ?
2021-05-09 17:08:35 +00:00
pipe(
addP(sukunOrDiacritic),
advanceP,
)(state)
2021-05-13 09:54:46 +00:00
: (phs === PhonemeStatus.PersianSilentWWithAa) ?
pipe(
addP("("),
advanceP,
addP(")"),
advanceP,
)(state)
: (phs === PhonemeStatus.ArabicWasla) ?
pipe(
addP(zer),
overwriteP(wasla),
)(state)
: (phs === PhonemeStatus.Izafe) ?
pipe(
2021-05-13 09:54:46 +00:00
reverseP,
addP(zer),
)(state)
: (phs === PhonemeStatus.EndOfDuParticle) ?
(console.log("here"), pipe(
reverseP,
addP(zwarakey),
)(state))
2021-05-13 09:54:46 +00:00
:
// phs === PhonemeState.ShortVowel
2021-05-09 17:08:35 +00:00
pipe(
advanceForHamzaMid,
addP(phonemeInfo.diacritic),
advanceForAinOrHamza,
)(state);
}
2021-05-08 18:31:59 +00:00
function stateInfo({ state, i, phonemes, phoneme }: {
state: DiacriticsAccumulator,
i: number,
phonemes: Phoneme[],
phoneme: Phoneme,
}) {
const prevPLetter = last(state.pOut);
const currentPLetter = state.pIn[0];
const nextPLetter = state.pIn[1];
const isBeginningOfWord = state.pOut === "" || prevPLetter === " ";
const isEndOfWord = !nextPLetter || nextPLetter === " ";
const phonemeInfo = phonemeTable[phoneme];
const previousPhoneme = i > 0 && phonemes[i-1];
const previousPhonemeInfo = (!isBeginningOfWord && i > 0) && phonemeTable[phonemes[i-1]];
// const nextPhoneme = (phonemes.length > (i + 1)) && phonemes[i+1];
// const nextPhonemeInfo = nextPhoneme ? phonemeTable[nextPhoneme] : undefined;
const doubleConsonant = previousPhonemeInfo && (phonemeInfo.consonant && previousPhonemeInfo.consonant);
const needsTashdeed = !isBeginningOfWord && doubleConsonant && (previousPhoneme === phoneme) && !phonemeInfo.matches?.includes(currentPLetter);
const needsSukun = doubleConsonant && ((previousPhoneme !== phoneme) || phonemeInfo.matches?.includes(currentPLetter));
const diacritic = isEndOfWord ? ((!phonemeInfo.longVowel || phonemeInfo.useEndingDiacritic) ? phonemeInfo.diacritic : undefined) : phonemeInfo.diacritic;
const sukunOrDiacritic = (needsSukun ? sukun : diacritic);
2021-05-09 17:08:35 +00:00
function getPhonemeState(): PhonemeStatus {
if (isBeginningOfWord && (phonemeInfo.longVowel && !phonemeInfo.endingOnly)) {
if (phoneme !== "aa" && currentPLetter !== "ا" && !phonemeInfo.matches?.includes(nextPLetter)) {
throw Error("phonetics error - needs alef prefix");
}
return PhonemeStatus.LeadingLongVowel;
}
if (isBeginningOfWord && (phonemeInfo.beginningMatches?.includes(currentPLetter) || phonemeInfo.matches?.includes(currentPLetter))) {
return PhonemeStatus.LeadingConsonantOrShortVowel;
}
console.log(phoneme, phonemes, prev2Chars(state.pOut))
if (isBeginningOfWord && phoneme === "u" && prevPLetter === " " && prev2Chars(state.pOut) === ("د" + zwarakey)) {
// console.log("du here", phoneme, phonemes);
return PhonemeStatus.EndOfDuParticle
}
2021-05-13 09:54:46 +00:00
if (!isBeginningOfWord && phoneme === "aa" && currentPLetter === "و" && nextPLetter === "ا") {
return PhonemeStatus.PersianSilentWWithAa;
}
if (!isBeginningOfWord && phoneme === "i" && currentPLetter === "ا" && nextPLetter === "ل") {
return PhonemeStatus.ArabicWasla;
}
if (phoneme === "-i-" && isBeginningOfWord) {
return PhonemeStatus.Izafe;
}
2021-05-09 17:08:35 +00:00
if (needsTashdeed) {
return PhonemeStatus.DoubleConsonantTashdeed;
}
if (isEndOfWord && ((phoneme === "u" && currentPLetter === "ه") || (phoneme === "h" && ["ه", "ح"].includes(currentPLetter)))) {
return PhonemeStatus.EndingWithHeyHim;
}
if ((phonemeInfo.matches?.includes(currentPLetter) || (isEndOfWord && phonemeInfo.endingMatches?.includes(currentPLetter)) || (phoneme === "m" && currentPLetter === "ن" && nextPLetter === "ب"))) {
return PhonemeStatus.DirectMatch;
}
if (phonemeInfo.diacritic && !phonemeInfo.longVowel) {
return PhonemeStatus.ShortVowel;
}
// console.log("bad phoneme is ", phoneme);
2021-05-09 17:08:35 +00:00
throw new Error("phonetics error - no status found for phoneme: " + phoneme);
}
2021-05-13 09:54:46 +00:00
const phs = getPhonemeState();
2021-05-09 17:08:35 +00:00
2021-05-08 18:31:59 +00:00
return {
2021-05-13 09:54:46 +00:00
phs, phonemeInfo, sukunOrDiacritic,
2021-05-08 18:31:59 +00:00
};
};