pashto-dictionary/website/src/lib/dictionary.ts

600 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Copyright (c) 2021 lingdocs.com
*
* This source code is licensed under the GPL3 license found in the
* LICENSE file in the root directory of this source tree.
*
*/
import { DictionaryDb } from "./dictionary-core";
import sanitizePashto from "./sanitize-pashto";
import fillerWords from "./filler-words";
import {
Types as T,
simplifyPhonetics,
typePredicates as tp,
revertSpelling,
} from "@lingdocs/ps-react";
// TODO: or use and modify leven ??
// @ts-ignore
import { levenshtein } from "edit-distance";
import { isPashtoScript } from "./is-pashto";
import { fuzzifyPashto } from "./fuzzify-pashto/fuzzify-pashto";
import { makeAWeeBitFuzzy } from "./wee-bit-fuzzy";
import { getTextOptions } from "./get-text-options";
import { DictionaryAPI, State } from "../types/dictionary-types";
const dictionaryBaseUrl = `https://storage.lingdocs.com/dictionary`;
const dictionaryUrl = `${dictionaryBaseUrl}/dictionary`;
const dictionaryInfoUrl = `${dictionaryBaseUrl}/dictionary-info`;
const dictionaryInfoLocalStorageKey = "dictionaryInfo5";
const dictionaryCollectionName = "dictionary3";
// const dictionaryDatabaseName = "dictdb.db";
export const pageSize = 60;
const db = indexedDB.open("inPrivate");
db.onerror = (e) => {
console.error(e);
alert(
"Your browser does not have IndexedDB enabled. This might be because you are using private mode. Please use regular mode or enable IndexedDB to use this dictionary"
);
};
const dictDb = new DictionaryDb({
url: dictionaryUrl,
infoUrl: dictionaryInfoUrl,
collectionName: dictionaryCollectionName,
infoLocalStorageKey: dictionaryInfoLocalStorageKey,
});
function makeSearchStringSafe(searchString: string): string {
return searchString.replace(/[#-.]|[[-^]|[?|{}]/g, "");
}
function fuzzifyEnglish(input: string): string {
const safeInput = input.trim().replace(/[#-.]|[[-^]|[?|{}]/g, "");
// TODO: Could do: cover british/american things like offense / offence
return safeInput
.replace("to ", "")
.replace(/our/g, "ou?r")
.replace(/or/g, "ou?r");
}
function chunkOutArray<T>(arr: T[], chunkSize: number): T[][] {
const R: T[][] = [];
for (let i = 0; i < arr.length; i += chunkSize) {
R.push(arr.slice(i, i + chunkSize));
}
return R;
}
function getExpForInflections(input: string, index: "p" | "f"): RegExp {
let base = input;
if (index === "f") {
if (["e", "é", "a", "á", "ó", "o"].includes(input.slice(-1))) {
base = input.slice(0, -1);
}
return new RegExp(`\\b${base}`);
}
if (["ه", "ې", "و"].includes(input.slice(-1))) {
base = input.slice(0, -1);
}
return new RegExp(`^${base}[و|ې|ه]?`);
}
function tsBack(period: "month" | "week"): number {
if (period === "month") {
// https://stackoverflow.com/a/24049314/8620945
const d = new Date();
const m = d.getMonth();
d.setMonth(d.getMonth() - 1);
// If still in same month, set date to last day of
// previous month
if (d.getMonth() === m) d.setDate(0);
d.setHours(0, 0, 0);
d.setMilliseconds(0);
// Get the time value in milliseconds and convert to seconds
return d.getTime();
}
const currentDate = new Date();
const lastWeekDate = new Date(
currentDate.getTime() - 7 * 24 * 60 * 60 * 1000
);
return lastWeekDate.getTime();
}
function alphabeticalLookup({
searchString,
page,
}: {
searchString: string;
page: number;
}): T.DictionaryEntry[] {
const r = new RegExp(
"^" + sanitizePashto(makeSearchStringSafe(searchString))
);
const regexResults: T.DictionaryEntry[] = dictDb.collection.find({
$or: [{ p: { $regex: r } }, { g: { $regex: r } }],
});
const indexNumbers = regexResults.map((mpd: any) => mpd.i);
// Find the first matching word occuring first in the Pashto Index
let firstIndexNumber = null;
if (indexNumbers.length) {
firstIndexNumber = Math.min(...indexNumbers);
}
// $gt query from that first occurance
if (firstIndexNumber !== null) {
return dictDb.collection
.chain()
.find({ i: { $gt: firstIndexNumber - 1 } })
.simplesort("i")
.limit(page * pageSize)
.data();
}
return [];
}
function fuzzyLookup<S extends T.DictionaryEntry>({
searchString,
language,
page,
tpFilter,
}: {
searchString: string;
language: "Pashto" | "English" | "Both";
page: number;
tpFilter?: (e: T.DictionaryEntry) => e is S;
}): S[] {
// TODO: Implement working with both
if (Number(searchString)) {
const entry = dictionary.findOneByTs(Number(searchString));
// @ts-ignore;
return entry ? [entry] : ([] as S[]);
}
return language === "Pashto"
? pashtoFuzzyLookup({ searchString, page, tpFilter })
: englishLookup({ searchString, page, tpFilter });
}
function englishLookup<S extends T.DictionaryEntry>({
searchString,
page,
tpFilter,
}: {
searchString: string;
page: number;
tpFilter?: (e: T.DictionaryEntry) => e is S;
}): S[] {
function sortByR(a: T.DictionaryEntry, b: T.DictionaryEntry) {
return (b.r || 3) - (a.r || 3);
}
let resultsGiven: number[] = [];
// get exact results
const exactQuery = {
e: {
$regex: new RegExp(`^${fuzzifyEnglish(searchString)}$`, "i"),
},
};
const exactResultsLimit = pageSize < 10 ? Math.floor(pageSize / 2) : 10;
const exactResults = dictDb.collection
.chain()
.find(exactQuery)
.limit(exactResultsLimit)
.simplesort("i")
.data();
exactResults.sort(sortByR);
resultsGiven = exactResults.map((mpd: any) => mpd.$loki);
// get results with full word match at beginning of string
const startingQuery = {
e: {
$regex: new RegExp(`^${fuzzifyEnglish(searchString)}\\b`, "i"),
},
$loki: { $nin: resultsGiven },
};
const startingResultsLimit = pageSize * page - resultsGiven.length;
const startingResults = dictDb.collection
.chain()
.find(startingQuery)
.limit(startingResultsLimit)
.simplesort("i")
.data();
startingResults.sort(sortByR);
resultsGiven = [
...resultsGiven,
...startingResults.map((mpd: any) => mpd.$loki),
];
// get results with full word match anywhere
const fullWordQuery = {
e: {
$regex: new RegExp(`\\b${fuzzifyEnglish(searchString)}\\b`, "i"),
},
$loki: { $nin: resultsGiven },
};
const fullWordResultsLimit = pageSize * page - resultsGiven.length;
const fullWordResults = dictDb.collection
.chain()
.find(fullWordQuery)
.limit(fullWordResultsLimit)
.simplesort("i")
.data();
fullWordResults.sort(sortByR);
resultsGiven = [
...resultsGiven,
...fullWordResults.map((mpd: any) => mpd.$loki),
];
// get results with partial match anywhere
const partialMatchQuery = {
e: {
$regex: new RegExp(`${fuzzifyEnglish(searchString)}`, "i"),
},
$loki: { $nin: resultsGiven },
};
const partialMatchLimit = pageSize * page - resultsGiven.length;
const partialMatchResults = dictDb.collection
.chain()
.where(tpFilter ? tpFilter : () => true)
.find(partialMatchQuery)
.limit(partialMatchLimit)
.simplesort("i")
.data();
partialMatchResults.sort(sortByR);
const results = [
...exactResults,
...startingResults,
...fullWordResults,
...partialMatchResults,
];
if (tpFilter) {
return results.filter(tpFilter);
}
return results;
}
function pashtoExactLookup(searchString: string): T.DictionaryEntry[] {
const index = isPashtoScript(searchString) ? "p" : "g";
const search = index === "g" ? simplifyPhonetics(searchString) : searchString;
return dictDb.collection.find({
[index]: search,
});
}
function pashtoFuzzyLookup<S extends T.DictionaryEntry>({
searchString,
page,
tpFilter,
}: {
searchString: string;
page: number;
tpFilter?: (e: T.DictionaryEntry) => e is S;
}): S[] {
let resultsGiven: number[] = [];
// Check if it's in Pashto or Latin script
const searchStringToUse = sanitizePashto(makeSearchStringSafe(searchString));
const index = isPashtoScript(searchStringToUse) ? "p" : "g";
const search =
index === "g" ? simplifyPhonetics(searchStringToUse) : searchStringToUse;
const infIndex = index === "p" ? "p" : "f";
// Get exact matches
const exactExpression = new RegExp("^" + search);
const weeBitFuzzy = new RegExp("^" + makeAWeeBitFuzzy(search, infIndex));
// prepare exact expression for special matching
// TODO: This is all a bit messy and could be done without regex
const expressionForInflections = getExpForInflections(search, infIndex);
const arabicPluralIndex = `ap${infIndex}`;
const pashtoPluralIndex = `pp${infIndex}`;
const presentStemIndex = `ps${infIndex}`;
const firstInfIndex = `infa${infIndex}`;
const secondInfIndex = `infb${infIndex}`;
const pashtoExactResultFields = [
{
[index]: { $regex: exactExpression },
},
{
[arabicPluralIndex]: { $regex: weeBitFuzzy },
},
{
[pashtoPluralIndex]: { $regex: weeBitFuzzy },
},
{
[presentStemIndex]: { $regex: weeBitFuzzy },
},
{
[firstInfIndex]: { $regex: expressionForInflections },
},
{
[secondInfIndex]: { $regex: expressionForInflections },
},
];
const exactQuery = { $or: [...pashtoExactResultFields] };
// just special incase using really small limits
// multiple times scrolling / chunking / sorting might get a bit messed up if using a limit of less than 10
const exactResultsLimit = pageSize < 10 ? Math.floor(pageSize / 2) : 10;
const exactResults = dictDb.collection
.chain()
.find(exactQuery)
.limit(exactResultsLimit)
.simplesort("i")
.data();
resultsGiven = exactResults.map((mpd: any) => mpd.$loki);
// Get slightly fuzzy matches
const slightlyFuzzy = new RegExp(makeAWeeBitFuzzy(search, infIndex), "i");
const slightlyFuzzyQuery = {
[index]: { $regex: slightlyFuzzy },
$loki: { $nin: resultsGiven },
};
const slightlyFuzzyResultsLimit = pageSize * page - resultsGiven.length;
const slightlyFuzzyResults = dictDb.collection
.chain()
.find(slightlyFuzzyQuery)
.limit(slightlyFuzzyResultsLimit)
.data();
resultsGiven.push(...slightlyFuzzyResults.map((mpd: any) => mpd.$loki));
// Get fuzzy matches
const pashtoRegExLogic = fuzzifyPashto(search, {
script: index === "p" ? "Pashto" : "Latin",
simplifiedLatin: index === "g",
allowSpacesInWords: true,
matchStart: "word",
});
const fuzzyPashtoExperssion = new RegExp(pashtoRegExLogic);
const pashtoFuzzyQuery = [
{
[index]: { $regex: fuzzyPashtoExperssion },
},
{
// TODO: Issue, this fuzzy doesn't line up well because it's not the simplified phonetics - still has 's etc
[arabicPluralIndex]: { $regex: fuzzyPashtoExperssion },
},
{
[presentStemIndex]: { $regex: fuzzyPashtoExperssion },
},
];
// fuzzy results should be allowed to take up the rest of the limit (not used up by exact results)
const fuzzyResultsLimit = pageSize * page - resultsGiven.length;
// don't get these fuzzy results if searching in only English
const fuzzyQuery = {
$or: pashtoFuzzyQuery,
$loki: { $nin: resultsGiven },
};
const fuzzyResults = dictDb.collection
.chain()
.find(fuzzyQuery)
.limit(fuzzyResultsLimit)
.data();
const results = tpFilter
? [...exactResults, ...slightlyFuzzyResults, ...fuzzyResults].filter(
tpFilter
)
: [...exactResults, ...slightlyFuzzyResults, ...fuzzyResults];
// sort out each chunk (based on limit used multiple times by infinite scroll)
// so that when infinite scrolling, it doesn't re-sort the previous chunks given
// const closeResultsLength = exactResults.length + slightlyFuzzyResults.length;
const chunksToSort = chunkOutArray(results, pageSize);
return chunksToSort.flatMap((c) => sortByRelevancy(c, search, index));
}
function sortByRelevancy<T extends Record<"p" | "g", string>>(
arr: Readonly<T[]>,
searchI: string,
index: "p" | "g"
): T[] {
// TODO: experiment with larger page sizes and not exact query,
// especially with phonetic searches like ghuT
//
// TODO: if result came from special query, mark it as special and
// then don't mess with the relevancy
// now instead of an extra pass for exact, we can just use this!
const similars = {
p: ["دډتټ", "زذضظځ", "صسث", "رړڼ", "ڼن", "یيېۍ", "قک", "ګږ", "ښخحه", "پف"],
g: ["tdTD", "rRN", "nN", "ei", "xkg", "pf", "au"],
};
function insert() {
return 1;
}
// check if it's removing dz etc
function remove() {
return 1;
}
function update(a: string, b: string) {
return similars[index].find((x) => x.includes(a) && x.includes(b))
? 0.5
: a !== b
? 1
: 0;
}
function levenOverVars(g: string, s: string): number {
if (!g.includes(",")) {
return levenshtein(g, s, insert, remove, update).distance;
}
return Math.min(
...g
.split(",")
.map((x) => levenshtein(x, s, insert, remove, update).distance)
);
}
const toSort = [...arr];
if (index === "g") {
toSort.sort((a, b) => {
const aDist = levenOverVars(a[index], searchI);
const bDist = levenOverVars(b[index], searchI);
return aDist - bDist;
});
} else {
toSort.sort((a, b) => {
const aDist = levenshtein(
a[index],
searchI,
insert,
remove,
update
).distance;
const bDist = levenshtein(
b[index],
searchI,
insert,
remove,
update
).distance;
return aDist - bDist;
});
}
return toSort;
}
function relatedWordsLookup(word: T.DictionaryEntry): T.DictionaryEntry[] {
const wordArray = word.e
.trim()
.replace(/\?/g, "")
.replace(/( |,|\.|!|;|\(|\))/g, " ")
.split(/ +/)
.filter((w: string) => !fillerWords.includes(w));
let results: T.DictionaryEntry[] = [];
wordArray.forEach((w: string) => {
let r: RegExp;
try {
r = new RegExp(`\\b${w}\\b`, "i");
const relatedToWord = dictDb.collection
.chain()
.find({
// don't include the original word
ts: { $ne: word.ts },
e: { $regex: r },
})
.limit(5)
.data();
results = [...results, ...relatedToWord];
// In case there's some weird regex fail
} catch (error) {
/* istanbul ignore next */
console.error(error);
}
});
// Remove duplicate items - https://stackoverflow.com/questions/40811451/remove-duplicates-from-a-array-of-objects
results = results.filter(function (a) {
// @ts-ignore
return !this[a.$loki] && (this[a.$loki] = true);
}, Object.create(null));
return results;
}
export function allEntries() {
return dictDb.collection.find();
}
function makeLookupPortal<X extends T.DictionaryEntry>(
tpFilter: (x: T.DictionaryEntry) => x is X
): T.EntryLookupPortal<X> {
return {
search: (s: string) =>
fuzzyLookup({
searchString: s,
language: "Pashto",
page: 1,
tpFilter,
}),
getByTs: (ts: number) => {
const res = dictDb.findOneByTs(ts);
if (!res) return undefined;
return tpFilter(res) ? res : undefined;
},
};
}
function makeVerbLookupPortal(): T.EntryLookupPortal<T.VerbEntry> {
return {
search: (s: string) => {
const vEntries = fuzzyLookup({
searchString: s,
language: "Pashto",
page: 1,
tpFilter: tp.isVerbDictionaryEntry,
});
return vEntries.map(
(entry): T.VerbEntry => ({
entry,
complement:
entry.c?.includes("comp.") && entry.l
? dictionary.findOneByTs(entry.l)
: undefined,
})
);
},
getByTs: (ts: number): T.VerbEntry | undefined => {
const entry = dictDb.findOneByTs(ts);
if (!entry) return undefined;
if (!tp.isVerbDictionaryEntry(entry)) {
console.error("not valid verb entry");
return undefined;
}
const complement = (() => {
if (entry.c?.includes("comp") && entry.l) {
const comp = dictDb.findOneByTs(entry.l);
if (!comp) {
console.error("complement not found for", entry);
}
return comp;
} else {
return undefined;
}
})();
return { entry, complement };
},
};
}
export const entryFeeder: T.EntryFeeder = {
nouns: makeLookupPortal(tp.isNounEntry),
verbs: makeVerbLookupPortal(),
adjectives: makeLookupPortal(tp.isAdjectiveEntry),
locativeAdverbs: makeLookupPortal(tp.isLocativeAdverbEntry),
adverbs: makeLookupPortal(tp.isAdverbEntry),
};
export const dictionary: DictionaryAPI = {
// NOTE: For some reason that I do not understand you have to pass the functions from the
// dictionary core class in like this... ie. initialize: dictDb.initialize will mess up the this usage
// in the dictionary core class
initialize: async () => await dictDb.initialize(),
update: async (notifyUpdateComing: () => void) =>
await dictDb.updateDictionary(notifyUpdateComing),
search: function (state: State): T.DictionaryEntry[] {
const searchString = revertSpelling(
state.searchValue,
getTextOptions(state).spelling
);
if (state.searchValue === "") {
return [];
}
return state.options.searchType === "alphabetical" &&
state.options.language === "Pashto"
? alphabeticalLookup({
searchString,
page: state.page,
})
: fuzzyLookup({
searchString,
language: state.options.language,
page: state.page,
});
},
exactPashtoSearch: pashtoExactLookup,
getNewWords: function (period: "week" | "month"): T.DictionaryEntry[] {
return dictDb.collection
.chain()
.find({
ts: { $gt: tsBack(period) },
})
.simplesort("ts")
.data()
.reverse();
},
findOneByTs: (ts: number) => dictDb.findOneByTs(ts),
findRelatedEntries: function (entry: T.DictionaryEntry): T.DictionaryEntry[] {
return relatedWordsLookup(entry);
},
};