From d8119a14753010620654b7491dabc92927e64eda Mon Sep 17 00:00:00 2001 From: adueck Date: Mon, 14 Aug 2023 13:41:54 +0400 Subject: [PATCH] improved use of levenshtein distance --- website/src/lib/dictionary.ts | 38 +++++++++++++++----------------- website/src/lib/wee-bit-fuzzy.ts | 7 +++--- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/website/src/lib/dictionary.ts b/website/src/lib/dictionary.ts index c80a970..b04c2ea 100644 --- a/website/src/lib/dictionary.ts +++ b/website/src/lib/dictionary.ts @@ -31,7 +31,7 @@ const dictionaryInfoUrl = `https://storage.googleapis.com/lingdocs/dictionary-in const dictionaryInfoLocalStorageKey = "dictionaryInfo5"; const dictionaryCollectionName = "dictionary3"; // const dictionaryDatabaseName = "dictdb.db"; -export const pageSize = 35; +export const pageSize = 60; const db = indexedDB.open("inPrivate"); db.onerror = (e) => { @@ -364,18 +364,9 @@ function pashtoFuzzyLookup({ : [...exactResults, ...slightlyFuzzyResults, ...fuzzyResults]; // sort out each chunk (based on limit used multiple times by infinite scroll) // so that when infinite scrolling, it doesn't re-sort the previous chunks given - const closeResultsLength = exactResults.length + slightlyFuzzyResults.length; + // const closeResultsLength = exactResults.length + slightlyFuzzyResults.length; const chunksToSort = chunkOutArray(results, pageSize); - return chunksToSort.reduce( - (acc, cur, i) => - i === 0 - ? [ - ...sortByRelevancy(cur.slice(0, closeResultsLength), search, index), - ...sortByRelevancy(cur.slice(closeResultsLength), search, index), - ] - : [...acc, ...sortByRelevancy(cur, search, index)], - [] - ); + return chunksToSort.flatMap((c) => sortByRelevancy(c, search, index)); } function sortByRelevancy>( @@ -390,8 +381,8 @@ function sortByRelevancy>( // then don't mess with the relevancy // now instead of an extra pass for exact, we can just use this! const similars = { - p: ["دډتټ", "زذضظځ", "صسث", "رړڼ", "ڼن", "یيېۍ", "قک", "ګږ", "ښخحه"], - g: ["tdTD", "rRN", "nN", "ei", "xkg"], + p: ["دډتټ", "زذضظځ", "صسث", "رړڼ", "ڼن", "یيېۍ", "قک", "ګږ", "ښخحه", "پف"], + g: ["tdTD", "rRN", "nN", "ei", "xkg", "pf", "au"], }; function insert() { return 1; @@ -401,18 +392,25 @@ function sortByRelevancy>( return 1; } function update(a: string, b: string) { - return a !== b - ? 1 - : similars[index].find((x) => x.includes(a) && x.includes(b)) + return similars[index].find((x) => x.includes(a) && x.includes(b)) ? 0.5 + : a !== b + ? 1 : 0; } + function levenOverVars(g: string, s: string): number { + return Math.min( + ...g + .split(",") + .map((x) => levenshtein(x, s, insert, remove, update).distance) + ); + } const toSort = [...arr]; toSort.sort((a, b) => { - const aDist = levenshtein(a[index], searchI, insert, remove, update); - const bDist = levenshtein(b[index], searchI, insert, remove, update); - return aDist.distance - bDist.distance; + const aDist = levenOverVars(a[index], searchI); + const bDist = levenOverVars(b[index], searchI); + return aDist - bDist; }); return toSort; } diff --git a/website/src/lib/wee-bit-fuzzy.ts b/website/src/lib/wee-bit-fuzzy.ts index 3db81fd..9819dbb 100644 --- a/website/src/lib/wee-bit-fuzzy.ts +++ b/website/src/lib/wee-bit-fuzzy.ts @@ -98,11 +98,10 @@ const fReplacer = { kh: hKhF, ts: sSoundsF, s: sSoundsF, - // only used if ignoring accents - a: "[a|á]", + a: "[a|á|u|ú]", á: "[a|á|u|ú]", u: "[u|ú|a|á]", - ú: "[u|ú]", + ú: "[u|ú|a|á]", o: "[o|ó]", ó: "[o|ó]", i: "[i|í]", @@ -118,7 +117,7 @@ const fReplacer = { const pRepRegex = new RegExp(Object.keys(pReplacer).join("|"), "g"); -const fRepRegex = /ey|ay|uy|ee|e|z|dz|x|kh|h|ts|s/g; +const fRepRegex = /ey|ay|uy|ee|a|u|e|z|dz|x|kh|h|ts|s/g; const fRepRegexWAccents = /ey|éy|ay|áy|uy|úy|ee|ée|e|é|z|dz|x|ts|s|kh|h|a|á|i|í|o|ó|u|ú|U|Ú/g;