BIG IMPROVEMENT TO FUZZY SEARCH - USING 3 PASSES

This commit is contained in:
lingdocs 2022-07-29 14:19:22 -05:00
parent 523bbb5b62
commit 254e33f48a
10 changed files with 175 additions and 69 deletions

View File

@ -7,7 +7,7 @@
"name": "functions",
"dependencies": {
"@google-cloud/storage": "^5.8.1",
"@lingdocs/pashto-inflector": "3.6.0",
"@lingdocs/pashto-inflector": "3.6.2",
"@types/cors": "^2.8.10",
"@types/google-spreadsheet": "^3.0.2",
"cors": "^2.8.5",
@ -30,7 +30,7 @@
"node": "16"
},
"peerDependencies": {
"@lingdocs/pashto-inflector": "3.6.0"
"@lingdocs/pashto-inflector": "3.6.2"
}
},
"node_modules/@babel/code-frame": {
@ -505,9 +505,9 @@
}
},
"node_modules/@lingdocs/pashto-inflector": {
"version": "3.6.0",
"resolved": "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.0.tgz",
"integrity": "sha512-OCkGiTTY8s2QgiTmP5MJ110GScul7ILtzoaz6b5zWq3Qerdhs10jAA0t/9OckfLE4yBkQYWkIG+5Qj4ZJGkODg==",
"version": "3.6.2",
"resolved": "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.2.tgz",
"integrity": "sha512-hHvgJPrNAp/ZBvZRsm++X2vFnZyuVrpE54YWPubwk+1Xn+28otoJ34r/OsN4N7eXrXBcxawYsWhf/ot9D987GQ==",
"license": "MIT",
"dependencies": {
"@formkit/auto-animate": "^1.0.0-beta.1",
@ -3682,9 +3682,9 @@
}
},
"@lingdocs/pashto-inflector": {
"version": "3.6.0",
"resolved": "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.0.tgz",
"integrity": "sha512-OCkGiTTY8s2QgiTmP5MJ110GScul7ILtzoaz6b5zWq3Qerdhs10jAA0t/9OckfLE4yBkQYWkIG+5Qj4ZJGkODg==",
"version": "3.6.2",
"resolved": "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.2.tgz",
"integrity": "sha512-hHvgJPrNAp/ZBvZRsm++X2vFnZyuVrpE54YWPubwk+1Xn+28otoJ34r/OsN4N7eXrXBcxawYsWhf/ot9D987GQ==",
"requires": {
"@formkit/auto-animate": "^1.0.0-beta.1",
"classnames": "^2.2.6",

View File

@ -14,7 +14,7 @@
"main": "lib/functions/src/index.js",
"dependencies": {
"@google-cloud/storage": "^5.8.1",
"@lingdocs/pashto-inflector": "3.6.0",
"@lingdocs/pashto-inflector": "3.6.2",
"@types/cors": "^2.8.10",
"@types/google-spreadsheet": "^3.0.2",
"cors": "^2.8.5",
@ -35,6 +35,6 @@
},
"private": true,
"peerDependencies": {
"@lingdocs/pashto-inflector": "3.6.0"
"@lingdocs/pashto-inflector": "3.6.2"
}
}

16
package-lock.json generated
View File

@ -9,7 +9,7 @@
"version": "0.3.1",
"license": "MIT",
"dependencies": {
"@lingdocs/pashto-inflector": "3.6.0",
"@lingdocs/pashto-inflector": "3.6.2",
"lokijs": "^1.5.12",
"nano": "^9.0.5",
"passport-github2": "^0.1.12",
@ -29,7 +29,7 @@
"typescript": "^4.4.3"
},
"peerDependencies": {
"@lingdocs/pashto-inflector": "3.6.0"
"@lingdocs/pashto-inflector": "3.6.2"
}
},
"node_modules/@babel/code-frame": {
@ -312,9 +312,9 @@
"dev": true
},
"node_modules/@lingdocs/pashto-inflector": {
"version": "3.6.0",
"resolved": "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.0.tgz",
"integrity": "sha512-OCkGiTTY8s2QgiTmP5MJ110GScul7ILtzoaz6b5zWq3Qerdhs10jAA0t/9OckfLE4yBkQYWkIG+5Qj4ZJGkODg==",
"version": "3.6.2",
"resolved": "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.2.tgz",
"integrity": "sha512-hHvgJPrNAp/ZBvZRsm++X2vFnZyuVrpE54YWPubwk+1Xn+28otoJ34r/OsN4N7eXrXBcxawYsWhf/ot9D987GQ==",
"license": "MIT",
"dependencies": {
"@formkit/auto-animate": "^1.0.0-beta.1",
@ -4213,9 +4213,9 @@
"dev": true
},
"@lingdocs/pashto-inflector": {
"version": "3.6.0",
"resolved": "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.0.tgz",
"integrity": "sha512-OCkGiTTY8s2QgiTmP5MJ110GScul7ILtzoaz6b5zWq3Qerdhs10jAA0t/9OckfLE4yBkQYWkIG+5Qj4ZJGkODg==",
"version": "3.6.2",
"resolved": "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.2.tgz",
"integrity": "sha512-hHvgJPrNAp/ZBvZRsm++X2vFnZyuVrpE54YWPubwk+1Xn+28otoJ34r/OsN4N7eXrXBcxawYsWhf/ot9D987GQ==",
"requires": {
"@formkit/auto-animate": "^1.0.0-beta.1",
"classnames": "^2.2.6",

View File

@ -17,12 +17,12 @@
"url": "git@github.com-lingdocs:lingdocs/lingdocs-main.git"
},
"peerDependencies": {
"@lingdocs/pashto-inflector": "3.6.0"
"@lingdocs/pashto-inflector": "3.6.2"
},
"author": "lingdocs.com",
"license": "MIT",
"dependencies": {
"@lingdocs/pashto-inflector": "3.6.0",
"@lingdocs/pashto-inflector": "3.6.2",
"lokijs": "^1.5.12",
"nano": "^9.0.5",
"passport-github2": "^0.1.12",

View File

@ -7,7 +7,7 @@
"private": true,
"dependencies": {
"@fortawesome/fontawesome-free": "^5.15.2",
"@lingdocs/pashto-inflector": "3.6.0",
"@lingdocs/pashto-inflector": "3.6.2",
"@testing-library/jest-dom": "^5.11.4",
"@testing-library/react": "^11.1.0",
"@testing-library/user-event": "^12.1.10",
@ -110,6 +110,6 @@
"user-event": "^4.0.0"
},
"peerDependencies": {
"@lingdocs/pashto-inflector": "3.6.0"
"@lingdocs/pashto-inflector": "3.6.2"
}
}

View File

@ -271,7 +271,18 @@ function pashtoFuzzyLookup<S extends T.DictionaryEntry>({ searchString, page, tp
.simplesort("i")
.data();
resultsGiven = exactResults.map((mpd: any) => mpd.$loki);
// Get slightly fuzzy matches
const slightlyFuzzy = new RegExp(makeAWeeBitFuzzy(search, infIndex), "i");
const slightlyFuzzyQuery = {
[index]: { $regex: slightlyFuzzy },
$loki: { $nin: resultsGiven },
};
const slightlyFuzzyResultsLimit = (pageSize * page) - resultsGiven.length;
const slightlyFuzzyResults = dictDb.collection.chain()
.find(slightlyFuzzyQuery)
.limit(slightlyFuzzyResultsLimit)
.data();
resultsGiven.push(...slightlyFuzzyResults.map((mpd: any) => mpd.$loki));
// Get fuzzy matches
const pashtoRegExLogic = fuzzifyPashto(search, {
script: index === "p" ? "Pashto" : "Latin",
@ -301,17 +312,18 @@ function pashtoFuzzyLookup<S extends T.DictionaryEntry>({ searchString, page, tp
.limit(fuzzyResultsLimit)
.data();
const results = tpFilter
? [...exactResults, ...fuzzyResults].filter(tpFilter)
: [...exactResults, ...fuzzyResults];
const chunksToSort = chunkOutArray(results, pageSize);
? [...exactResults, ...slightlyFuzzyResults, ...fuzzyResults].filter(tpFilter)
: [...exactResults, ...slightlyFuzzyResults, ...fuzzyResults];
// sort out each chunk (based on limit used multiple times by infinite scroll)
// so that when infinite scrolling, it doesn't resort the previous chunks given
// TODO: If on the first page, only sort the fuzzyResults
// so that when infinite scrolling, it doesn't re-sort the previous chunks given
const closeResultsLength = exactResults.length + slightlyFuzzyResults.length;
const chunksToSort = chunkOutArray(results, pageSize);
return chunksToSort
.reduce((acc, cur, i) => ((i === 0)
? [
...sortByRelevancy(cur.slice(0, exactResults.length), search, index),
...sortByRelevancy(cur.slice(exactResults.length), search, index),
// don't sort theclose results in the first chunk
...cur.slice(0, closeResultsLength),
...sortByRelevancy(cur.slice(closeResultsLength), search, index),
]
: [
...acc,
@ -386,14 +398,12 @@ function makeVerbLookupPortal(): T.EntryLookupPortal<T.VerbEntry> {
page: 1,
tpFilter: tp.isVerbDictionaryEntry,
});
const r = vEntries.map((entry): T.VerbEntry => ({
return vEntries.map((entry): T.VerbEntry => ({
entry,
complement: (entry.c?.includes("comp.") && entry.l)
? dictionary.findOneByTs(entry.l)
: undefined,
}));
console.log(r);
return r;
},
getByTs: (ts: number): T.VerbEntry | undefined => {
const entry = dictDb.findOneByTs(ts);

View File

@ -0,0 +1,28 @@
import { makeAWeeBitFuzzy } from "./wee-bit-fuzzy";
const pMatches = [
["پیټی", "پېټی"],
["دوستی", "دوستي"],
["پته", "پټه"],
];
const fMatches = [
["sareyy", "saRey"],
["peyTey", "peTey"],
];
pMatches.forEach((pair) => {
test(`${pair[0]} should match ${pair[1]}`, () => {
const re = makeAWeeBitFuzzy(pair[0], "p");
const result = pair[1].match(new RegExp(re, "i"));
expect(result).toBeTruthy();
});
});
fMatches.forEach((pair) => {
test(`${pair[0]} should match ${pair[1]}`, () => {
const re = makeAWeeBitFuzzy(pair[0], "f");
const result = pair[1].match(new RegExp(re, "i"));
expect(result).toBeTruthy();
});
});

View File

@ -6,36 +6,104 @@
*
*/
const matcher = {
q: "[q|k]",
k: "[q|k]",
// TODO: this might not be the best way to handle
// double aa's passing as a's - because it can totally ignore the a's
a: "[a|á|ă]?a?",
á: "[a|á|ă]?a?",
ă: "[a|á|ă]?a?",
u: "[u|ú]",
ú: "[u|ú]",
e: "[e|é]",
é: "[e|é]",
i: "[i|í]",
í: "[i|í]",
o: "[o|ó]",
ó: "[o|ó]",
g: "[g|G]",
G: "[g|G]",
r: "[r|R]",
R: "[r|R]",
// const matcher = {
// q: "[q|k]",
// k: "[q|k]",
// // TODO: this might not be the best way to handle
// // double aa's passing as a's - because it can totally ignore the a's
// a: "[a|á|ă]?a?",
// á: "[a|á|ă]?a?",
// ă: "[a|á|ă]?a?",
// u: "[u|ú]",
// ú: "[u|ú]",
// e: "[e|é]",
// é: "[e|é]",
// i: "[i|í]",
// í: "[i|í]",
// o: "[o|ó]",
// ó: "[o|ó]",
// g: "[g|G]",
// G: "[g|G]",
// r: "[r|R]",
// R: "[r|R]",
// };
const fiveYeys = "[ئ|ۍ|ي|ې|ی]";
const sSounds = "[س|ص|ث|څ]";
const zSounds = "[ز|ژ|ض|ظ|ذ|ځ]";
const tSounds = "[ت|ط|ټ]";
const dSounds = "[د|ډ]";
const rSounds = "[ر|ړ|ڼ]";
const nSounds = "[ن|ڼ]";
const pReplacer = {
"ی": fiveYeys,
"ي": fiveYeys,
"ۍ": fiveYeys,
"ئ": fiveYeys,
"ې": fiveYeys,
"س": sSounds,
"ص": sSounds,
"ث": sSounds,
"څ": sSounds,
"ز": zSounds,
"ظ": zSounds,
"ذ": zSounds,
"ض": zSounds,
"ژ": zSounds,
"ځ": zSounds,
"ت": tSounds,
"ط": tSounds,
"ټ": tSounds,
"د": dSounds,
"ډ": dSounds,
"ر": rSounds,
"ړ": rSounds,
"ن": nSounds,
"ڼ": nSounds,
};
const fRepRegex = /r|R|q|k|a|á|ă|e|é|i|í|o|ó|g|G|u|ú/g;
const fiveYeysF = "(?:eyy|ey|ee|e|uy)";
const zSoundsF = "(?:z|dz)";
const fReplacer = {
"eyy": fiveYeysF,
"ey": fiveYeysF,
"uy": fiveYeysF,
"ee": fiveYeysF,
"e": fiveYeysF,
"z": zSoundsF,
"dz": zSoundsF,
};
const pRepRegex = new RegExp(Object.keys(pReplacer).join("|"), "g");
const fRepRegex = /eyy|ey|uy|ee|e|z|dz/g;
function makePAWeeBitFuzzy(s: string): string {
// + s.replace(/ /g, "").split("").join(" *");
return "^" + s.replace(pRepRegex, mtch => {
// @ts-ignore
return pReplacer[mtch];
});
}
function makeFAWeeBitFuzzy(s: string): string {
return "^" + s.replace(fRepRegex, mtch => {
// @ts-ignore
return fReplacer[mtch];
});
}
export function makeAWeeBitFuzzy(s: string, i: "f" | "p"): string {
const logic = i === "f"
? "^" + s.replace(/ /g, "").split("").join("['|`]? *").replace(fRepRegex, (mtch) => {
// @ts-ignore
return matcher[mtch];
})
: "^" + s.replace(/ /g, "").split("").join(" *");
return logic;
return i === "p"
? makePAWeeBitFuzzy(s)
: makeFAWeeBitFuzzy(s);
}

View File

@ -1590,10 +1590,10 @@
"@types/yargs" "^16.0.0"
chalk "^4.0.0"
"@lingdocs/pashto-inflector@3.6.0":
version "3.6.0"
resolved "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.0.tgz#89143246341bbca70c340f9cbcd72650f8348231"
integrity sha512-OCkGiTTY8s2QgiTmP5MJ110GScul7ILtzoaz6b5zWq3Qerdhs10jAA0t/9OckfLE4yBkQYWkIG+5Qj4ZJGkODg==
"@lingdocs/pashto-inflector@3.6.2":
version "3.6.2"
resolved "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.2.tgz#122eaeaac59253ea0ee708d772e860502aa1d6b7"
integrity sha512-hHvgJPrNAp/ZBvZRsm++X2vFnZyuVrpE54YWPubwk+1Xn+28otoJ34r/OsN4N7eXrXBcxawYsWhf/ot9D987GQ==
dependencies:
"@formkit/auto-animate" "^1.0.0-beta.1"
classnames "^2.2.6"

View File

@ -178,10 +178,10 @@
"resolved" "https://registry.npmjs.org/@hapi/hoek/-/hoek-9.2.1.tgz"
"version" "9.2.1"
"@lingdocs/pashto-inflector@3.6.0":
"integrity" "sha512-OCkGiTTY8s2QgiTmP5MJ110GScul7ILtzoaz6b5zWq3Qerdhs10jAA0t/9OckfLE4yBkQYWkIG+5Qj4ZJGkODg=="
"resolved" "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.0.tgz"
"version" "3.6.0"
"@lingdocs/pashto-inflector@3.6.2":
"integrity" "sha512-hHvgJPrNAp/ZBvZRsm++X2vFnZyuVrpE54YWPubwk+1Xn+28otoJ34r/OsN4N7eXrXBcxawYsWhf/ot9D987GQ=="
"resolved" "https://npm.lingdocs.com/@lingdocs%2fpashto-inflector/-/pashto-inflector-3.6.2.tgz"
"version" "3.6.2"
dependencies:
"@formkit/auto-animate" "^1.0.0-beta.1"
"classnames" "^2.2.6"