From 3825bc4181d731b419cd362d63ef70b48525d718 Mon Sep 17 00:00:00 2001 From: Tom Moor Date: Mon, 22 Apr 2024 07:40:13 -0400 Subject: [PATCH] Improve the quality of snippets for search results (#6828) * Improve the quality of snippets for search results * Prefer full match * tweak highlight regex --- package.json | 1 - server/models/helpers/SearchHelper.ts | 213 +++++++++++++++---- server/typings/tommoor__remove-markdown.d.ts | 8 - yarn.lock | 5 - 4 files changed, 176 insertions(+), 51 deletions(-) delete mode 100644 server/typings/tommoor__remove-markdown.d.ts diff --git a/package.json b/package.json index e3d6e9a28..95cb2404c 100644 --- a/package.json +++ b/package.json @@ -78,7 +78,6 @@ "@sentry/node": "^7.99.0", "@sentry/react": "^7.99.0", "@tippyjs/react": "^4.2.6", - "@tommoor/remove-markdown": "^0.3.2", "@types/form-data": "^2.5.0", "@types/mailparser": "^3.4.4", "@types/sanitize-filename": "^1.6.3", diff --git a/server/models/helpers/SearchHelper.ts b/server/models/helpers/SearchHelper.ts index ca5f3a32e..429faa7a8 100644 --- a/server/models/helpers/SearchHelper.ts +++ b/server/models/helpers/SearchHelper.ts @@ -1,4 +1,3 @@ -import removeMarkdown from "@tommoor/remove-markdown"; import invariant from "invariant"; import escapeRegExp from "lodash/escapeRegExp"; import find from "lodash/find"; @@ -13,6 +12,7 @@ import Share from "@server/models/Share"; import Team from "@server/models/Team"; import User from "@server/models/User"; import { sequelize } from "@server/storage/database"; +import DocumentHelper from "./DocumentHelper"; type SearchResponse = { results: { @@ -52,7 +52,6 @@ type RankedDocument = Document & { id: string; dataValues: Partial & { searchRanking: number; - searchContext: string; }; }; @@ -108,12 +107,6 @@ export default class SearchHelper { ), "searchRanking", ], - [ - Sequelize.literal( - `ts_headline('english', "text", to_tsquery('english', :query), :headlineOptions)` - ), - "searchContext", - ], ], replacements: queryReplacements, where, @@ -245,12 +238,6 @@ export default class SearchHelper { ), "searchRanking", ], - [ - Sequelize.literal( - `ts_headline('english', "text", to_tsquery('english', :query), :headlineOptions)` - ), - "searchContext", - ], ], subQuery: false, include, @@ -275,7 +262,7 @@ export default class SearchHelper { // Final query to get associated document data const documents = await Document.scope([ - "withoutState", + "withState", "withDrafts", { method: ["withViews", user.id], @@ -296,6 +283,39 @@ export default class SearchHelper { return this.buildResponse(query, results, documents, count); } + private static buildResultContext(document: Document, query: string) { + const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g)); + const text = DocumentHelper.toPlainText(document); + + // Regex to highlight quoted queries as ts_headline will not do this by default due to stemming. + const fullMatchRegex = new RegExp(escapeRegExp(query), "i"); + const highlightRegex = new RegExp( + [ + fullMatchRegex.source, + ...(quotedQueries.length + ? quotedQueries.map((match) => escapeRegExp(match[1])) + : this.removeStopWords(query) + .trim() + .split(" ") + .map((match) => `\\b${escapeRegExp(match)}\\b`)), + ].join("|"), + "gi" + ); + + // chop text around the first match, prefer the first full match if possible. + const fullMatchIndex = text.search(fullMatchRegex); + const offsetStartIndex = + (fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65; + const startIndex = Math.max( + 0, + offsetStartIndex <= 0 ? 0 : text.indexOf(" ", offsetStartIndex) + ); + const context = text.replace(highlightRegex, "$&"); + const endIndex = context.lastIndexOf(" ", startIndex + 250); + + return context.slice(startIndex, endIndex); + } + private static async buildWhere( model: User | Team, query: string | undefined, @@ -457,33 +477,16 @@ export default class SearchHelper { documents: Document[], count: number ): SearchResponse { - const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g)).slice(0, 3); - - // Regex to highlight quoted queries as ts_headline will not do this by default due to stemming. - const quotedRegex = new RegExp( - quotedQueries.map((match) => escapeRegExp(match[1])).join("|"), - "gi" - ); - return { results: map(results, (result) => { - let context = removeMarkdown(result.dataValues.searchContext, { - stripHTML: false, - }); - - // If there are any quoted queries, highlighting these takes precedence over the default - if (quotedQueries.length) { - context = context - .replace(/<\/?b>/g, "") - .replace(quotedRegex, "$&"); - } + const document = find(documents, { + id: result.id, + }) as Document; return { ranking: result.dataValues.searchRanking, - context, - document: find(documents, { - id: result.id, - }) as Document, + context: this.buildResultContext(document, query), + document, }; }), totalCount: count, @@ -537,4 +540,140 @@ export default class SearchHelper { .replace(/:/g, "\\:") ); } + + private static removeStopWords(query: string): string { + const stopwords = [ + "i", + "me", + "my", + "myself", + "we", + "our", + "ours", + "ourselves", + "you", + "your", + "yours", + "yourself", + "yourselves", + "he", + "him", + "his", + "himself", + "she", + "her", + "hers", + "herself", + "it", + "its", + "itself", + "they", + "them", + "their", + "theirs", + "themselves", + "what", + "which", + "who", + "whom", + "this", + "that", + "these", + "those", + "am", + "is", + "are", + "was", + "were", + "be", + "been", + "being", + "have", + "has", + "had", + "having", + "do", + "does", + "did", + "doing", + "a", + "an", + "the", + "and", + "but", + "if", + "or", + "because", + "as", + "until", + "while", + "of", + "at", + "by", + "for", + "with", + "about", + "against", + "between", + "into", + "through", + "during", + "before", + "after", + "above", + "below", + "to", + "from", + "up", + "down", + "in", + "out", + "on", + "off", + "over", + "under", + "again", + "further", + "then", + "once", + "here", + "there", + "when", + "where", + "why", + "how", + "all", + "any", + "both", + "each", + "few", + "more", + "most", + "other", + "some", + "such", + "no", + "nor", + "not", + "only", + "own", + "same", + "so", + "than", + "too", + "very", + "s", + "t", + "can", + "will", + "just", + "don", + "should", + "now", + ]; + return query + .split(" ") + .filter((word) => !stopwords.includes(word)) + .join(" "); + } } diff --git a/server/typings/tommoor__remove-markdown.d.ts b/server/typings/tommoor__remove-markdown.d.ts deleted file mode 100644 index 03256319d..000000000 --- a/server/typings/tommoor__remove-markdown.d.ts +++ /dev/null @@ -1,8 +0,0 @@ -declare module "@tommoor/remove-markdown" { - export default function removeMarkdown( - text: string, - options?: { - stripHTML: boolean; - } - ): string; -} diff --git a/yarn.lock b/yarn.lock index a9991b5e3..2b4bbdf90 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2958,11 +2958,6 @@ dependencies: tippy.js "^6.3.1" -"@tommoor/remove-markdown@^0.3.2": - version "0.3.2" - resolved "https://registry.yarnpkg.com/@tommoor/remove-markdown/-/remove-markdown-0.3.2.tgz#5288ddd0e26b6b173e76ebb31c94653b0dcff45d" - integrity "sha1-Uojd0OJraxc+duuzHJRlOw3P9F0= sha512-awcc9hfLZqyyZHOGzAHbnjgZJpQGS1W1oZZ5GXOTTnbKVdKQ4OWYbrRWPUvXI2YAKJazrcS8rxPh67PX3rpGkQ==" - "@tootallnate/once@2": version "2.0.0" resolved "https://registry.yarnpkg.com/@tootallnate/once/-/once-2.0.0.tgz#f544a148d3ab35801c1f633a7441fd87c2e484bf"