Improve the quality of snippets for search results (#6828)

* Improve the quality of snippets for search results

* Prefer full match

* tweak highlight regex
This commit is contained in:
Tom Moor
2024-04-22 07:40:13 -04:00
committed by GitHub
parent 9855adcd3b
commit 3825bc4181
4 changed files with 176 additions and 51 deletions

View File

@@ -78,7 +78,6 @@
"@sentry/node": "^7.99.0", "@sentry/node": "^7.99.0",
"@sentry/react": "^7.99.0", "@sentry/react": "^7.99.0",
"@tippyjs/react": "^4.2.6", "@tippyjs/react": "^4.2.6",
"@tommoor/remove-markdown": "^0.3.2",
"@types/form-data": "^2.5.0", "@types/form-data": "^2.5.0",
"@types/mailparser": "^3.4.4", "@types/mailparser": "^3.4.4",
"@types/sanitize-filename": "^1.6.3", "@types/sanitize-filename": "^1.6.3",

View File

@@ -1,4 +1,3 @@
import removeMarkdown from "@tommoor/remove-markdown";
import invariant from "invariant"; import invariant from "invariant";
import escapeRegExp from "lodash/escapeRegExp"; import escapeRegExp from "lodash/escapeRegExp";
import find from "lodash/find"; import find from "lodash/find";
@@ -13,6 +12,7 @@ import Share from "@server/models/Share";
import Team from "@server/models/Team"; import Team from "@server/models/Team";
import User from "@server/models/User"; import User from "@server/models/User";
import { sequelize } from "@server/storage/database"; import { sequelize } from "@server/storage/database";
import DocumentHelper from "./DocumentHelper";
type SearchResponse = { type SearchResponse = {
results: { results: {
@@ -52,7 +52,6 @@ type RankedDocument = Document & {
id: string; id: string;
dataValues: Partial<Document> & { dataValues: Partial<Document> & {
searchRanking: number; searchRanking: number;
searchContext: string;
}; };
}; };
@@ -108,12 +107,6 @@ export default class SearchHelper {
), ),
"searchRanking", "searchRanking",
], ],
[
Sequelize.literal(
`ts_headline('english', "text", to_tsquery('english', :query), :headlineOptions)`
),
"searchContext",
],
], ],
replacements: queryReplacements, replacements: queryReplacements,
where, where,
@@ -245,12 +238,6 @@ export default class SearchHelper {
), ),
"searchRanking", "searchRanking",
], ],
[
Sequelize.literal(
`ts_headline('english', "text", to_tsquery('english', :query), :headlineOptions)`
),
"searchContext",
],
], ],
subQuery: false, subQuery: false,
include, include,
@@ -275,7 +262,7 @@ export default class SearchHelper {
// Final query to get associated document data // Final query to get associated document data
const documents = await Document.scope([ const documents = await Document.scope([
"withoutState", "withState",
"withDrafts", "withDrafts",
{ {
method: ["withViews", user.id], method: ["withViews", user.id],
@@ -296,6 +283,39 @@ export default class SearchHelper {
return this.buildResponse(query, results, documents, count); return this.buildResponse(query, results, documents, count);
} }
private static buildResultContext(document: Document, query: string) {
const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g));
const text = DocumentHelper.toPlainText(document);
// Regex to highlight quoted queries as ts_headline will not do this by default due to stemming.
const fullMatchRegex = new RegExp(escapeRegExp(query), "i");
const highlightRegex = new RegExp(
[
fullMatchRegex.source,
...(quotedQueries.length
? quotedQueries.map((match) => escapeRegExp(match[1]))
: this.removeStopWords(query)
.trim()
.split(" ")
.map((match) => `\\b${escapeRegExp(match)}\\b`)),
].join("|"),
"gi"
);
// chop text around the first match, prefer the first full match if possible.
const fullMatchIndex = text.search(fullMatchRegex);
const offsetStartIndex =
(fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65;
const startIndex = Math.max(
0,
offsetStartIndex <= 0 ? 0 : text.indexOf(" ", offsetStartIndex)
);
const context = text.replace(highlightRegex, "<b>$&</b>");
const endIndex = context.lastIndexOf(" ", startIndex + 250);
return context.slice(startIndex, endIndex);
}
private static async buildWhere( private static async buildWhere(
model: User | Team, model: User | Team,
query: string | undefined, query: string | undefined,
@@ -457,33 +477,16 @@ export default class SearchHelper {
documents: Document[], documents: Document[],
count: number count: number
): SearchResponse { ): SearchResponse {
const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g)).slice(0, 3);
// Regex to highlight quoted queries as ts_headline will not do this by default due to stemming.
const quotedRegex = new RegExp(
quotedQueries.map((match) => escapeRegExp(match[1])).join("|"),
"gi"
);
return { return {
results: map(results, (result) => { results: map(results, (result) => {
let context = removeMarkdown(result.dataValues.searchContext, { const document = find(documents, {
stripHTML: false, id: result.id,
}); }) as Document;
// If there are any quoted queries, highlighting these takes precedence over the default
if (quotedQueries.length) {
context = context
.replace(/<\/?b>/g, "")
.replace(quotedRegex, "<b>$&</b>");
}
return { return {
ranking: result.dataValues.searchRanking, ranking: result.dataValues.searchRanking,
context, context: this.buildResultContext(document, query),
document: find(documents, { document,
id: result.id,
}) as Document,
}; };
}), }),
totalCount: count, totalCount: count,
@@ -537,4 +540,140 @@ export default class SearchHelper {
.replace(/:/g, "\\:") .replace(/:/g, "\\:")
); );
} }
private static removeStopWords(query: string): string {
const stopwords = [
"i",
"me",
"my",
"myself",
"we",
"our",
"ours",
"ourselves",
"you",
"your",
"yours",
"yourself",
"yourselves",
"he",
"him",
"his",
"himself",
"she",
"her",
"hers",
"herself",
"it",
"its",
"itself",
"they",
"them",
"their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"this",
"that",
"these",
"those",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"a",
"an",
"the",
"and",
"but",
"if",
"or",
"because",
"as",
"until",
"while",
"of",
"at",
"by",
"for",
"with",
"about",
"against",
"between",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"to",
"from",
"up",
"down",
"in",
"out",
"on",
"off",
"over",
"under",
"again",
"further",
"then",
"once",
"here",
"there",
"when",
"where",
"why",
"how",
"all",
"any",
"both",
"each",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"s",
"t",
"can",
"will",
"just",
"don",
"should",
"now",
];
return query
.split(" ")
.filter((word) => !stopwords.includes(word))
.join(" ");
}
} }

View File

@@ -1,8 +0,0 @@
declare module "@tommoor/remove-markdown" {
export default function removeMarkdown(
text: string,
options?: {
stripHTML: boolean;
}
): string;
}

View File

@@ -2958,11 +2958,6 @@
dependencies: dependencies:
tippy.js "^6.3.1" tippy.js "^6.3.1"
"@tommoor/remove-markdown@^0.3.2":
version "0.3.2"
resolved "https://registry.yarnpkg.com/@tommoor/remove-markdown/-/remove-markdown-0.3.2.tgz#5288ddd0e26b6b173e76ebb31c94653b0dcff45d"
integrity "sha1-Uojd0OJraxc+duuzHJRlOw3P9F0= sha512-awcc9hfLZqyyZHOGzAHbnjgZJpQGS1W1oZZ5GXOTTnbKVdKQ4OWYbrRWPUvXI2YAKJazrcS8rxPh67PX3rpGkQ=="
"@tootallnate/once@2": "@tootallnate/once@2":
version "2.0.0" version "2.0.0"
resolved "https://registry.yarnpkg.com/@tootallnate/once/-/once-2.0.0.tgz#f544a148d3ab35801c1f633a7441fd87c2e484bf" resolved "https://registry.yarnpkg.com/@tootallnate/once/-/once-2.0.0.tgz#f544a148d3ab35801c1f633a7441fd87c2e484bf"