Improve the quality of snippets for search results (#6828)
* Improve the quality of snippets for search results * Prefer full match * tweak highlight regex
This commit is contained in:
@@ -78,7 +78,6 @@
|
|||||||
"@sentry/node": "^7.99.0",
|
"@sentry/node": "^7.99.0",
|
||||||
"@sentry/react": "^7.99.0",
|
"@sentry/react": "^7.99.0",
|
||||||
"@tippyjs/react": "^4.2.6",
|
"@tippyjs/react": "^4.2.6",
|
||||||
"@tommoor/remove-markdown": "^0.3.2",
|
|
||||||
"@types/form-data": "^2.5.0",
|
"@types/form-data": "^2.5.0",
|
||||||
"@types/mailparser": "^3.4.4",
|
"@types/mailparser": "^3.4.4",
|
||||||
"@types/sanitize-filename": "^1.6.3",
|
"@types/sanitize-filename": "^1.6.3",
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import removeMarkdown from "@tommoor/remove-markdown";
|
|
||||||
import invariant from "invariant";
|
import invariant from "invariant";
|
||||||
import escapeRegExp from "lodash/escapeRegExp";
|
import escapeRegExp from "lodash/escapeRegExp";
|
||||||
import find from "lodash/find";
|
import find from "lodash/find";
|
||||||
@@ -13,6 +12,7 @@ import Share from "@server/models/Share";
|
|||||||
import Team from "@server/models/Team";
|
import Team from "@server/models/Team";
|
||||||
import User from "@server/models/User";
|
import User from "@server/models/User";
|
||||||
import { sequelize } from "@server/storage/database";
|
import { sequelize } from "@server/storage/database";
|
||||||
|
import DocumentHelper from "./DocumentHelper";
|
||||||
|
|
||||||
type SearchResponse = {
|
type SearchResponse = {
|
||||||
results: {
|
results: {
|
||||||
@@ -52,7 +52,6 @@ type RankedDocument = Document & {
|
|||||||
id: string;
|
id: string;
|
||||||
dataValues: Partial<Document> & {
|
dataValues: Partial<Document> & {
|
||||||
searchRanking: number;
|
searchRanking: number;
|
||||||
searchContext: string;
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -108,12 +107,6 @@ export default class SearchHelper {
|
|||||||
),
|
),
|
||||||
"searchRanking",
|
"searchRanking",
|
||||||
],
|
],
|
||||||
[
|
|
||||||
Sequelize.literal(
|
|
||||||
`ts_headline('english', "text", to_tsquery('english', :query), :headlineOptions)`
|
|
||||||
),
|
|
||||||
"searchContext",
|
|
||||||
],
|
|
||||||
],
|
],
|
||||||
replacements: queryReplacements,
|
replacements: queryReplacements,
|
||||||
where,
|
where,
|
||||||
@@ -245,12 +238,6 @@ export default class SearchHelper {
|
|||||||
),
|
),
|
||||||
"searchRanking",
|
"searchRanking",
|
||||||
],
|
],
|
||||||
[
|
|
||||||
Sequelize.literal(
|
|
||||||
`ts_headline('english', "text", to_tsquery('english', :query), :headlineOptions)`
|
|
||||||
),
|
|
||||||
"searchContext",
|
|
||||||
],
|
|
||||||
],
|
],
|
||||||
subQuery: false,
|
subQuery: false,
|
||||||
include,
|
include,
|
||||||
@@ -275,7 +262,7 @@ export default class SearchHelper {
|
|||||||
|
|
||||||
// Final query to get associated document data
|
// Final query to get associated document data
|
||||||
const documents = await Document.scope([
|
const documents = await Document.scope([
|
||||||
"withoutState",
|
"withState",
|
||||||
"withDrafts",
|
"withDrafts",
|
||||||
{
|
{
|
||||||
method: ["withViews", user.id],
|
method: ["withViews", user.id],
|
||||||
@@ -296,6 +283,39 @@ export default class SearchHelper {
|
|||||||
return this.buildResponse(query, results, documents, count);
|
return this.buildResponse(query, results, documents, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static buildResultContext(document: Document, query: string) {
|
||||||
|
const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g));
|
||||||
|
const text = DocumentHelper.toPlainText(document);
|
||||||
|
|
||||||
|
// Regex to highlight quoted queries as ts_headline will not do this by default due to stemming.
|
||||||
|
const fullMatchRegex = new RegExp(escapeRegExp(query), "i");
|
||||||
|
const highlightRegex = new RegExp(
|
||||||
|
[
|
||||||
|
fullMatchRegex.source,
|
||||||
|
...(quotedQueries.length
|
||||||
|
? quotedQueries.map((match) => escapeRegExp(match[1]))
|
||||||
|
: this.removeStopWords(query)
|
||||||
|
.trim()
|
||||||
|
.split(" ")
|
||||||
|
.map((match) => `\\b${escapeRegExp(match)}\\b`)),
|
||||||
|
].join("|"),
|
||||||
|
"gi"
|
||||||
|
);
|
||||||
|
|
||||||
|
// chop text around the first match, prefer the first full match if possible.
|
||||||
|
const fullMatchIndex = text.search(fullMatchRegex);
|
||||||
|
const offsetStartIndex =
|
||||||
|
(fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65;
|
||||||
|
const startIndex = Math.max(
|
||||||
|
0,
|
||||||
|
offsetStartIndex <= 0 ? 0 : text.indexOf(" ", offsetStartIndex)
|
||||||
|
);
|
||||||
|
const context = text.replace(highlightRegex, "<b>$&</b>");
|
||||||
|
const endIndex = context.lastIndexOf(" ", startIndex + 250);
|
||||||
|
|
||||||
|
return context.slice(startIndex, endIndex);
|
||||||
|
}
|
||||||
|
|
||||||
private static async buildWhere(
|
private static async buildWhere(
|
||||||
model: User | Team,
|
model: User | Team,
|
||||||
query: string | undefined,
|
query: string | undefined,
|
||||||
@@ -457,33 +477,16 @@ export default class SearchHelper {
|
|||||||
documents: Document[],
|
documents: Document[],
|
||||||
count: number
|
count: number
|
||||||
): SearchResponse {
|
): SearchResponse {
|
||||||
const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g)).slice(0, 3);
|
|
||||||
|
|
||||||
// Regex to highlight quoted queries as ts_headline will not do this by default due to stemming.
|
|
||||||
const quotedRegex = new RegExp(
|
|
||||||
quotedQueries.map((match) => escapeRegExp(match[1])).join("|"),
|
|
||||||
"gi"
|
|
||||||
);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
results: map(results, (result) => {
|
results: map(results, (result) => {
|
||||||
let context = removeMarkdown(result.dataValues.searchContext, {
|
const document = find(documents, {
|
||||||
stripHTML: false,
|
id: result.id,
|
||||||
});
|
}) as Document;
|
||||||
|
|
||||||
// If there are any quoted queries, highlighting these takes precedence over the default
|
|
||||||
if (quotedQueries.length) {
|
|
||||||
context = context
|
|
||||||
.replace(/<\/?b>/g, "")
|
|
||||||
.replace(quotedRegex, "<b>$&</b>");
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
ranking: result.dataValues.searchRanking,
|
ranking: result.dataValues.searchRanking,
|
||||||
context,
|
context: this.buildResultContext(document, query),
|
||||||
document: find(documents, {
|
document,
|
||||||
id: result.id,
|
|
||||||
}) as Document,
|
|
||||||
};
|
};
|
||||||
}),
|
}),
|
||||||
totalCount: count,
|
totalCount: count,
|
||||||
@@ -537,4 +540,140 @@ export default class SearchHelper {
|
|||||||
.replace(/:/g, "\\:")
|
.replace(/:/g, "\\:")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static removeStopWords(query: string): string {
|
||||||
|
const stopwords = [
|
||||||
|
"i",
|
||||||
|
"me",
|
||||||
|
"my",
|
||||||
|
"myself",
|
||||||
|
"we",
|
||||||
|
"our",
|
||||||
|
"ours",
|
||||||
|
"ourselves",
|
||||||
|
"you",
|
||||||
|
"your",
|
||||||
|
"yours",
|
||||||
|
"yourself",
|
||||||
|
"yourselves",
|
||||||
|
"he",
|
||||||
|
"him",
|
||||||
|
"his",
|
||||||
|
"himself",
|
||||||
|
"she",
|
||||||
|
"her",
|
||||||
|
"hers",
|
||||||
|
"herself",
|
||||||
|
"it",
|
||||||
|
"its",
|
||||||
|
"itself",
|
||||||
|
"they",
|
||||||
|
"them",
|
||||||
|
"their",
|
||||||
|
"theirs",
|
||||||
|
"themselves",
|
||||||
|
"what",
|
||||||
|
"which",
|
||||||
|
"who",
|
||||||
|
"whom",
|
||||||
|
"this",
|
||||||
|
"that",
|
||||||
|
"these",
|
||||||
|
"those",
|
||||||
|
"am",
|
||||||
|
"is",
|
||||||
|
"are",
|
||||||
|
"was",
|
||||||
|
"were",
|
||||||
|
"be",
|
||||||
|
"been",
|
||||||
|
"being",
|
||||||
|
"have",
|
||||||
|
"has",
|
||||||
|
"had",
|
||||||
|
"having",
|
||||||
|
"do",
|
||||||
|
"does",
|
||||||
|
"did",
|
||||||
|
"doing",
|
||||||
|
"a",
|
||||||
|
"an",
|
||||||
|
"the",
|
||||||
|
"and",
|
||||||
|
"but",
|
||||||
|
"if",
|
||||||
|
"or",
|
||||||
|
"because",
|
||||||
|
"as",
|
||||||
|
"until",
|
||||||
|
"while",
|
||||||
|
"of",
|
||||||
|
"at",
|
||||||
|
"by",
|
||||||
|
"for",
|
||||||
|
"with",
|
||||||
|
"about",
|
||||||
|
"against",
|
||||||
|
"between",
|
||||||
|
"into",
|
||||||
|
"through",
|
||||||
|
"during",
|
||||||
|
"before",
|
||||||
|
"after",
|
||||||
|
"above",
|
||||||
|
"below",
|
||||||
|
"to",
|
||||||
|
"from",
|
||||||
|
"up",
|
||||||
|
"down",
|
||||||
|
"in",
|
||||||
|
"out",
|
||||||
|
"on",
|
||||||
|
"off",
|
||||||
|
"over",
|
||||||
|
"under",
|
||||||
|
"again",
|
||||||
|
"further",
|
||||||
|
"then",
|
||||||
|
"once",
|
||||||
|
"here",
|
||||||
|
"there",
|
||||||
|
"when",
|
||||||
|
"where",
|
||||||
|
"why",
|
||||||
|
"how",
|
||||||
|
"all",
|
||||||
|
"any",
|
||||||
|
"both",
|
||||||
|
"each",
|
||||||
|
"few",
|
||||||
|
"more",
|
||||||
|
"most",
|
||||||
|
"other",
|
||||||
|
"some",
|
||||||
|
"such",
|
||||||
|
"no",
|
||||||
|
"nor",
|
||||||
|
"not",
|
||||||
|
"only",
|
||||||
|
"own",
|
||||||
|
"same",
|
||||||
|
"so",
|
||||||
|
"than",
|
||||||
|
"too",
|
||||||
|
"very",
|
||||||
|
"s",
|
||||||
|
"t",
|
||||||
|
"can",
|
||||||
|
"will",
|
||||||
|
"just",
|
||||||
|
"don",
|
||||||
|
"should",
|
||||||
|
"now",
|
||||||
|
];
|
||||||
|
return query
|
||||||
|
.split(" ")
|
||||||
|
.filter((word) => !stopwords.includes(word))
|
||||||
|
.join(" ");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
8
server/typings/tommoor__remove-markdown.d.ts
vendored
8
server/typings/tommoor__remove-markdown.d.ts
vendored
@@ -1,8 +0,0 @@
|
|||||||
declare module "@tommoor/remove-markdown" {
|
|
||||||
export default function removeMarkdown(
|
|
||||||
text: string,
|
|
||||||
options?: {
|
|
||||||
stripHTML: boolean;
|
|
||||||
}
|
|
||||||
): string;
|
|
||||||
}
|
|
||||||
@@ -2958,11 +2958,6 @@
|
|||||||
dependencies:
|
dependencies:
|
||||||
tippy.js "^6.3.1"
|
tippy.js "^6.3.1"
|
||||||
|
|
||||||
"@tommoor/remove-markdown@^0.3.2":
|
|
||||||
version "0.3.2"
|
|
||||||
resolved "https://registry.yarnpkg.com/@tommoor/remove-markdown/-/remove-markdown-0.3.2.tgz#5288ddd0e26b6b173e76ebb31c94653b0dcff45d"
|
|
||||||
integrity "sha1-Uojd0OJraxc+duuzHJRlOw3P9F0= sha512-awcc9hfLZqyyZHOGzAHbnjgZJpQGS1W1oZZ5GXOTTnbKVdKQ4OWYbrRWPUvXI2YAKJazrcS8rxPh67PX3rpGkQ=="
|
|
||||||
|
|
||||||
"@tootallnate/once@2":
|
"@tootallnate/once@2":
|
||||||
version "2.0.0"
|
version "2.0.0"
|
||||||
resolved "https://registry.yarnpkg.com/@tootallnate/once/-/once-2.0.0.tgz#f544a148d3ab35801c1f633a7441fd87c2e484bf"
|
resolved "https://registry.yarnpkg.com/@tootallnate/once/-/once-2.0.0.tgz#f544a148d3ab35801c1f633a7441fd87c2e484bf"
|
||||||
|
|||||||
Reference in New Issue
Block a user