From c6408f7b3f055b575dd047a0a8a07da3e53c6a51 Mon Sep 17 00:00:00 2001 From: Tom Moor Date: Tue, 2 Jul 2024 19:33:30 -0400 Subject: [PATCH] fix: CJK content results in long context strings in search results closes #7183 --- app/components/DocumentListItem.tsx | 2 + server/models/helpers/SearchHelper.test.ts | 2 +- server/models/helpers/SearchHelper.ts | 28 +++++++++- shared/utils/string.ts | 65 ++++++++++++++++++++++ 4 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 shared/utils/string.ts diff --git a/app/components/DocumentListItem.tsx b/app/components/DocumentListItem.tsx index b797c485e..a33ec43f4 100644 --- a/app/components/DocumentListItem.tsx +++ b/app/components/DocumentListItem.tsx @@ -275,6 +275,8 @@ const ResultContext = styled(Highlight)` font-size: 15px; margin-top: -0.25em; margin-bottom: 0.25em; + max-height: 90px; + overflow: hidden; `; export default observer(React.forwardRef(DocumentListItem)); diff --git a/server/models/helpers/SearchHelper.test.ts b/server/models/helpers/SearchHelper.test.ts index 4976fdeed..dd54dcfa9 100644 --- a/server/models/helpers/SearchHelper.test.ts +++ b/server/models/helpers/SearchHelper.test.ts @@ -465,7 +465,7 @@ describe("SearchHelper", () => { expect(totalCount).toBe(0); }); - test("should find extact phrases", async () => { + test("should find exact phrases", async () => { const team = await buildTeam(); const user = await buildUser({ teamId: team.id }); const collection = await buildCollection({ diff --git a/server/models/helpers/SearchHelper.ts b/server/models/helpers/SearchHelper.ts index 796c75610..c24c451ee 100644 --- a/server/models/helpers/SearchHelper.ts +++ b/server/models/helpers/SearchHelper.ts @@ -5,6 +5,7 @@ import map from "lodash/map"; import queryParser from "pg-tsquery"; import { Op, Sequelize, WhereOptions } from "sequelize"; import { DateFilter, StatusFilter } from "@shared/types"; +import { regexIndexOf, regexLastIndexOf } from "@shared/utils/string"; import { getUrls } from "@shared/utils/urls"; import Collection from "@server/models/Collection"; import Document from "@server/models/Document"; @@ -304,16 +305,39 @@ export default class SearchHelper { "gi" ); + // Breaking characters + const breakChars = [ + " ", + ".", + ",", + `"`, + "'", + "\n", + "。", + "!", + "?", + "!", + "?", + "…", + ]; + const breakCharsRegex = new RegExp(`[${breakChars.join("")}]`, "g"); + // chop text around the first match, prefer the first full match if possible. const fullMatchIndex = text.search(fullMatchRegex); const offsetStartIndex = (fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65; const startIndex = Math.max( 0, - offsetStartIndex <= 0 ? 0 : text.indexOf(" ", offsetStartIndex) + offsetStartIndex <= 0 + ? 0 + : regexIndexOf(text, breakCharsRegex, offsetStartIndex) ); const context = text.replace(highlightRegex, "$&"); - const endIndex = context.lastIndexOf(" ", startIndex + 250); + const endIndex = regexLastIndexOf( + context, + breakCharsRegex, + startIndex + 250 + ); return context.slice(startIndex, endIndex); } diff --git a/shared/utils/string.ts b/shared/utils/string.ts new file mode 100644 index 000000000..3079bb681 --- /dev/null +++ b/shared/utils/string.ts @@ -0,0 +1,65 @@ +/** + * Returns the index of the first occurrence of a substring in a string that matches a regular expression. + * + * @param text The string to search in. + * @param re The regular expression to search for. + * @param startPos The position in the string at which to begin the search. Defaults to 0. + */ +export const regexIndexOf = function ( + text: string, + re: RegExp, + startPos?: number +) { + startPos = startPos || 0; + + if (!re.global) { + const flags = "g" + (re.multiline ? "m" : "") + (re.ignoreCase ? "i" : ""); + re = new RegExp(re.source, flags); + } + + re.lastIndex = startPos; + const match = re.exec(text); + + if (match) { + return match.index; + } else { + return -1; + } +}; + +/** + * Returns the index of the last occurrence of a substring in a string that matches a regular expression. + * + * @param text The string to search in. + * @param re The regular expression to search for. + * @param startPos The position in the string at which to begin the search. Defaults to the end of the string. + */ +export const regexLastIndexOf = function ( + text: string, + re: RegExp, + startPos?: number +) { + startPos = startPos === undefined ? text.length : startPos; + + if (!re.global) { + const flags = "g" + (re.multiline ? "m" : "") + (re.ignoreCase ? "i" : ""); + re = new RegExp(re.source, flags); + } + + let lastSuccess = -1; + for (let pos = 0; pos <= startPos; pos++) { + re.lastIndex = pos; + + const match = re.exec(text); + if (!match) { + break; + } + + pos = match.index; + if (pos <= startPos) { + lastSuccess = pos; + } + } + + return lastSuccess; +};