fix: CJK content results in long context strings in search results

closes #7183
2024-07-02 19:33:30 -04:00
parent 18f729b970
commit c6408f7b3f
4 changed files with 94 additions and 3 deletions
--- a/app/components/DocumentListItem.tsx
+++ b/app/components/DocumentListItem.tsx
@@ -275,6 +275,8 @@ const ResultContext = styled(Highlight)`
  font-size: 15px;
  margin-top: -0.25em;
  margin-bottom: 0.25em;
  max-height: 90px;
  overflow: hidden;
 `;
 export default observer(React.forwardRef(DocumentListItem));
--- a/server/models/helpers/SearchHelper.test.ts
+++ b/server/models/helpers/SearchHelper.test.ts
@@ -465,7 +465,7 @@ describe("SearchHelper", () => {
      expect(totalCount).toBe(0);
    });
-    test("should find extact phrases", async () => {
+    test("should find exact phrases", async () => {
      const team = await buildTeam();
      const user = await buildUser({ teamId: team.id });
      const collection = await buildCollection({
--- a/server/models/helpers/SearchHelper.ts
+++ b/server/models/helpers/SearchHelper.ts
@@ -5,6 +5,7 @@ import map from "lodash/map";
 import queryParser from "pg-tsquery";
 import { Op, Sequelize, WhereOptions } from "sequelize";
 import { DateFilter, StatusFilter } from "@shared/types";
 import { regexIndexOf, regexLastIndexOf } from "@shared/utils/string";
 import { getUrls } from "@shared/utils/urls";
 import Collection from "@server/models/Collection";
 import Document from "@server/models/Document";
@@ -304,16 +305,39 @@ export default class SearchHelper {
      "gi"
    );
    // Breaking characters
    const breakChars = [
      " ",
      ".",
      ",",
      `"`,
      "'",
      "\n",
      "。",
      "！",
      "？",
      "!",
      "?",
      "…",
    ];
    const breakCharsRegex = new RegExp(`[${breakChars.join("")}]`, "g");
    // chop text around the first match, prefer the first full match if possible.
    const fullMatchIndex = text.search(fullMatchRegex);
    const offsetStartIndex =
      (fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65;
    const startIndex = Math.max(
      0,
-      offsetStartIndex <= 0 ? 0 : text.indexOf(" ", offsetStartIndex)
+      offsetStartIndex <= 0
        ? 0
        : regexIndexOf(text, breakCharsRegex, offsetStartIndex)
    );
    const context = text.replace(highlightRegex, "<b>$&</b>");
-    const endIndex = context.lastIndexOf(" ", startIndex + 250);
+    const endIndex = regexLastIndexOf(
      context,
      breakCharsRegex,
      startIndex + 250
    );
    return context.slice(startIndex, endIndex);
  }
--- a/shared/utils/string.ts
+++ b/shared/utils/string.ts
@@ -0,0 +1,65 @@
 /**
 * Returns the index of the first occurrence of a substring in a string that matches a regular expression.
 *
 * @param text The string to search in.
 * @param re The regular expression to search for.
 * @param startPos The position in the string at which to begin the search. Defaults to 0.
 */
 export const regexIndexOf = function (
  text: string,
  re: RegExp,
  startPos?: number
 ) {
  startPos = startPos || 0;
  if (!re.global) {
    const flags = "g" + (re.multiline ? "m" : "") + (re.ignoreCase ? "i" : "");
    re = new RegExp(re.source, flags);
  }
  re.lastIndex = startPos;
  const match = re.exec(text);
  if (match) {
    return match.index;
  } else {
    return -1;
  }
 };
 /**
 * Returns the index of the last occurrence of a substring in a string that matches a regular expression.
 *
 * @param text The string to search in.
 * @param re The regular expression to search for.
 * @param startPos The position in the string at which to begin the search. Defaults to the end of the string.
 */
 export const regexLastIndexOf = function (
  text: string,
  re: RegExp,
  startPos?: number
 ) {
  startPos = startPos === undefined ? text.length : startPos;
  if (!re.global) {
    const flags = "g" + (re.multiline ? "m" : "") + (re.ignoreCase ? "i" : "");
    re = new RegExp(re.source, flags);
  }
  let lastSuccess = -1;
  for (let pos = 0; pos <= startPos; pos++) {
    re.lastIndex = pos;
    const match = re.exec(text);
    if (!match) {
      break;
    }
    pos = match.index;
    if (pos <= startPos) {
      lastSuccess = pos;
    }
  }
  return lastSuccess;
 };