fix: CJK content results in long context strings in search results

closes #7183
This commit is contained in:
Tom Moor
2024-07-02 19:33:30 -04:00
parent 18f729b970
commit c6408f7b3f
4 changed files with 94 additions and 3 deletions

View File

@@ -275,6 +275,8 @@ const ResultContext = styled(Highlight)`
font-size: 15px;
margin-top: -0.25em;
margin-bottom: 0.25em;
max-height: 90px;
overflow: hidden;
`;
export default observer(React.forwardRef(DocumentListItem));

View File

@@ -465,7 +465,7 @@ describe("SearchHelper", () => {
expect(totalCount).toBe(0);
});
test("should find extact phrases", async () => {
test("should find exact phrases", async () => {
const team = await buildTeam();
const user = await buildUser({ teamId: team.id });
const collection = await buildCollection({

View File

@@ -5,6 +5,7 @@ import map from "lodash/map";
import queryParser from "pg-tsquery";
import { Op, Sequelize, WhereOptions } from "sequelize";
import { DateFilter, StatusFilter } from "@shared/types";
import { regexIndexOf, regexLastIndexOf } from "@shared/utils/string";
import { getUrls } from "@shared/utils/urls";
import Collection from "@server/models/Collection";
import Document from "@server/models/Document";
@@ -304,16 +305,39 @@ export default class SearchHelper {
"gi"
);
// Breaking characters
const breakChars = [
" ",
".",
",",
`"`,
"'",
"\n",
"。",
"",
"",
"!",
"?",
"…",
];
const breakCharsRegex = new RegExp(`[${breakChars.join("")}]`, "g");
// chop text around the first match, prefer the first full match if possible.
const fullMatchIndex = text.search(fullMatchRegex);
const offsetStartIndex =
(fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65;
const startIndex = Math.max(
0,
offsetStartIndex <= 0 ? 0 : text.indexOf(" ", offsetStartIndex)
offsetStartIndex <= 0
? 0
: regexIndexOf(text, breakCharsRegex, offsetStartIndex)
);
const context = text.replace(highlightRegex, "<b>$&</b>");
const endIndex = context.lastIndexOf(" ", startIndex + 250);
const endIndex = regexLastIndexOf(
context,
breakCharsRegex,
startIndex + 250
);
return context.slice(startIndex, endIndex);
}

65
shared/utils/string.ts Normal file
View File

@@ -0,0 +1,65 @@
/**
* Returns the index of the first occurrence of a substring in a string that matches a regular expression.
*
* @param text The string to search in.
* @param re The regular expression to search for.
* @param startPos The position in the string at which to begin the search. Defaults to 0.
*/
export const regexIndexOf = function (
text: string,
re: RegExp,
startPos?: number
) {
startPos = startPos || 0;
if (!re.global) {
const flags = "g" + (re.multiline ? "m" : "") + (re.ignoreCase ? "i" : "");
re = new RegExp(re.source, flags);
}
re.lastIndex = startPos;
const match = re.exec(text);
if (match) {
return match.index;
} else {
return -1;
}
};
/**
* Returns the index of the last occurrence of a substring in a string that matches a regular expression.
*
* @param text The string to search in.
* @param re The regular expression to search for.
* @param startPos The position in the string at which to begin the search. Defaults to the end of the string.
*/
export const regexLastIndexOf = function (
text: string,
re: RegExp,
startPos?: number
) {
startPos = startPos === undefined ? text.length : startPos;
if (!re.global) {
const flags = "g" + (re.multiline ? "m" : "") + (re.ignoreCase ? "i" : "");
re = new RegExp(re.source, flags);
}
let lastSuccess = -1;
for (let pos = 0; pos <= startPos; pos++) {
re.lastIndex = pos;
const match = re.exec(text);
if (!match) {
break;
}
pos = match.index;
if (pos <= startPos) {
lastSuccess = pos;
}
}
return lastSuccess;
};