712 lines
17 KiB
TypeScript
712 lines
17 KiB
TypeScript
import invariant from "invariant";
|
||
import escapeRegExp from "lodash/escapeRegExp";
|
||
import find from "lodash/find";
|
||
import map from "lodash/map";
|
||
import queryParser from "pg-tsquery";
|
||
import { Op, Sequelize, WhereOptions } from "sequelize";
|
||
import { DateFilter, StatusFilter } from "@shared/types";
|
||
import { regexIndexOf, regexLastIndexOf } from "@shared/utils/string";
|
||
import { getUrls } from "@shared/utils/urls";
|
||
import Collection from "@server/models/Collection";
|
||
import Document from "@server/models/Document";
|
||
import Share from "@server/models/Share";
|
||
import Team from "@server/models/Team";
|
||
import User from "@server/models/User";
|
||
import { sequelize } from "@server/storage/database";
|
||
import { DocumentHelper } from "./DocumentHelper";
|
||
|
||
type SearchResponse = {
|
||
results: {
|
||
/** The search ranking, for sorting results */
|
||
ranking: number;
|
||
/** A snippet of contextual text around the search result */
|
||
context: string;
|
||
/** The document result */
|
||
document: Document;
|
||
}[];
|
||
/** The total number of results for the search query without pagination */
|
||
totalCount: number;
|
||
};
|
||
|
||
type SearchOptions = {
|
||
/** The query limit for pagination */
|
||
limit?: number;
|
||
/** The query offset for pagination */
|
||
offset?: number;
|
||
/** Limit results to a collection. Authorization is presumed to have been done before passing to this helper. */
|
||
collectionId?: string | null;
|
||
/** Limit results to a shared document. */
|
||
share?: Share;
|
||
/** Limit results to a date range. */
|
||
dateFilter?: DateFilter;
|
||
/** Status of the documents to return */
|
||
statusFilter?: StatusFilter[];
|
||
/** Limit results to a list of documents. */
|
||
documentIds?: string[];
|
||
/** Limit results to a list of users that collaborated on the document. */
|
||
collaboratorIds?: string[];
|
||
/** The minimum number of words to be returned in the contextual snippet */
|
||
snippetMinWords?: number;
|
||
/** The maximum number of words to be returned in the contextual snippet */
|
||
snippetMaxWords?: number;
|
||
};
|
||
|
||
type RankedDocument = Document & {
|
||
id: string;
|
||
dataValues: Partial<Document> & {
|
||
searchRanking: number;
|
||
};
|
||
};
|
||
|
||
export default class SearchHelper {
|
||
/**
|
||
* The maximum length of a search query.
|
||
*/
|
||
public static maxQueryLength = 1000;
|
||
|
||
public static async searchForTeam(
|
||
team: Team,
|
||
query: string,
|
||
options: SearchOptions = {}
|
||
): Promise<SearchResponse> {
|
||
const {
|
||
snippetMinWords = 20,
|
||
snippetMaxWords = 30,
|
||
limit = 15,
|
||
offset = 0,
|
||
} = options;
|
||
|
||
const where = await this.buildWhere(team, query, {
|
||
...options,
|
||
statusFilter: [...(options.statusFilter || []), StatusFilter.Published],
|
||
});
|
||
|
||
if (options.share?.includeChildDocuments) {
|
||
const sharedDocument = await options.share.$get("document");
|
||
invariant(sharedDocument, "Cannot find document for share");
|
||
|
||
const childDocumentIds = await sharedDocument.findAllChildDocumentIds({
|
||
archivedAt: {
|
||
[Op.is]: null,
|
||
},
|
||
});
|
||
|
||
where[Op.and].push({
|
||
id: [sharedDocument.id, ...childDocumentIds],
|
||
});
|
||
}
|
||
|
||
const queryReplacements = {
|
||
query: this.webSearchQuery(query),
|
||
headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`,
|
||
};
|
||
|
||
const resultsQuery = Document.unscoped().findAll({
|
||
attributes: [
|
||
"id",
|
||
[
|
||
Sequelize.literal(
|
||
`ts_rank("searchVector", to_tsquery('english', :query))`
|
||
),
|
||
"searchRanking",
|
||
],
|
||
],
|
||
replacements: queryReplacements,
|
||
where,
|
||
order: [
|
||
["searchRanking", "DESC"],
|
||
["updatedAt", "DESC"],
|
||
],
|
||
limit,
|
||
offset,
|
||
}) as any as Promise<RankedDocument[]>;
|
||
|
||
const countQuery = Document.unscoped().count({
|
||
// @ts-expect-error Types are incorrect for count
|
||
replacements: queryReplacements,
|
||
where,
|
||
}) as any as Promise<number>;
|
||
const [results, count] = await Promise.all([resultsQuery, countQuery]);
|
||
|
||
// Final query to get associated document data
|
||
const documents = await Document.findAll({
|
||
where: {
|
||
id: map(results, "id"),
|
||
teamId: team.id,
|
||
},
|
||
include: [
|
||
{
|
||
model: Collection,
|
||
as: "collection",
|
||
},
|
||
],
|
||
});
|
||
|
||
return this.buildResponse(query, results, documents, count);
|
||
}
|
||
|
||
public static async searchTitlesForUser(
|
||
user: User,
|
||
query: string,
|
||
options: SearchOptions = {}
|
||
): Promise<Document[]> {
|
||
const { limit = 15, offset = 0 } = options;
|
||
const where = await this.buildWhere(user, undefined, options);
|
||
|
||
where[Op.and].push({
|
||
title: {
|
||
[Op.iLike]: `%${query}%`,
|
||
},
|
||
});
|
||
|
||
const include = [
|
||
{
|
||
association: "memberships",
|
||
where: {
|
||
userId: user.id,
|
||
},
|
||
required: false,
|
||
separate: false,
|
||
},
|
||
{
|
||
model: User,
|
||
as: "createdBy",
|
||
paranoid: false,
|
||
},
|
||
{
|
||
model: User,
|
||
as: "updatedBy",
|
||
paranoid: false,
|
||
},
|
||
];
|
||
|
||
return Document.scope([
|
||
"withoutState",
|
||
"withDrafts",
|
||
{
|
||
method: ["withViews", user.id],
|
||
},
|
||
{
|
||
method: ["withCollectionPermissions", user.id],
|
||
},
|
||
{
|
||
method: ["withMembership", user.id],
|
||
},
|
||
]).findAll({
|
||
where,
|
||
subQuery: false,
|
||
order: [["updatedAt", "DESC"]],
|
||
include,
|
||
offset,
|
||
limit,
|
||
});
|
||
}
|
||
|
||
public static async searchForUser(
|
||
user: User,
|
||
query: string,
|
||
options: SearchOptions = {}
|
||
): Promise<SearchResponse> {
|
||
const {
|
||
snippetMinWords = 20,
|
||
snippetMaxWords = 30,
|
||
limit = 15,
|
||
offset = 0,
|
||
} = options;
|
||
|
||
const where = await this.buildWhere(user, query, options);
|
||
|
||
const queryReplacements = {
|
||
query: this.webSearchQuery(query),
|
||
headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`,
|
||
};
|
||
|
||
const include = [
|
||
{
|
||
association: "memberships",
|
||
where: {
|
||
userId: user.id,
|
||
},
|
||
required: false,
|
||
separate: false,
|
||
},
|
||
];
|
||
|
||
const resultsQuery = Document.unscoped().findAll({
|
||
attributes: [
|
||
"id",
|
||
[
|
||
Sequelize.literal(
|
||
`ts_rank("searchVector", to_tsquery('english', :query))`
|
||
),
|
||
"searchRanking",
|
||
],
|
||
],
|
||
subQuery: false,
|
||
include,
|
||
replacements: queryReplacements,
|
||
where,
|
||
order: [
|
||
["searchRanking", "DESC"],
|
||
["updatedAt", "DESC"],
|
||
],
|
||
limit,
|
||
offset,
|
||
}) as any as Promise<RankedDocument[]>;
|
||
|
||
const countQuery = Document.unscoped().count({
|
||
// @ts-expect-error Types are incorrect for count
|
||
subQuery: false,
|
||
include,
|
||
replacements: queryReplacements,
|
||
where,
|
||
}) as any as Promise<number>;
|
||
const [results, count] = await Promise.all([resultsQuery, countQuery]);
|
||
|
||
// Final query to get associated document data
|
||
const documents = await Document.scope([
|
||
"withState",
|
||
"withDrafts",
|
||
{
|
||
method: ["withViews", user.id],
|
||
},
|
||
{
|
||
method: ["withCollectionPermissions", user.id],
|
||
},
|
||
{
|
||
method: ["withMembership", user.id],
|
||
},
|
||
]).findAll({
|
||
where: {
|
||
teamId: user.teamId,
|
||
id: map(results, "id"),
|
||
},
|
||
});
|
||
|
||
return this.buildResponse(query, results, documents, count);
|
||
}
|
||
|
||
private static buildResultContext(document: Document, query: string) {
|
||
const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g));
|
||
const text = DocumentHelper.toPlainText(document);
|
||
|
||
// Regex to highlight quoted queries as ts_headline will not do this by default due to stemming.
|
||
const fullMatchRegex = new RegExp(escapeRegExp(query), "i");
|
||
const highlightRegex = new RegExp(
|
||
[
|
||
fullMatchRegex.source,
|
||
...(quotedQueries.length
|
||
? quotedQueries.map((match) => escapeRegExp(match[1]))
|
||
: this.removeStopWords(query)
|
||
.trim()
|
||
.split(" ")
|
||
.map((match) => `\\b${escapeRegExp(match)}\\b`)),
|
||
].join("|"),
|
||
"gi"
|
||
);
|
||
|
||
// Breaking characters
|
||
const breakChars = [
|
||
" ",
|
||
".",
|
||
",",
|
||
`"`,
|
||
"'",
|
||
"\n",
|
||
"。",
|
||
"!",
|
||
"?",
|
||
"!",
|
||
"?",
|
||
"…",
|
||
];
|
||
const breakCharsRegex = new RegExp(`[${breakChars.join("")}]`, "g");
|
||
|
||
// chop text around the first match, prefer the first full match if possible.
|
||
const fullMatchIndex = text.search(fullMatchRegex);
|
||
const offsetStartIndex =
|
||
(fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65;
|
||
const startIndex = Math.max(
|
||
0,
|
||
offsetStartIndex <= 0
|
||
? 0
|
||
: regexIndexOf(text, breakCharsRegex, offsetStartIndex)
|
||
);
|
||
const context = text.replace(highlightRegex, "<b>$&</b>");
|
||
const endIndex = regexLastIndexOf(
|
||
context,
|
||
breakCharsRegex,
|
||
startIndex + 250
|
||
);
|
||
|
||
return context.slice(startIndex, endIndex);
|
||
}
|
||
|
||
private static async buildWhere(
|
||
model: User | Team,
|
||
query: string | undefined,
|
||
options: SearchOptions
|
||
) {
|
||
const teamId = model instanceof Team ? model.id : model.teamId;
|
||
const where: WhereOptions<Document> = {
|
||
teamId,
|
||
[Op.or]: [],
|
||
[Op.and]: [
|
||
{
|
||
deletedAt: {
|
||
[Op.eq]: null,
|
||
},
|
||
},
|
||
],
|
||
};
|
||
|
||
if (model instanceof User) {
|
||
where[Op.or].push({ "$memberships.id$": { [Op.ne]: null } });
|
||
}
|
||
|
||
// Ensure we're filtering by the users accessible collections. If
|
||
// collectionId is passed as an option it is assumed that the authorization
|
||
// has already been done in the router
|
||
const collectionIds = options.collectionId
|
||
? [options.collectionId]
|
||
: await model.collectionIds();
|
||
|
||
if (collectionIds.length) {
|
||
where[Op.or].push({ collectionId: collectionIds });
|
||
}
|
||
|
||
if (options.dateFilter) {
|
||
where[Op.and].push({
|
||
updatedAt: {
|
||
[Op.gt]: sequelize.literal(
|
||
`now() - interval '1 ${options.dateFilter}'`
|
||
),
|
||
},
|
||
});
|
||
}
|
||
|
||
if (options.collaboratorIds) {
|
||
where[Op.and].push({
|
||
collaboratorIds: {
|
||
[Op.contains]: options.collaboratorIds,
|
||
},
|
||
});
|
||
}
|
||
|
||
if (options.documentIds) {
|
||
where[Op.and].push({
|
||
id: options.documentIds,
|
||
});
|
||
}
|
||
|
||
const statusQuery = [];
|
||
if (options.statusFilter?.includes(StatusFilter.Published)) {
|
||
statusQuery.push({
|
||
[Op.and]: [
|
||
{
|
||
publishedAt: {
|
||
[Op.ne]: null,
|
||
},
|
||
archivedAt: {
|
||
[Op.eq]: null,
|
||
},
|
||
},
|
||
],
|
||
});
|
||
}
|
||
|
||
if (
|
||
options.statusFilter?.includes(StatusFilter.Draft) &&
|
||
// Only ever include draft results for the user's own documents
|
||
model instanceof User
|
||
) {
|
||
statusQuery.push({
|
||
[Op.and]: [
|
||
{
|
||
publishedAt: {
|
||
[Op.eq]: null,
|
||
},
|
||
archivedAt: {
|
||
[Op.eq]: null,
|
||
},
|
||
[Op.or]: [
|
||
{ createdById: model.id },
|
||
{ "$memberships.id$": { [Op.ne]: null } },
|
||
],
|
||
},
|
||
],
|
||
});
|
||
}
|
||
|
||
if (options.statusFilter?.includes(StatusFilter.Archived)) {
|
||
statusQuery.push({
|
||
archivedAt: {
|
||
[Op.ne]: null,
|
||
},
|
||
});
|
||
}
|
||
|
||
if (statusQuery.length) {
|
||
where[Op.and].push({
|
||
[Op.or]: statusQuery,
|
||
});
|
||
}
|
||
|
||
if (query) {
|
||
// find words that look like urls, these should be treated separately as the postgres full-text
|
||
// index will generally not match them.
|
||
const likelyUrls = getUrls(query);
|
||
|
||
// remove likely urls, and escape the rest of the query.
|
||
const limitedQuery = this.escapeQuery(
|
||
likelyUrls
|
||
.reduce((q, url) => q.replace(url, ""), query)
|
||
.slice(0, this.maxQueryLength)
|
||
.trim()
|
||
);
|
||
|
||
// Extract quoted queries and add them to the where clause, up to a maximum of 3 total.
|
||
const quotedQueries = Array.from(limitedQuery.matchAll(/"([^"]*)"/g)).map(
|
||
(match) => match[1]
|
||
);
|
||
|
||
const iLikeQueries = [...quotedQueries, ...likelyUrls].slice(0, 3);
|
||
|
||
for (const match of iLikeQueries) {
|
||
where[Op.and].push({
|
||
[Op.or]: [
|
||
{
|
||
title: {
|
||
[Op.iLike]: `%${match}%`,
|
||
},
|
||
},
|
||
{
|
||
text: {
|
||
[Op.iLike]: `%${match}%`,
|
||
},
|
||
},
|
||
],
|
||
});
|
||
}
|
||
|
||
if (limitedQuery || iLikeQueries.length === 0) {
|
||
where[Op.and].push(
|
||
Sequelize.fn(
|
||
`"searchVector" @@ to_tsquery`,
|
||
"english",
|
||
Sequelize.literal(":query")
|
||
)
|
||
);
|
||
}
|
||
}
|
||
|
||
return where;
|
||
}
|
||
|
||
private static buildResponse(
|
||
query: string,
|
||
results: RankedDocument[],
|
||
documents: Document[],
|
||
count: number
|
||
): SearchResponse {
|
||
return {
|
||
results: map(results, (result) => {
|
||
const document = find(documents, {
|
||
id: result.id,
|
||
}) as Document;
|
||
|
||
return {
|
||
ranking: result.dataValues.searchRanking,
|
||
context: this.buildResultContext(document, query),
|
||
document,
|
||
};
|
||
}),
|
||
totalCount: count,
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Convert a user search query into a format that can be used by Postgres
|
||
*
|
||
* @param query The user search query
|
||
* @returns The query formatted for Postgres ts_query
|
||
*/
|
||
public static webSearchQuery(query: string): string {
|
||
// limit length of search queries as we're using regex against untrusted input
|
||
let limitedQuery = this.escapeQuery(query.slice(0, this.maxQueryLength));
|
||
|
||
const quotedSearch =
|
||
limitedQuery.startsWith('"') && limitedQuery.endsWith('"');
|
||
|
||
// Replace single quote characters with &.
|
||
const singleQuotes = limitedQuery.matchAll(/'+/g);
|
||
|
||
for (const match of singleQuotes) {
|
||
if (
|
||
match.index &&
|
||
match.index > 0 &&
|
||
match.index < limitedQuery.length - 1
|
||
) {
|
||
limitedQuery =
|
||
limitedQuery.substring(0, match.index) +
|
||
"&" +
|
||
limitedQuery.substring(match.index + 1);
|
||
}
|
||
}
|
||
|
||
return (
|
||
queryParser()(quotedSearch ? limitedQuery : `${limitedQuery}*`)
|
||
// Remove any trailing join characters
|
||
.replace(/&$/, "")
|
||
);
|
||
}
|
||
|
||
private static escapeQuery(query: string): string {
|
||
return (
|
||
query
|
||
// replace "\" with escaped "\\" because sequelize.escape doesn't do it
|
||
// see: https://github.com/sequelize/sequelize/issues/2950
|
||
.replace(/\\/g, "\\\\")
|
||
// replace ":" with escaped "\:" because it's a reserved character in tsquery
|
||
// see: https://github.com/outline/outline/issues/6542
|
||
.replace(/:/g, "\\:")
|
||
);
|
||
}
|
||
|
||
private static removeStopWords(query: string): string {
|
||
const stopwords = [
|
||
"i",
|
||
"me",
|
||
"my",
|
||
"myself",
|
||
"we",
|
||
"our",
|
||
"ours",
|
||
"ourselves",
|
||
"you",
|
||
"your",
|
||
"yours",
|
||
"yourself",
|
||
"yourselves",
|
||
"he",
|
||
"him",
|
||
"his",
|
||
"himself",
|
||
"she",
|
||
"her",
|
||
"hers",
|
||
"herself",
|
||
"it",
|
||
"its",
|
||
"itself",
|
||
"they",
|
||
"them",
|
||
"their",
|
||
"theirs",
|
||
"themselves",
|
||
"what",
|
||
"which",
|
||
"who",
|
||
"whom",
|
||
"this",
|
||
"that",
|
||
"these",
|
||
"those",
|
||
"am",
|
||
"is",
|
||
"are",
|
||
"was",
|
||
"were",
|
||
"be",
|
||
"been",
|
||
"being",
|
||
"have",
|
||
"has",
|
||
"had",
|
||
"having",
|
||
"do",
|
||
"does",
|
||
"did",
|
||
"doing",
|
||
"a",
|
||
"an",
|
||
"the",
|
||
"and",
|
||
"but",
|
||
"if",
|
||
"or",
|
||
"because",
|
||
"as",
|
||
"until",
|
||
"while",
|
||
"of",
|
||
"at",
|
||
"by",
|
||
"for",
|
||
"with",
|
||
"about",
|
||
"against",
|
||
"between",
|
||
"into",
|
||
"through",
|
||
"during",
|
||
"before",
|
||
"after",
|
||
"above",
|
||
"below",
|
||
"to",
|
||
"from",
|
||
"up",
|
||
"down",
|
||
"in",
|
||
"out",
|
||
"on",
|
||
"off",
|
||
"over",
|
||
"under",
|
||
"again",
|
||
"further",
|
||
"then",
|
||
"once",
|
||
"here",
|
||
"there",
|
||
"when",
|
||
"where",
|
||
"why",
|
||
"how",
|
||
"all",
|
||
"any",
|
||
"both",
|
||
"each",
|
||
"few",
|
||
"more",
|
||
"most",
|
||
"other",
|
||
"some",
|
||
"such",
|
||
"no",
|
||
"nor",
|
||
"not",
|
||
"only",
|
||
"own",
|
||
"same",
|
||
"so",
|
||
"than",
|
||
"too",
|
||
"very",
|
||
"s",
|
||
"t",
|
||
"can",
|
||
"will",
|
||
"just",
|
||
"don",
|
||
"should",
|
||
"now",
|
||
];
|
||
return query
|
||
.split(" ")
|
||
.filter((word) => !stopwords.includes(word))
|
||
.join(" ");
|
||
}
|
||
}
|