Files
outline/server/models/helpers/SearchHelper.ts

712 lines
17 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import invariant from "invariant";
import escapeRegExp from "lodash/escapeRegExp";
import find from "lodash/find";
import map from "lodash/map";
import queryParser from "pg-tsquery";
import { Op, Sequelize, WhereOptions } from "sequelize";
import { DateFilter, StatusFilter } from "@shared/types";
import { regexIndexOf, regexLastIndexOf } from "@shared/utils/string";
import { getUrls } from "@shared/utils/urls";
import Collection from "@server/models/Collection";
import Document from "@server/models/Document";
import Share from "@server/models/Share";
import Team from "@server/models/Team";
import User from "@server/models/User";
import { sequelize } from "@server/storage/database";
import { DocumentHelper } from "./DocumentHelper";
type SearchResponse = {
results: {
/** The search ranking, for sorting results */
ranking: number;
/** A snippet of contextual text around the search result */
context: string;
/** The document result */
document: Document;
}[];
/** The total number of results for the search query without pagination */
totalCount: number;
};
type SearchOptions = {
/** The query limit for pagination */
limit?: number;
/** The query offset for pagination */
offset?: number;
/** Limit results to a collection. Authorization is presumed to have been done before passing to this helper. */
collectionId?: string | null;
/** Limit results to a shared document. */
share?: Share;
/** Limit results to a date range. */
dateFilter?: DateFilter;
/** Status of the documents to return */
statusFilter?: StatusFilter[];
/** Limit results to a list of documents. */
documentIds?: string[];
/** Limit results to a list of users that collaborated on the document. */
collaboratorIds?: string[];
/** The minimum number of words to be returned in the contextual snippet */
snippetMinWords?: number;
/** The maximum number of words to be returned in the contextual snippet */
snippetMaxWords?: number;
};
type RankedDocument = Document & {
id: string;
dataValues: Partial<Document> & {
searchRanking: number;
};
};
export default class SearchHelper {
/**
* The maximum length of a search query.
*/
public static maxQueryLength = 1000;
public static async searchForTeam(
team: Team,
query: string,
options: SearchOptions = {}
): Promise<SearchResponse> {
const {
snippetMinWords = 20,
snippetMaxWords = 30,
limit = 15,
offset = 0,
} = options;
const where = await this.buildWhere(team, query, {
...options,
statusFilter: [...(options.statusFilter || []), StatusFilter.Published],
});
if (options.share?.includeChildDocuments) {
const sharedDocument = await options.share.$get("document");
invariant(sharedDocument, "Cannot find document for share");
const childDocumentIds = await sharedDocument.findAllChildDocumentIds({
archivedAt: {
[Op.is]: null,
},
});
where[Op.and].push({
id: [sharedDocument.id, ...childDocumentIds],
});
}
const queryReplacements = {
query: this.webSearchQuery(query),
headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`,
};
const resultsQuery = Document.unscoped().findAll({
attributes: [
"id",
[
Sequelize.literal(
`ts_rank("searchVector", to_tsquery('english', :query))`
),
"searchRanking",
],
],
replacements: queryReplacements,
where,
order: [
["searchRanking", "DESC"],
["updatedAt", "DESC"],
],
limit,
offset,
}) as any as Promise<RankedDocument[]>;
const countQuery = Document.unscoped().count({
// @ts-expect-error Types are incorrect for count
replacements: queryReplacements,
where,
}) as any as Promise<number>;
const [results, count] = await Promise.all([resultsQuery, countQuery]);
// Final query to get associated document data
const documents = await Document.findAll({
where: {
id: map(results, "id"),
teamId: team.id,
},
include: [
{
model: Collection,
as: "collection",
},
],
});
return this.buildResponse(query, results, documents, count);
}
public static async searchTitlesForUser(
user: User,
query: string,
options: SearchOptions = {}
): Promise<Document[]> {
const { limit = 15, offset = 0 } = options;
const where = await this.buildWhere(user, undefined, options);
where[Op.and].push({
title: {
[Op.iLike]: `%${query}%`,
},
});
const include = [
{
association: "memberships",
where: {
userId: user.id,
},
required: false,
separate: false,
},
{
model: User,
as: "createdBy",
paranoid: false,
},
{
model: User,
as: "updatedBy",
paranoid: false,
},
];
return Document.scope([
"withoutState",
"withDrafts",
{
method: ["withViews", user.id],
},
{
method: ["withCollectionPermissions", user.id],
},
{
method: ["withMembership", user.id],
},
]).findAll({
where,
subQuery: false,
order: [["updatedAt", "DESC"]],
include,
offset,
limit,
});
}
public static async searchForUser(
user: User,
query: string,
options: SearchOptions = {}
): Promise<SearchResponse> {
const {
snippetMinWords = 20,
snippetMaxWords = 30,
limit = 15,
offset = 0,
} = options;
const where = await this.buildWhere(user, query, options);
const queryReplacements = {
query: this.webSearchQuery(query),
headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`,
};
const include = [
{
association: "memberships",
where: {
userId: user.id,
},
required: false,
separate: false,
},
];
const resultsQuery = Document.unscoped().findAll({
attributes: [
"id",
[
Sequelize.literal(
`ts_rank("searchVector", to_tsquery('english', :query))`
),
"searchRanking",
],
],
subQuery: false,
include,
replacements: queryReplacements,
where,
order: [
["searchRanking", "DESC"],
["updatedAt", "DESC"],
],
limit,
offset,
}) as any as Promise<RankedDocument[]>;
const countQuery = Document.unscoped().count({
// @ts-expect-error Types are incorrect for count
subQuery: false,
include,
replacements: queryReplacements,
where,
}) as any as Promise<number>;
const [results, count] = await Promise.all([resultsQuery, countQuery]);
// Final query to get associated document data
const documents = await Document.scope([
"withState",
"withDrafts",
{
method: ["withViews", user.id],
},
{
method: ["withCollectionPermissions", user.id],
},
{
method: ["withMembership", user.id],
},
]).findAll({
where: {
teamId: user.teamId,
id: map(results, "id"),
},
});
return this.buildResponse(query, results, documents, count);
}
private static buildResultContext(document: Document, query: string) {
const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g));
const text = DocumentHelper.toPlainText(document);
// Regex to highlight quoted queries as ts_headline will not do this by default due to stemming.
const fullMatchRegex = new RegExp(escapeRegExp(query), "i");
const highlightRegex = new RegExp(
[
fullMatchRegex.source,
...(quotedQueries.length
? quotedQueries.map((match) => escapeRegExp(match[1]))
: this.removeStopWords(query)
.trim()
.split(" ")
.map((match) => `\\b${escapeRegExp(match)}\\b`)),
].join("|"),
"gi"
);
// Breaking characters
const breakChars = [
" ",
".",
",",
`"`,
"'",
"\n",
"。",
"",
"",
"!",
"?",
"…",
];
const breakCharsRegex = new RegExp(`[${breakChars.join("")}]`, "g");
// chop text around the first match, prefer the first full match if possible.
const fullMatchIndex = text.search(fullMatchRegex);
const offsetStartIndex =
(fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65;
const startIndex = Math.max(
0,
offsetStartIndex <= 0
? 0
: regexIndexOf(text, breakCharsRegex, offsetStartIndex)
);
const context = text.replace(highlightRegex, "<b>$&</b>");
const endIndex = regexLastIndexOf(
context,
breakCharsRegex,
startIndex + 250
);
return context.slice(startIndex, endIndex);
}
private static async buildWhere(
model: User | Team,
query: string | undefined,
options: SearchOptions
) {
const teamId = model instanceof Team ? model.id : model.teamId;
const where: WhereOptions<Document> = {
teamId,
[Op.or]: [],
[Op.and]: [
{
deletedAt: {
[Op.eq]: null,
},
},
],
};
if (model instanceof User) {
where[Op.or].push({ "$memberships.id$": { [Op.ne]: null } });
}
// Ensure we're filtering by the users accessible collections. If
// collectionId is passed as an option it is assumed that the authorization
// has already been done in the router
const collectionIds = options.collectionId
? [options.collectionId]
: await model.collectionIds();
if (collectionIds.length) {
where[Op.or].push({ collectionId: collectionIds });
}
if (options.dateFilter) {
where[Op.and].push({
updatedAt: {
[Op.gt]: sequelize.literal(
`now() - interval '1 ${options.dateFilter}'`
),
},
});
}
if (options.collaboratorIds) {
where[Op.and].push({
collaboratorIds: {
[Op.contains]: options.collaboratorIds,
},
});
}
if (options.documentIds) {
where[Op.and].push({
id: options.documentIds,
});
}
const statusQuery = [];
if (options.statusFilter?.includes(StatusFilter.Published)) {
statusQuery.push({
[Op.and]: [
{
publishedAt: {
[Op.ne]: null,
},
archivedAt: {
[Op.eq]: null,
},
},
],
});
}
if (
options.statusFilter?.includes(StatusFilter.Draft) &&
// Only ever include draft results for the user's own documents
model instanceof User
) {
statusQuery.push({
[Op.and]: [
{
publishedAt: {
[Op.eq]: null,
},
archivedAt: {
[Op.eq]: null,
},
[Op.or]: [
{ createdById: model.id },
{ "$memberships.id$": { [Op.ne]: null } },
],
},
],
});
}
if (options.statusFilter?.includes(StatusFilter.Archived)) {
statusQuery.push({
archivedAt: {
[Op.ne]: null,
},
});
}
if (statusQuery.length) {
where[Op.and].push({
[Op.or]: statusQuery,
});
}
if (query) {
// find words that look like urls, these should be treated separately as the postgres full-text
// index will generally not match them.
const likelyUrls = getUrls(query);
// remove likely urls, and escape the rest of the query.
const limitedQuery = this.escapeQuery(
likelyUrls
.reduce((q, url) => q.replace(url, ""), query)
.slice(0, this.maxQueryLength)
.trim()
);
// Extract quoted queries and add them to the where clause, up to a maximum of 3 total.
const quotedQueries = Array.from(limitedQuery.matchAll(/"([^"]*)"/g)).map(
(match) => match[1]
);
const iLikeQueries = [...quotedQueries, ...likelyUrls].slice(0, 3);
for (const match of iLikeQueries) {
where[Op.and].push({
[Op.or]: [
{
title: {
[Op.iLike]: `%${match}%`,
},
},
{
text: {
[Op.iLike]: `%${match}%`,
},
},
],
});
}
if (limitedQuery || iLikeQueries.length === 0) {
where[Op.and].push(
Sequelize.fn(
`"searchVector" @@ to_tsquery`,
"english",
Sequelize.literal(":query")
)
);
}
}
return where;
}
private static buildResponse(
query: string,
results: RankedDocument[],
documents: Document[],
count: number
): SearchResponse {
return {
results: map(results, (result) => {
const document = find(documents, {
id: result.id,
}) as Document;
return {
ranking: result.dataValues.searchRanking,
context: this.buildResultContext(document, query),
document,
};
}),
totalCount: count,
};
}
/**
* Convert a user search query into a format that can be used by Postgres
*
* @param query The user search query
* @returns The query formatted for Postgres ts_query
*/
public static webSearchQuery(query: string): string {
// limit length of search queries as we're using regex against untrusted input
let limitedQuery = this.escapeQuery(query.slice(0, this.maxQueryLength));
const quotedSearch =
limitedQuery.startsWith('"') && limitedQuery.endsWith('"');
// Replace single quote characters with &.
const singleQuotes = limitedQuery.matchAll(/'+/g);
for (const match of singleQuotes) {
if (
match.index &&
match.index > 0 &&
match.index < limitedQuery.length - 1
) {
limitedQuery =
limitedQuery.substring(0, match.index) +
"&" +
limitedQuery.substring(match.index + 1);
}
}
return (
queryParser()(quotedSearch ? limitedQuery : `${limitedQuery}*`)
// Remove any trailing join characters
.replace(/&$/, "")
);
}
private static escapeQuery(query: string): string {
return (
query
// replace "\" with escaped "\\" because sequelize.escape doesn't do it
// see: https://github.com/sequelize/sequelize/issues/2950
.replace(/\\/g, "\\\\")
// replace ":" with escaped "\:" because it's a reserved character in tsquery
// see: https://github.com/outline/outline/issues/6542
.replace(/:/g, "\\:")
);
}
private static removeStopWords(query: string): string {
const stopwords = [
"i",
"me",
"my",
"myself",
"we",
"our",
"ours",
"ourselves",
"you",
"your",
"yours",
"yourself",
"yourselves",
"he",
"him",
"his",
"himself",
"she",
"her",
"hers",
"herself",
"it",
"its",
"itself",
"they",
"them",
"their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"this",
"that",
"these",
"those",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"a",
"an",
"the",
"and",
"but",
"if",
"or",
"because",
"as",
"until",
"while",
"of",
"at",
"by",
"for",
"with",
"about",
"against",
"between",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"to",
"from",
"up",
"down",
"in",
"out",
"on",
"off",
"over",
"under",
"again",
"further",
"then",
"once",
"here",
"there",
"when",
"where",
"why",
"how",
"all",
"any",
"both",
"each",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"s",
"t",
"can",
"will",
"just",
"don",
"should",
"now",
];
return query
.split(" ")
.filter((word) => !stopwords.includes(word))
.join(" ");
}
}