688 lines
16 KiB
TypeScript
688 lines
16 KiB
TypeScript
import invariant from "invariant";
|
|
import escapeRegExp from "lodash/escapeRegExp";
|
|
import find from "lodash/find";
|
|
import map from "lodash/map";
|
|
import queryParser from "pg-tsquery";
|
|
import { Op, Sequelize, WhereOptions } from "sequelize";
|
|
import { DateFilter, StatusFilter } from "@shared/types";
|
|
import { getUrls } from "@shared/utils/urls";
|
|
import Collection from "@server/models/Collection";
|
|
import Document from "@server/models/Document";
|
|
import Share from "@server/models/Share";
|
|
import Team from "@server/models/Team";
|
|
import User from "@server/models/User";
|
|
import { sequelize } from "@server/storage/database";
|
|
import { DocumentHelper } from "./DocumentHelper";
|
|
|
|
type SearchResponse = {
|
|
results: {
|
|
/** The search ranking, for sorting results */
|
|
ranking: number;
|
|
/** A snippet of contextual text around the search result */
|
|
context: string;
|
|
/** The document result */
|
|
document: Document;
|
|
}[];
|
|
/** The total number of results for the search query without pagination */
|
|
totalCount: number;
|
|
};
|
|
|
|
type SearchOptions = {
|
|
/** The query limit for pagination */
|
|
limit?: number;
|
|
/** The query offset for pagination */
|
|
offset?: number;
|
|
/** Limit results to a collection. Authorization is presumed to have been done before passing to this helper. */
|
|
collectionId?: string | null;
|
|
/** Limit results to a shared document. */
|
|
share?: Share;
|
|
/** Limit results to a date range. */
|
|
dateFilter?: DateFilter;
|
|
/** Status of the documents to return */
|
|
statusFilter?: StatusFilter[];
|
|
/** Limit results to a list of documents. */
|
|
documentIds?: string[];
|
|
/** Limit results to a list of users that collaborated on the document. */
|
|
collaboratorIds?: string[];
|
|
/** The minimum number of words to be returned in the contextual snippet */
|
|
snippetMinWords?: number;
|
|
/** The maximum number of words to be returned in the contextual snippet */
|
|
snippetMaxWords?: number;
|
|
};
|
|
|
|
type RankedDocument = Document & {
|
|
id: string;
|
|
dataValues: Partial<Document> & {
|
|
searchRanking: number;
|
|
};
|
|
};
|
|
|
|
export default class SearchHelper {
|
|
/**
|
|
* The maximum length of a search query.
|
|
*/
|
|
public static maxQueryLength = 1000;
|
|
|
|
public static async searchForTeam(
|
|
team: Team,
|
|
query: string,
|
|
options: SearchOptions = {}
|
|
): Promise<SearchResponse> {
|
|
const {
|
|
snippetMinWords = 20,
|
|
snippetMaxWords = 30,
|
|
limit = 15,
|
|
offset = 0,
|
|
} = options;
|
|
|
|
const where = await this.buildWhere(team, query, {
|
|
...options,
|
|
statusFilter: [...(options.statusFilter || []), StatusFilter.Published],
|
|
});
|
|
|
|
if (options.share?.includeChildDocuments) {
|
|
const sharedDocument = await options.share.$get("document");
|
|
invariant(sharedDocument, "Cannot find document for share");
|
|
|
|
const childDocumentIds = await sharedDocument.findAllChildDocumentIds({
|
|
archivedAt: {
|
|
[Op.is]: null,
|
|
},
|
|
});
|
|
|
|
where[Op.and].push({
|
|
id: [sharedDocument.id, ...childDocumentIds],
|
|
});
|
|
}
|
|
|
|
const queryReplacements = {
|
|
query: this.webSearchQuery(query),
|
|
headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`,
|
|
};
|
|
|
|
const resultsQuery = Document.unscoped().findAll({
|
|
attributes: [
|
|
"id",
|
|
[
|
|
Sequelize.literal(
|
|
`ts_rank("searchVector", to_tsquery('english', :query))`
|
|
),
|
|
"searchRanking",
|
|
],
|
|
],
|
|
replacements: queryReplacements,
|
|
where,
|
|
order: [
|
|
["searchRanking", "DESC"],
|
|
["updatedAt", "DESC"],
|
|
],
|
|
limit,
|
|
offset,
|
|
}) as any as Promise<RankedDocument[]>;
|
|
|
|
const countQuery = Document.unscoped().count({
|
|
// @ts-expect-error Types are incorrect for count
|
|
replacements: queryReplacements,
|
|
where,
|
|
}) as any as Promise<number>;
|
|
const [results, count] = await Promise.all([resultsQuery, countQuery]);
|
|
|
|
// Final query to get associated document data
|
|
const documents = await Document.findAll({
|
|
where: {
|
|
id: map(results, "id"),
|
|
teamId: team.id,
|
|
},
|
|
include: [
|
|
{
|
|
model: Collection,
|
|
as: "collection",
|
|
},
|
|
],
|
|
});
|
|
|
|
return this.buildResponse(query, results, documents, count);
|
|
}
|
|
|
|
public static async searchTitlesForUser(
|
|
user: User,
|
|
query: string,
|
|
options: SearchOptions = {}
|
|
): Promise<Document[]> {
|
|
const { limit = 15, offset = 0 } = options;
|
|
const where = await this.buildWhere(user, undefined, options);
|
|
|
|
where[Op.and].push({
|
|
title: {
|
|
[Op.iLike]: `%${query}%`,
|
|
},
|
|
});
|
|
|
|
const include = [
|
|
{
|
|
association: "memberships",
|
|
where: {
|
|
userId: user.id,
|
|
},
|
|
required: false,
|
|
separate: false,
|
|
},
|
|
{
|
|
model: User,
|
|
as: "createdBy",
|
|
paranoid: false,
|
|
},
|
|
{
|
|
model: User,
|
|
as: "updatedBy",
|
|
paranoid: false,
|
|
},
|
|
];
|
|
|
|
return Document.scope([
|
|
"withoutState",
|
|
"withDrafts",
|
|
{
|
|
method: ["withViews", user.id],
|
|
},
|
|
{
|
|
method: ["withCollectionPermissions", user.id],
|
|
},
|
|
{
|
|
method: ["withMembership", user.id],
|
|
},
|
|
]).findAll({
|
|
where,
|
|
subQuery: false,
|
|
order: [["updatedAt", "DESC"]],
|
|
include,
|
|
offset,
|
|
limit,
|
|
});
|
|
}
|
|
|
|
public static async searchForUser(
|
|
user: User,
|
|
query: string,
|
|
options: SearchOptions = {}
|
|
): Promise<SearchResponse> {
|
|
const {
|
|
snippetMinWords = 20,
|
|
snippetMaxWords = 30,
|
|
limit = 15,
|
|
offset = 0,
|
|
} = options;
|
|
|
|
const where = await this.buildWhere(user, query, options);
|
|
|
|
const queryReplacements = {
|
|
query: this.webSearchQuery(query),
|
|
headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`,
|
|
};
|
|
|
|
const include = [
|
|
{
|
|
association: "memberships",
|
|
where: {
|
|
userId: user.id,
|
|
},
|
|
required: false,
|
|
separate: false,
|
|
},
|
|
];
|
|
|
|
const resultsQuery = Document.unscoped().findAll({
|
|
attributes: [
|
|
"id",
|
|
[
|
|
Sequelize.literal(
|
|
`ts_rank("searchVector", to_tsquery('english', :query))`
|
|
),
|
|
"searchRanking",
|
|
],
|
|
],
|
|
subQuery: false,
|
|
include,
|
|
replacements: queryReplacements,
|
|
where,
|
|
order: [
|
|
["searchRanking", "DESC"],
|
|
["updatedAt", "DESC"],
|
|
],
|
|
limit,
|
|
offset,
|
|
}) as any as Promise<RankedDocument[]>;
|
|
|
|
const countQuery = Document.unscoped().count({
|
|
// @ts-expect-error Types are incorrect for count
|
|
subQuery: false,
|
|
include,
|
|
replacements: queryReplacements,
|
|
where,
|
|
}) as any as Promise<number>;
|
|
const [results, count] = await Promise.all([resultsQuery, countQuery]);
|
|
|
|
// Final query to get associated document data
|
|
const documents = await Document.scope([
|
|
"withState",
|
|
"withDrafts",
|
|
{
|
|
method: ["withViews", user.id],
|
|
},
|
|
{
|
|
method: ["withCollectionPermissions", user.id],
|
|
},
|
|
{
|
|
method: ["withMembership", user.id],
|
|
},
|
|
]).findAll({
|
|
where: {
|
|
teamId: user.teamId,
|
|
id: map(results, "id"),
|
|
},
|
|
});
|
|
|
|
return this.buildResponse(query, results, documents, count);
|
|
}
|
|
|
|
private static buildResultContext(document: Document, query: string) {
|
|
const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g));
|
|
const text = DocumentHelper.toPlainText(document);
|
|
|
|
// Regex to highlight quoted queries as ts_headline will not do this by default due to stemming.
|
|
const fullMatchRegex = new RegExp(escapeRegExp(query), "i");
|
|
const highlightRegex = new RegExp(
|
|
[
|
|
fullMatchRegex.source,
|
|
...(quotedQueries.length
|
|
? quotedQueries.map((match) => escapeRegExp(match[1]))
|
|
: this.removeStopWords(query)
|
|
.trim()
|
|
.split(" ")
|
|
.map((match) => `\\b${escapeRegExp(match)}\\b`)),
|
|
].join("|"),
|
|
"gi"
|
|
);
|
|
|
|
// chop text around the first match, prefer the first full match if possible.
|
|
const fullMatchIndex = text.search(fullMatchRegex);
|
|
const offsetStartIndex =
|
|
(fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65;
|
|
const startIndex = Math.max(
|
|
0,
|
|
offsetStartIndex <= 0 ? 0 : text.indexOf(" ", offsetStartIndex)
|
|
);
|
|
const context = text.replace(highlightRegex, "<b>$&</b>");
|
|
const endIndex = context.lastIndexOf(" ", startIndex + 250);
|
|
|
|
return context.slice(startIndex, endIndex);
|
|
}
|
|
|
|
private static async buildWhere(
|
|
model: User | Team,
|
|
query: string | undefined,
|
|
options: SearchOptions
|
|
) {
|
|
const teamId = model instanceof Team ? model.id : model.teamId;
|
|
const where: WhereOptions<Document> = {
|
|
teamId,
|
|
[Op.or]: [],
|
|
[Op.and]: [
|
|
{
|
|
deletedAt: {
|
|
[Op.eq]: null,
|
|
},
|
|
},
|
|
],
|
|
};
|
|
|
|
if (model instanceof User) {
|
|
where[Op.or].push({ "$memberships.id$": { [Op.ne]: null } });
|
|
}
|
|
|
|
// Ensure we're filtering by the users accessible collections. If
|
|
// collectionId is passed as an option it is assumed that the authorization
|
|
// has already been done in the router
|
|
const collectionIds = options.collectionId
|
|
? [options.collectionId]
|
|
: await model.collectionIds();
|
|
|
|
if (collectionIds.length) {
|
|
where[Op.or].push({ collectionId: collectionIds });
|
|
}
|
|
|
|
if (options.dateFilter) {
|
|
where[Op.and].push({
|
|
updatedAt: {
|
|
[Op.gt]: sequelize.literal(
|
|
`now() - interval '1 ${options.dateFilter}'`
|
|
),
|
|
},
|
|
});
|
|
}
|
|
|
|
if (options.collaboratorIds) {
|
|
where[Op.and].push({
|
|
collaboratorIds: {
|
|
[Op.contains]: options.collaboratorIds,
|
|
},
|
|
});
|
|
}
|
|
|
|
if (options.documentIds) {
|
|
where[Op.and].push({
|
|
id: options.documentIds,
|
|
});
|
|
}
|
|
|
|
const statusQuery = [];
|
|
if (options.statusFilter?.includes(StatusFilter.Published)) {
|
|
statusQuery.push({
|
|
[Op.and]: [
|
|
{
|
|
publishedAt: {
|
|
[Op.ne]: null,
|
|
},
|
|
archivedAt: {
|
|
[Op.eq]: null,
|
|
},
|
|
},
|
|
],
|
|
});
|
|
}
|
|
|
|
if (
|
|
options.statusFilter?.includes(StatusFilter.Draft) &&
|
|
// Only ever include draft results for the user's own documents
|
|
model instanceof User
|
|
) {
|
|
statusQuery.push({
|
|
[Op.and]: [
|
|
{
|
|
publishedAt: {
|
|
[Op.eq]: null,
|
|
},
|
|
archivedAt: {
|
|
[Op.eq]: null,
|
|
},
|
|
[Op.or]: [
|
|
{ createdById: model.id },
|
|
{ "$memberships.id$": { [Op.ne]: null } },
|
|
],
|
|
},
|
|
],
|
|
});
|
|
}
|
|
|
|
if (options.statusFilter?.includes(StatusFilter.Archived)) {
|
|
statusQuery.push({
|
|
archivedAt: {
|
|
[Op.ne]: null,
|
|
},
|
|
});
|
|
}
|
|
|
|
if (statusQuery.length) {
|
|
where[Op.and].push({
|
|
[Op.or]: statusQuery,
|
|
});
|
|
}
|
|
|
|
if (query) {
|
|
// find words that look like urls, these should be treated separately as the postgres full-text
|
|
// index will generally not match them.
|
|
const likelyUrls = getUrls(query);
|
|
|
|
// remove likely urls, and escape the rest of the query.
|
|
const limitedQuery = this.escapeQuery(
|
|
likelyUrls
|
|
.reduce((q, url) => q.replace(url, ""), query)
|
|
.slice(0, this.maxQueryLength)
|
|
.trim()
|
|
);
|
|
|
|
// Extract quoted queries and add them to the where clause, up to a maximum of 3 total.
|
|
const quotedQueries = Array.from(limitedQuery.matchAll(/"([^"]*)"/g)).map(
|
|
(match) => match[1]
|
|
);
|
|
|
|
const iLikeQueries = [...quotedQueries, ...likelyUrls].slice(0, 3);
|
|
|
|
for (const match of iLikeQueries) {
|
|
where[Op.and].push({
|
|
[Op.or]: [
|
|
{
|
|
title: {
|
|
[Op.iLike]: `%${match}%`,
|
|
},
|
|
},
|
|
{
|
|
text: {
|
|
[Op.iLike]: `%${match}%`,
|
|
},
|
|
},
|
|
],
|
|
});
|
|
}
|
|
|
|
if (limitedQuery || iLikeQueries.length === 0) {
|
|
where[Op.and].push(
|
|
Sequelize.fn(
|
|
`"searchVector" @@ to_tsquery`,
|
|
"english",
|
|
Sequelize.literal(":query")
|
|
)
|
|
);
|
|
}
|
|
}
|
|
|
|
return where;
|
|
}
|
|
|
|
private static buildResponse(
|
|
query: string,
|
|
results: RankedDocument[],
|
|
documents: Document[],
|
|
count: number
|
|
): SearchResponse {
|
|
return {
|
|
results: map(results, (result) => {
|
|
const document = find(documents, {
|
|
id: result.id,
|
|
}) as Document;
|
|
|
|
return {
|
|
ranking: result.dataValues.searchRanking,
|
|
context: this.buildResultContext(document, query),
|
|
document,
|
|
};
|
|
}),
|
|
totalCount: count,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Convert a user search query into a format that can be used by Postgres
|
|
*
|
|
* @param query The user search query
|
|
* @returns The query formatted for Postgres ts_query
|
|
*/
|
|
public static webSearchQuery(query: string): string {
|
|
// limit length of search queries as we're using regex against untrusted input
|
|
let limitedQuery = this.escapeQuery(query.slice(0, this.maxQueryLength));
|
|
|
|
const quotedSearch =
|
|
limitedQuery.startsWith('"') && limitedQuery.endsWith('"');
|
|
|
|
// Replace single quote characters with &.
|
|
const singleQuotes = limitedQuery.matchAll(/'+/g);
|
|
|
|
for (const match of singleQuotes) {
|
|
if (
|
|
match.index &&
|
|
match.index > 0 &&
|
|
match.index < limitedQuery.length - 1
|
|
) {
|
|
limitedQuery =
|
|
limitedQuery.substring(0, match.index) +
|
|
"&" +
|
|
limitedQuery.substring(match.index + 1);
|
|
}
|
|
}
|
|
|
|
return (
|
|
queryParser()(quotedSearch ? limitedQuery : `${limitedQuery}*`)
|
|
// Remove any trailing join characters
|
|
.replace(/&$/, "")
|
|
);
|
|
}
|
|
|
|
private static escapeQuery(query: string): string {
|
|
return (
|
|
query
|
|
// replace "\" with escaped "\\" because sequelize.escape doesn't do it
|
|
// see: https://github.com/sequelize/sequelize/issues/2950
|
|
.replace(/\\/g, "\\\\")
|
|
// replace ":" with escaped "\:" because it's a reserved character in tsquery
|
|
// see: https://github.com/outline/outline/issues/6542
|
|
.replace(/:/g, "\\:")
|
|
);
|
|
}
|
|
|
|
private static removeStopWords(query: string): string {
|
|
const stopwords = [
|
|
"i",
|
|
"me",
|
|
"my",
|
|
"myself",
|
|
"we",
|
|
"our",
|
|
"ours",
|
|
"ourselves",
|
|
"you",
|
|
"your",
|
|
"yours",
|
|
"yourself",
|
|
"yourselves",
|
|
"he",
|
|
"him",
|
|
"his",
|
|
"himself",
|
|
"she",
|
|
"her",
|
|
"hers",
|
|
"herself",
|
|
"it",
|
|
"its",
|
|
"itself",
|
|
"they",
|
|
"them",
|
|
"their",
|
|
"theirs",
|
|
"themselves",
|
|
"what",
|
|
"which",
|
|
"who",
|
|
"whom",
|
|
"this",
|
|
"that",
|
|
"these",
|
|
"those",
|
|
"am",
|
|
"is",
|
|
"are",
|
|
"was",
|
|
"were",
|
|
"be",
|
|
"been",
|
|
"being",
|
|
"have",
|
|
"has",
|
|
"had",
|
|
"having",
|
|
"do",
|
|
"does",
|
|
"did",
|
|
"doing",
|
|
"a",
|
|
"an",
|
|
"the",
|
|
"and",
|
|
"but",
|
|
"if",
|
|
"or",
|
|
"because",
|
|
"as",
|
|
"until",
|
|
"while",
|
|
"of",
|
|
"at",
|
|
"by",
|
|
"for",
|
|
"with",
|
|
"about",
|
|
"against",
|
|
"between",
|
|
"into",
|
|
"through",
|
|
"during",
|
|
"before",
|
|
"after",
|
|
"above",
|
|
"below",
|
|
"to",
|
|
"from",
|
|
"up",
|
|
"down",
|
|
"in",
|
|
"out",
|
|
"on",
|
|
"off",
|
|
"over",
|
|
"under",
|
|
"again",
|
|
"further",
|
|
"then",
|
|
"once",
|
|
"here",
|
|
"there",
|
|
"when",
|
|
"where",
|
|
"why",
|
|
"how",
|
|
"all",
|
|
"any",
|
|
"both",
|
|
"each",
|
|
"few",
|
|
"more",
|
|
"most",
|
|
"other",
|
|
"some",
|
|
"such",
|
|
"no",
|
|
"nor",
|
|
"not",
|
|
"only",
|
|
"own",
|
|
"same",
|
|
"so",
|
|
"than",
|
|
"too",
|
|
"very",
|
|
"s",
|
|
"t",
|
|
"can",
|
|
"will",
|
|
"just",
|
|
"don",
|
|
"should",
|
|
"now",
|
|
];
|
|
return query
|
|
.split(" ")
|
|
.filter((word) => !stopwords.includes(word))
|
|
.join(" ");
|
|
}
|
|
}
|