import invariant from "invariant"; import escapeRegExp from "lodash/escapeRegExp"; import find from "lodash/find"; import map from "lodash/map"; import queryParser from "pg-tsquery"; import { Op, Sequelize, WhereOptions } from "sequelize"; import { DateFilter, StatusFilter } from "@shared/types"; import { regexIndexOf, regexLastIndexOf } from "@shared/utils/string"; import { getUrls } from "@shared/utils/urls"; import Collection from "@server/models/Collection"; import Document from "@server/models/Document"; import Share from "@server/models/Share"; import Team from "@server/models/Team"; import User from "@server/models/User"; import { sequelize } from "@server/storage/database"; import { DocumentHelper } from "./DocumentHelper"; type SearchResponse = { results: { /** The search ranking, for sorting results */ ranking: number; /** A snippet of contextual text around the search result */ context: string; /** The document result */ document: Document; }[]; /** The total number of results for the search query without pagination */ totalCount: number; }; type SearchOptions = { /** The query limit for pagination */ limit?: number; /** The query offset for pagination */ offset?: number; /** Limit results to a collection. Authorization is presumed to have been done before passing to this helper. */ collectionId?: string | null; /** Limit results to a shared document. */ share?: Share; /** Limit results to a date range. */ dateFilter?: DateFilter; /** Status of the documents to return */ statusFilter?: StatusFilter[]; /** Limit results to a list of documents. */ documentIds?: string[]; /** Limit results to a list of users that collaborated on the document. */ collaboratorIds?: string[]; /** The minimum number of words to be returned in the contextual snippet */ snippetMinWords?: number; /** The maximum number of words to be returned in the contextual snippet */ snippetMaxWords?: number; }; type RankedDocument = Document & { id: string; dataValues: Partial & { searchRanking: number; }; }; export default class SearchHelper { /** * The maximum length of a search query. */ public static maxQueryLength = 1000; public static async searchForTeam( team: Team, query: string, options: SearchOptions = {} ): Promise { const { snippetMinWords = 20, snippetMaxWords = 30, limit = 15, offset = 0, } = options; const where = await this.buildWhere(team, query, { ...options, statusFilter: [...(options.statusFilter || []), StatusFilter.Published], }); if (options.share?.includeChildDocuments) { const sharedDocument = await options.share.$get("document"); invariant(sharedDocument, "Cannot find document for share"); const childDocumentIds = await sharedDocument.findAllChildDocumentIds({ archivedAt: { [Op.is]: null, }, }); where[Op.and].push({ id: [sharedDocument.id, ...childDocumentIds], }); } const queryReplacements = { query: this.webSearchQuery(query), headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`, }; const resultsQuery = Document.unscoped().findAll({ attributes: [ "id", [ Sequelize.literal( `ts_rank("searchVector", to_tsquery('english', :query))` ), "searchRanking", ], ], replacements: queryReplacements, where, order: [ ["searchRanking", "DESC"], ["updatedAt", "DESC"], ], limit, offset, }) as any as Promise; const countQuery = Document.unscoped().count({ // @ts-expect-error Types are incorrect for count replacements: queryReplacements, where, }) as any as Promise; const [results, count] = await Promise.all([resultsQuery, countQuery]); // Final query to get associated document data const documents = await Document.findAll({ where: { id: map(results, "id"), teamId: team.id, }, include: [ { model: Collection, as: "collection", }, ], }); return this.buildResponse(query, results, documents, count); } public static async searchTitlesForUser( user: User, query: string, options: SearchOptions = {} ): Promise { const { limit = 15, offset = 0 } = options; const where = await this.buildWhere(user, undefined, options); where[Op.and].push({ title: { [Op.iLike]: `%${query}%`, }, }); const include = [ { association: "memberships", where: { userId: user.id, }, required: false, separate: false, }, { model: User, as: "createdBy", paranoid: false, }, { model: User, as: "updatedBy", paranoid: false, }, ]; return Document.scope([ "withoutState", "withDrafts", { method: ["withViews", user.id], }, { method: ["withCollectionPermissions", user.id], }, { method: ["withMembership", user.id], }, ]).findAll({ where, subQuery: false, order: [["updatedAt", "DESC"]], include, offset, limit, }); } public static async searchForUser( user: User, query: string, options: SearchOptions = {} ): Promise { const { snippetMinWords = 20, snippetMaxWords = 30, limit = 15, offset = 0, } = options; const where = await this.buildWhere(user, query, options); const queryReplacements = { query: this.webSearchQuery(query), headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`, }; const include = [ { association: "memberships", where: { userId: user.id, }, required: false, separate: false, }, ]; const resultsQuery = Document.unscoped().findAll({ attributes: [ "id", [ Sequelize.literal( `ts_rank("searchVector", to_tsquery('english', :query))` ), "searchRanking", ], ], subQuery: false, include, replacements: queryReplacements, where, order: [ ["searchRanking", "DESC"], ["updatedAt", "DESC"], ], limit, offset, }) as any as Promise; const countQuery = Document.unscoped().count({ // @ts-expect-error Types are incorrect for count subQuery: false, include, replacements: queryReplacements, where, }) as any as Promise; const [results, count] = await Promise.all([resultsQuery, countQuery]); // Final query to get associated document data const documents = await Document.scope([ "withState", "withDrafts", { method: ["withViews", user.id], }, { method: ["withCollectionPermissions", user.id], }, { method: ["withMembership", user.id], }, ]).findAll({ where: { teamId: user.teamId, id: map(results, "id"), }, }); return this.buildResponse(query, results, documents, count); } private static buildResultContext(document: Document, query: string) { const quotedQueries = Array.from(query.matchAll(/"([^"]*)"/g)); const text = DocumentHelper.toPlainText(document); // Regex to highlight quoted queries as ts_headline will not do this by default due to stemming. const fullMatchRegex = new RegExp(escapeRegExp(query), "i"); const highlightRegex = new RegExp( [ fullMatchRegex.source, ...(quotedQueries.length ? quotedQueries.map((match) => escapeRegExp(match[1])) : this.removeStopWords(query) .trim() .split(" ") .map((match) => `\\b${escapeRegExp(match)}\\b`)), ].join("|"), "gi" ); // Breaking characters const breakChars = [ " ", ".", ",", `"`, "'", "\n", "。", "！", "？", "!", "?", "…", ]; const breakCharsRegex = new RegExp(`[${breakChars.join("")}]`, "g"); // chop text around the first match, prefer the first full match if possible. const fullMatchIndex = text.search(fullMatchRegex); const offsetStartIndex = (fullMatchIndex >= 0 ? fullMatchIndex : text.search(highlightRegex)) - 65; const startIndex = Math.max( 0, offsetStartIndex <= 0 ? 0 : regexIndexOf(text, breakCharsRegex, offsetStartIndex) ); const context = text.replace(highlightRegex, "$&"); const endIndex = regexLastIndexOf( context, breakCharsRegex, startIndex + 250 ); return context.slice(startIndex, endIndex); } private static async buildWhere( model: User | Team, query: string | undefined, options: SearchOptions ) { const teamId = model instanceof Team ? model.id : model.teamId; const where: WhereOptions = { teamId, [Op.or]: [], [Op.and]: [ { deletedAt: { [Op.eq]: null, }, }, ], }; if (model instanceof User) { where[Op.or].push({ "$memberships.id$": { [Op.ne]: null } }); } // Ensure we're filtering by the users accessible collections. If // collectionId is passed as an option it is assumed that the authorization // has already been done in the router const collectionIds = options.collectionId ? [options.collectionId] : await model.collectionIds(); if (collectionIds.length) { where[Op.or].push({ collectionId: collectionIds }); } if (options.dateFilter) { where[Op.and].push({ updatedAt: { [Op.gt]: sequelize.literal( `now() - interval '1 ${options.dateFilter}'` ), }, }); } if (options.collaboratorIds) { where[Op.and].push({ collaboratorIds: { [Op.contains]: options.collaboratorIds, }, }); } if (options.documentIds) { where[Op.and].push({ id: options.documentIds, }); } const statusQuery = []; if (options.statusFilter?.includes(StatusFilter.Published)) { statusQuery.push({ [Op.and]: [ { publishedAt: { [Op.ne]: null, }, archivedAt: { [Op.eq]: null, }, }, ], }); } if ( options.statusFilter?.includes(StatusFilter.Draft) && // Only ever include draft results for the user's own documents model instanceof User ) { statusQuery.push({ [Op.and]: [ { publishedAt: { [Op.eq]: null, }, archivedAt: { [Op.eq]: null, }, [Op.or]: [ { createdById: model.id }, { "$memberships.id$": { [Op.ne]: null } }, ], }, ], }); } if (options.statusFilter?.includes(StatusFilter.Archived)) { statusQuery.push({ archivedAt: { [Op.ne]: null, }, }); } if (statusQuery.length) { where[Op.and].push({ [Op.or]: statusQuery, }); } if (query) { // find words that look like urls, these should be treated separately as the postgres full-text // index will generally not match them. const likelyUrls = getUrls(query); // remove likely urls, and escape the rest of the query. const limitedQuery = this.escapeQuery( likelyUrls .reduce((q, url) => q.replace(url, ""), query) .slice(0, this.maxQueryLength) .trim() ); // Extract quoted queries and add them to the where clause, up to a maximum of 3 total. const quotedQueries = Array.from(limitedQuery.matchAll(/"([^"]*)"/g)).map( (match) => match[1] ); const iLikeQueries = [...quotedQueries, ...likelyUrls].slice(0, 3); for (const match of iLikeQueries) { where[Op.and].push({ [Op.or]: [ { title: { [Op.iLike]: `%${match}%`, }, }, { text: { [Op.iLike]: `%${match}%`, }, }, ], }); } if (limitedQuery || iLikeQueries.length === 0) { where[Op.and].push( Sequelize.fn( `"searchVector" @@ to_tsquery`, "english", Sequelize.literal(":query") ) ); } } return where; } private static buildResponse( query: string, results: RankedDocument[], documents: Document[], count: number ): SearchResponse { return { results: map(results, (result) => { const document = find(documents, { id: result.id, }) as Document; return { ranking: result.dataValues.searchRanking, context: this.buildResultContext(document, query), document, }; }), totalCount: count, }; } /** * Convert a user search query into a format that can be used by Postgres * * @param query The user search query * @returns The query formatted for Postgres ts_query */ public static webSearchQuery(query: string): string { // limit length of search queries as we're using regex against untrusted input let limitedQuery = this.escapeQuery(query.slice(0, this.maxQueryLength)); const quotedSearch = limitedQuery.startsWith('"') && limitedQuery.endsWith('"'); // Replace single quote characters with &. const singleQuotes = limitedQuery.matchAll(/'+/g); for (const match of singleQuotes) { if ( match.index && match.index > 0 && match.index < limitedQuery.length - 1 ) { limitedQuery = limitedQuery.substring(0, match.index) + "&" + limitedQuery.substring(match.index + 1); } } return ( queryParser()(quotedSearch ? limitedQuery : `${limitedQuery}*`) // Remove any trailing join characters .replace(/&$/, "") ); } private static escapeQuery(query: string): string { return ( query // replace "\" with escaped "\\" because sequelize.escape doesn't do it // see: https://github.com/sequelize/sequelize/issues/2950 .replace(/\\/g, "\\\\") // replace ":" with escaped "\:" because it's a reserved character in tsquery // see: https://github.com/outline/outline/issues/6542 .replace(/:/g, "\\:") ); } private static removeStopWords(query: string): string { const stopwords = [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", ]; return query .split(" ") .filter((word) => !stopwords.includes(word)) .join(" "); } }