From d22b44dcffeca97ba78a012c0706c6512d39c09d Mon Sep 17 00:00:00 2001 From: Tom Moor Date: Sun, 27 Nov 2022 10:46:54 -0500 Subject: [PATCH] Further improved search matches --- package.json | 1 + server/models/helpers/SearchHelper.ts | 47 +++++++++---- server/routes/api/documents/documents.test.ts | 68 ++++++++++++++----- yarn.lock | 5 ++ 4 files changed, 92 insertions(+), 29 deletions(-) diff --git a/package.json b/package.json index 62af1a0dd..e2468930b 100644 --- a/package.json +++ b/package.json @@ -150,6 +150,7 @@ "passport-slack-oauth2": "^1.1.1", "pg": "^8.5.1", "pg-hstore": "^2.3.4", + "pg-tsquery": "^8.4.0", "polished": "^3.7.2", "prosemirror-commands": "1.2.2", "prosemirror-dropcursor": "^1.4.0", diff --git a/server/models/helpers/SearchHelper.ts b/server/models/helpers/SearchHelper.ts index 96ed98b9f..c72ecf3b4 100644 --- a/server/models/helpers/SearchHelper.ts +++ b/server/models/helpers/SearchHelper.ts @@ -1,6 +1,7 @@ import removeMarkdown from "@tommoor/remove-markdown"; import invariant from "invariant"; import { find, map } from "lodash"; +import queryParser from "pg-tsquery"; import { Op, QueryTypes } from "sequelize"; import { DateFilter } from "@shared/types"; import unescape from "@shared/utils/unescape"; @@ -54,12 +55,16 @@ type Results = { }; export default class SearchHelper { + /** + * The maximum length of a search query. + */ + public static maxQueryLength = 1000; + public static async searchForTeam( team: Team, query: string, options: SearchOptions = {} ): Promise { - const wildcardQuery = `${this.escapeQuery(query)}:*`; const { snippetMinWords = 20, snippetMaxWords = 30, @@ -103,7 +108,7 @@ export default class SearchHelper { // Build the SQL query to get result documentIds, ranking, and search term context const whereClause = ` - "searchVector" @@ websearch_to_tsquery('english', :query) AND + "searchVector" @@ to_tsquery('english', :query) AND "teamId" = :teamId AND "collectionId" IN(:collectionIds) AND ${documentClause} @@ -113,8 +118,8 @@ export default class SearchHelper { const selectSql = ` SELECT id, - ts_rank(documents."searchVector", websearch_to_tsquery('english', :query)) as "searchRanking", - ts_headline('english', "text", websearch_to_tsquery('english', :query), :headlineOptions) as "searchContext" + ts_rank(documents."searchVector", to_tsquery('english', :query)) as "searchRanking", + ts_headline('english', "text", to_tsquery('english', :query), :headlineOptions) as "searchContext" FROM documents WHERE ${whereClause} ORDER BY @@ -130,7 +135,7 @@ export default class SearchHelper { `; const queryReplacements = { teamId: team.id, - query: wildcardQuery, + query: this.webSearchQuery(query), collectionIds, documentIds, headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`, @@ -176,8 +181,6 @@ export default class SearchHelper { limit = 15, offset = 0, } = options; - const wildcardQuery = `${SearchHelper.escapeQuery(query)}:*`; - // Ensure we're filtering by the users accessible collections. If // collectionId is passed as an option it is assumed that the authorization // has already been done in the router @@ -197,7 +200,7 @@ export default class SearchHelper { // Build the SQL query to get documentIds, ranking, and search term context const whereClause = ` - "searchVector" @@ websearch_to_tsquery('english', :query) AND + "searchVector" @@ to_tsquery('english', :query) AND "teamId" = :teamId AND ${ collectionIds.length @@ -226,8 +229,8 @@ export default class SearchHelper { const selectSql = ` SELECT id, - ts_rank(documents."searchVector", websearch_to_tsquery('english', :query)) as "searchRanking", - ts_headline('english', "text", websearch_to_tsquery('english', :query), :headlineOptions) as "searchContext" + ts_rank(documents."searchVector", to_tsquery('english', :query)) as "searchRanking", + ts_headline('english', "text", to_tsquery('english', :query), :headlineOptions) as "searchContext" FROM documents WHERE ${whereClause} ORDER BY @@ -245,7 +248,7 @@ export default class SearchHelper { teamId: user.teamId, userId: user.id, collaboratorIds: options.collaboratorIds, - query: wildcardQuery, + query: this.webSearchQuery(query), collectionIds, dateFilter, headlineOptions: `MaxFragments=1, MinWords=${snippetMinWords}, MaxWords=${snippetMaxWords}`, @@ -302,9 +305,29 @@ export default class SearchHelper { }; } + /** + * Convert a user search query into a format that can be used by Postgres + * + * @param query The user search query + * @returns The query formatted for Postgres ts_query + */ + private static webSearchQuery(query: string): string { + // limit length of search queries as we're using regex against untrusted input + const limitedQuery = this.escapeQuery(query.slice(0, this.maxQueryLength)); + + // if the search term is one unquoted word then allow partial matches automatically + const queryWordCount = limitedQuery.split(" ").length; + const singleUnquotedSearch = + queryWordCount === 1 && !limitedQuery.startsWith('"'); + + return queryParser({ singleQuoteReplacement: "&" })( + singleUnquotedSearch ? `${limitedQuery}*` : limitedQuery + ); + } + private static escapeQuery(query: string): string { // replace "\" with escaped "\\" because sequelize.escape doesn't do it // https://github.com/sequelize/sequelize/issues/2950 - return sequelize.escape(query).replace(/\\/g, "\\\\"); + return query.replace(/\\/g, "\\\\"); } } diff --git a/server/routes/api/documents/documents.test.ts b/server/routes/api/documents/documents.test.ts index 0a2b79618..a9ed8ac7f 100644 --- a/server/routes/api/documents/documents.test.ts +++ b/server/routes/api/documents/documents.test.ts @@ -1243,7 +1243,7 @@ describe("#documents.search", () => { const res = await server.post("/api/documents.search", { body: { token: user.getJwtToken(), - query: "sear &", + query: "sear", }, }); const body = await res.json(); @@ -1254,24 +1254,58 @@ describe("#documents.search", () => { expect(body.data[2].document.id).toEqual(thirdResult.id); }); - it("should strip junk from search term", async () => { - const user = await buildUser(); - const firstResult = await buildDocument({ - title: "search term", - text: "this is some random text of the document body", - userId: user.id, - teamId: user.teamId, + describe("search operators", () => { + it("negative search operator", async () => { + const { user } = await seed(); + await buildDocument({ + title: "search term", + text: "random text", + userId: user.id, + teamId: user.teamId, + }); + const firstResult = await buildDocument({ + title: "title text", + text: "search term", + userId: user.id, + teamId: user.teamId, + }); + const res = await server.post("/api/documents.search", { + body: { + token: user.getJwtToken(), + query: `search -random`, + }, + }); + const body = await res.json(); + expect(res.status).toEqual(200); + expect(body.data.length).toEqual(1); + expect(body.data[0].document.id).toEqual(firstResult.id); }); - const res = await server.post("/api/documents.search", { - body: { - token: user.getJwtToken(), - query: "rando &\\;:()", - }, + + it("quoted search operator", async () => { + const { user } = await seed(); + await buildDocument({ + title: "document one", + text: "term search", + userId: user.id, + teamId: user.teamId, + }); + const firstResult = await buildDocument({ + title: "search term", + text: "content", + userId: user.id, + teamId: user.teamId, + }); + const res = await server.post("/api/documents.search", { + body: { + token: user.getJwtToken(), + query: `"search term"`, + }, + }); + const body = await res.json(); + expect(res.status).toEqual(200); + expect(body.data.length).toEqual(1); + expect(body.data[0].document.id).toEqual(firstResult.id); }); - const body = await res.json(); - expect(res.status).toEqual(200); - expect(body.data.length).toEqual(1); - expect(body.data[0].document.id).toEqual(firstResult.id); }); it("should not return draft documents", async () => { diff --git a/yarn.lock b/yarn.lock index 1587608e9..0c960f8ab 100644 --- a/yarn.lock +++ b/yarn.lock @@ -12014,6 +12014,11 @@ pg-protocol@^1.4.0: resolved "https://registry.yarnpkg.com/pg-protocol/-/pg-protocol-1.4.0.tgz#43a71a92f6fe3ac559952555aa3335c8cb4908be" integrity sha512-El+aXWcwG/8wuFICMQjM5ZSAm6OWiJicFdNYo+VY3QP+8vI4SvLIWVe51PppTzMhikUJR+PsyIFKqfdXPz/yxA== +pg-tsquery@^8.4.0: + version "8.4.0" + resolved "https://registry.yarnpkg.com/pg-tsquery/-/pg-tsquery-8.4.0.tgz#411293cce23ca1eeb8c29109af9fadf28f20a7d9" + integrity sha512-m0jIxUVwLKSdmOAlqtlbo6K+EFIOZ/hyOMnoe8DmYFqEmOmvafIjGQFmcPP+z5MWd/p7ExxoKNIL31gmM+CwxQ== + pg-types@^2.1.0: version "2.2.0" resolved "https://registry.yarnpkg.com/pg-types/-/pg-types-2.2.0.tgz#2d0250d636454f7cfa3b6ae0382fdfa8063254a3"