From 21537b069b2a4054db8f13faf1e5130181d381a0 Mon Sep 17 00:00:00 2001 From: Tom Moor Date: Sun, 21 Apr 2024 11:51:52 -0400 Subject: [PATCH] Special-case searching for urls as these are not indexed in whole by postgres. closes OLN-276 --- server/models/helpers/SearchHelper.ts | 41 ++++++++++++++++++--------- shared/utils/urls.ts | 10 +++++++ 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/server/models/helpers/SearchHelper.ts b/server/models/helpers/SearchHelper.ts index f979a043a..ca5f3a32e 100644 --- a/server/models/helpers/SearchHelper.ts +++ b/server/models/helpers/SearchHelper.ts @@ -6,6 +6,7 @@ import map from "lodash/map"; import queryParser from "pg-tsquery"; import { Op, Sequelize, WhereOptions } from "sequelize"; import { DateFilter, StatusFilter } from "@shared/types"; +import { getUrls } from "@shared/utils/urls"; import Collection from "@server/models/Collection"; import Document from "@server/models/Document"; import Share from "@server/models/Share"; @@ -400,39 +401,51 @@ export default class SearchHelper { } if (query) { + // find words that look like urls, these should be treated separately as the postgres full-text + // index will generally not match them. + const likelyUrls = getUrls(query); + + // remove likely urls, and escape the rest of the query. const limitedQuery = this.escapeQuery( - query.slice(0, this.maxQueryLength) + likelyUrls + .reduce((q, url) => q.replace(url, ""), query) + .slice(0, this.maxQueryLength) + .trim() ); // Extract quoted queries and add them to the where clause, up to a maximum of 3 total. - const quotedQueries = Array.from( - limitedQuery.matchAll(/"([^"]*)"/g) - ).slice(0, 3); + const quotedQueries = Array.from(limitedQuery.matchAll(/"([^"]*)"/g)).map( + (match) => match[1] + ); - for (const match of quotedQueries) { + const iLikeQueries = [...quotedQueries, ...likelyUrls].slice(0, 3); + + for (const match of iLikeQueries) { where[Op.and].push({ [Op.or]: [ { title: { - [Op.iLike]: `%${match[1]}%`, + [Op.iLike]: `%${match}%`, }, }, { text: { - [Op.iLike]: `%${match[1]}%`, + [Op.iLike]: `%${match}%`, }, }, ], }); } - where[Op.and].push( - Sequelize.fn( - `"searchVector" @@ to_tsquery`, - "english", - Sequelize.literal(":query") - ) - ); + if (limitedQuery || iLikeQueries.length === 0) { + where[Op.and].push( + Sequelize.fn( + `"searchVector" @@ to_tsquery`, + "english", + Sequelize.literal(":query") + ) + ); + } } return where; diff --git a/shared/utils/urls.ts b/shared/utils/urls.ts index 4409eaabd..10f73b77d 100644 --- a/shared/utils/urls.ts +++ b/shared/utils/urls.ts @@ -154,3 +154,13 @@ export function urlRegex(url: string | null | undefined): RegExp | undefined { return new RegExp(escapeRegExp(`${urlObj.protocol}//${urlObj.host}`)); } + +/** + * Extracts LIKELY urls from the given text, note this does not validate the urls. + * + * @param text The text to extract urls from. + * @returns An array of likely urls. + */ +export function getUrls(text: string) { + return Array.from(text.match(/(?:https?):\/\/[^\s]+/gi) || []); +}