fix: Queue health monitor should only run on worker processes (#6228)

2023-11-27 20:55:00 -05:00
parent 2db7776533
commit 07cd13f17a
7 changed files with 62 additions and 40 deletions
--- a/server/env.ts
+++ b/server/env.ts
@@ -18,6 +18,7 @@ import {
  IsBoolean,
  MaxLength,
 } from "class-validator";
+import uniq from "lodash/uniq";
 import { languages } from "@shared/i18n";
 import { CannotUseWithout } from "@server/utils/validators";
 import Deprecated from "./models/decorators/Deprecated";
@@ -226,16 +227,20 @@ export class Environment {
  public DEFAULT_LANGUAGE = process.env.DEFAULT_LANGUAGE ?? "en_US";

  /**
-   * A comma separated list of which services should be enabled on this
-   * instance – defaults to all.
+   * A comma list of which services should be enabled on this instance – defaults to all.
   *
   * If a services flag is passed it takes priority over the environment variable
   * for example: --services=web,worker
   */
-  public SERVICES =
-    getArg("services") ??
-    process.env.SERVICES ??
-    "collaboration,websockets,worker,web";
+  public SERVICES = uniq(
+    (
+      getArg("services") ??
+      process.env.SERVICES ??
+      "collaboration,websockets,worker,web"
+    )
+      .split(",")
+      .map((service) => service.toLowerCase().trim())
+  );

  /**
   * Auto-redirect to https in production. The default is true but you may set
--- a/server/index.ts
+++ b/server/index.ts
@@ -10,7 +10,6 @@ import Koa from "koa";
 import helmet from "koa-helmet";
 import logger from "koa-logger";
 import Router from "koa-router";
-import uniq from "lodash/uniq";
 import { AddressInfo } from "net";
 import stoppable from "stoppable";
 import throng from "throng";
@@ -27,17 +26,11 @@ import { checkConnection, sequelize } from "./storage/database";
 import RedisAdapter from "./storage/redis";
 import Metrics from "./logging/Metrics";

-// The default is to run all services to make development and OSS installations
-// easier to deal with. Separate services are only needed at scale.
-const serviceNames = uniq(
-  env.SERVICES.split(",").map((service) => service.trim())
-);
-
 // The number of processes to run, defaults to the number of CPU's available
 // for the web service, and 1 for collaboration during the beta period.
 let processCount = env.WEB_CONCURRENCY;

-if (serviceNames.includes("collaboration")) {
+if (env.SERVICES.includes("collaboration")) {
  if (processCount !== 1) {
    Logger.info(
      "lifecycle",
@@ -114,14 +107,14 @@ async function start(id: number, disconnect: () => void) {
  app.use(router.routes());

  // loop through requested services at startup
-  for (const name of serviceNames) {
+  for (const name of env.SERVICES) {
    if (!Object.keys(services).includes(name)) {
      throw new Error(`Unknown service ${name}`);
    }

    Logger.info("lifecycle", `Starting ${name} service`);
    const init = services[name];
-    await init(app, server, serviceNames);
+    await init(app, server, env.SERVICES);
  }

  server.on("error", (err) => {
--- a/server/queues/HealthMonitor.ts
+++ b/server/queues/HealthMonitor.ts
@@ -0,0 +1,40 @@
+import { Queue } from "bull";
+import { Second } from "@shared/utils/time";
+import Logger from "@server/logging/Logger";
+
+/* eslint-disable @typescript-eslint/no-misused-promises */
+export default class HealthMonitor {
+  /**
+   * Starts a health monitor for the given queue. If the queue stops processing jobs then the
+   * process is exit.
+   *
+   * @param queue The queue to monitor
+   */
+  public static start(queue: Queue) {
+    let processedJobsSinceCheck = 0;
+
+    queue.on("active", () => {
+      processedJobsSinceCheck += 1;
+    });
+
+    setInterval(async () => {
+      if (processedJobsSinceCheck > 0) {
+        processedJobsSinceCheck = 0;
+        return;
+      }
+
+      processedJobsSinceCheck = 0;
+      const waiting = await queue.getWaitingCount();
+      if (waiting > 50) {
+        Logger.fatal(
+          "Queue has stopped processing jobs",
+          new Error(`Jobs are waiting in the ${queue.name} queue`),
+          {
+            queue: queue.name,
+            waiting,
+          }
+        );
+      }
+    }, 30 * Second);
+  }
+}
--- a/server/queues/index.ts
+++ b/server/queues/index.ts
@@ -1,4 +1,4 @@
-import { createQueue } from "@server/utils/queue";
+import { createQueue } from "@server/queues/queue";

 export const globalEventQueue = createQueue("globalEvents", {
  attempts: 5,
--- a/server/queues/queue.ts
+++ b/server/queues/queue.ts
@@ -3,17 +3,15 @@ import Queue from "bull";
 import snakeCase from "lodash/snakeCase";
 import { Second } from "@shared/utils/time";
 import env from "@server/env";
-import Logger from "@server/logging/Logger";
 import Metrics from "@server/logging/Metrics";
 import Redis from "@server/storage/redis";
-import ShutdownHelper, { ShutdownOrder } from "./ShutdownHelper";
+import ShutdownHelper, { ShutdownOrder } from "@server/utils/ShutdownHelper";

 export function createQueue(
  name: string,
  defaultJobOptions?: Partial<Queue.JobOptions>
 ) {
  const prefix = `queue.${snakeCase(name)}`;
-  let processedJobsSinceCheck = 0;

  // Notes on reusing Redis connections for Bull:
  // https://github.com/OptimalBits/bull/blob/b6d530f72a774be0fd4936ddb4ad9df3b183f4b6/PATTERNS.md#reusing-redis-connections
@@ -54,31 +52,12 @@ export function createQueue(
  queue.on("failed", () => {
    Metrics.increment(`${prefix}.jobs.failed`);
  });
-  queue.on("active", () => {
-    processedJobsSinceCheck += 1;
-  });

  if (env.ENVIRONMENT !== "test") {
    setInterval(async () => {
      Metrics.gauge(`${prefix}.count`, await queue.count());
      Metrics.gauge(`${prefix}.delayed_count`, await queue.getDelayedCount());
    }, 5 * Second);
-
-    setInterval(async () => {
-      if (processedJobsSinceCheck > 0) {
-        processedJobsSinceCheck = 0;
-        return;
-      }
-
-      processedJobsSinceCheck = 0;
-      const waiting = await queue.getWaitingCount();
-      if (waiting > 50) {
-        Logger.fatal(
-          "Queue has stopped processing jobs",
-          new Error(`${waiting} jobs are waiting in the ${name} queue`)
-        );
-      }
-    }, 30 * Second);
  }

  ShutdownHelper.add(name, ShutdownOrder.normal, async () => {
--- a/server/services/worker.ts
+++ b/server/services/worker.ts
@@ -1,6 +1,7 @@
 import Logger from "@server/logging/Logger";
 import { setResource } from "@server/logging/tracer";
 import { traceFunction } from "@server/logging/tracing";
+import HealthMonitor from "@server/queues/HealthMonitor";
 import { initI18n } from "@server/utils/i18n";
 import {
  globalEventQueue,
@@ -152,4 +153,8 @@ export default function init() {
    .catch((err) => {
      Logger.fatal("Error starting taskQueue", err);
    });
+
+  HealthMonitor.start(globalEventQueue);
+  HealthMonitor.start(processorEventQueue);
+  HealthMonitor.start(websocketQueue);
 }
--- a/server/storage/redis.ts
+++ b/server/storage/redis.ts
@@ -42,7 +42,7 @@ export default class RedisAdapter extends Redis {
     */
    const connectionNamePrefix = env.isDevelopment ? process.pid : "outline";
    const connectionName =
-      `${connectionNamePrefix}:${env.SERVICES.replace(/,/g, "-")}` +
+      `${connectionNamePrefix}:${env.SERVICES.join("-")}` +
      (connectionNameSuffix ? `:${connectionNameSuffix}` : "");

    if (!url || !url.startsWith("ioredis://")) {