chore: Refactor data import (#3434)

* Complete refactor of import * feat: Notion data import (#3442)
2022-04-23 10:07:35 -07:00
parent bdcfaae025
commit 33ce49cc33
45 changed files with 2217 additions and 1066 deletions
--- a/server/queues/processors/ExportsProcessor.ts
+++ b/server/queues/processors/ExportsProcessor.ts
@@ -4,6 +4,7 @@ import ExportFailureEmail from "@server/emails/templates/ExportFailureEmail";
 import ExportSuccessEmail from "@server/emails/templates/ExportSuccessEmail";
 import Logger from "@server/logging/logger";
 import { FileOperation, Collection, Event, Team, User } from "@server/models";
+import { FileOperationState } from "@server/models/FileOperation";
 import { Event as TEvent } from "@server/types";
 import { uploadToS3FromBuffer } from "@server/utils/s3";
 import { archiveCollections } from "@server/utils/zip";
@@ -41,7 +42,7 @@ export default class ExportsProcessor extends BaseProcessor {
        });

        this.updateFileOperation(fileOperation, actorId, teamId, {
-          state: "creating",
+          state: FileOperationState.Creating,
        });
        // heavy lifting of creating the zip file
        Logger.info(
@@ -50,7 +51,7 @@ export default class ExportsProcessor extends BaseProcessor {
        );
        const filePath = await archiveCollections(collections);
        let url;
-        let state: any = "creating";
+        let state = FileOperationState.Creating;

        try {
          // @ts-expect-error ts-migrate(2769) FIXME: No overload matches this call.
@@ -58,7 +59,7 @@ export default class ExportsProcessor extends BaseProcessor {
          // @ts-expect-error ts-migrate(2769) FIXME: No overload matches this call.
          const stat = await fs.promises.stat(filePath);
          this.updateFileOperation(fileOperation, actorId, teamId, {
-            state: "uploading",
+            state: FileOperationState.Uploading,
            size: stat.size,
          });
          Logger.info(
@@ -75,12 +76,12 @@ export default class ExportsProcessor extends BaseProcessor {
            "processor",
            `Upload complete for file operation ${fileOperation.id}`
          );
-          state = "complete";
+          state = FileOperationState.Complete;
        } catch (error) {
          Logger.error("Error exporting collection data", error, {
            fileOperationId: fileOperation.id,
          });
-          state = "error";
+          state = FileOperationState.Error;
          url = undefined;
        } finally {
          this.updateFileOperation(fileOperation, actorId, teamId, {
@@ -88,7 +89,7 @@ export default class ExportsProcessor extends BaseProcessor {
            url,
          });

-          if (state === "error") {
+          if (state === FileOperationState.Error) {
            await ExportFailureEmail.schedule({
              to: user.email,
              teamUrl: team.url,
--- a/server/queues/processors/FileOperationsProcessor.ts
+++ b/server/queues/processors/FileOperationsProcessor.ts
@@ -0,0 +1,40 @@
+import invariant from "invariant";
+import { FileOperation } from "@server/models";
+import {
+  FileOperationFormat,
+  FileOperationType,
+} from "@server/models/FileOperation";
+import { Event as TEvent, FileOperationEvent } from "@server/types";
+import ImportMarkdownZipTask from "../tasks/ImportMarkdownZipTask";
+import ImportNotionTask from "../tasks/ImportNotionTask";
+import BaseProcessor from "./BaseProcessor";
+
+export default class FileOperationsProcessor extends BaseProcessor {
+  static applicableEvents: TEvent["name"][] = ["fileOperations.create"];
+
+  async perform(event: FileOperationEvent) {
+    if (event.name !== "fileOperations.create") {
+      return;
+    }
+
+    const fileOperation = await FileOperation.findByPk(event.modelId);
+    invariant(fileOperation, "fileOperation not found");
+
+    // map file operation type and format to the appropriate task
+    if (fileOperation.type === FileOperationType.Import) {
+      switch (fileOperation.format) {
+        case FileOperationFormat.MarkdownZip:
+          await ImportMarkdownZipTask.schedule({
+            fileOperationId: event.modelId,
+          });
+          break;
+        case FileOperationFormat.Notion:
+          await ImportNotionTask.schedule({
+            fileOperationId: event.modelId,
+          });
+          break;
+        default:
+      }
+    }
+  }
+}
--- a/server/queues/processors/ImportsProcessor.ts
+++ b/server/queues/processors/ImportsProcessor.ts
@@ -1,79 +0,0 @@
-import fs from "fs";
-import os from "os";
-import File from "formidable/lib/file";
-import invariant from "invariant";
-import collectionImporter from "@server/commands/collectionImporter";
-import { Event, FileOperation, Attachment, User } from "@server/models";
-import { Event as TEvent } from "@server/types";
-import BaseProcessor from "./BaseProcessor";
-
-export default class ImportsProcessor extends BaseProcessor {
-  static applicableEvents: TEvent["name"][] = ["collections.import"];
-
-  async perform(event: TEvent) {
-    switch (event.name) {
-      case "collections.import": {
-        let state, error;
-        const { type } = event.data;
-        const attachment = await Attachment.findByPk(event.modelId);
-        invariant(attachment, "attachment not found");
-
-        const user = await User.findByPk(event.actorId);
-        invariant(user, "user not found");
-
-        const fileOperation = await FileOperation.create({
-          type: "import",
-          state: "creating",
-          size: attachment.size,
-          key: attachment.key,
-          userId: user.id,
-          teamId: user.teamId,
-        });
-
-        await Event.schedule({
-          name: "fileOperations.create",
-          modelId: fileOperation.id,
-          teamId: user.teamId,
-          actorId: user.id,
-        });
-
-        try {
-          const buffer = await attachment.buffer;
-          const tmpDir = os.tmpdir();
-          const tmpFilePath = `${tmpDir}/upload-${event.modelId}`;
-          await fs.promises.writeFile(tmpFilePath, buffer as Uint8Array);
-          const file = new File({
-            name: attachment.name,
-            type: attachment.contentType,
-            path: tmpFilePath,
-          });
-
-          await collectionImporter({
-            file,
-            user,
-            type,
-            ip: event.ip,
-          });
-          await attachment.destroy();
-
-          state = "complete";
-        } catch (err) {
-          state = "error";
-          error = err.message;
-        } finally {
-          await fileOperation.update({ state, error });
-          await Event.schedule({
-            name: "fileOperations.update",
-            modelId: fileOperation.id,
-            teamId: user.teamId,
-            actorId: user.id,
-          });
-        }
-
-        return;
-      }
-
-      default:
-    }
-  }
-}
--- a/server/queues/tasks/BaseTask.ts
+++ b/server/queues/tasks/BaseTask.ts
@@ -33,7 +33,7 @@ export default abstract class BaseTask<T> {
   * @param props Properties to be used by the task
   * @returns A promise that resolves once the task has completed.
   */
-  public abstract perform(props: T): Promise<void>;
+  public abstract perform(props: T): Promise<any>;

  /**
   * Job options such as priority and retry strategy, as defined by Bull.
--- a/server/queues/tasks/CleanupExpiredFileOperationsTask.test.ts
+++ b/server/queues/tasks/CleanupExpiredFileOperationsTask.test.ts
@@ -1,5 +1,9 @@
 import { subDays } from "date-fns";
 import { FileOperation } from "@server/models";
+import {
+  FileOperationState,
+  FileOperationType,
+} from "@server/models/FileOperation";
 import { buildFileOperation } from "@server/test/factories";
 import { flushdb } from "@server/test/support";
 import CleanupExpiredFileOperationsTask from "./CleanupExpiredFileOperationsTask";
@@ -9,13 +13,13 @@ beforeEach(() => flushdb());
 describe("CleanupExpiredFileOperationsTask", () => {
  it("should expire exports older than 30 days ago", async () => {
    await buildFileOperation({
-      type: "export",
-      state: "complete",
+      type: FileOperationType.Export,
+      state: FileOperationState.Complete,
      createdAt: subDays(new Date(), 30),
    });
    await buildFileOperation({
-      type: "export",
-      state: "complete",
+      type: FileOperationType.Export,
+      state: FileOperationState.Complete,
    });

    /* This is a test helper that creates a new task and runs it. */
@@ -24,8 +28,8 @@ describe("CleanupExpiredFileOperationsTask", () => {

    const data = await FileOperation.count({
      where: {
-        type: "export",
-        state: "expired",
+        type: FileOperationType.Export,
+        state: FileOperationState.Expired,
      },
    });
    expect(data).toEqual(1);
@@ -33,13 +37,13 @@ describe("CleanupExpiredFileOperationsTask", () => {

  it("should not expire exports made less than 30 days ago", async () => {
    await buildFileOperation({
-      type: "export",
-      state: "complete",
+      type: FileOperationType.Export,
+      state: FileOperationState.Complete,
      createdAt: subDays(new Date(), 29),
    });
    await buildFileOperation({
-      type: "export",
-      state: "complete",
+      type: FileOperationType.Export,
+      state: FileOperationState.Complete,
    });

    const task = new CleanupExpiredFileOperationsTask();
@@ -47,8 +51,8 @@ describe("CleanupExpiredFileOperationsTask", () => {

    const data = await FileOperation.count({
      where: {
-        type: "export",
-        state: "expired",
+        type: FileOperationType.Export,
+        state: FileOperationState.Expired,
      },
    });
    expect(data).toEqual(0);
--- a/server/queues/tasks/CleanupExpiredFileOperationsTask.ts
+++ b/server/queues/tasks/CleanupExpiredFileOperationsTask.ts
@@ -2,6 +2,10 @@ import { subDays } from "date-fns";
 import { Op } from "sequelize";
 import Logger from "@server/logging/logger";
 import { FileOperation } from "@server/models";
+import {
+  FileOperationState,
+  FileOperationType,
+} from "@server/models/FileOperation";
 import BaseTask, { TaskPriority } from "./BaseTask";

 type Props = {
@@ -13,12 +17,12 @@ export default class CleanupExpiredFileOperationsTask extends BaseTask<Props> {
    Logger.info("task", `Expiring export file operations older than 30 days…`);
    const fileOperations = await FileOperation.unscoped().findAll({
      where: {
-        type: "export",
+        type: FileOperationType.Export,
        createdAt: {
          [Op.lt]: subDays(new Date(), 30),
        },
        state: {
-          [Op.ne]: "expired",
+          [Op.ne]: FileOperationState.Expired,
        },
      },
      limit,
--- a/server/queues/tasks/ImportMarkdownZipTask.test.ts
+++ b/server/queues/tasks/ImportMarkdownZipTask.test.ts
@@ -0,0 +1,87 @@
+import fs from "fs";
+import path from "path";
+import { FileOperation } from "@server/models";
+import { buildFileOperation } from "@server/test/factories";
+import { flushdb } from "@server/test/support";
+import ImportMarkdownZipTask from "./ImportMarkdownZipTask";
+
+beforeEach(() => flushdb());
+
+describe("ImportMarkdownZipTask", () => {
+  it("should import the documents, attachments", async () => {
+    const fileOperation = await buildFileOperation();
+    Object.defineProperty(fileOperation, "buffer", {
+      get() {
+        return fs.readFileSync(
+          path.resolve(__dirname, "..", "..", "test", "fixtures", "outline.zip")
+        );
+      },
+    });
+    jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
+
+    const props = {
+      fileOperationId: fileOperation.id,
+    };
+
+    const task = new ImportMarkdownZipTask();
+    const response = await task.perform(props);
+
+    expect(response.collections.size).toEqual(1);
+    expect(response.documents.size).toEqual(8);
+    expect(response.attachments.size).toEqual(6);
+  });
+
+  it("should throw an error with corrupt zip", async () => {
+    const fileOperation = await buildFileOperation();
+    Object.defineProperty(fileOperation, "buffer", {
+      get() {
+        return fs.readFileSync(
+          path.resolve(__dirname, "..", "..", "test", "fixtures", "corrupt.zip")
+        );
+      },
+    });
+    jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
+
+    const props = {
+      fileOperationId: fileOperation.id,
+    };
+
+    let error;
+    try {
+      const task = new ImportMarkdownZipTask();
+      await task.perform(props);
+    } catch (err) {
+      error = err;
+    }
+
+    expect(error && error.message).toBeTruthy();
+  });
+
+  it("should throw an error with empty collection in zip", async () => {
+    const fileOperation = await buildFileOperation();
+    Object.defineProperty(fileOperation, "buffer", {
+      get() {
+        return fs.readFileSync(
+          path.resolve(__dirname, "..", "..", "test", "fixtures", "empty.zip")
+        );
+      },
+    });
+    jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
+
+    const props = {
+      fileOperationId: fileOperation.id,
+    };
+
+    let error;
+    try {
+      const task = new ImportMarkdownZipTask();
+      await task.perform(props);
+    } catch (err) {
+      error = err;
+    }
+
+    expect(error && error.message).toBe(
+      "Uploaded file does not contain any valid documents"
+    );
+  });
+});
--- a/server/queues/tasks/ImportMarkdownZipTask.ts
+++ b/server/queues/tasks/ImportMarkdownZipTask.ts
@@ -0,0 +1,171 @@
+import JSZip from "jszip";
+import mime from "mime-types";
+import { v4 as uuidv4 } from "uuid";
+import documentImporter from "@server/commands/documentImporter";
+import Logger from "@server/logging/logger";
+import { FileOperation, User } from "@server/models";
+import { zipAsFileTree, FileTreeNode } from "@server/utils/zip";
+import ImportTask, { StructuredImportData } from "./ImportTask";
+
+export default class ImportMarkdownZipTask extends ImportTask {
+  public async parseData(
+    buffer: Buffer,
+    fileOperation: FileOperation
+  ): Promise<StructuredImportData> {
+    const zip = await JSZip.loadAsync(buffer);
+    const tree = zipAsFileTree(zip);
+
+    return this.parseFileTree({ fileOperation, zip, tree });
+  }
+
+  /**
+   * Converts the file structure from zipAsFileTree into documents,
+   * collections, and attachments.
+   *
+   * @param tree An array of FileTreeNode representing root files in the zip
+   * @returns A StructuredImportData object
+   */
+  private async parseFileTree({
+    zip,
+    tree,
+    fileOperation,
+  }: {
+    zip: JSZip;
+    fileOperation: FileOperation;
+    tree: FileTreeNode[];
+  }): Promise<StructuredImportData> {
+    const user = await User.findByPk(fileOperation.userId);
+    const output: StructuredImportData = {
+      collections: [],
+      documents: [],
+      attachments: [],
+    };
+
+    async function parseNodeChildren(
+      children: FileTreeNode[],
+      collectionId: string,
+      parentDocumentId?: string
+    ): Promise<void> {
+      if (!user) {
+        throw new Error("User not found");
+      }
+
+      await Promise.all(
+        children.map(async (child) => {
+          // special case for folders of attachments
+          if (
+            child.name === "uploads" ||
+            (child.children.length > 0 && child.path.includes("/uploads/"))
+          ) {
+            return parseNodeChildren(child.children, collectionId);
+          }
+
+          const zipObject = zip.files[child.path];
+          const id = uuidv4();
+
+          // this is an attachment
+          if (child.path.includes("/uploads/") && child.children.length === 0) {
+            output.attachments.push({
+              id,
+              name: child.name,
+              path: child.path,
+              mimeType: mime.lookup(child.path) || "application/octet-stream",
+              buffer: await zipObject.async("nodebuffer"),
+            });
+            return;
+          }
+
+          const { title, text } = await documentImporter({
+            mimeType: "text/markdown",
+            fileName: child.name,
+            content: await zipObject.async("string"),
+            user,
+            ip: user.lastActiveIp || undefined,
+          });
+
+          let metadata;
+          try {
+            metadata = zipObject.comment ? JSON.parse(zipObject.comment) : {};
+          } catch (err) {
+            Logger.debug(
+              "task",
+              `ZIP comment found for ${child.name}, but could not be parsed as metadata: ${zipObject.comment}`
+            );
+          }
+
+          const createdAt = metadata.createdAt
+            ? new Date(metadata.createdAt)
+            : zipObject.date;
+
+          const updatedAt = metadata.updatedAt
+            ? new Date(metadata.updatedAt)
+            : zipObject.date;
+
+          const existingEmptyDocumentIndex = output.documents.findIndex(
+            (doc) =>
+              doc.title === title &&
+              doc.collectionId === collectionId &&
+              doc.parentDocumentId === parentDocumentId &&
+              doc.text === ""
+          );
+
+          // When there is a file and a folder with the same name this handles
+          // the case by combining the two into one document with nested children
+          if (existingEmptyDocumentIndex !== -1) {
+            output.documents[existingEmptyDocumentIndex].text = text;
+          } else {
+            output.documents.push({
+              id,
+              title,
+              text,
+              updatedAt,
+              createdAt,
+              collectionId,
+              parentDocumentId,
+              path: child.path,
+            });
+          }
+
+          await parseNodeChildren(child.children, collectionId, id);
+        })
+      );
+    }
+
+    // All nodes in the root level should be collections
+    for (const node of tree) {
+      if (node.path.endsWith("/")) {
+        const collectionId = uuidv4();
+        output.collections.push({
+          id: collectionId,
+          name: node.title,
+        });
+        await parseNodeChildren(node.children, collectionId);
+      } else {
+        Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
+          fileOperationId: fileOperation.id,
+        });
+      }
+    }
+
+    // Check all of the attachments we've created against urls in the text
+    // and replace them out with attachment redirect urls before continuing.
+    for (const document of output.documents) {
+      for (const attachment of output.attachments) {
+        // Pull the collection and subdirectory out of the path name, upload
+        // folders in an export are relative to the document itself
+        const normalizedAttachmentPath = attachment.path.replace(
+          /(.*)uploads\//,
+          "uploads/"
+        );
+
+        const reference = `<<${attachment.id}>>`;
+        document.text = document.text
+          .replace(new RegExp(attachment.path, "g"), reference)
+          .replace(new RegExp(normalizedAttachmentPath, "g"), reference)
+          .replace(new RegExp(`/${normalizedAttachmentPath}`, "g"), reference);
+      }
+    }
+
+    return output;
+  }
+}
--- a/server/queues/tasks/ImportNotionTask.test.ts
+++ b/server/queues/tasks/ImportNotionTask.test.ts
@@ -0,0 +1,80 @@
+import fs from "fs";
+import path from "path";
+import { FileOperation } from "@server/models";
+import { buildFileOperation } from "@server/test/factories";
+import { flushdb } from "@server/test/support";
+import ImportNotionTask from "./ImportNotionTask";
+
+beforeEach(() => flushdb());
+
+describe("ImportNotionTask", () => {
+  it("should import successfully from a Markdown export", async () => {
+    const fileOperation = await buildFileOperation();
+    Object.defineProperty(fileOperation, "buffer", {
+      get() {
+        return fs.readFileSync(
+          path.resolve(
+            __dirname,
+            "..",
+            "..",
+            "test",
+            "fixtures",
+            "notion-markdown.zip"
+          )
+        );
+      },
+    });
+    jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
+
+    const props = {
+      fileOperationId: fileOperation.id,
+    };
+
+    const task = new ImportNotionTask();
+    const response = await task.perform(props);
+
+    expect(response.collections.size).toEqual(2);
+    expect(response.documents.size).toEqual(6);
+    expect(response.attachments.size).toEqual(1);
+
+    // Check that the image url was replaced in the text with a redirect
+    const attachments = Array.from(response.attachments.values());
+    const documents = Array.from(response.documents.values());
+    expect(documents[2].text).toContain(attachments[0].redirectUrl);
+  });
+
+  it("should import successfully from a HTML export", async () => {
+    const fileOperation = await buildFileOperation();
+    Object.defineProperty(fileOperation, "buffer", {
+      get() {
+        return fs.readFileSync(
+          path.resolve(
+            __dirname,
+            "..",
+            "..",
+            "test",
+            "fixtures",
+            "notion-html.zip"
+          )
+        );
+      },
+    });
+    jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
+
+    const props = {
+      fileOperationId: fileOperation.id,
+    };
+
+    const task = new ImportNotionTask();
+    const response = await task.perform(props);
+
+    expect(response.collections.size).toEqual(2);
+    expect(response.documents.size).toEqual(6);
+    expect(response.attachments.size).toEqual(4);
+
+    // Check that the image url was replaced in the text with a redirect
+    const attachments = Array.from(response.attachments.values());
+    const documents = Array.from(response.documents.values());
+    expect(documents[1].text).toContain(attachments[1].redirectUrl);
+  });
+});
--- a/server/queues/tasks/ImportNotionTask.ts
+++ b/server/queues/tasks/ImportNotionTask.ts
@@ -0,0 +1,301 @@
+import path from "path";
+import JSZip from "jszip";
+import { compact } from "lodash";
+import mime from "mime-types";
+import { v4 as uuidv4 } from "uuid";
+import documentImporter from "@server/commands/documentImporter";
+import Logger from "@server/logging/logger";
+import { FileOperation, User } from "@server/models";
+import { zipAsFileTree, FileTreeNode } from "@server/utils/zip";
+import ImportTask, { StructuredImportData } from "./ImportTask";
+
+export default class ImportNotionTask extends ImportTask {
+  public async parseData(
+    buffer: Buffer,
+    fileOperation: FileOperation
+  ): Promise<StructuredImportData> {
+    const zip = await JSZip.loadAsync(buffer);
+    const tree = zipAsFileTree(zip);
+    return this.parseFileTree({ fileOperation, zip, tree });
+  }
+
+  /**
+   * Converts the file structure from zipAsFileTree into documents,
+   * collections, and attachments.
+   *
+   * @param tree An array of FileTreeNode representing root files in the zip
+   * @returns A StructuredImportData object
+   */
+  private async parseFileTree({
+    zip,
+    tree,
+    fileOperation,
+  }: {
+    zip: JSZip;
+    fileOperation: FileOperation;
+    tree: FileTreeNode[];
+  }): Promise<StructuredImportData> {
+    const user = await User.findByPk(fileOperation.userId);
+    if (!user) {
+      throw new Error("User not found");
+    }
+
+    const output: StructuredImportData = {
+      collections: [],
+      documents: [],
+      attachments: [],
+    };
+
+    const parseNodeChildren = async (
+      children: FileTreeNode[],
+      collectionId: string,
+      parentDocumentId?: string
+    ): Promise<void> => {
+      if (!user) {
+        throw new Error("User not found");
+      }
+
+      await Promise.all(
+        children.map(async (child) => {
+          // Ignore the CSV's for databases upfront
+          if (child.path.endsWith(".csv")) {
+            return;
+          }
+
+          const zipObject = zip.files[child.path];
+          const id = uuidv4();
+          const match = child.title.match(this.NotionUUIDRegex);
+          const name = child.title.replace(this.NotionUUIDRegex, "");
+          const sourceId = match ? match[0].trim() : undefined;
+
+          // If it's not a text file we're going to treat it as an attachment.
+          const mimeType = mime.lookup(child.name);
+          const isDocument =
+            mimeType === "text/markdown" ||
+            mimeType === "text/plain" ||
+            mimeType === "text/html";
+
+          // If it's not a document and not a folder, treat it as an attachment
+          if (!isDocument && mimeType) {
+            output.attachments.push({
+              id,
+              name: child.name,
+              path: child.path,
+              mimeType,
+              buffer: await zipObject.async("nodebuffer"),
+              sourceId,
+            });
+            return;
+          }
+
+          Logger.debug("task", `Processing ${name} as ${mimeType}`);
+
+          const { title, text } = await documentImporter({
+            mimeType: mimeType || "text/markdown",
+            fileName: name,
+            content: await zipObject.async("string"),
+            user,
+            ip: user.lastActiveIp || undefined,
+          });
+
+          const existingDocumentIndex = output.documents.findIndex(
+            (doc) => doc.sourceId === sourceId
+          );
+
+          const existingDocument = output.documents[existingDocumentIndex];
+
+          // If there is an existing document with the same sourceId that means
+          // we've already parsed either a folder or a file referencing the same
+          // document, as such we should merge.
+          if (existingDocument) {
+            if (existingDocument.text === "") {
+              output.documents[existingDocumentIndex].text = text;
+            }
+
+            await parseNodeChildren(
+              child.children,
+              collectionId,
+              existingDocument.id
+            );
+          } else {
+            output.documents.push({
+              id,
+              title,
+              text,
+              collectionId,
+              parentDocumentId,
+              path: child.path,
+              sourceId,
+            });
+            await parseNodeChildren(child.children, collectionId, id);
+          }
+        })
+      );
+    };
+
+    const replaceInternalLinksAndImages = (text: string) => {
+      // Find if there are any images in this document
+      const imagesInText = this.parseImages(text);
+
+      for (const image of imagesInText) {
+        const name = path.basename(image.src);
+        const attachment = output.attachments.find((att) => att.name === name);
+
+        if (!attachment) {
+          Logger.info(
+            "task",
+            `Could not find referenced attachment with name ${name} and src ${image.src}`
+          );
+        } else {
+          text = text.replace(
+            new RegExp(image.src, "g"),
+            `<<${attachment.id}>>`
+          );
+        }
+      }
+
+      // With Notion's HTML import, images sometimes come wrapped in anchor tags
+      // This isn't supported in Outline's editor, so we need to strip them.
+      text = text.replace(/\[!\[([^[]+)]/g, "![]");
+
+      // Find if there are any links in this document pointing to other documents
+      const internalLinksInText = this.parseInternalLinks(text);
+
+      // For each link update to the standardized format of <<documentId>>
+      // instead of a relative or absolute URL within the original zip file.
+      for (const link of internalLinksInText) {
+        const doc = output.documents.find(
+          (doc) => doc.sourceId === link.sourceId
+        );
+
+        if (!doc) {
+          Logger.info(
+            "task",
+            `Could not find referenced document with sourceId ${link.sourceId}`
+          );
+        } else {
+          text = text.replace(link.href, `<<${doc.id}>>`);
+        }
+      }
+
+      return text;
+    };
+
+    // All nodes in the root level should become collections
+    for (const node of tree) {
+      const match = node.title.match(this.NotionUUIDRegex);
+      const name = node.title.replace(this.NotionUUIDRegex, "");
+      const sourceId = match ? match[0].trim() : undefined;
+      const mimeType = mime.lookup(node.name);
+
+      const existingCollectionIndex = output.collections.findIndex(
+        (collection) => collection.sourceId === sourceId
+      );
+      const existingCollection = output.collections[existingCollectionIndex];
+      const collectionId = existingCollection?.id || uuidv4();
+      let description;
+
+      // Root level docs become the descriptions of collections
+      if (
+        mimeType === "text/markdown" ||
+        mimeType === "text/plain" ||
+        mimeType === "text/html"
+      ) {
+        const zipObject = zip.files[node.path];
+        const { text } = await documentImporter({
+          mimeType,
+          fileName: name,
+          content: await zipObject.async("string"),
+          user,
+          ip: user.lastActiveIp || undefined,
+        });
+
+        description = text;
+      } else if (node.children.length > 0) {
+        await parseNodeChildren(node.children, collectionId);
+      } else {
+        Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
+          fileOperationId: fileOperation.id,
+        });
+        continue;
+      }
+
+      if (existingCollectionIndex !== -1) {
+        if (description) {
+          output.collections[existingCollectionIndex].description = description;
+        }
+      } else {
+        output.collections.push({
+          id: collectionId,
+          name,
+          description,
+          sourceId,
+        });
+      }
+    }
+
+    for (const document of output.documents) {
+      document.text = replaceInternalLinksAndImages(document.text);
+    }
+
+    for (const collection of output.collections) {
+      if (collection.description) {
+        collection.description = replaceInternalLinksAndImages(
+          collection.description
+        );
+      }
+    }
+
+    return output;
+  }
+
+  /**
+   * Extracts internal links from a markdown document, taking into account the
+   * sourceId of the document, which is part of the link title.
+   *
+   * @param text The markdown text to parse
+   * @returns An array of internal links
+   */
+  private parseInternalLinks(
+    text: string
+  ): { title: string; href: string; sourceId: string }[] {
+    return compact(
+      [...text.matchAll(this.NotionLinkRegex)].map((match) => ({
+        title: match[1],
+        href: match[2],
+        sourceId: match[3],
+      }))
+    );
+  }
+
+  /**
+   * Extracts images from the markdown document
+   *
+   * @param text The markdown text to parse
+   * @returns An array of internal links
+   */
+  private parseImages(text: string): { alt: string; src: string }[] {
+    return compact(
+      [...text.matchAll(this.ImageRegex)].map((match) => ({
+        alt: match[1],
+        src: match[2],
+      }))
+    );
+  }
+
+  /**
+   * Regex to find markdown images of all types
+   */
+  private ImageRegex = /!\[(?<alt>[^\][]*?)]\((?<filename>[^\][]*?)(?=“|\))“?(?<title>[^\][”]+)?”?\)/g;
+
+  /**
+   * Regex to find markdown links containing ID's that look like UUID's with the
+   * "-"'s removed, Notion's sourceId format.
+   */
+  private NotionLinkRegex = /\[([^[]+)]\((.*?([0-9a-fA-F]{32})\..*?)\)/g;
+
+  /**
+   * Regex to find Notion document UUID's in the title of a document.
+   */
+  private NotionUUIDRegex = /\s([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}|[0-9a-fA-F]{32})$/;
+}
--- a/server/queues/tasks/ImportTask.ts
+++ b/server/queues/tasks/ImportTask.ts
@@ -0,0 +1,379 @@
+import invariant from "invariant";
+import attachmentCreator from "@server/commands/attachmentCreator";
+import documentCreator from "@server/commands/documentCreator";
+import { sequelize } from "@server/database/sequelize";
+import { ValidationError } from "@server/errors";
+import logger from "@server/logging/logger";
+import {
+  User,
+  Event,
+  Document,
+  Collection,
+  FileOperation,
+  Attachment,
+} from "@server/models";
+import { FileOperationState } from "@server/models/FileOperation";
+import BaseTask, { TaskPriority } from "./BaseTask";
+
+type Props = {
+  fileOperationId: string;
+};
+
+/**
+ * Standardized format for data importing, to be used by all import tasks.
+ */
+export type StructuredImportData = {
+  collections: {
+    id: string;
+    name: string;
+    /**
+     * The collection description. To reference an attachment or image use the
+     * special formatting <<attachmentId>>. It will be replaced with a reference
+     * to the actual attachment as part of persistData.
+     *
+     * To reference a document use <<documentId>>, it will be replaced with a
+     * link to the document as part of persistData once the document url is
+     * generated.
+     */
+    description?: string;
+    /** Optional id from import source, useful for mapping */
+    sourceId?: string;
+  }[];
+  documents: {
+    id: string;
+    title: string;
+    /**
+     * The document text. To reference an attachment or image use the special
+     * formatting <<attachmentId>>. It will be replaced with a reference to the
+     * actual attachment as part of persistData.
+     *
+     * To reference another document use <<documentId>>, it will be replaced
+     * with a link to the document as part of persistData once the document url
+     * is generated.
+     */
+    text: string;
+    collectionId: string;
+    updatedAt?: Date;
+    createdAt?: Date;
+    parentDocumentId?: string;
+    path: string;
+    /** Optional id from import source, useful for mapping */
+    sourceId?: string;
+  }[];
+  attachments: {
+    id: string;
+    name: string;
+    path: string;
+    mimeType: string;
+    buffer: Buffer;
+    /** Optional id from import source, useful for mapping */
+    sourceId?: string;
+  }[];
+};
+
+export default abstract class ImportTask extends BaseTask<Props> {
+  /**
+   * Runs the import task.
+   *
+   * @param props The props
+   */
+  public async perform({ fileOperationId }: Props) {
+    const fileOperation = await FileOperation.findByPk(fileOperationId);
+    invariant(fileOperation, "fileOperation not found");
+
+    try {
+      logger.info("task", `ImportTask fetching data for ${fileOperationId}`);
+      const data = await this.fetchData(fileOperation);
+
+      logger.info("task", `ImportTask parsing data for ${fileOperationId}`);
+      const parsed = await this.parseData(data, fileOperation);
+
+      if (parsed.collections.length === 0) {
+        throw ValidationError(
+          "Uploaded file does not contain any collections. The root of the zip file must contain folders representing collections."
+        );
+      }
+
+      if (parsed.documents.length === 0) {
+        throw ValidationError(
+          "Uploaded file does not contain any valid documents"
+        );
+      }
+
+      let result;
+      try {
+        logger.info(
+          "task",
+          `ImportTask persisting data for ${fileOperationId}`
+        );
+        result = await this.persistData(parsed, fileOperation);
+      } catch (error) {
+        logger.error(
+          `ImportTask failed to persist data for ${fileOperationId}`,
+          error
+        );
+        throw new Error("Sorry, an internal error occurred during import");
+      }
+
+      await this.updateFileOperation(
+        fileOperation,
+        FileOperationState.Complete
+      );
+
+      return result;
+    } catch (error) {
+      await this.updateFileOperation(
+        fileOperation,
+        FileOperationState.Error,
+        error
+      );
+      throw error;
+    }
+  }
+
+  /**
+   * Update the state of the underlying FileOperation in the database and send
+   * an event to the client.
+   *
+   * @param fileOperation The FileOperation to update
+   */
+  private async updateFileOperation(
+    fileOperation: FileOperation,
+    state: FileOperationState,
+    error?: Error
+  ) {
+    await fileOperation.update({ state, error: error?.message });
+    await Event.schedule({
+      name: "fileOperations.update",
+      modelId: fileOperation.id,
+      teamId: fileOperation.teamId,
+      actorId: fileOperation.userId,
+    });
+  }
+
+  /**
+   * Fetch the remote data needed for the import, by default this will download
+   * any file associated with the FileOperation, save it to a temporary file,
+   * and return the path.
+   *
+   * @param fileOperation The FileOperation to fetch data for
+   * @returns string
+   */
+  protected async fetchData(fileOperation: FileOperation) {
+    return fileOperation.buffer;
+  }
+
+  /**
+   * Parse the data loaded from fetchData into a consistent structured format
+   * that represents collections, documents, and the relationships between them.
+   *
+   * @param data The data loaded from fetchData
+   * @returns A promise that resolves to the structured data
+   */
+  protected abstract parseData(
+    data: any,
+    fileOperation: FileOperation
+  ): Promise<StructuredImportData>;
+
+  /**
+   * Persist the data that was already fetched and parsed into the consistent
+   * structured data.
+   *
+   * @param props The props
+   */
+  protected async persistData(
+    data: StructuredImportData,
+    fileOperation: FileOperation
+  ): Promise<{
+    collections: Map<string, Collection>;
+    documents: Map<string, Document>;
+    attachments: Map<string, Attachment>;
+  }> {
+    const collections = new Map<string, Collection>();
+    const documents = new Map<string, Document>();
+    const attachments = new Map<string, Attachment>();
+
+    return sequelize.transaction(async (transaction) => {
+      const user = await User.findByPk(fileOperation.userId, {
+        transaction,
+      });
+      invariant(user, "User not found");
+
+      const ip = user.lastActiveIp || undefined;
+
+      // Attachments
+      for (const item of data.attachments) {
+        const attachment = await attachmentCreator({
+          source: "import",
+          id: item.id,
+          name: item.name,
+          type: item.mimeType,
+          buffer: item.buffer,
+          user,
+          ip,
+          transaction,
+        });
+        attachments.set(item.id, attachment);
+      }
+
+      // Collections
+      for (const item of data.collections) {
+        let description = item.description;
+
+        if (description) {
+          // Check all of the attachments we've created against urls in the text
+          // and replace them out with attachment redirect urls before saving.
+          for (const aitem of data.attachments) {
+            const attachment = attachments.get(aitem.id);
+            if (!attachment) {
+              continue;
+            }
+            description = description.replace(
+              new RegExp(`<<${attachment.id}>>`, "g"),
+              attachment.redirectUrl
+            );
+          }
+
+          // Check all of the document we've created against urls in the text
+          // and replace them out with a valid internal link. Because we are doing
+          // this before saving, we can't use the document slug, but we can take
+          // advantage of the fact that the document id will redirect in the client
+          for (const ditem of data.documents) {
+            description = description.replace(
+              new RegExp(`<<${ditem.id}>>`, "g"),
+              `/doc/${ditem.id}`
+            );
+          }
+        }
+
+        // check if collection with name exists
+        const response = await Collection.findOrCreate({
+          where: {
+            teamId: fileOperation.teamId,
+            name: item.name,
+          },
+          defaults: {
+            id: item.id,
+            description,
+            createdById: fileOperation.userId,
+            permission: "read_write",
+          },
+          transaction,
+        });
+
+        let collection = response[0];
+        const isCreated = response[1];
+
+        // create new collection if name already exists, yes it's possible that
+        // there is also a "Name (Imported)" but this is a case not worth dealing
+        // with right now
+        if (!isCreated) {
+          const name = `${item.name} (Imported)`;
+          collection = await Collection.create(
+            {
+              id: item.id,
+              description,
+              teamId: fileOperation.teamId,
+              createdById: fileOperation.userId,
+              name,
+              permission: "read_write",
+            },
+            { transaction }
+          );
+        }
+
+        await Event.create(
+          {
+            name: "collections.create",
+            collectionId: collection.id,
+            teamId: collection.teamId,
+            actorId: fileOperation.userId,
+            data: {
+              name: collection.name,
+            },
+            ip,
+          },
+          {
+            transaction,
+          }
+        );
+
+        collections.set(item.id, collection);
+      }
+
+      // Documents
+      for (const item of data.documents) {
+        let text = item.text;
+
+        // Check all of the attachments we've created against urls in the text
+        // and replace them out with attachment redirect urls before saving.
+        for (const aitem of data.attachments) {
+          const attachment = attachments.get(aitem.id);
+          if (!attachment) {
+            continue;
+          }
+          text = text.replace(
+            new RegExp(`<<${attachment.id}>>`, "g"),
+            attachment.redirectUrl
+          );
+        }
+
+        // Check all of the document we've created against urls in the text
+        // and replace them out with a valid internal link. Because we are doing
+        // this before saving, we can't use the document slug, but we can take
+        // advantage of the fact that the document id will redirect in the client
+        for (const ditem of data.documents) {
+          text = text.replace(
+            new RegExp(`<<${ditem.id}>>`, "g"),
+            `/doc/${ditem.id}`
+          );
+        }
+
+        const document = await documentCreator({
+          source: "import",
+          id: item.id,
+          title: item.title,
+          text,
+          collectionId: item.collectionId,
+          createdAt: item.createdAt,
+          updatedAt: item.updatedAt ?? item.createdAt,
+          publishedAt: item.updatedAt ?? item.createdAt ?? new Date(),
+          parentDocumentId: item.parentDocumentId,
+          user,
+          ip,
+          transaction,
+        });
+        documents.set(item.id, document);
+
+        const collection = collections.get(item.collectionId);
+        if (collection) {
+          await collection.addDocumentToStructure(document, 0, { transaction });
+        }
+      }
+
+      // Return value is only used for testing
+      return {
+        collections,
+        documents,
+        attachments,
+      };
+    });
+  }
+
+  /**
+   * Optional hook to remove any temporary files that were created
+   */
+  protected async cleanupData() {
+    // noop
+  }
+
+  /**
+   * Job options such as priority and retry strategy, as defined by Bull.
+   */
+  public get options() {
+    return {
+      priority: TaskPriority.Low,
+      attempts: 1,
+    };
+  }
+}