fix: Improve logic for word import (#6361)

* Refactor DocumentConverter * Support parsing images from Confluence exported .doc files * fix: Bring across 2 fixes from enterprise codebase * Bust dependency cache
2024-01-09 20:29:47 -08:00
parent a032f2e7e5
commit 7d7d0fd9ca
6 changed files with 297 additions and 227 deletions
--- a/server/commands/documentImporter.ts
+++ b/server/commands/documentImporter.ts
@@ -1,136 +1,15 @@
-import path from "path";
 import emojiRegex from "emoji-regex";
 import escapeRegExp from "lodash/escapeRegExp";
 import truncate from "lodash/truncate";
-import mammoth from "mammoth";
-import quotedPrintable from "quoted-printable";
 import { Transaction } from "sequelize";
-import utf8 from "utf8";
 import parseTitle from "@shared/utils/parseTitle";
 import { DocumentValidation } from "@shared/validations";
 import { traceFunction } from "@server/logging/tracing";
 import { User } from "@server/models";
 import ProsemirrorHelper from "@server/models/helpers/ProsemirrorHelper";
 import TextHelper from "@server/models/helpers/TextHelper";
-import turndownService from "@server/utils/turndown";
-import { FileImportError, InvalidRequestError } from "../errors";
-
-interface ImportableFile {
-  type: string;
-  getMarkdown: (content: Buffer | string) => Promise<string>;
-}
-
-const importMapping: ImportableFile[] = [
-  {
-    type: "application/msword",
-    getMarkdown: confluenceToMarkdown,
-  },
-  {
-    type: "application/octet-stream",
-    getMarkdown: docxToMarkdown,
-  },
-  {
-    type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-    getMarkdown: docxToMarkdown,
-  },
-  {
-    type: "text/html",
-    getMarkdown: htmlToMarkdown,
-  },
-  {
-    type: "text/plain",
-    getMarkdown: fileToMarkdown,
-  },
-  {
-    type: "text/markdown",
-    getMarkdown: fileToMarkdown,
-  },
-];
-
-async function fileToMarkdown(content: Buffer | string): Promise<string> {
-  if (content instanceof Buffer) {
-    content = content.toString("utf8");
-  }
-  return content;
-}
-
-async function docxToMarkdown(content: Buffer | string): Promise<string> {
-  if (content instanceof Buffer) {
-    const { value: html } = await mammoth.convertToHtml({
-      buffer: content,
-    });
-
-    return turndownService.turndown(html);
-  }
-
-  throw new Error("docxToMarkdown: content must be a Buffer");
-}
-
-async function htmlToMarkdown(content: Buffer | string): Promise<string> {
-  if (content instanceof Buffer) {
-    content = content.toString("utf8");
-  }
-
-  return turndownService.turndown(content);
-}
-
-async function confluenceToMarkdown(value: Buffer | string): Promise<string> {
-  if (value instanceof Buffer) {
-    value = value.toString("utf8");
-  }
-
-  // We're only supporting the ridiculous output from Confluence here, regular
-  // Word documents should call into the docxToMarkdown importer.
-  // See: https://jira.atlassian.com/browse/CONFSERVER-38237
-  if (!value.includes("Content-Type: multipart/related")) {
-    throw FileImportError("Unsupported Word file");
-  }
-
-  // get boundary marker
-  const boundaryMarker = value.match(/boundary="(.+)"/);
-
-  if (!boundaryMarker) {
-    throw FileImportError("Unsupported Word file (No boundary marker)");
-  }
-
-  // get content between multipart boundaries
-  let boundaryReached = 0;
-  const lines = value.split("\n").filter((line) => {
-    if (line.includes(boundaryMarker[1])) {
-      boundaryReached++;
-      return false;
-    }
-
-    if (line.startsWith("Content-")) {
-      return false;
-    }
-
-    // 1 == definition
-    // 2 == content
-    // 3 == ending
-    if (boundaryReached === 2) {
-      return true;
-    }
-
-    return false;
-  });
-
-  if (!lines.length) {
-    throw FileImportError("Unsupported Word file (No content found)");
-  }
-
-  // Mime attachment is "quoted printable" encoded, must be decoded first
-  // https://en.wikipedia.org/wiki/Quoted-printable
-  value = utf8.decode(quotedPrintable.decode(lines.join("\n")));
-
-  // If we don't remove the title here it becomes printed in the document
-  // body by turndown
-  turndownService.remove(["style", "title"]);
-
-  // Now we should have something that looks like HTML
-  const html = turndownService.turndown(value);
-  return html.replace(/<br>/g, " \\n ");
-}
+import { DocumentConverter } from "@server/utils/DocumentConverter";
+import { InvalidRequestError } from "../errors";

 type Props = {
  user: User;
@@ -154,31 +33,12 @@ async function documentImporter({
  title: string;
  state: Buffer;
 }> {
-  const fileInfo = importMapping.filter((item) => {
-    if (item.type === mimeType) {
-      if (
-        mimeType === "application/octet-stream" &&
-        path.extname(fileName) !== ".docx"
-      ) {
-        return false;
-      }
-
-      return true;
-    }
-
-    if (item.type === "text/markdown" && path.extname(fileName) === ".md") {
-      return true;
-    }
-
-    return false;
-  })[0];
-
-  if (!fileInfo) {
-    throw InvalidRequestError(`File type ${mimeType} not supported`);
-  }
-
+  let text = await DocumentConverter.convertToMarkdown(
+    content,
+    fileName,
+    mimeType
+  );
  let title = fileName.replace(/\.[^/.]+$/, "");
-  let text = await fileInfo.getMarkdown(content);

  // find and extract emoji near the beginning of the document.
  const regex = emojiRegex();
@@ -203,6 +63,13 @@ async function documentImporter({
  // to match our hardbreak parser.
  text = text.trim().replace(/<br>/gi, "\\n");

+  // Escape any dollar signs in the text to prevent them being interpreted as
+  // math blocks
+  text = text.replace(/\$/g, "\\$");
+
+  // Remove any closed and immediately reopened formatting marks
+  text = text.replace(/\*\*\*\*/gi, "").replace(/____/gi, "");
+
  text = await TextHelper.replaceImagesWithAttachments(
    text,
    user,
--- a/server/utils/DocumentConverter.ts
+++ b/server/utils/DocumentConverter.ts
@@ -0,0 +1,125 @@
+import escapeRegExp from "lodash/escapeRegExp";
+import { simpleParser } from "mailparser";
+import mammoth from "mammoth";
+import { FileImportError } from "@server/errors";
+import turndownService from "@server/utils/turndown";
+
+export class DocumentConverter {
+  /**
+   * Convert an incoming file to markdown.
+   * @param content The content of the file.
+   * @param fileName The name of the file, including extension.
+   * @param mimeType The mime type of the file.
+   * @returns The markdown representation of the file.
+   */
+  public static async convertToMarkdown(
+    content: Buffer | string,
+    fileName: string,
+    mimeType: string
+  ) {
+    // First try to convert the file based on the mime type.
+    switch (mimeType) {
+      case "application/msword":
+        return this.confluenceToMarkdown(content);
+      case "application/octet-stream":
+        if (fileName.endsWith(".docx")) {
+          return this.docXToMarkdown(content);
+        }
+        throw FileImportError(`File type ${mimeType} not supported`);
+      case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        return this.docXToMarkdown(content);
+      case "text/html":
+        return this.htmlToMarkdown(content);
+      case "text/plain":
+      case "text/markdown":
+        return this.fileToMarkdown(content);
+      default:
+        break;
+    }
+
+    // If the mime type doesn't work, try to convert based on the file extension.
+    const extension = fileName.split(".").pop();
+    switch (extension) {
+      case "docx":
+        return this.docXToMarkdown(content);
+      case "html":
+        return this.htmlToMarkdown(content);
+      case "md":
+      case "markdown":
+        return this.fileToMarkdown(content);
+      default:
+        throw FileImportError(`File type ${mimeType} not supported`);
+    }
+  }
+
+  public static async docXToMarkdown(content: Buffer | string) {
+    if (content instanceof Buffer) {
+      const { value } = await mammoth.convertToHtml({
+        buffer: content,
+      });
+
+      return turndownService.turndown(value);
+    }
+
+    throw FileImportError("Unsupported Word file");
+  }
+
+  public static async htmlToMarkdown(content: Buffer | string) {
+    if (content instanceof Buffer) {
+      content = content.toString("utf8");
+    }
+
+    return turndownService.turndown(content);
+  }
+
+  public static async fileToMarkdown(content: Buffer | string) {
+    if (content instanceof Buffer) {
+      content = content.toString("utf8");
+    }
+    return content;
+  }
+
+  public static async confluenceToMarkdown(value: Buffer | string) {
+    if (value instanceof Buffer) {
+      value = value.toString("utf8");
+    }
+
+    // We're only supporting the output from Confluence here, regular Word documents should call
+    // into the docxToMarkdown importer. See: https://jira.atlassian.com/browse/CONFSERVER-38237
+    if (!value.includes("Content-Type: multipart/related")) {
+      throw FileImportError("Unsupported Word file");
+    }
+
+    // Confluence "Word" documents are actually just multi-part email messages, so we can use
+    // mailparser to parse the content.
+    const parsed = await simpleParser(value);
+    if (!parsed.html) {
+      throw FileImportError("Unsupported Word file (No content found)");
+    }
+
+    // Replace the content-location with a data URI for each attachment.
+    for (const attachment of parsed.attachments) {
+      const contentLocation = String(
+        attachment.headers.get("content-location") ?? ""
+      );
+
+      const id = contentLocation.split("/").pop();
+      if (!id) {
+        continue;
+      }
+
+      parsed.html = parsed.html.replace(
+        new RegExp(escapeRegExp(id), "g"),
+        `data:image/png;base64,${attachment.content.toString("base64")}`
+      );
+    }
+
+    // If we don't remove the title here it becomes printed in the document
+    // body by turndown
+    turndownService.remove(["style", "title"]);
+
+    // Now we should have something that looks like HTML
+    const html = turndownService.turndown(parsed.html);
+    return html.replace(/<br>/g, " \\n ");
+  }
+}