chore: Refactor data import (#3434)

* Complete refactor of import * feat: Notion data import (#3442)
2022-04-23 10:07:35 -07:00
parent bdcfaae025
commit 33ce49cc33
45 changed files with 2217 additions and 1066 deletions
--- a/server/commands/documentImporter.ts
+++ b/server/commands/documentImporter.ts
@@ -1,9 +1,10 @@
-import fs from "fs";
 import path from "path";
+import emojiRegex from "emoji-regex";
 import { strikethrough, tables } from "joplin-turndown-plugin-gfm";
 import { truncate } from "lodash";
 import mammoth from "mammoth";
 import quotedPrintable from "quoted-printable";
+import { Transaction } from "sequelize";
 import TurndownService from "turndown";
 import utf8 from "utf8";
 import { MAX_TITLE_LENGTH } from "@shared/constants";
@@ -21,7 +22,7 @@ const turndownService = new TurndownService({
  hr: "---",
  bulletListMarker: "-",
  headingStyle: "atx",
-});
+}).remove(["script", "style", "title", "head"]);

 // Use the GitHub-flavored markdown plugin to parse
 // strikethoughs and tables
@@ -37,7 +38,7 @@ turndownService

 interface ImportableFile {
  type: string;
-  getMarkdown: (file: any) => Promise<string>;
+  getMarkdown: (content: Buffer | string) => Promise<string>;
 }

 const importMapping: ImportableFile[] = [
@@ -68,26 +69,34 @@ const importMapping: ImportableFile[] = [
  },
 ];

-// @ts-expect-error ts-migrate(7006) FIXME: Parameter 'file' implicitly has an 'any' type.
-async function fileToMarkdown(file): Promise<string> {
-  return fs.promises.readFile(file.path, "utf8");
+async function fileToMarkdown(content: Buffer | string): Promise<string> {
+  if (content instanceof Buffer) {
+    content = content.toString("utf8");
+  }
+  return content;
 }

-// @ts-expect-error ts-migrate(7006) FIXME: Parameter 'file' implicitly has an 'any' type.
-async function docxToMarkdown(file): Promise<string> {
-  const { value } = await mammoth.convertToHtml(file);
-  return turndownService.turndown(value);
+async function docxToMarkdown(content: Buffer | string): Promise<string> {
+  if (content instanceof Buffer) {
+    const { value: html } = await mammoth.convertToHtml({ buffer: content });
+    return turndownService.turndown(html);
+  }
+
+  throw new Error("docxToMarkdown: content must be a Buffer");
 }

-// @ts-expect-error ts-migrate(7006) FIXME: Parameter 'file' implicitly has an 'any' type.
-async function htmlToMarkdown(file): Promise<string> {
-  const value = await fs.promises.readFile(file.path, "utf8");
-  return turndownService.turndown(value);
+async function htmlToMarkdown(content: Buffer | string): Promise<string> {
+  if (content instanceof Buffer) {
+    content = content.toString("utf8");
+  }
+
+  return turndownService.turndown(content);
 }

-// @ts-expect-error ts-migrate(7006) FIXME: Parameter 'file' implicitly has an 'any' type.
-async function confluenceToMarkdown(file): Promise<string> {
-  let value = await fs.promises.readFile(file.path, "utf8");
+async function confluenceToMarkdown(value: Buffer | string): Promise<string> {
+  if (value instanceof Buffer) {
+    value = value.toString("utf8");
+  }

  // We're only supporting the ridiculous output from Confluence here, regular
  // Word documents should call into the docxToMarkdown importer.
@@ -143,22 +152,28 @@ async function confluenceToMarkdown(file): Promise<string> {
 }

 async function documentImporter({
-  file,
+  mimeType,
+  fileName,
+  content,
  user,
  ip,
+  transaction,
 }: {
  user: User;
-  file: File;
-  ip: string;
+  mimeType: string;
+  fileName: string;
+  content: Buffer | string;
+  ip?: string;
+  transaction?: Transaction;
 }): Promise<{
  text: string;
  title: string;
 }> {
  const fileInfo = importMapping.filter((item) => {
-    if (item.type === file.type) {
+    if (item.type === mimeType) {
      if (
-        file.type === "application/octet-stream" &&
-        path.extname(file.name) !== ".docx"
+        mimeType === "application/octet-stream" &&
+        path.extname(fileName) !== ".docx"
      ) {
        return false;
      }
@@ -166,7 +181,7 @@ async function documentImporter({
      return true;
    }

-    if (item.type === "text/markdown" && path.extname(file.name) === ".md") {
+    if (item.type === "text/markdown" && path.extname(fileName) === ".md") {
      return true;
    }

@@ -174,20 +189,35 @@ async function documentImporter({
  })[0];

  if (!fileInfo) {
-    throw InvalidRequestError(`File type ${file.type} not supported`);
+    throw InvalidRequestError(`File type ${mimeType} not supported`);
  }

-  let title = deserializeFilename(file.name.replace(/\.[^/.]+$/, ""));
-  let text = await fileInfo.getMarkdown(file);
+  let title = deserializeFilename(fileName.replace(/\.[^/.]+$/, ""));
+  let text = await fileInfo.getMarkdown(content);
+  text = text.trim();
+
+  // find and extract first emoji, in the case of some imports it can be outside
+  // of the title, at the top of the document.
+  const regex = emojiRegex();
+  const matches = regex.exec(text);
+  const firstEmoji = matches ? matches[0] : undefined;
+  if (firstEmoji && text.startsWith(firstEmoji)) {
+    text = text.replace(firstEmoji, "").trim();
+  }

  // If the first line of the imported text looks like a markdown heading
  // then we can use this as the document title
-  if (text.trim().startsWith("# ")) {
+  if (text.startsWith("# ")) {
    const result = parseTitle(text);
    title = result.title;
    text = text.replace(`# ${title}\n`, "");
  }

+  // If we parsed an emoji from _above_ the title then add it back at prefixing
+  if (firstEmoji) {
+    title = `${firstEmoji} ${title}`;
+  }
+
  // find data urls, convert to blobs, upload and write attachments
  const images = parseImages(text);
  const dataURIs = images.filter((href) => href.startsWith("data:"));
@@ -201,6 +231,7 @@ async function documentImporter({
      buffer,
      user,
      ip,
+      transaction,
    });
    text = text.replace(uri, attachment.redirectUrl);
  }