chore: Update documentImporter with changes from enterprise, improved Confluence compat

This commit is contained in:
Tom Moor
2022-06-02 21:42:32 +02:00
parent 9113989635
commit 68dd76cfa3
8 changed files with 144 additions and 31 deletions

View File

@@ -1,11 +1,9 @@
import path from "path";
import emojiRegex from "emoji-regex";
import { strikethrough, tables } from "joplin-turndown-plugin-gfm";
import { truncate } from "lodash";
import mammoth from "mammoth";
import quotedPrintable from "quoted-printable";
import { Transaction } from "sequelize";
import TurndownService from "turndown";
import utf8 from "utf8";
import { MAX_TITLE_LENGTH } from "@shared/constants";
import parseTitle from "@shared/utils/parseTitle";
@@ -13,28 +11,10 @@ import { APM } from "@server/logging/tracing";
import { User } from "@server/models";
import dataURItoBuffer from "@server/utils/dataURItoBuffer";
import parseImages from "@server/utils/parseImages";
import turndownService from "@server/utils/turndown";
import { FileImportError, InvalidRequestError } from "../errors";
import attachmentCreator from "./attachmentCreator";
// https://github.com/domchristie/turndown#options
const turndownService = new TurndownService({
hr: "---",
bulletListMarker: "-",
headingStyle: "atx",
}).remove(["script", "style", "title", "head"]);
// Use the GitHub-flavored markdown plugin to parse
// strikethoughs and tables
turndownService
.use(strikethrough)
.use(tables)
.addRule("breaks", {
filter: ["br"],
replacement: function () {
return "\n";
},
});
interface ImportableFile {
type: string;
getMarkdown: (content: Buffer | string) => Promise<string>;
@@ -200,7 +180,8 @@ async function documentImporter({
const regex = emojiRegex();
const matches = regex.exec(text);
const firstEmoji = matches ? matches[0] : undefined;
if (firstEmoji && text.startsWith(firstEmoji)) {
const textStartsWithEmoji = firstEmoji && text.startsWith(firstEmoji);
if (textStartsWithEmoji) {
text = text.replace(firstEmoji, "").trim();
}
@@ -213,10 +194,14 @@ async function documentImporter({
}
// If we parsed an emoji from _above_ the title then add it back at prefixing
if (firstEmoji) {
if (textStartsWithEmoji) {
title = `${firstEmoji} ${title}`;
}
// Replace any <br> generated by the turndown plugin with escaped newlines
// to match our hardbreak parser.
text = text.replace(/<br>/gi, "\\n");
// find data urls, convert to blobs, upload and write attachments
const images = parseImages(text);
const dataURIs = images.filter((href) => href.startsWith("data:"));

View File

@@ -12,10 +12,11 @@ declare module "oy-vey";
declare module "fetch-test-server";
declare module "joplin-turndown-plugin-gfm" {
declare module "@joplin/turndown-plugin-gfm" {
import { Plugin } from "turndown";
export const strikethrough: Plugin;
export const tables: Plugin;
export const taskListItems: Plugin;
export const gfm: Plugin;
}

View File

@@ -0,0 +1,15 @@
import TurndownService from "turndown";
/**
* A turndown plugin for converting break tags to newlines.
*
* @param turndownService The TurndownService instance.
*/
export default function breaks(turndownService: TurndownService) {
turndownService.addRule("breaks", {
filter: ["br"],
replacement: function () {
return "\n";
},
});
}

View File

@@ -0,0 +1,57 @@
import { repeat } from "lodash";
import TurndownService from "turndown";
const highlightRegExp = /brush: ([a-z0-9]+);/;
/**
* A turndown plugin for converting a confluence code block to markdown.
*
* @param turndownService The TurndownService instance.
*/
export default function confluenceCodeBlock(turndownService: TurndownService) {
turndownService.addRule("fencedConfluenceHighlightedCodeBlock", {
filter: function (node) {
const firstChild = node.firstChild;
return (
node.nodeName === "DIV" &&
firstChild?.nodeName === "PRE" &&
// @ts-expect-error className exists
firstChild.className === "syntaxhighlighter-pre"
);
},
replacement: function (content, node) {
const dataSyntaxhighlighterParams =
// @ts-expect-error getAttribute exists
node.firstChild?.getAttribute("data-syntaxhighlighter-params") ?? "";
const language = (dataSyntaxhighlighterParams.match(highlightRegExp) || [
null,
"",
])[1];
const code = node.firstChild?.textContent ?? "";
const fenceChar = "`";
let fenceSize = 3;
const fenceInCodeRegex = new RegExp("^" + fenceChar + "{3,}", "gm");
let match;
while ((match = fenceInCodeRegex.exec(code))) {
if (match[0].length >= fenceSize) {
fenceSize = match[0].length + 1;
}
}
const fence = repeat(fenceChar, fenceSize);
return (
"\n\n" +
fence +
language +
"\n" +
code.replace(/\n$/, "") +
"\n" +
fence +
"\n\n"
);
},
});
}

View File

@@ -0,0 +1,25 @@
import TurndownService from "turndown";
/**
* A turndown plugin for converting a confluence task list to markdown.
*
* @param turndownService The TurndownService instance.
*/
export default function confluenceTaskList(turndownService: TurndownService) {
turndownService.addRule("confluenceTaskList", {
filter: function (node) {
return (
node.nodeName === "LI" &&
node.parentNode?.nodeName === "UL" &&
// @ts-expect-error className exists
node.parentNode?.className.includes("inline-task-list")
);
},
replacement: function (content, node) {
return (
// @ts-expect-error className exists
(node.className === "checked" ? "- [x]" : "- [ ]") + ` ${content} \n`
);
},
});
}

View File

@@ -0,0 +1,30 @@
import { gfm } from "@joplin/turndown-plugin-gfm";
import TurndownService from "turndown";
import breaks from "./breaks";
import confluenceCodeBlock from "./confluence-code-block";
import confluenceTaskList from "./confluence-task-list";
/**
* Turndown converts HTML to Markdown and is used in the importer code.
*
* For options, see: https://github.com/domchristie/turndown#options
*/
const service = new TurndownService({
hr: "---",
bulletListMarker: "-",
headingStyle: "atx",
codeBlockStyle: "fenced",
blankReplacement: (content, node) => {
if (node.nodeName === "P") {
return "\n\n\\\n";
}
return "";
},
})
.remove(["script", "style", "title", "head"])
.use(gfm)
.use(confluenceTaskList)
.use(confluenceCodeBlock)
.use(breaks);
export default service;