diff --git a/package.json b/package.json index 05d1cb1ff..4cec5aba8 100644 --- a/package.json +++ b/package.json @@ -56,6 +56,7 @@ "@getoutline/y-prosemirror": "^1.0.18", "@hocuspocus/provider": "^1.0.0-alpha.36", "@hocuspocus/server": "^1.0.0-alpha.102", + "@joplin/turndown-plugin-gfm": "^1.0.44", "@outlinewiki/koa-passport": "^4.1.4", "@outlinewiki/passport-azure-ad-oauth2": "^0.1.0", "@renderlesskit/react": "^0.6.0", @@ -100,7 +101,6 @@ "invariant": "^2.2.4", "ioredis": "^4.28.0", "is-printable-key-event": "^1.0.0", - "joplin-turndown-plugin-gfm": "^1.0.12", "json-loader": "0.5.4", "jsonwebtoken": "^8.5.0", "jszip": "^3.7.1", diff --git a/server/commands/documentImporter.ts b/server/commands/documentImporter.ts index ed06e1487..5e761d0d8 100644 --- a/server/commands/documentImporter.ts +++ b/server/commands/documentImporter.ts @@ -1,11 +1,9 @@ import path from "path"; import emojiRegex from "emoji-regex"; -import { strikethrough, tables } from "joplin-turndown-plugin-gfm"; import { truncate } from "lodash"; import mammoth from "mammoth"; import quotedPrintable from "quoted-printable"; import { Transaction } from "sequelize"; -import TurndownService from "turndown"; import utf8 from "utf8"; import { MAX_TITLE_LENGTH } from "@shared/constants"; import parseTitle from "@shared/utils/parseTitle"; @@ -13,28 +11,10 @@ import { APM } from "@server/logging/tracing"; import { User } from "@server/models"; import dataURItoBuffer from "@server/utils/dataURItoBuffer"; import parseImages from "@server/utils/parseImages"; +import turndownService from "@server/utils/turndown"; import { FileImportError, InvalidRequestError } from "../errors"; import attachmentCreator from "./attachmentCreator"; -// https://github.com/domchristie/turndown#options -const turndownService = new TurndownService({ - hr: "---", - bulletListMarker: "-", - headingStyle: "atx", -}).remove(["script", "style", "title", "head"]); - -// Use the GitHub-flavored markdown plugin to parse -// strikethoughs and tables -turndownService - .use(strikethrough) - .use(tables) - .addRule("breaks", { - filter: ["br"], - replacement: function () { - return "\n"; - }, - }); - interface ImportableFile { type: string; getMarkdown: (content: Buffer | string) => Promise; @@ -200,7 +180,8 @@ async function documentImporter({ const regex = emojiRegex(); const matches = regex.exec(text); const firstEmoji = matches ? matches[0] : undefined; - if (firstEmoji && text.startsWith(firstEmoji)) { + const textStartsWithEmoji = firstEmoji && text.startsWith(firstEmoji); + if (textStartsWithEmoji) { text = text.replace(firstEmoji, "").trim(); } @@ -213,10 +194,14 @@ async function documentImporter({ } // If we parsed an emoji from _above_ the title then add it back at prefixing - if (firstEmoji) { + if (textStartsWithEmoji) { title = `${firstEmoji} ${title}`; } + // Replace any
generated by the turndown plugin with escaped newlines + // to match our hardbreak parser. + text = text.replace(/
/gi, "\\n"); + // find data urls, convert to blobs, upload and write attachments const images = parseImages(text); const dataURIs = images.filter((href) => href.startsWith("data:")); diff --git a/server/typings/index.d.ts b/server/typings/index.d.ts index cf3a204d5..d966c5a67 100644 --- a/server/typings/index.d.ts +++ b/server/typings/index.d.ts @@ -12,10 +12,11 @@ declare module "oy-vey"; declare module "fetch-test-server"; -declare module "joplin-turndown-plugin-gfm" { +declare module "@joplin/turndown-plugin-gfm" { import { Plugin } from "turndown"; export const strikethrough: Plugin; - export const tables: Plugin; + export const taskListItems: Plugin; + export const gfm: Plugin; } diff --git a/server/utils/turndown/breaks.ts b/server/utils/turndown/breaks.ts new file mode 100644 index 000000000..82494d504 --- /dev/null +++ b/server/utils/turndown/breaks.ts @@ -0,0 +1,15 @@ +import TurndownService from "turndown"; + +/** + * A turndown plugin for converting break tags to newlines. + * + * @param turndownService The TurndownService instance. + */ +export default function breaks(turndownService: TurndownService) { + turndownService.addRule("breaks", { + filter: ["br"], + replacement: function () { + return "\n"; + }, + }); +} diff --git a/server/utils/turndown/confluence-code-block.ts b/server/utils/turndown/confluence-code-block.ts new file mode 100644 index 000000000..52b53f1f5 --- /dev/null +++ b/server/utils/turndown/confluence-code-block.ts @@ -0,0 +1,57 @@ +import { repeat } from "lodash"; +import TurndownService from "turndown"; + +const highlightRegExp = /brush: ([a-z0-9]+);/; + +/** + * A turndown plugin for converting a confluence code block to markdown. + * + * @param turndownService The TurndownService instance. + */ +export default function confluenceCodeBlock(turndownService: TurndownService) { + turndownService.addRule("fencedConfluenceHighlightedCodeBlock", { + filter: function (node) { + const firstChild = node.firstChild; + return ( + node.nodeName === "DIV" && + firstChild?.nodeName === "PRE" && + // @ts-expect-error className exists + firstChild.className === "syntaxhighlighter-pre" + ); + }, + replacement: function (content, node) { + const dataSyntaxhighlighterParams = + // @ts-expect-error getAttribute exists + node.firstChild?.getAttribute("data-syntaxhighlighter-params") ?? ""; + const language = (dataSyntaxhighlighterParams.match(highlightRegExp) || [ + null, + "", + ])[1]; + const code = node.firstChild?.textContent ?? ""; + + const fenceChar = "`"; + let fenceSize = 3; + const fenceInCodeRegex = new RegExp("^" + fenceChar + "{3,}", "gm"); + + let match; + while ((match = fenceInCodeRegex.exec(code))) { + if (match[0].length >= fenceSize) { + fenceSize = match[0].length + 1; + } + } + + const fence = repeat(fenceChar, fenceSize); + + return ( + "\n\n" + + fence + + language + + "\n" + + code.replace(/\n$/, "") + + "\n" + + fence + + "\n\n" + ); + }, + }); +} diff --git a/server/utils/turndown/confluence-task-list.ts b/server/utils/turndown/confluence-task-list.ts new file mode 100644 index 000000000..b8de96d86 --- /dev/null +++ b/server/utils/turndown/confluence-task-list.ts @@ -0,0 +1,25 @@ +import TurndownService from "turndown"; + +/** + * A turndown plugin for converting a confluence task list to markdown. + * + * @param turndownService The TurndownService instance. + */ +export default function confluenceTaskList(turndownService: TurndownService) { + turndownService.addRule("confluenceTaskList", { + filter: function (node) { + return ( + node.nodeName === "LI" && + node.parentNode?.nodeName === "UL" && + // @ts-expect-error className exists + node.parentNode?.className.includes("inline-task-list") + ); + }, + replacement: function (content, node) { + return ( + // @ts-expect-error className exists + (node.className === "checked" ? "- [x]" : "- [ ]") + ` ${content} \n` + ); + }, + }); +} diff --git a/server/utils/turndown/index.ts b/server/utils/turndown/index.ts new file mode 100644 index 000000000..1f5b810d5 --- /dev/null +++ b/server/utils/turndown/index.ts @@ -0,0 +1,30 @@ +import { gfm } from "@joplin/turndown-plugin-gfm"; +import TurndownService from "turndown"; +import breaks from "./breaks"; +import confluenceCodeBlock from "./confluence-code-block"; +import confluenceTaskList from "./confluence-task-list"; + +/** + * Turndown converts HTML to Markdown and is used in the importer code. + * + * For options, see: https://github.com/domchristie/turndown#options + */ +const service = new TurndownService({ + hr: "---", + bulletListMarker: "-", + headingStyle: "atx", + codeBlockStyle: "fenced", + blankReplacement: (content, node) => { + if (node.nodeName === "P") { + return "\n\n\\\n"; + } + return ""; + }, +}) + .remove(["script", "style", "title", "head"]) + .use(gfm) + .use(confluenceTaskList) + .use(confluenceCodeBlock) + .use(breaks); + +export default service; diff --git a/yarn.lock b/yarn.lock index 0d9e835f4..6c249f7dc 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1813,6 +1813,11 @@ "@babel/runtime" "^7.7.2" regenerator-runtime "^0.13.3" +"@joplin/turndown-plugin-gfm@^1.0.44": + version "1.0.44" + resolved "https://registry.yarnpkg.com/@joplin/turndown-plugin-gfm/-/turndown-plugin-gfm-1.0.44.tgz#028e4c56bf58e57a4d0d923bb7d10a21c30d5c7d" + integrity sha512-lpVI/fpj0CKzWzpsOxsmqwjWlIrw+IZlIEz3h8Vqoviz8dCYbqSSY/4VxpiUDmBpxX/3Xk73R5BfzqiAHBmYqA== + "@lifeomic/attempt@^3.0.2": version "3.0.3" resolved "https://registry.yarnpkg.com/@lifeomic/attempt/-/attempt-3.0.3.tgz#e742a5b85eb673e2f1746b0f39cb932cbc6145bb" @@ -9509,11 +9514,6 @@ jmespath@0.15.0: resolved "https://registry.yarnpkg.com/jmespath/-/jmespath-0.15.0.tgz#a3f222a9aae9f966f5d27c796510e28091764217" integrity sha1-o/Iiqarp+Wb10nx5ZRDigJF2Qhc= -joplin-turndown-plugin-gfm@^1.0.12: - version "1.0.12" - resolved "https://registry.yarnpkg.com/joplin-turndown-plugin-gfm/-/joplin-turndown-plugin-gfm-1.0.12.tgz#f0774183177895c6fedeec951053cab6046dede8" - integrity sha512-qL4+1iycQjZ1fs8zk3jSRk7cg3ROBUHk7GKtiLAQLFzLPKErnILUvz5DLszSQvz3s1sTjPbywLDISVUtBY6HaA== - jpeg-js@0.4.2: version "0.4.2" resolved "https://registry.yarnpkg.com/jpeg-js/-/jpeg-js-0.4.2.tgz#8b345b1ae4abde64c2da2fe67ea216a114ac279d"