From d5bac6cbcaebd5f3d29772e3f5dad462d8fb0da3 Mon Sep 17 00:00:00 2001 From: Tom Moor Date: Sun, 15 Oct 2023 10:51:50 -0400 Subject: [PATCH] fix: Paragraphs in table cells skipped in import Port HTML importer rules from enterprise fork --- .../utils/turndown/confluence-code-block.ts | 57 ------------------- server/utils/turndown/confluence-task-list.ts | 23 -------- .../{empty-lists.ts => emptyLists.ts} | 0 server/utils/turndown/emptyParagraph.ts | 21 +++++++ server/utils/turndown/index.ts | 12 ++-- server/utils/turndown/sanitizeTables.ts | 43 ++++++++++++++ server/utils/turndown/underlines.ts | 15 +++++ 7 files changed, 86 insertions(+), 85 deletions(-) delete mode 100644 server/utils/turndown/confluence-code-block.ts delete mode 100644 server/utils/turndown/confluence-task-list.ts rename server/utils/turndown/{empty-lists.ts => emptyLists.ts} (100%) create mode 100644 server/utils/turndown/emptyParagraph.ts create mode 100644 server/utils/turndown/sanitizeTables.ts create mode 100644 server/utils/turndown/underlines.ts diff --git a/server/utils/turndown/confluence-code-block.ts b/server/utils/turndown/confluence-code-block.ts deleted file mode 100644 index 986a5df51..000000000 --- a/server/utils/turndown/confluence-code-block.ts +++ /dev/null @@ -1,57 +0,0 @@ -import repeat from "lodash/repeat"; -import TurndownService from "turndown"; - -const highlightRegExp = /brush: ([a-z0-9]+);/; - -/** - * A turndown plugin for converting a confluence code block to markdown. - * - * @param turndownService The TurndownService instance. - */ -export default function confluenceCodeBlock(turndownService: TurndownService) { - turndownService.addRule("fencedConfluenceHighlightedCodeBlock", { - filter(node) { - const firstChild = node.firstChild; - return ( - node.nodeName === "DIV" && - firstChild?.nodeName === "PRE" && - // @ts-expect-error className exists - firstChild.className === "syntaxhighlighter-pre" - ); - }, - replacement(content, node) { - const dataSyntaxhighlighterParams = - // @ts-expect-error getAttribute exists - node.firstChild?.getAttribute("data-syntaxhighlighter-params") ?? ""; - const language = (dataSyntaxhighlighterParams.match(highlightRegExp) || [ - null, - "", - ])[1]; - const code = node.firstChild?.textContent ?? ""; - - const fenceChar = "`"; - let fenceSize = 3; - const fenceInCodeRegex = new RegExp("^" + fenceChar + "{3,}", "gm"); - - let match; - while ((match = fenceInCodeRegex.exec(code))) { - if (match[0].length >= fenceSize) { - fenceSize = match[0].length + 1; - } - } - - const fence = repeat(fenceChar, fenceSize); - - return ( - "\n\n" + - fence + - language + - "\n" + - code.replace(/\n$/, "") + - "\n" + - fence + - "\n\n" - ); - }, - }); -} diff --git a/server/utils/turndown/confluence-task-list.ts b/server/utils/turndown/confluence-task-list.ts deleted file mode 100644 index 8ecaac15a..000000000 --- a/server/utils/turndown/confluence-task-list.ts +++ /dev/null @@ -1,23 +0,0 @@ -import TurndownService from "turndown"; - -/** - * A turndown plugin for converting a confluence task list to markdown. - * - * @param turndownService The TurndownService instance. - */ -export default function confluenceTaskList(turndownService: TurndownService) { - turndownService.addRule("confluenceTaskList", { - filter(node) { - return ( - node.nodeName === "LI" && - node.parentElement?.nodeName === "UL" && - node.parentElement?.className.includes("inline-task-list") - ); - }, - replacement(content, node) { - return "className" in node - ? (node.className === "checked" ? "- [x]" : "- [ ]") + ` ${content} \n` - : content; - }, - }); -} diff --git a/server/utils/turndown/empty-lists.ts b/server/utils/turndown/emptyLists.ts similarity index 100% rename from server/utils/turndown/empty-lists.ts rename to server/utils/turndown/emptyLists.ts diff --git a/server/utils/turndown/emptyParagraph.ts b/server/utils/turndown/emptyParagraph.ts new file mode 100644 index 000000000..1b2017876 --- /dev/null +++ b/server/utils/turndown/emptyParagraph.ts @@ -0,0 +1,21 @@ +import TurndownService from "turndown"; + +/** + * A turndown plugin for converting paragraphs with only breaks to newlines. + * + * @param turndownService The TurndownService instance. + */ +export default function emptyParagraphs(turndownService: TurndownService) { + turndownService.addRule("emptyParagraphs", { + filter(node) { + return ( + node.nodeName === "P" && + node.children.length === 1 && + node.children[0].nodeName === "BR" + ); + }, + replacement() { + return "\n\n\\\n"; + }, + }); +} diff --git a/server/utils/turndown/index.ts b/server/utils/turndown/index.ts index 56b093602..8efc8d9de 100644 --- a/server/utils/turndown/index.ts +++ b/server/utils/turndown/index.ts @@ -1,11 +1,12 @@ import { gfm } from "@joplin/turndown-plugin-gfm"; import TurndownService from "turndown"; import breaks from "./breaks"; -import confluenceCodeBlock from "./confluence-code-block"; -import confluenceTaskList from "./confluence-task-list"; -import emptyLists from "./empty-lists"; +import emptyLists from "./emptyLists"; +import emptyParagraphs from "./emptyParagraph"; import frames from "./frames"; import images from "./images"; +import sanitizeTables from "./sanitizeTables"; +import underlines from "./underlines"; /** * Turndown converts HTML to Markdown and is used in the importer code. @@ -26,9 +27,10 @@ const service = new TurndownService({ }) .remove(["script", "style", "title", "head"]) .use(gfm) + .use(emptyParagraphs) + .use(sanitizeTables) + .use(underlines) .use(frames) - .use(confluenceTaskList) - .use(confluenceCodeBlock) .use(images) .use(breaks) .use(emptyLists); diff --git a/server/utils/turndown/sanitizeTables.ts b/server/utils/turndown/sanitizeTables.ts new file mode 100644 index 000000000..70c86bf8d --- /dev/null +++ b/server/utils/turndown/sanitizeTables.ts @@ -0,0 +1,43 @@ +import TurndownService from "turndown"; + +/** + * A turndown plugin for removing incompatible nodes from tables. + * + * @param turndownService The TurndownService instance. + */ +export default function sanitizeTables(turndownService: TurndownService) { + function inHtmlContext(node: HTMLElement, selector: string) { + let currentNode = node; + // start at the closest element + while (currentNode !== null && currentNode.nodeType !== 1) { + currentNode = (currentNode.parentElement || + currentNode.parentNode) as HTMLElement; + } + return ( + currentNode !== null && + currentNode.nodeType === 1 && + currentNode.closest(selector) !== null + ); + } + + turndownService.addRule("headingsInTables", { + filter(node) { + return ( + ["H1", "H2", "H3", "H4", "H5", "H6"].includes(node.nodeName) && + inHtmlContext(node, "table") + ); + }, + replacement(content) { + return `**${content.trim()}**`; + }, + }); + + turndownService.addRule("paragraphsInCells", { + filter(node) { + return node.nodeName === "P" && inHtmlContext(node, "table"); + }, + replacement(content) { + return content.trim(); + }, + }); +} diff --git a/server/utils/turndown/underlines.ts b/server/utils/turndown/underlines.ts new file mode 100644 index 000000000..be6b2dfac --- /dev/null +++ b/server/utils/turndown/underlines.ts @@ -0,0 +1,15 @@ +import TurndownService from "turndown"; + +/** + * A turndown plugin for converting u tags to underlines. + * + * @param turndownService The TurndownService instance. + */ +export default function underlines(turndownService: TurndownService) { + turndownService.addRule("underlines", { + filter: ["u"], + replacement(content) { + return `__${content.trim()}__`; + }, + }); +}