fix: Paragraphs in table cells skipped in import

Port HTML importer rules from enterprise fork
This commit is contained in:
Tom Moor
2023-10-15 10:51:50 -04:00
parent 00ee8729ec
commit d5bac6cbca
7 changed files with 86 additions and 85 deletions

View File

@@ -1,57 +0,0 @@
import repeat from "lodash/repeat";
import TurndownService from "turndown";
const highlightRegExp = /brush: ([a-z0-9]+);/;
/**
* A turndown plugin for converting a confluence code block to markdown.
*
* @param turndownService The TurndownService instance.
*/
export default function confluenceCodeBlock(turndownService: TurndownService) {
turndownService.addRule("fencedConfluenceHighlightedCodeBlock", {
filter(node) {
const firstChild = node.firstChild;
return (
node.nodeName === "DIV" &&
firstChild?.nodeName === "PRE" &&
// @ts-expect-error className exists
firstChild.className === "syntaxhighlighter-pre"
);
},
replacement(content, node) {
const dataSyntaxhighlighterParams =
// @ts-expect-error getAttribute exists
node.firstChild?.getAttribute("data-syntaxhighlighter-params") ?? "";
const language = (dataSyntaxhighlighterParams.match(highlightRegExp) || [
null,
"",
])[1];
const code = node.firstChild?.textContent ?? "";
const fenceChar = "`";
let fenceSize = 3;
const fenceInCodeRegex = new RegExp("^" + fenceChar + "{3,}", "gm");
let match;
while ((match = fenceInCodeRegex.exec(code))) {
if (match[0].length >= fenceSize) {
fenceSize = match[0].length + 1;
}
}
const fence = repeat(fenceChar, fenceSize);
return (
"\n\n" +
fence +
language +
"\n" +
code.replace(/\n$/, "") +
"\n" +
fence +
"\n\n"
);
},
});
}

View File

@@ -1,23 +0,0 @@
import TurndownService from "turndown";
/**
* A turndown plugin for converting a confluence task list to markdown.
*
* @param turndownService The TurndownService instance.
*/
export default function confluenceTaskList(turndownService: TurndownService) {
turndownService.addRule("confluenceTaskList", {
filter(node) {
return (
node.nodeName === "LI" &&
node.parentElement?.nodeName === "UL" &&
node.parentElement?.className.includes("inline-task-list")
);
},
replacement(content, node) {
return "className" in node
? (node.className === "checked" ? "- [x]" : "- [ ]") + ` ${content} \n`
: content;
},
});
}

View File

@@ -0,0 +1,21 @@
import TurndownService from "turndown";
/**
* A turndown plugin for converting paragraphs with only breaks to newlines.
*
* @param turndownService The TurndownService instance.
*/
export default function emptyParagraphs(turndownService: TurndownService) {
turndownService.addRule("emptyParagraphs", {
filter(node) {
return (
node.nodeName === "P" &&
node.children.length === 1 &&
node.children[0].nodeName === "BR"
);
},
replacement() {
return "\n\n\\\n";
},
});
}

View File

@@ -1,11 +1,12 @@
import { gfm } from "@joplin/turndown-plugin-gfm"; import { gfm } from "@joplin/turndown-plugin-gfm";
import TurndownService from "turndown"; import TurndownService from "turndown";
import breaks from "./breaks"; import breaks from "./breaks";
import confluenceCodeBlock from "./confluence-code-block"; import emptyLists from "./emptyLists";
import confluenceTaskList from "./confluence-task-list"; import emptyParagraphs from "./emptyParagraph";
import emptyLists from "./empty-lists";
import frames from "./frames"; import frames from "./frames";
import images from "./images"; import images from "./images";
import sanitizeTables from "./sanitizeTables";
import underlines from "./underlines";
/** /**
* Turndown converts HTML to Markdown and is used in the importer code. * Turndown converts HTML to Markdown and is used in the importer code.
@@ -26,9 +27,10 @@ const service = new TurndownService({
}) })
.remove(["script", "style", "title", "head"]) .remove(["script", "style", "title", "head"])
.use(gfm) .use(gfm)
.use(emptyParagraphs)
.use(sanitizeTables)
.use(underlines)
.use(frames) .use(frames)
.use(confluenceTaskList)
.use(confluenceCodeBlock)
.use(images) .use(images)
.use(breaks) .use(breaks)
.use(emptyLists); .use(emptyLists);

View File

@@ -0,0 +1,43 @@
import TurndownService from "turndown";
/**
* A turndown plugin for removing incompatible nodes from tables.
*
* @param turndownService The TurndownService instance.
*/
export default function sanitizeTables(turndownService: TurndownService) {
function inHtmlContext(node: HTMLElement, selector: string) {
let currentNode = node;
// start at the closest element
while (currentNode !== null && currentNode.nodeType !== 1) {
currentNode = (currentNode.parentElement ||
currentNode.parentNode) as HTMLElement;
}
return (
currentNode !== null &&
currentNode.nodeType === 1 &&
currentNode.closest(selector) !== null
);
}
turndownService.addRule("headingsInTables", {
filter(node) {
return (
["H1", "H2", "H3", "H4", "H5", "H6"].includes(node.nodeName) &&
inHtmlContext(node, "table")
);
},
replacement(content) {
return `**${content.trim()}**`;
},
});
turndownService.addRule("paragraphsInCells", {
filter(node) {
return node.nodeName === "P" && inHtmlContext(node, "table");
},
replacement(content) {
return content.trim();
},
});
}

View File

@@ -0,0 +1,15 @@
import TurndownService from "turndown";
/**
* A turndown plugin for converting u tags to underlines.
*
* @param turndownService The TurndownService instance.
*/
export default function underlines(turndownService: TurndownService) {
turndownService.addRule("underlines", {
filter: ["u"],
replacement(content) {
return `__${content.trim()}__`;
},
});
}