From b7f0af9b856dded2e46bd84aa72fc6905a7f0bdb Mon Sep 17 00:00:00 2001 From: Tom Moor Date: Sat, 24 Feb 2024 13:01:04 -0500 Subject: [PATCH] fix: Vendorized turndown-gfm-plugin and fixed performance issue with parsing tables closes OLN-277 --- server/utils/turndown/index.ts | 7 +- server/utils/turndown/tables.ts | 349 ++++++++++++++++++++++++++++++++ 2 files changed, 354 insertions(+), 2 deletions(-) create mode 100644 server/utils/turndown/tables.ts diff --git a/server/utils/turndown/index.ts b/server/utils/turndown/index.ts index 649129d5b..9e8e0ab38 100644 --- a/server/utils/turndown/index.ts +++ b/server/utils/turndown/index.ts @@ -1,4 +1,4 @@ -import { gfm } from "@joplin/turndown-plugin-gfm"; +import { taskListItems, strikethrough } from "@joplin/turndown-plugin-gfm"; import TurndownService from "turndown"; import breaks from "./breaks"; import emptyLists from "./emptyLists"; @@ -8,6 +8,7 @@ import images from "./images"; import inlineLink from "./inlineLink"; import sanitizeLists from "./sanitizeLists"; import sanitizeTables from "./sanitizeTables"; +import tables from "./tables"; import underlines from "./underlines"; import { inHtmlContext } from "./utils"; @@ -27,7 +28,9 @@ const service = new TurndownService({ : "", }) .remove(["script", "style", "title", "head"]) - .use(gfm) + .use(taskListItems) + .use(strikethrough) + .use(tables) .use(inlineLink) .use(emptyParagraph) .use(sanitizeTables) diff --git a/server/utils/turndown/tables.ts b/server/utils/turndown/tables.ts new file mode 100644 index 000000000..4290418fc --- /dev/null +++ b/server/utils/turndown/tables.ts @@ -0,0 +1,349 @@ +// Based on https://www.npmjs.com/package/joplin-turndown-plugin-gfm +import type TurndownService from "turndown"; +import { inHtmlContext } from "./utils"; + +const rules: Record = {}; +const alignMap = { left: ":---", right: "---:", center: ":---:" }; + +// Note use of WeakMap to enable garbage collection +const tableShouldBeSkippedCache = new WeakMap(); + +function getAlignment(node: HTMLElement) { + return node + ? (node.getAttribute("align") || node.style.textAlign || "").toLowerCase() + : ""; +} + +function getBorder(alignment: keyof typeof alignMap) { + return alignment ? alignMap[alignment] : "---"; +} + +function getColumnAlignment( + table: HTMLTableElement | null, + columnIndex: number +) { + const votes = { + left: 0, + right: 0, + center: 0, + "": 0, + }; + + let align: keyof typeof alignMap = "left"; + if (!table) { + return align; + } + + // Reference is important as .rows is an expensive getter. + const rows = table.rows; + + for (let i = 0; i < rows.length; ++i) { + const row = rows[i]; + if (columnIndex < row.childNodes.length) { + const cellAlignment = getAlignment( + row.childNodes[columnIndex] as HTMLElement + ); + ++votes[cellAlignment]; + + if ( + votes[cellAlignment] > votes[align] && + Object.keys(alignMap).includes(cellAlignment) + ) { + align = cellAlignment as keyof typeof alignMap; + } + } + } + + return align; +} + +rules.tableCell = { + filter: ["th", "td"], + replacement(content, node: HTMLTableCellElement) { + if (tableShouldBeSkipped(nodeParentTable(node))) { + return content; + } + return cell(content, node); + }, +}; + +rules.tableRow = { + filter: "tr", + replacement(content, node: HTMLTableRowElement) { + const parentTable = nodeParentTable(node); + if (tableShouldBeSkipped(parentTable)) { + return content; + } + + let borderCells = ""; + + if (isHeadingRow(node)) { + const colCount = tableColCount(parentTable); + for (let i = 0; i < colCount; i++) { + const childNode = + i < node.childNodes.length ? node.childNodes[i] : null; + const border = getBorder(getColumnAlignment(parentTable, i)); + borderCells += cell(border, childNode, i); + } + } + return "\n" + content + (borderCells ? "\n" + borderCells : ""); + }, +}; + +rules.table = { + // Only convert tables that can result in valid Markdown + // Other tables are kept as HTML using `keep` (see below). + filter(node) { + return node.nodeName === "TABLE" && !tableShouldBeHtml(node); + }, + + replacement(content, node: HTMLTableElement) { + if (tableShouldBeSkipped(node)) { + return content; + } + + // Ensure there are no blank lines + content = content.replace(/\n+/g, "\n"); + + // If table has no heading, add an empty one so as to get a valid Markdown table + const secondLineParts = content.trim().split("\n"); + let secondLine = ""; + if (secondLineParts.length >= 2) { + secondLine = secondLineParts[1]; + } + const secondLineIsDivider = /\| :?---/.test(secondLine); + + const columnCount = tableColCount(node); + let emptyHeader = ""; + if (columnCount && !secondLineIsDivider) { + emptyHeader = "|" + " |".repeat(columnCount) + "\n" + "|"; + for (let columnIndex = 0; columnIndex < columnCount; ++columnIndex) { + emptyHeader += + " " + getBorder(getColumnAlignment(node, columnIndex)) + " |"; + } + } + + return "\n\n" + emptyHeader + content + "\n\n"; + }, +}; + +rules.tableSection = { + filter: ["thead", "tbody", "tfoot"], + replacement(content) { + return content; + }, +}; + +/** + * A tr is a heading row if the parent is a THEAD or its the first child of the TABLE or the first + * TBODY (possibly following a blank THEAD) and every cell is a TH. + * + * @param tr The tr node to check + * @returns Whether the tr is a heading row + */ +function isHeadingRow(tr: Node) { + const parentNode = tr.parentNode; + if (!parentNode) { + return false; + } + + return ( + parentNode.nodeName === "THEAD" || + (parentNode.firstChild === tr && + (parentNode.nodeName === "TABLE" || isFirstTbody(parentNode)) && + Array.from(tr.childNodes).every((n) => n.nodeName === "TH")) + ); +} + +function isFirstTbody(element: Node) { + const previousSibling = element?.previousSibling; + if (!previousSibling) { + return false; + } + + return ( + element.nodeName === "TBODY" && + (!previousSibling || + (previousSibling.nodeName === "THEAD" && + /^\s*$/i.test(previousSibling.textContent ?? ""))) + ); +} + +function cell( + content: string, + node: ChildNode | null = null, + index: number | null = null +) { + if (index === null && node) { + index = Array.from(node?.parentNode?.childNodes ?? []).indexOf(node); + } + let prefix = " "; + if (index === 0) { + prefix = "| "; + } + let filteredContent = content + .trim() + .replace(/\n\r/g, "
") + .replace(/\n/g, "
"); + filteredContent = filteredContent.replace(/\|+/g, "\\|"); + while (filteredContent.length < 3) { + filteredContent += " "; + } + if (node) { + filteredContent = handleColSpan(filteredContent, node, " "); + } + return prefix + filteredContent + " |"; +} + +function nodeContainsTable(node: Node) { + if (!node?.childNodes) { + return false; + } + + for (let i = 0; i < node.childNodes.length; i++) { + const child = node.childNodes[i]; + if (child.nodeName === "TABLE") { + return true; + } + if (nodeContainsTable(child)) { + return true; + } + } + return false; +} + +const nodeContains = (node: HTMLElement, types: string | string[]) => { + if (!node?.childNodes) { + return false; + } + + for (let i = 0; i < node.childNodes.length; i++) { + const child = node.childNodes[i] as HTMLElement; + if (types === "code" && inHtmlContext(child, "CODE")) { + return true; + } + if (types.includes(child.nodeName)) { + return true; + } + if (nodeContains(child, types)) { + return true; + } + } + + return false; +}; + +const tableShouldBeHtml = (tableNode: HTMLElement) => + nodeContains(tableNode, "code") || + nodeContains(tableNode, [ + "UL", + "OL", + "H1", + "H2", + "H3", + "H4", + "H5", + "H6", + "HR", + "BLOCKQUOTE", + ]); + +// Various conditions under which a table should be skipped - i.e. each cell +// will be rendered one after the other as if they were paragraphs. +function tableShouldBeSkipped(tableNode: HTMLTableElement | null) { + if (!tableNode) { + return true; + } + + const cached = tableShouldBeSkippedCache.get(tableNode); + if (cached !== undefined) { + return cached; + } + + const process = () => { + if (!tableNode) { + return true; + } + + // Reference is important as .rows is an expensive getter. + const rows = tableNode.rows; + + if (!rows) { + return true; + } + if (rows.length === 1 && rows[0].childNodes.length <= 1) { + return true; + } + if (nodeContainsTable(tableNode)) { + return true; + } + return false; + }; + + const result = process(); + tableShouldBeSkippedCache.set(tableNode, result); + return result; +} + +function nodeParentTable( + node: HTMLTableCellElement | HTMLTableRowElement +): HTMLTableElement | null { + let parent = node.parentNode; + if (!parent) { + return null; + } + + while (parent.nodeName !== "TABLE") { + parent = parent.parentNode; + if (!parent) { + return null; + } + } + + return parent as HTMLTableElement; +} + +function handleColSpan(content: string, node: ChildNode, emptyChar: string) { + if (!node) { + return content; + } + + const colspan = Number((node as HTMLElement).getAttribute("colspan") || 1); + for (let i = 1; i < colspan; i++) { + content += " | " + emptyChar.repeat(3); + } + return content; +} + +function tableColCount(node: HTMLTableElement | null) { + if (!node) { + return 0; + } + + let maxColCount = 0; + + // Reference is important as .rows is an expensive getter. + const rows = node.rows; + + for (let i = 0; i < rows.length; i++) { + const row = rows[i]; + const colCount = row.childNodes.length; + if (colCount > maxColCount) { + maxColCount = colCount; + } + } + return maxColCount; +} + +export default function tables(turndownService: TurndownService) { + turndownService.keep(function (node) { + if (node.nodeName === "TABLE" && tableShouldBeHtml(node)) { + return true; + } + return false; + }); + + for (const key in rules) { + turndownService.addRule(key, rules[key]); + } +}