fix: Vendorized turndown-gfm-plugin and fixed performance issue with parsing tables
closes OLN-277
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
import { gfm } from "@joplin/turndown-plugin-gfm";
|
||||
import { taskListItems, strikethrough } from "@joplin/turndown-plugin-gfm";
|
||||
import TurndownService from "turndown";
|
||||
import breaks from "./breaks";
|
||||
import emptyLists from "./emptyLists";
|
||||
@@ -8,6 +8,7 @@ import images from "./images";
|
||||
import inlineLink from "./inlineLink";
|
||||
import sanitizeLists from "./sanitizeLists";
|
||||
import sanitizeTables from "./sanitizeTables";
|
||||
import tables from "./tables";
|
||||
import underlines from "./underlines";
|
||||
import { inHtmlContext } from "./utils";
|
||||
|
||||
@@ -27,7 +28,9 @@ const service = new TurndownService({
|
||||
: "",
|
||||
})
|
||||
.remove(["script", "style", "title", "head"])
|
||||
.use(gfm)
|
||||
.use(taskListItems)
|
||||
.use(strikethrough)
|
||||
.use(tables)
|
||||
.use(inlineLink)
|
||||
.use(emptyParagraph)
|
||||
.use(sanitizeTables)
|
||||
|
||||
349
server/utils/turndown/tables.ts
Normal file
349
server/utils/turndown/tables.ts
Normal file
@@ -0,0 +1,349 @@
|
||||
// Based on https://www.npmjs.com/package/joplin-turndown-plugin-gfm
|
||||
import type TurndownService from "turndown";
|
||||
import { inHtmlContext } from "./utils";
|
||||
|
||||
const rules: Record<string, TurndownService.Rule> = {};
|
||||
const alignMap = { left: ":---", right: "---:", center: ":---:" };
|
||||
|
||||
// Note use of WeakMap to enable garbage collection
|
||||
const tableShouldBeSkippedCache = new WeakMap<HTMLTableElement, boolean>();
|
||||
|
||||
function getAlignment(node: HTMLElement) {
|
||||
return node
|
||||
? (node.getAttribute("align") || node.style.textAlign || "").toLowerCase()
|
||||
: "";
|
||||
}
|
||||
|
||||
function getBorder(alignment: keyof typeof alignMap) {
|
||||
return alignment ? alignMap[alignment] : "---";
|
||||
}
|
||||
|
||||
function getColumnAlignment(
|
||||
table: HTMLTableElement | null,
|
||||
columnIndex: number
|
||||
) {
|
||||
const votes = {
|
||||
left: 0,
|
||||
right: 0,
|
||||
center: 0,
|
||||
"": 0,
|
||||
};
|
||||
|
||||
let align: keyof typeof alignMap = "left";
|
||||
if (!table) {
|
||||
return align;
|
||||
}
|
||||
|
||||
// Reference is important as .rows is an expensive getter.
|
||||
const rows = table.rows;
|
||||
|
||||
for (let i = 0; i < rows.length; ++i) {
|
||||
const row = rows[i];
|
||||
if (columnIndex < row.childNodes.length) {
|
||||
const cellAlignment = getAlignment(
|
||||
row.childNodes[columnIndex] as HTMLElement
|
||||
);
|
||||
++votes[cellAlignment];
|
||||
|
||||
if (
|
||||
votes[cellAlignment] > votes[align] &&
|
||||
Object.keys(alignMap).includes(cellAlignment)
|
||||
) {
|
||||
align = cellAlignment as keyof typeof alignMap;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return align;
|
||||
}
|
||||
|
||||
rules.tableCell = {
|
||||
filter: ["th", "td"],
|
||||
replacement(content, node: HTMLTableCellElement) {
|
||||
if (tableShouldBeSkipped(nodeParentTable(node))) {
|
||||
return content;
|
||||
}
|
||||
return cell(content, node);
|
||||
},
|
||||
};
|
||||
|
||||
rules.tableRow = {
|
||||
filter: "tr",
|
||||
replacement(content, node: HTMLTableRowElement) {
|
||||
const parentTable = nodeParentTable(node);
|
||||
if (tableShouldBeSkipped(parentTable)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
let borderCells = "";
|
||||
|
||||
if (isHeadingRow(node)) {
|
||||
const colCount = tableColCount(parentTable);
|
||||
for (let i = 0; i < colCount; i++) {
|
||||
const childNode =
|
||||
i < node.childNodes.length ? node.childNodes[i] : null;
|
||||
const border = getBorder(getColumnAlignment(parentTable, i));
|
||||
borderCells += cell(border, childNode, i);
|
||||
}
|
||||
}
|
||||
return "\n" + content + (borderCells ? "\n" + borderCells : "");
|
||||
},
|
||||
};
|
||||
|
||||
rules.table = {
|
||||
// Only convert tables that can result in valid Markdown
|
||||
// Other tables are kept as HTML using `keep` (see below).
|
||||
filter(node) {
|
||||
return node.nodeName === "TABLE" && !tableShouldBeHtml(node);
|
||||
},
|
||||
|
||||
replacement(content, node: HTMLTableElement) {
|
||||
if (tableShouldBeSkipped(node)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
// Ensure there are no blank lines
|
||||
content = content.replace(/\n+/g, "\n");
|
||||
|
||||
// If table has no heading, add an empty one so as to get a valid Markdown table
|
||||
const secondLineParts = content.trim().split("\n");
|
||||
let secondLine = "";
|
||||
if (secondLineParts.length >= 2) {
|
||||
secondLine = secondLineParts[1];
|
||||
}
|
||||
const secondLineIsDivider = /\| :?---/.test(secondLine);
|
||||
|
||||
const columnCount = tableColCount(node);
|
||||
let emptyHeader = "";
|
||||
if (columnCount && !secondLineIsDivider) {
|
||||
emptyHeader = "|" + " |".repeat(columnCount) + "\n" + "|";
|
||||
for (let columnIndex = 0; columnIndex < columnCount; ++columnIndex) {
|
||||
emptyHeader +=
|
||||
" " + getBorder(getColumnAlignment(node, columnIndex)) + " |";
|
||||
}
|
||||
}
|
||||
|
||||
return "\n\n" + emptyHeader + content + "\n\n";
|
||||
},
|
||||
};
|
||||
|
||||
rules.tableSection = {
|
||||
filter: ["thead", "tbody", "tfoot"],
|
||||
replacement(content) {
|
||||
return content;
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* A tr is a heading row if the parent is a THEAD or its the first child of the TABLE or the first
|
||||
* TBODY (possibly following a blank THEAD) and every cell is a TH.
|
||||
*
|
||||
* @param tr The tr node to check
|
||||
* @returns Whether the tr is a heading row
|
||||
*/
|
||||
function isHeadingRow(tr: Node) {
|
||||
const parentNode = tr.parentNode;
|
||||
if (!parentNode) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (
|
||||
parentNode.nodeName === "THEAD" ||
|
||||
(parentNode.firstChild === tr &&
|
||||
(parentNode.nodeName === "TABLE" || isFirstTbody(parentNode)) &&
|
||||
Array.from(tr.childNodes).every((n) => n.nodeName === "TH"))
|
||||
);
|
||||
}
|
||||
|
||||
function isFirstTbody(element: Node) {
|
||||
const previousSibling = element?.previousSibling;
|
||||
if (!previousSibling) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (
|
||||
element.nodeName === "TBODY" &&
|
||||
(!previousSibling ||
|
||||
(previousSibling.nodeName === "THEAD" &&
|
||||
/^\s*$/i.test(previousSibling.textContent ?? "")))
|
||||
);
|
||||
}
|
||||
|
||||
function cell(
|
||||
content: string,
|
||||
node: ChildNode | null = null,
|
||||
index: number | null = null
|
||||
) {
|
||||
if (index === null && node) {
|
||||
index = Array.from(node?.parentNode?.childNodes ?? []).indexOf(node);
|
||||
}
|
||||
let prefix = " ";
|
||||
if (index === 0) {
|
||||
prefix = "| ";
|
||||
}
|
||||
let filteredContent = content
|
||||
.trim()
|
||||
.replace(/\n\r/g, "<br>")
|
||||
.replace(/\n/g, "<br>");
|
||||
filteredContent = filteredContent.replace(/\|+/g, "\\|");
|
||||
while (filteredContent.length < 3) {
|
||||
filteredContent += " ";
|
||||
}
|
||||
if (node) {
|
||||
filteredContent = handleColSpan(filteredContent, node, " ");
|
||||
}
|
||||
return prefix + filteredContent + " |";
|
||||
}
|
||||
|
||||
function nodeContainsTable(node: Node) {
|
||||
if (!node?.childNodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (let i = 0; i < node.childNodes.length; i++) {
|
||||
const child = node.childNodes[i];
|
||||
if (child.nodeName === "TABLE") {
|
||||
return true;
|
||||
}
|
||||
if (nodeContainsTable(child)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
const nodeContains = (node: HTMLElement, types: string | string[]) => {
|
||||
if (!node?.childNodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (let i = 0; i < node.childNodes.length; i++) {
|
||||
const child = node.childNodes[i] as HTMLElement;
|
||||
if (types === "code" && inHtmlContext(child, "CODE")) {
|
||||
return true;
|
||||
}
|
||||
if (types.includes(child.nodeName)) {
|
||||
return true;
|
||||
}
|
||||
if (nodeContains(child, types)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
const tableShouldBeHtml = (tableNode: HTMLElement) =>
|
||||
nodeContains(tableNode, "code") ||
|
||||
nodeContains(tableNode, [
|
||||
"UL",
|
||||
"OL",
|
||||
"H1",
|
||||
"H2",
|
||||
"H3",
|
||||
"H4",
|
||||
"H5",
|
||||
"H6",
|
||||
"HR",
|
||||
"BLOCKQUOTE",
|
||||
]);
|
||||
|
||||
// Various conditions under which a table should be skipped - i.e. each cell
|
||||
// will be rendered one after the other as if they were paragraphs.
|
||||
function tableShouldBeSkipped(tableNode: HTMLTableElement | null) {
|
||||
if (!tableNode) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const cached = tableShouldBeSkippedCache.get(tableNode);
|
||||
if (cached !== undefined) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
const process = () => {
|
||||
if (!tableNode) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reference is important as .rows is an expensive getter.
|
||||
const rows = tableNode.rows;
|
||||
|
||||
if (!rows) {
|
||||
return true;
|
||||
}
|
||||
if (rows.length === 1 && rows[0].childNodes.length <= 1) {
|
||||
return true;
|
||||
}
|
||||
if (nodeContainsTable(tableNode)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
const result = process();
|
||||
tableShouldBeSkippedCache.set(tableNode, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
function nodeParentTable(
|
||||
node: HTMLTableCellElement | HTMLTableRowElement
|
||||
): HTMLTableElement | null {
|
||||
let parent = node.parentNode;
|
||||
if (!parent) {
|
||||
return null;
|
||||
}
|
||||
|
||||
while (parent.nodeName !== "TABLE") {
|
||||
parent = parent.parentNode;
|
||||
if (!parent) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return parent as HTMLTableElement;
|
||||
}
|
||||
|
||||
function handleColSpan(content: string, node: ChildNode, emptyChar: string) {
|
||||
if (!node) {
|
||||
return content;
|
||||
}
|
||||
|
||||
const colspan = Number((node as HTMLElement).getAttribute("colspan") || 1);
|
||||
for (let i = 1; i < colspan; i++) {
|
||||
content += " | " + emptyChar.repeat(3);
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
function tableColCount(node: HTMLTableElement | null) {
|
||||
if (!node) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let maxColCount = 0;
|
||||
|
||||
// Reference is important as .rows is an expensive getter.
|
||||
const rows = node.rows;
|
||||
|
||||
for (let i = 0; i < rows.length; i++) {
|
||||
const row = rows[i];
|
||||
const colCount = row.childNodes.length;
|
||||
if (colCount > maxColCount) {
|
||||
maxColCount = colCount;
|
||||
}
|
||||
}
|
||||
return maxColCount;
|
||||
}
|
||||
|
||||
export default function tables(turndownService: TurndownService) {
|
||||
turndownService.keep(function (node) {
|
||||
if (node.nodeName === "TABLE" && tableShouldBeHtml(node)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
for (const key in rules) {
|
||||
turndownService.addRule(key, rules[key]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user