fix: Improve logic for word import (#6361)
* Refactor DocumentConverter * Support parsing images from Confluence exported .doc files * fix: Bring across 2 fixes from enterprise codebase * Bust dependency cache
This commit is contained in:
@@ -1,136 +1,15 @@
|
||||
import path from "path";
|
||||
import emojiRegex from "emoji-regex";
|
||||
import escapeRegExp from "lodash/escapeRegExp";
|
||||
import truncate from "lodash/truncate";
|
||||
import mammoth from "mammoth";
|
||||
import quotedPrintable from "quoted-printable";
|
||||
import { Transaction } from "sequelize";
|
||||
import utf8 from "utf8";
|
||||
import parseTitle from "@shared/utils/parseTitle";
|
||||
import { DocumentValidation } from "@shared/validations";
|
||||
import { traceFunction } from "@server/logging/tracing";
|
||||
import { User } from "@server/models";
|
||||
import ProsemirrorHelper from "@server/models/helpers/ProsemirrorHelper";
|
||||
import TextHelper from "@server/models/helpers/TextHelper";
|
||||
import turndownService from "@server/utils/turndown";
|
||||
import { FileImportError, InvalidRequestError } from "../errors";
|
||||
|
||||
interface ImportableFile {
|
||||
type: string;
|
||||
getMarkdown: (content: Buffer | string) => Promise<string>;
|
||||
}
|
||||
|
||||
const importMapping: ImportableFile[] = [
|
||||
{
|
||||
type: "application/msword",
|
||||
getMarkdown: confluenceToMarkdown,
|
||||
},
|
||||
{
|
||||
type: "application/octet-stream",
|
||||
getMarkdown: docxToMarkdown,
|
||||
},
|
||||
{
|
||||
type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
getMarkdown: docxToMarkdown,
|
||||
},
|
||||
{
|
||||
type: "text/html",
|
||||
getMarkdown: htmlToMarkdown,
|
||||
},
|
||||
{
|
||||
type: "text/plain",
|
||||
getMarkdown: fileToMarkdown,
|
||||
},
|
||||
{
|
||||
type: "text/markdown",
|
||||
getMarkdown: fileToMarkdown,
|
||||
},
|
||||
];
|
||||
|
||||
async function fileToMarkdown(content: Buffer | string): Promise<string> {
|
||||
if (content instanceof Buffer) {
|
||||
content = content.toString("utf8");
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
async function docxToMarkdown(content: Buffer | string): Promise<string> {
|
||||
if (content instanceof Buffer) {
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
buffer: content,
|
||||
});
|
||||
|
||||
return turndownService.turndown(html);
|
||||
}
|
||||
|
||||
throw new Error("docxToMarkdown: content must be a Buffer");
|
||||
}
|
||||
|
||||
async function htmlToMarkdown(content: Buffer | string): Promise<string> {
|
||||
if (content instanceof Buffer) {
|
||||
content = content.toString("utf8");
|
||||
}
|
||||
|
||||
return turndownService.turndown(content);
|
||||
}
|
||||
|
||||
async function confluenceToMarkdown(value: Buffer | string): Promise<string> {
|
||||
if (value instanceof Buffer) {
|
||||
value = value.toString("utf8");
|
||||
}
|
||||
|
||||
// We're only supporting the ridiculous output from Confluence here, regular
|
||||
// Word documents should call into the docxToMarkdown importer.
|
||||
// See: https://jira.atlassian.com/browse/CONFSERVER-38237
|
||||
if (!value.includes("Content-Type: multipart/related")) {
|
||||
throw FileImportError("Unsupported Word file");
|
||||
}
|
||||
|
||||
// get boundary marker
|
||||
const boundaryMarker = value.match(/boundary="(.+)"/);
|
||||
|
||||
if (!boundaryMarker) {
|
||||
throw FileImportError("Unsupported Word file (No boundary marker)");
|
||||
}
|
||||
|
||||
// get content between multipart boundaries
|
||||
let boundaryReached = 0;
|
||||
const lines = value.split("\n").filter((line) => {
|
||||
if (line.includes(boundaryMarker[1])) {
|
||||
boundaryReached++;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (line.startsWith("Content-")) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 == definition
|
||||
// 2 == content
|
||||
// 3 == ending
|
||||
if (boundaryReached === 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
|
||||
if (!lines.length) {
|
||||
throw FileImportError("Unsupported Word file (No content found)");
|
||||
}
|
||||
|
||||
// Mime attachment is "quoted printable" encoded, must be decoded first
|
||||
// https://en.wikipedia.org/wiki/Quoted-printable
|
||||
value = utf8.decode(quotedPrintable.decode(lines.join("\n")));
|
||||
|
||||
// If we don't remove the title here it becomes printed in the document
|
||||
// body by turndown
|
||||
turndownService.remove(["style", "title"]);
|
||||
|
||||
// Now we should have something that looks like HTML
|
||||
const html = turndownService.turndown(value);
|
||||
return html.replace(/<br>/g, " \\n ");
|
||||
}
|
||||
import { DocumentConverter } from "@server/utils/DocumentConverter";
|
||||
import { InvalidRequestError } from "../errors";
|
||||
|
||||
type Props = {
|
||||
user: User;
|
||||
@@ -154,31 +33,12 @@ async function documentImporter({
|
||||
title: string;
|
||||
state: Buffer;
|
||||
}> {
|
||||
const fileInfo = importMapping.filter((item) => {
|
||||
if (item.type === mimeType) {
|
||||
if (
|
||||
mimeType === "application/octet-stream" &&
|
||||
path.extname(fileName) !== ".docx"
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (item.type === "text/markdown" && path.extname(fileName) === ".md") {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
})[0];
|
||||
|
||||
if (!fileInfo) {
|
||||
throw InvalidRequestError(`File type ${mimeType} not supported`);
|
||||
}
|
||||
|
||||
let text = await DocumentConverter.convertToMarkdown(
|
||||
content,
|
||||
fileName,
|
||||
mimeType
|
||||
);
|
||||
let title = fileName.replace(/\.[^/.]+$/, "");
|
||||
let text = await fileInfo.getMarkdown(content);
|
||||
|
||||
// find and extract emoji near the beginning of the document.
|
||||
const regex = emojiRegex();
|
||||
@@ -203,6 +63,13 @@ async function documentImporter({
|
||||
// to match our hardbreak parser.
|
||||
text = text.trim().replace(/<br>/gi, "\\n");
|
||||
|
||||
// Escape any dollar signs in the text to prevent them being interpreted as
|
||||
// math blocks
|
||||
text = text.replace(/\$/g, "\\$");
|
||||
|
||||
// Remove any closed and immediately reopened formatting marks
|
||||
text = text.replace(/\*\*\*\*/gi, "").replace(/____/gi, "");
|
||||
|
||||
text = await TextHelper.replaceImagesWithAttachments(
|
||||
text,
|
||||
user,
|
||||
|
||||
125
server/utils/DocumentConverter.ts
Normal file
125
server/utils/DocumentConverter.ts
Normal file
@@ -0,0 +1,125 @@
|
||||
import escapeRegExp from "lodash/escapeRegExp";
|
||||
import { simpleParser } from "mailparser";
|
||||
import mammoth from "mammoth";
|
||||
import { FileImportError } from "@server/errors";
|
||||
import turndownService from "@server/utils/turndown";
|
||||
|
||||
export class DocumentConverter {
|
||||
/**
|
||||
* Convert an incoming file to markdown.
|
||||
* @param content The content of the file.
|
||||
* @param fileName The name of the file, including extension.
|
||||
* @param mimeType The mime type of the file.
|
||||
* @returns The markdown representation of the file.
|
||||
*/
|
||||
public static async convertToMarkdown(
|
||||
content: Buffer | string,
|
||||
fileName: string,
|
||||
mimeType: string
|
||||
) {
|
||||
// First try to convert the file based on the mime type.
|
||||
switch (mimeType) {
|
||||
case "application/msword":
|
||||
return this.confluenceToMarkdown(content);
|
||||
case "application/octet-stream":
|
||||
if (fileName.endsWith(".docx")) {
|
||||
return this.docXToMarkdown(content);
|
||||
}
|
||||
throw FileImportError(`File type ${mimeType} not supported`);
|
||||
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
||||
return this.docXToMarkdown(content);
|
||||
case "text/html":
|
||||
return this.htmlToMarkdown(content);
|
||||
case "text/plain":
|
||||
case "text/markdown":
|
||||
return this.fileToMarkdown(content);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// If the mime type doesn't work, try to convert based on the file extension.
|
||||
const extension = fileName.split(".").pop();
|
||||
switch (extension) {
|
||||
case "docx":
|
||||
return this.docXToMarkdown(content);
|
||||
case "html":
|
||||
return this.htmlToMarkdown(content);
|
||||
case "md":
|
||||
case "markdown":
|
||||
return this.fileToMarkdown(content);
|
||||
default:
|
||||
throw FileImportError(`File type ${mimeType} not supported`);
|
||||
}
|
||||
}
|
||||
|
||||
public static async docXToMarkdown(content: Buffer | string) {
|
||||
if (content instanceof Buffer) {
|
||||
const { value } = await mammoth.convertToHtml({
|
||||
buffer: content,
|
||||
});
|
||||
|
||||
return turndownService.turndown(value);
|
||||
}
|
||||
|
||||
throw FileImportError("Unsupported Word file");
|
||||
}
|
||||
|
||||
public static async htmlToMarkdown(content: Buffer | string) {
|
||||
if (content instanceof Buffer) {
|
||||
content = content.toString("utf8");
|
||||
}
|
||||
|
||||
return turndownService.turndown(content);
|
||||
}
|
||||
|
||||
public static async fileToMarkdown(content: Buffer | string) {
|
||||
if (content instanceof Buffer) {
|
||||
content = content.toString("utf8");
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
public static async confluenceToMarkdown(value: Buffer | string) {
|
||||
if (value instanceof Buffer) {
|
||||
value = value.toString("utf8");
|
||||
}
|
||||
|
||||
// We're only supporting the output from Confluence here, regular Word documents should call
|
||||
// into the docxToMarkdown importer. See: https://jira.atlassian.com/browse/CONFSERVER-38237
|
||||
if (!value.includes("Content-Type: multipart/related")) {
|
||||
throw FileImportError("Unsupported Word file");
|
||||
}
|
||||
|
||||
// Confluence "Word" documents are actually just multi-part email messages, so we can use
|
||||
// mailparser to parse the content.
|
||||
const parsed = await simpleParser(value);
|
||||
if (!parsed.html) {
|
||||
throw FileImportError("Unsupported Word file (No content found)");
|
||||
}
|
||||
|
||||
// Replace the content-location with a data URI for each attachment.
|
||||
for (const attachment of parsed.attachments) {
|
||||
const contentLocation = String(
|
||||
attachment.headers.get("content-location") ?? ""
|
||||
);
|
||||
|
||||
const id = contentLocation.split("/").pop();
|
||||
if (!id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
parsed.html = parsed.html.replace(
|
||||
new RegExp(escapeRegExp(id), "g"),
|
||||
`data:image/png;base64,${attachment.content.toString("base64")}`
|
||||
);
|
||||
}
|
||||
|
||||
// If we don't remove the title here it becomes printed in the document
|
||||
// body by turndown
|
||||
turndownService.remove(["style", "title"]);
|
||||
|
||||
// Now we should have something that looks like HTML
|
||||
const html = turndownService.turndown(parsed.html);
|
||||
return html.replace(/<br>/g, " \\n ");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user