Files
outline/server/commands/documentImporter.ts
dependabot[bot] fbd16d4b9a chore(deps-dev): bump prettier from 2.1.2 to 2.8.8 (#5372)
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Tom Moor <tom.moor@gmail.com>
2023-05-22 19:14:56 -07:00

234 lines
6.2 KiB
TypeScript

import path from "path";
import emojiRegex from "emoji-regex";
import { truncate } from "lodash";
import mammoth from "mammoth";
import quotedPrintable from "quoted-printable";
import { Transaction } from "sequelize";
import utf8 from "utf8";
import parseTitle from "@shared/utils/parseTitle";
import { DocumentValidation } from "@shared/validations";
import { traceFunction } from "@server/logging/tracing";
import { User } from "@server/models";
import dataURItoBuffer from "@server/utils/dataURItoBuffer";
import parseImages from "@server/utils/parseImages";
import turndownService from "@server/utils/turndown";
import { FileImportError, InvalidRequestError } from "../errors";
import attachmentCreator from "./attachmentCreator";
interface ImportableFile {
type: string;
getMarkdown: (content: Buffer | string) => Promise<string>;
}
const importMapping: ImportableFile[] = [
{
type: "application/msword",
getMarkdown: confluenceToMarkdown,
},
{
type: "application/octet-stream",
getMarkdown: docxToMarkdown,
},
{
type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
getMarkdown: docxToMarkdown,
},
{
type: "text/html",
getMarkdown: htmlToMarkdown,
},
{
type: "text/plain",
getMarkdown: fileToMarkdown,
},
{
type: "text/markdown",
getMarkdown: fileToMarkdown,
},
];
async function fileToMarkdown(content: Buffer | string): Promise<string> {
if (content instanceof Buffer) {
content = content.toString("utf8");
}
return content;
}
async function docxToMarkdown(content: Buffer | string): Promise<string> {
if (content instanceof Buffer) {
const { value: html } = await mammoth.convertToHtml({ buffer: content });
return turndownService.turndown(html);
}
throw new Error("docxToMarkdown: content must be a Buffer");
}
async function htmlToMarkdown(content: Buffer | string): Promise<string> {
if (content instanceof Buffer) {
content = content.toString("utf8");
}
return turndownService.turndown(content);
}
async function confluenceToMarkdown(value: Buffer | string): Promise<string> {
if (value instanceof Buffer) {
value = value.toString("utf8");
}
// We're only supporting the ridiculous output from Confluence here, regular
// Word documents should call into the docxToMarkdown importer.
// See: https://jira.atlassian.com/browse/CONFSERVER-38237
if (!value.includes("Content-Type: multipart/related")) {
throw FileImportError("Unsupported Word file");
}
// get boundary marker
const boundaryMarker = value.match(/boundary="(.+)"/);
if (!boundaryMarker) {
throw FileImportError("Unsupported Word file (No boundary marker)");
}
// get content between multipart boundaries
let boundaryReached = 0;
const lines = value.split("\n").filter((line) => {
if (line.includes(boundaryMarker[1])) {
boundaryReached++;
return false;
}
if (line.startsWith("Content-")) {
return false;
}
// 1 == definition
// 2 == content
// 3 == ending
if (boundaryReached === 2) {
return true;
}
return false;
});
if (!lines.length) {
throw FileImportError("Unsupported Word file (No content found)");
}
// Mime attachment is "quoted printable" encoded, must be decoded first
// https://en.wikipedia.org/wiki/Quoted-printable
value = utf8.decode(quotedPrintable.decode(lines.join("\n")));
// If we don't remove the title here it becomes printed in the document
// body by turndown
turndownService.remove(["style", "title"]);
// Now we should have something that looks like HTML
const html = turndownService.turndown(value);
return html.replace(/<br>/g, " \\n ");
}
async function documentImporter({
mimeType,
fileName,
content,
user,
ip,
transaction,
}: {
user: User;
mimeType: string;
fileName: string;
content: Buffer | string;
ip?: string;
transaction?: Transaction;
}): Promise<{
text: string;
title: string;
}> {
const fileInfo = importMapping.filter((item) => {
if (item.type === mimeType) {
if (
mimeType === "application/octet-stream" &&
path.extname(fileName) !== ".docx"
) {
return false;
}
return true;
}
if (item.type === "text/markdown" && path.extname(fileName) === ".md") {
return true;
}
return false;
})[0];
if (!fileInfo) {
throw InvalidRequestError(`File type ${mimeType} not supported`);
}
let title = fileName.replace(/\.[^/.]+$/, "");
let text = await fileInfo.getMarkdown(content);
text = text.trim();
// find and extract first emoji, in the case of some imports it can be outside
// of the title, at the top of the document.
const regex = emojiRegex();
const matches = regex.exec(text);
const firstEmoji = matches ? matches[0] : undefined;
const textStartsWithEmoji = firstEmoji && text.startsWith(firstEmoji);
if (textStartsWithEmoji) {
text = text.replace(firstEmoji, "").trim();
}
// If the first line of the imported text looks like a markdown heading
// then we can use this as the document title
if (text.startsWith("# ")) {
const result = parseTitle(text);
title = result.title;
text = text.replace(`# ${title}`, "").trimStart();
}
// If we parsed an emoji from _above_ the title then add it back at prefixing
if (textStartsWithEmoji) {
title = `${firstEmoji} ${title}`;
}
// Replace any <br> generated by the turndown plugin with escaped newlines
// to match our hardbreak parser.
text = text.replace(/<br>/gi, "\\n");
// find data urls, convert to blobs, upload and write attachments
const images = parseImages(text);
const dataURIs = images.filter((href) => href.startsWith("data:"));
for (const uri of dataURIs) {
const name = "imported";
const { buffer, type } = dataURItoBuffer(uri);
const attachment = await attachmentCreator({
name,
type,
buffer,
user,
ip,
transaction,
});
text = text.replace(uri, attachment.redirectUrl);
}
// It's better to truncate particularly long titles than fail the import
title = truncate(title, { length: DocumentValidation.maxTitleLength });
return {
text,
title,
};
}
export default traceFunction({
spanName: "documentImporter",
})(documentImporter);