Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Tom Moor <tom.moor@gmail.com>
305 lines
9.4 KiB
TypeScript
305 lines
9.4 KiB
TypeScript
import path from "path";
|
|
import JSZip from "jszip";
|
|
import { compact, escapeRegExp } from "lodash";
|
|
import mime from "mime-types";
|
|
import { v4 as uuidv4 } from "uuid";
|
|
import documentImporter from "@server/commands/documentImporter";
|
|
import Logger from "@server/logging/Logger";
|
|
import { FileOperation, User } from "@server/models";
|
|
import ZipHelper, { FileTreeNode } from "@server/utils/ZipHelper";
|
|
import ImportTask, { StructuredImportData } from "./ImportTask";
|
|
|
|
export default class ImportNotionTask extends ImportTask {
|
|
public async parseData(
|
|
stream: NodeJS.ReadableStream,
|
|
fileOperation: FileOperation
|
|
): Promise<StructuredImportData> {
|
|
const zip = await JSZip.loadAsync(stream);
|
|
const tree = ZipHelper.toFileTree(zip);
|
|
return this.parseFileTree({ fileOperation, zip, tree });
|
|
}
|
|
|
|
/**
|
|
* Converts the file structure from zipAsFileTree into documents,
|
|
* collections, and attachments.
|
|
*
|
|
* @param tree An array of FileTreeNode representing root files in the zip
|
|
* @returns A StructuredImportData object
|
|
*/
|
|
private async parseFileTree({
|
|
zip,
|
|
tree,
|
|
fileOperation,
|
|
}: {
|
|
zip: JSZip;
|
|
fileOperation: FileOperation;
|
|
tree: FileTreeNode[];
|
|
}): Promise<StructuredImportData> {
|
|
const user = await User.findByPk(fileOperation.userId, {
|
|
rejectOnEmpty: true,
|
|
});
|
|
|
|
const output: StructuredImportData = {
|
|
collections: [],
|
|
documents: [],
|
|
attachments: [],
|
|
};
|
|
|
|
const parseNodeChildren = async (
|
|
children: FileTreeNode[],
|
|
collectionId: string,
|
|
parentDocumentId?: string
|
|
): Promise<void> => {
|
|
await Promise.all(
|
|
children.map(async (child) => {
|
|
// Ignore the CSV's for databases upfront
|
|
if (child.path.endsWith(".csv")) {
|
|
return;
|
|
}
|
|
|
|
const zipObject = zip.files[child.path];
|
|
const id = uuidv4();
|
|
const match = child.title.match(this.NotionUUIDRegex);
|
|
const name = child.title.replace(this.NotionUUIDRegex, "");
|
|
const sourceId = match ? match[0].trim() : undefined;
|
|
|
|
// If it's not a text file we're going to treat it as an attachment.
|
|
const mimeType = mime.lookup(child.name);
|
|
const isDocument =
|
|
mimeType === "text/markdown" ||
|
|
mimeType === "text/plain" ||
|
|
mimeType === "text/html";
|
|
|
|
// If it's not a document and not a folder, treat it as an attachment
|
|
if (!isDocument && mimeType) {
|
|
output.attachments.push({
|
|
id,
|
|
name: child.name,
|
|
path: child.path,
|
|
mimeType,
|
|
buffer: () => zipObject.async("nodebuffer"),
|
|
sourceId,
|
|
});
|
|
return;
|
|
}
|
|
|
|
Logger.debug("task", `Processing ${name} as ${mimeType}`);
|
|
|
|
const { title, text } = await documentImporter({
|
|
mimeType: mimeType || "text/markdown",
|
|
fileName: name,
|
|
content: zipObject ? await zipObject.async("string") : "",
|
|
user,
|
|
ip: user.lastActiveIp || undefined,
|
|
});
|
|
|
|
const existingDocumentIndex = output.documents.findIndex(
|
|
(doc) => doc.sourceId === sourceId
|
|
);
|
|
|
|
const existingDocument = output.documents[existingDocumentIndex];
|
|
|
|
// If there is an existing document with the same sourceId that means
|
|
// we've already parsed either a folder or a file referencing the same
|
|
// document, as such we should merge.
|
|
if (existingDocument) {
|
|
if (existingDocument.text === "") {
|
|
output.documents[existingDocumentIndex].text = text;
|
|
}
|
|
|
|
await parseNodeChildren(
|
|
child.children,
|
|
collectionId,
|
|
existingDocument.id
|
|
);
|
|
} else {
|
|
output.documents.push({
|
|
id,
|
|
title,
|
|
text,
|
|
collectionId,
|
|
parentDocumentId,
|
|
path: child.path,
|
|
sourceId,
|
|
});
|
|
await parseNodeChildren(child.children, collectionId, id);
|
|
}
|
|
})
|
|
);
|
|
};
|
|
|
|
const replaceInternalLinksAndImages = (text: string) => {
|
|
// Find if there are any images in this document
|
|
const imagesInText = this.parseImages(text);
|
|
|
|
for (const image of imagesInText) {
|
|
const name = path.basename(image.src);
|
|
const attachment = output.attachments.find(
|
|
(att) =>
|
|
att.path.endsWith(image.src) ||
|
|
encodeURI(att.path).endsWith(image.src)
|
|
);
|
|
|
|
if (!attachment) {
|
|
if (!image.src.startsWith("http")) {
|
|
Logger.info(
|
|
"task",
|
|
`Could not find referenced attachment with name ${name} and src ${image.src}`
|
|
);
|
|
}
|
|
} else {
|
|
text = text.replace(
|
|
new RegExp(escapeRegExp(image.src), "g"),
|
|
`<<${attachment.id}>>`
|
|
);
|
|
}
|
|
}
|
|
|
|
// With Notion's HTML import, images sometimes come wrapped in anchor tags
|
|
// This isn't supported in Outline's editor, so we need to strip them.
|
|
text = text.replace(/\[!\[([^[]+)]/g, "![]");
|
|
|
|
// Find if there are any links in this document pointing to other documents
|
|
const internalLinksInText = this.parseInternalLinks(text);
|
|
|
|
// For each link update to the standardized format of <<documentId>>
|
|
// instead of a relative or absolute URL within the original zip file.
|
|
for (const link of internalLinksInText) {
|
|
const doc = output.documents.find(
|
|
(doc) => doc.sourceId === link.sourceId
|
|
);
|
|
|
|
if (!doc) {
|
|
Logger.info(
|
|
"task",
|
|
`Could not find referenced document with sourceId ${link.sourceId}`
|
|
);
|
|
} else {
|
|
text = text.replace(link.href, `<<${doc.id}>>`);
|
|
}
|
|
}
|
|
|
|
return text;
|
|
};
|
|
|
|
// All nodes in the root level should become collections
|
|
for (const node of tree) {
|
|
const match = node.title.match(this.NotionUUIDRegex);
|
|
const name = node.title.replace(this.NotionUUIDRegex, "");
|
|
const sourceId = match ? match[0].trim() : undefined;
|
|
const mimeType = mime.lookup(node.name);
|
|
|
|
const existingCollectionIndex = output.collections.findIndex(
|
|
(collection) => collection.sourceId === sourceId
|
|
);
|
|
const existingCollection = output.collections[existingCollectionIndex];
|
|
const collectionId = existingCollection?.id || uuidv4();
|
|
let description;
|
|
|
|
// Root level docs become the descriptions of collections
|
|
if (
|
|
mimeType === "text/markdown" ||
|
|
mimeType === "text/plain" ||
|
|
mimeType === "text/html"
|
|
) {
|
|
const zipObject = zip.files[node.path];
|
|
const { text } = await documentImporter({
|
|
mimeType,
|
|
fileName: name,
|
|
content: await zipObject.async("string"),
|
|
user,
|
|
ip: user.lastActiveIp || undefined,
|
|
});
|
|
|
|
description = text;
|
|
} else if (node.children.length > 0) {
|
|
await parseNodeChildren(node.children, collectionId);
|
|
} else {
|
|
Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
|
|
fileOperationId: fileOperation.id,
|
|
});
|
|
continue;
|
|
}
|
|
|
|
if (existingCollectionIndex !== -1) {
|
|
if (description) {
|
|
output.collections[existingCollectionIndex].description = description;
|
|
}
|
|
} else {
|
|
output.collections.push({
|
|
id: collectionId,
|
|
name,
|
|
description,
|
|
sourceId,
|
|
});
|
|
}
|
|
}
|
|
|
|
for (const document of output.documents) {
|
|
document.text = replaceInternalLinksAndImages(document.text);
|
|
}
|
|
|
|
for (const collection of output.collections) {
|
|
if (typeof collection.description === "string") {
|
|
collection.description = replaceInternalLinksAndImages(
|
|
collection.description
|
|
);
|
|
}
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
/**
|
|
* Extracts internal links from a markdown document, taking into account the
|
|
* sourceId of the document, which is part of the link title.
|
|
*
|
|
* @param text The markdown text to parse
|
|
* @returns An array of internal links
|
|
*/
|
|
private parseInternalLinks(
|
|
text: string
|
|
): { title: string; href: string; sourceId: string }[] {
|
|
return compact(
|
|
[...text.matchAll(this.NotionLinkRegex)].map((match) => ({
|
|
title: match[1],
|
|
href: match[2],
|
|
sourceId: match[3],
|
|
}))
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Extracts images from the markdown document
|
|
*
|
|
* @param text The markdown text to parse
|
|
* @returns An array of internal links
|
|
*/
|
|
private parseImages(text: string): { alt: string; src: string }[] {
|
|
return compact(
|
|
[...text.matchAll(this.ImageRegex)].map((match) => ({
|
|
alt: match[1],
|
|
src: match[2],
|
|
}))
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Regex to find markdown images of all types
|
|
*/
|
|
private ImageRegex =
|
|
/!\[(?<alt>[^\][]*?)]\((?<filename>[^\][]*?)(?=“|\))“?(?<title>[^\][”]+)?”?\)/g;
|
|
|
|
/**
|
|
* Regex to find markdown links containing ID's that look like UUID's with the
|
|
* "-"'s removed, Notion's sourceId format.
|
|
*/
|
|
private NotionLinkRegex = /\[([^[]+)]\((.*?([0-9a-fA-F]{32})\..*?)\)/g;
|
|
|
|
/**
|
|
* Regex to find Notion document UUID's in the title of a document.
|
|
*/
|
|
private NotionUUIDRegex =
|
|
/\s([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}|[0-9a-fA-F]{32})$/;
|
|
}
|