diff --git a/server/queues/tasks/ExportDocumentTreeTask.ts b/server/queues/tasks/ExportDocumentTreeTask.ts new file mode 100644 index 000000000..edaa0ed90 --- /dev/null +++ b/server/queues/tasks/ExportDocumentTreeTask.ts @@ -0,0 +1,195 @@ +import path from "path"; +import JSZip from "jszip"; +import { FileOperationFormat } from "@shared/types"; +import Logger from "@server/logging/Logger"; +import { Collection } from "@server/models"; +import Attachment from "@server/models/Attachment"; +import Document from "@server/models/Document"; +import DocumentHelper from "@server/models/helpers/DocumentHelper"; +import ZipHelper from "@server/utils/ZipHelper"; +import { serializeFilename } from "@server/utils/fs"; +import parseAttachmentIds from "@server/utils/parseAttachmentIds"; +import { getFileByKey } from "@server/utils/s3"; +import { NavigationNode } from "~/types"; +import ExportTask from "./ExportTask"; + +export default abstract class ExportDocumentTreeTask extends ExportTask { + /** + * Exports the document tree to the given zip instance. + * + * @param zip The JSZip instance to add files to + * @param documentId The document ID to export + * @param pathInZip The path in the zip to add the document to + * @param format The format to export in + */ + protected async addDocumentToArchive({ + zip, + pathInZip, + documentId, + format = FileOperationFormat.MarkdownZip, + pathMap, + }: { + zip: JSZip; + pathInZip: string; + documentId: string; + format: FileOperationFormat; + pathMap: Map; + }) { + const document = await Document.findByPk(documentId); + if (!document) { + return; + } + + let text = + format === FileOperationFormat.HTMLZip + ? await DocumentHelper.toHTML(document, { centered: true }) + : await DocumentHelper.toMarkdown(document); + const attachments = await Attachment.findAll({ + where: { + teamId: document.teamId, + id: parseAttachmentIds(document.text), + }, + }); + + // Add any referenced attachments to the zip file and replace the + // reference in the document with the path to the attachment in the zip + await Promise.all( + attachments.map(async (attachment) => { + try { + const img = await getFileByKey(attachment.key); + const dir = path.dirname(pathInZip); + if (img) { + zip.file(path.join(dir, attachment.key), img as Blob, { + createFolders: true, + }); + } + } catch (err) { + Logger.error( + `Failed to add attachment to archive: ${attachment.key}`, + err + ); + } + + text = text.replace(attachment.redirectUrl, encodeURI(attachment.key)); + }) + ); + + // Replace any internal links with relative paths to the document in the zip + const internalLinks = [ + ...text.matchAll(/\/doc\/(?:[0-9a-zA-Z-_~]*-)?([a-zA-Z0-9]{10,15})/g), + ]; + internalLinks.forEach((match) => { + const matchedLink = match[0]; + const matchedDocPath = pathMap.get(matchedLink); + + if (matchedDocPath) { + const relativePath = path.relative(pathInZip, matchedDocPath); + if (relativePath.startsWith(".")) { + text = text.replace( + matchedLink, + encodeURI(relativePath.substring(1)) + ); + } + } + }); + + // Finally, add the document to the zip file + zip.file(pathInZip, text, { + date: document.updatedAt, + createFolders: true, + comment: JSON.stringify({ + createdAt: document.createdAt, + updatedAt: document.updatedAt, + }), + }); + } + + /** + * Exports the documents and attachments in the given collections to a zip file + * and returns the path to the zip file in tmp. + * + * @param zip The JSZip instance to add files to + * @param collections The collections to export + * @param format The format to export in + * + * @returns The path to the zip file in tmp. + */ + protected async addCollectionsToArchive( + zip: JSZip, + collections: Collection[], + format: FileOperationFormat + ) { + const pathMap = this.createPathMap(collections, format); + + for (const path of pathMap) { + const documentId = path[0].replace("/doc/", ""); + const pathInZip = path[1]; + + await this.addDocumentToArchive({ + zip, + pathInZip, + documentId, + format, + pathMap, + }); + } + + return ZipHelper.toTmpFile(zip); + } + + /** + * Generates a map of document urls to their path in the zip file. + * + * @param collections + */ + private createPathMap( + collections: Collection[], + format: FileOperationFormat + ) { + const map = new Map(); + + for (const collection of collections) { + if (collection.documentStructure) { + this.addDocumentTreeToPathMap( + map, + collection.documentStructure, + serializeFilename(collection.name), + format + ); + } + } + + return map; + } + + private addDocumentTreeToPathMap( + map: Map, + nodes: NavigationNode[], + root: string, + format: FileOperationFormat + ) { + for (const node of nodes) { + const title = serializeFilename(node.title) || "Untitled"; + const extension = format === FileOperationFormat.HTMLZip ? "html" : "md"; + + // Ensure the document is given a unique path in zip, even if it has + // the same title as another document in the same collection. + let i = 0; + let filePath = path.join(root, `${title}.${extension}`); + while (Array.from(map.values()).includes(filePath)) { + filePath = path.join(root, `${title} (${++i}).${extension}`); + } + + map.set(node.url, filePath); + + if (node.children?.length) { + this.addDocumentTreeToPathMap( + map, + node.children, + path.join(root, title), + format + ); + } + } + } +} diff --git a/server/queues/tasks/ExportHTMLZipTask.ts b/server/queues/tasks/ExportHTMLZipTask.ts index aabdfafa6..fc6a84ef9 100644 --- a/server/queues/tasks/ExportHTMLZipTask.ts +++ b/server/queues/tasks/ExportHTMLZipTask.ts @@ -1,10 +1,16 @@ +import JSZip from "jszip"; import { FileOperationFormat } from "@shared/types"; import { Collection } from "@server/models"; -import { archiveCollections } from "@server/utils/zip"; -import ExportTask from "./ExportTask"; +import ExportDocumentTreeTask from "./ExportDocumentTreeTask"; -export default class ExportHTMLZipTask extends ExportTask { +export default class ExportHTMLZipTask extends ExportDocumentTreeTask { public async export(collections: Collection[]) { - return await archiveCollections(collections, FileOperationFormat.HTMLZip); + const zip = new JSZip(); + + return await this.addCollectionsToArchive( + zip, + collections, + FileOperationFormat.HTMLZip + ); } } diff --git a/server/queues/tasks/ExportMarkdownZipTask.ts b/server/queues/tasks/ExportMarkdownZipTask.ts index a84a93cb6..6ea69eefe 100644 --- a/server/queues/tasks/ExportMarkdownZipTask.ts +++ b/server/queues/tasks/ExportMarkdownZipTask.ts @@ -1,11 +1,14 @@ +import JSZip from "jszip"; import { FileOperationFormat } from "@shared/types"; import { Collection } from "@server/models"; -import { archiveCollections } from "@server/utils/zip"; -import ExportTask from "./ExportTask"; +import ExportDocumentTreeTask from "./ExportDocumentTreeTask"; -export default class ExportMarkdownZipTask extends ExportTask { +export default class ExportMarkdownZipTask extends ExportDocumentTreeTask { public async export(collections: Collection[]) { - return await archiveCollections( + const zip = new JSZip(); + + return await this.addCollectionsToArchive( + zip, collections, FileOperationFormat.MarkdownZip ); diff --git a/server/queues/tasks/ImportMarkdownZipTask.ts b/server/queues/tasks/ImportMarkdownZipTask.ts index 4053619fd..966f8240e 100644 --- a/server/queues/tasks/ImportMarkdownZipTask.ts +++ b/server/queues/tasks/ImportMarkdownZipTask.ts @@ -5,7 +5,7 @@ import { v4 as uuidv4 } from "uuid"; import documentImporter from "@server/commands/documentImporter"; import Logger from "@server/logging/Logger"; import { FileOperation, User } from "@server/models"; -import { zipAsFileTree, FileTreeNode } from "@server/utils/zip"; +import ZipHelper, { FileTreeNode } from "@server/utils/ZipHelper"; import ImportTask, { StructuredImportData } from "./ImportTask"; export default class ImportMarkdownZipTask extends ImportTask { @@ -14,7 +14,7 @@ export default class ImportMarkdownZipTask extends ImportTask { fileOperation: FileOperation ): Promise { const zip = await JSZip.loadAsync(buffer); - const tree = zipAsFileTree(zip); + const tree = ZipHelper.toFileTree(zip); return this.parseFileTree({ fileOperation, zip, tree }); } diff --git a/server/queues/tasks/ImportNotionTask.ts b/server/queues/tasks/ImportNotionTask.ts index bcbaa8c25..7b67523e2 100644 --- a/server/queues/tasks/ImportNotionTask.ts +++ b/server/queues/tasks/ImportNotionTask.ts @@ -6,7 +6,7 @@ import { v4 as uuidv4 } from "uuid"; import documentImporter from "@server/commands/documentImporter"; import Logger from "@server/logging/Logger"; import { FileOperation, User } from "@server/models"; -import { zipAsFileTree, FileTreeNode } from "@server/utils/zip"; +import ZipHelper, { FileTreeNode } from "@server/utils/ZipHelper"; import ImportTask, { StructuredImportData } from "./ImportTask"; export default class ImportNotionTask extends ImportTask { @@ -15,7 +15,7 @@ export default class ImportNotionTask extends ImportTask { fileOperation: FileOperation ): Promise { const zip = await JSZip.loadAsync(buffer); - const tree = zipAsFileTree(zip); + const tree = ZipHelper.toFileTree(zip); return this.parseFileTree({ fileOperation, zip, tree }); } diff --git a/server/utils/ZipHelper.ts b/server/utils/ZipHelper.ts new file mode 100644 index 000000000..bc15f267d --- /dev/null +++ b/server/utils/ZipHelper.ts @@ -0,0 +1,115 @@ +import fs from "fs"; +import path from "path"; +import JSZip from "jszip"; +import { find } from "lodash"; +import tmp from "tmp"; +import { ValidationError } from "@server/errors"; +import { trace } from "@server/logging/tracing"; +import { deserializeFilename } from "./fs"; + +export type FileTreeNode = { + /** The title, extracted from the file name */ + title: string; + /** The file name including extension */ + name: string; + /** Full path to the file within the zip file */ + path: string; + /** Any nested children */ + children: FileTreeNode[]; +}; + +@trace() +export default class ZipHelper { + /** + * Converts the flat structure returned by JSZIP into a nested file structure + * for easier processing. + * + * @param zip The JSZip instance + * @param maxFiles The maximum number of files to unzip (Prevent zip bombs) + */ + public static toFileTree( + zip: JSZip, + /** The maximum number of files to unzip */ + maxFiles = 10000 + ) { + let fileCount = 0; + const paths = Object.keys(zip.files).map((filePath) => { + if (++fileCount > maxFiles) { + throw ValidationError("Too many files in zip"); + } + + return `/${filePath}`; + }); + const tree: FileTreeNode[] = []; + + paths.forEach(function (filePath) { + if (filePath.startsWith("/__MACOSX")) { + return; + } + + const pathParts = filePath.split("/"); + + // Remove first blank element from the parts array. + pathParts.shift(); + + let currentLevel = tree; // initialize currentLevel to root + + pathParts.forEach(function (name) { + // check to see if the path already exists. + const existingPath = find(currentLevel, { + name, + }); + + if (existingPath) { + // The path to this item was already in the tree, so don't add again. + // Set the current level to this path's children + currentLevel = existingPath.children; + } else if (name.endsWith(".DS_Store") || !name) { + return; + } else { + const newPart = { + name, + path: filePath.replace(/^\//, ""), + title: deserializeFilename(path.parse(path.basename(name)).name), + children: [], + }; + + currentLevel.push(newPart); + currentLevel = newPart.children; + } + }); + }); + + return tree; + } + + /** + * Write a zip file to a temporary disk location + * + * @param zip JSZip object + * @returns pathname of the temporary file where the zip was written to disk + */ + public static async toTmpFile(zip: JSZip): Promise { + return new Promise((resolve, reject) => { + tmp.file( + { + prefix: "export-", + postfix: ".zip", + }, + (err, path) => { + if (err) { + return reject(err); + } + zip + .generateNodeStream({ + type: "nodebuffer", + streamFiles: true, + }) + .pipe(fs.createWriteStream(path)) + .on("finish", () => resolve(path)) + .on("error", reject); + } + ); + }); + } +} diff --git a/server/utils/zip.ts b/server/utils/zip.ts deleted file mode 100644 index 8882b216c..000000000 --- a/server/utils/zip.ts +++ /dev/null @@ -1,266 +0,0 @@ -import fs from "fs"; -import path from "path"; -import JSZip, { JSZipObject } from "jszip"; -import { find } from "lodash"; -import tmp from "tmp"; -import { FileOperationFormat } from "@shared/types"; -import { ValidationError } from "@server/errors"; -import Logger from "@server/logging/Logger"; -import Attachment from "@server/models/Attachment"; -import Collection from "@server/models/Collection"; -import Document from "@server/models/Document"; -import DocumentHelper from "@server/models/helpers/DocumentHelper"; -import { NavigationNode } from "~/types"; -import { deserializeFilename, serializeFilename } from "./fs"; -import parseAttachmentIds from "./parseAttachmentIds"; -import { getFileByKey } from "./s3"; - -type ItemType = "collection" | "document" | "attachment"; - -export type Item = { - path: string; - dir: string; - name: string; - depth: number; - metadata: Record; - type: ItemType; - item: JSZipObject; -}; - -export type FileTreeNode = { - /** The title, extracted from the file name */ - title: string; - /** The file name including extension */ - name: string; - /** The full path to within the zip file */ - path: string; - /** The nested children */ - children: FileTreeNode[]; -}; - -async function addDocumentTreeToArchive( - zip: JSZip, - documents: NavigationNode[], - format = FileOperationFormat.MarkdownZip -) { - for (const doc of documents) { - const document = await Document.findByPk(doc.id); - - if (!document) { - continue; - } - - let text = - format === FileOperationFormat.HTMLZip - ? await DocumentHelper.toHTML(document, { centered: true }) - : await DocumentHelper.toMarkdown(document); - const attachments = await Attachment.findAll({ - where: { - teamId: document.teamId, - id: parseAttachmentIds(document.text), - }, - }); - - for (const attachment of attachments) { - await addImageToArchive(zip, attachment.key); - text = text.replace(attachment.redirectUrl, encodeURI(attachment.key)); - } - - let title = serializeFilename(document.title) || "Untitled"; - - const extension = format === FileOperationFormat.HTMLZip ? "html" : "md"; - - title = safeAddFileToArchive(zip, `${title}.${extension}`, text, { - date: document.updatedAt, - comment: JSON.stringify({ - createdAt: document.createdAt, - updatedAt: document.updatedAt, - }), - }); - - if (doc.children && doc.children.length) { - const folder = zip.folder(path.parse(title).name); - - if (folder) { - await addDocumentTreeToArchive(folder, doc.children, format); - } - } - } -} - -/** - * Adds the content of a file in remote storage to the given zip file. - * - * @param zip JSZip object to add to - * @param key path to file in S3 storage - */ -async function addImageToArchive(zip: JSZip, key: string) { - try { - const img = await getFileByKey(key); - - // @ts-expect-error Blob - zip.file(key, img, { - createFolders: true, - }); - } catch (err) { - Logger.error("Error loading image attachment from S3", err, { - key, - }); - } -} - -/** - * Adds content to a zip file, if the given filename already exists in the zip - * then it will automatically increment numbers at the end of the filename. - * - * @param zip JSZip object to add to - * @param key filename with extension - * @param content the content to add - * @param options options for added content - * @returns The new title - */ -function safeAddFileToArchive( - zip: JSZip, - key: string, - content: string | Uint8Array | ArrayBuffer | Blob, - options: JSZip.JSZipFileOptions -) { - // @ts-expect-error root exists - const root = zip.root; - - // Filenames in the directory already - const keysInDirectory = Object.keys(zip.files) - .filter((k) => k.includes(root)) - .filter((k) => !k.endsWith("/")) - .map((k) => path.basename(k).replace(/\s\((\d+)\)\./, ".")); - - // The number of duplicate filenames - const existingKeysCount = keysInDirectory.filter((t) => t === key).length; - const filename = path.parse(key).name; - const extension = path.extname(key); - - // Construct the new de-duplicated filename (if any) - const safeKey = - existingKeysCount > 0 - ? `${filename} (${existingKeysCount})${extension}` - : key; - - zip.file(safeKey, content, options); - return safeKey; -} - -/** - * Write a zip file to a temporary disk location - * - * @param zip JSZip object - * @returns pathname of the temporary file where the zip was written to disk - */ -async function archiveToPath(zip: JSZip): Promise { - return new Promise((resolve, reject) => { - tmp.file( - { - prefix: "export-", - postfix: ".zip", - }, - (err, path) => { - if (err) { - return reject(err); - } - zip - .generateNodeStream({ - type: "nodebuffer", - streamFiles: true, - }) - .pipe(fs.createWriteStream(path)) - .on("finish", () => resolve(path)) - .on("error", reject); - } - ); - }); -} - -export async function archiveCollections( - collections: Collection[], - format: FileOperationFormat -) { - const zip = new JSZip(); - - for (const collection of collections) { - if (collection.documentStructure) { - const folder = zip.folder(serializeFilename(collection.name)); - - if (folder) { - await addDocumentTreeToArchive( - folder, - collection.documentStructure, - format - ); - } - } - } - - return archiveToPath(zip); -} - -/** - * Converts the flat structure returned by JSZIP into a nested file structure - * for easier processing. - * - * @param paths An array of paths to files in the zip - * @returns - */ -export function zipAsFileTree( - zip: JSZip, - /** The maximum number of files to unzip */ - maxFiles = 10000 -) { - let fileCount = 0; - const paths = Object.keys(zip.files).map((filePath) => { - if (++fileCount > maxFiles) { - throw ValidationError("Too many files in zip"); - } - - return `/${filePath}`; - }); - const tree: FileTreeNode[] = []; - - paths.forEach(function (filePath) { - if (filePath.startsWith("/__MACOSX")) { - return; - } - - const pathParts = filePath.split("/"); - - // Remove first blank element from the parts array. - pathParts.shift(); - - let currentLevel = tree; // initialize currentLevel to root - - pathParts.forEach(function (name) { - // check to see if the path already exists. - const existingPath = find(currentLevel, { - name, - }); - - if (existingPath) { - // The path to this item was already in the tree, so don't add again. - // Set the current level to this path's children - currentLevel = existingPath.children; - } else if (name.endsWith(".DS_Store") || !name) { - return; - } else { - const newPart = { - name, - path: filePath.replace(/^\//, ""), - title: deserializeFilename(path.parse(path.basename(name)).name), - children: [], - }; - - currentLevel.push(newPart); - currentLevel = newPart.children; - } - }); - }); - - return tree; -}