chore: Refactor data import (#3434)
* Complete refactor of import * feat: Notion data import (#3442)
This commit is contained in:
@@ -4,6 +4,7 @@ import ExportFailureEmail from "@server/emails/templates/ExportFailureEmail";
|
||||
import ExportSuccessEmail from "@server/emails/templates/ExportSuccessEmail";
|
||||
import Logger from "@server/logging/logger";
|
||||
import { FileOperation, Collection, Event, Team, User } from "@server/models";
|
||||
import { FileOperationState } from "@server/models/FileOperation";
|
||||
import { Event as TEvent } from "@server/types";
|
||||
import { uploadToS3FromBuffer } from "@server/utils/s3";
|
||||
import { archiveCollections } from "@server/utils/zip";
|
||||
@@ -41,7 +42,7 @@ export default class ExportsProcessor extends BaseProcessor {
|
||||
});
|
||||
|
||||
this.updateFileOperation(fileOperation, actorId, teamId, {
|
||||
state: "creating",
|
||||
state: FileOperationState.Creating,
|
||||
});
|
||||
// heavy lifting of creating the zip file
|
||||
Logger.info(
|
||||
@@ -50,7 +51,7 @@ export default class ExportsProcessor extends BaseProcessor {
|
||||
);
|
||||
const filePath = await archiveCollections(collections);
|
||||
let url;
|
||||
let state: any = "creating";
|
||||
let state = FileOperationState.Creating;
|
||||
|
||||
try {
|
||||
// @ts-expect-error ts-migrate(2769) FIXME: No overload matches this call.
|
||||
@@ -58,7 +59,7 @@ export default class ExportsProcessor extends BaseProcessor {
|
||||
// @ts-expect-error ts-migrate(2769) FIXME: No overload matches this call.
|
||||
const stat = await fs.promises.stat(filePath);
|
||||
this.updateFileOperation(fileOperation, actorId, teamId, {
|
||||
state: "uploading",
|
||||
state: FileOperationState.Uploading,
|
||||
size: stat.size,
|
||||
});
|
||||
Logger.info(
|
||||
@@ -75,12 +76,12 @@ export default class ExportsProcessor extends BaseProcessor {
|
||||
"processor",
|
||||
`Upload complete for file operation ${fileOperation.id}`
|
||||
);
|
||||
state = "complete";
|
||||
state = FileOperationState.Complete;
|
||||
} catch (error) {
|
||||
Logger.error("Error exporting collection data", error, {
|
||||
fileOperationId: fileOperation.id,
|
||||
});
|
||||
state = "error";
|
||||
state = FileOperationState.Error;
|
||||
url = undefined;
|
||||
} finally {
|
||||
this.updateFileOperation(fileOperation, actorId, teamId, {
|
||||
@@ -88,7 +89,7 @@ export default class ExportsProcessor extends BaseProcessor {
|
||||
url,
|
||||
});
|
||||
|
||||
if (state === "error") {
|
||||
if (state === FileOperationState.Error) {
|
||||
await ExportFailureEmail.schedule({
|
||||
to: user.email,
|
||||
teamUrl: team.url,
|
||||
|
||||
40
server/queues/processors/FileOperationsProcessor.ts
Normal file
40
server/queues/processors/FileOperationsProcessor.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
import invariant from "invariant";
|
||||
import { FileOperation } from "@server/models";
|
||||
import {
|
||||
FileOperationFormat,
|
||||
FileOperationType,
|
||||
} from "@server/models/FileOperation";
|
||||
import { Event as TEvent, FileOperationEvent } from "@server/types";
|
||||
import ImportMarkdownZipTask from "../tasks/ImportMarkdownZipTask";
|
||||
import ImportNotionTask from "../tasks/ImportNotionTask";
|
||||
import BaseProcessor from "./BaseProcessor";
|
||||
|
||||
export default class FileOperationsProcessor extends BaseProcessor {
|
||||
static applicableEvents: TEvent["name"][] = ["fileOperations.create"];
|
||||
|
||||
async perform(event: FileOperationEvent) {
|
||||
if (event.name !== "fileOperations.create") {
|
||||
return;
|
||||
}
|
||||
|
||||
const fileOperation = await FileOperation.findByPk(event.modelId);
|
||||
invariant(fileOperation, "fileOperation not found");
|
||||
|
||||
// map file operation type and format to the appropriate task
|
||||
if (fileOperation.type === FileOperationType.Import) {
|
||||
switch (fileOperation.format) {
|
||||
case FileOperationFormat.MarkdownZip:
|
||||
await ImportMarkdownZipTask.schedule({
|
||||
fileOperationId: event.modelId,
|
||||
});
|
||||
break;
|
||||
case FileOperationFormat.Notion:
|
||||
await ImportNotionTask.schedule({
|
||||
fileOperationId: event.modelId,
|
||||
});
|
||||
break;
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,79 +0,0 @@
|
||||
import fs from "fs";
|
||||
import os from "os";
|
||||
import File from "formidable/lib/file";
|
||||
import invariant from "invariant";
|
||||
import collectionImporter from "@server/commands/collectionImporter";
|
||||
import { Event, FileOperation, Attachment, User } from "@server/models";
|
||||
import { Event as TEvent } from "@server/types";
|
||||
import BaseProcessor from "./BaseProcessor";
|
||||
|
||||
export default class ImportsProcessor extends BaseProcessor {
|
||||
static applicableEvents: TEvent["name"][] = ["collections.import"];
|
||||
|
||||
async perform(event: TEvent) {
|
||||
switch (event.name) {
|
||||
case "collections.import": {
|
||||
let state, error;
|
||||
const { type } = event.data;
|
||||
const attachment = await Attachment.findByPk(event.modelId);
|
||||
invariant(attachment, "attachment not found");
|
||||
|
||||
const user = await User.findByPk(event.actorId);
|
||||
invariant(user, "user not found");
|
||||
|
||||
const fileOperation = await FileOperation.create({
|
||||
type: "import",
|
||||
state: "creating",
|
||||
size: attachment.size,
|
||||
key: attachment.key,
|
||||
userId: user.id,
|
||||
teamId: user.teamId,
|
||||
});
|
||||
|
||||
await Event.schedule({
|
||||
name: "fileOperations.create",
|
||||
modelId: fileOperation.id,
|
||||
teamId: user.teamId,
|
||||
actorId: user.id,
|
||||
});
|
||||
|
||||
try {
|
||||
const buffer = await attachment.buffer;
|
||||
const tmpDir = os.tmpdir();
|
||||
const tmpFilePath = `${tmpDir}/upload-${event.modelId}`;
|
||||
await fs.promises.writeFile(tmpFilePath, buffer as Uint8Array);
|
||||
const file = new File({
|
||||
name: attachment.name,
|
||||
type: attachment.contentType,
|
||||
path: tmpFilePath,
|
||||
});
|
||||
|
||||
await collectionImporter({
|
||||
file,
|
||||
user,
|
||||
type,
|
||||
ip: event.ip,
|
||||
});
|
||||
await attachment.destroy();
|
||||
|
||||
state = "complete";
|
||||
} catch (err) {
|
||||
state = "error";
|
||||
error = err.message;
|
||||
} finally {
|
||||
await fileOperation.update({ state, error });
|
||||
await Event.schedule({
|
||||
name: "fileOperations.update",
|
||||
modelId: fileOperation.id,
|
||||
teamId: user.teamId,
|
||||
actorId: user.id,
|
||||
});
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -33,7 +33,7 @@ export default abstract class BaseTask<T> {
|
||||
* @param props Properties to be used by the task
|
||||
* @returns A promise that resolves once the task has completed.
|
||||
*/
|
||||
public abstract perform(props: T): Promise<void>;
|
||||
public abstract perform(props: T): Promise<any>;
|
||||
|
||||
/**
|
||||
* Job options such as priority and retry strategy, as defined by Bull.
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
import { subDays } from "date-fns";
|
||||
import { FileOperation } from "@server/models";
|
||||
import {
|
||||
FileOperationState,
|
||||
FileOperationType,
|
||||
} from "@server/models/FileOperation";
|
||||
import { buildFileOperation } from "@server/test/factories";
|
||||
import { flushdb } from "@server/test/support";
|
||||
import CleanupExpiredFileOperationsTask from "./CleanupExpiredFileOperationsTask";
|
||||
@@ -9,13 +13,13 @@ beforeEach(() => flushdb());
|
||||
describe("CleanupExpiredFileOperationsTask", () => {
|
||||
it("should expire exports older than 30 days ago", async () => {
|
||||
await buildFileOperation({
|
||||
type: "export",
|
||||
state: "complete",
|
||||
type: FileOperationType.Export,
|
||||
state: FileOperationState.Complete,
|
||||
createdAt: subDays(new Date(), 30),
|
||||
});
|
||||
await buildFileOperation({
|
||||
type: "export",
|
||||
state: "complete",
|
||||
type: FileOperationType.Export,
|
||||
state: FileOperationState.Complete,
|
||||
});
|
||||
|
||||
/* This is a test helper that creates a new task and runs it. */
|
||||
@@ -24,8 +28,8 @@ describe("CleanupExpiredFileOperationsTask", () => {
|
||||
|
||||
const data = await FileOperation.count({
|
||||
where: {
|
||||
type: "export",
|
||||
state: "expired",
|
||||
type: FileOperationType.Export,
|
||||
state: FileOperationState.Expired,
|
||||
},
|
||||
});
|
||||
expect(data).toEqual(1);
|
||||
@@ -33,13 +37,13 @@ describe("CleanupExpiredFileOperationsTask", () => {
|
||||
|
||||
it("should not expire exports made less than 30 days ago", async () => {
|
||||
await buildFileOperation({
|
||||
type: "export",
|
||||
state: "complete",
|
||||
type: FileOperationType.Export,
|
||||
state: FileOperationState.Complete,
|
||||
createdAt: subDays(new Date(), 29),
|
||||
});
|
||||
await buildFileOperation({
|
||||
type: "export",
|
||||
state: "complete",
|
||||
type: FileOperationType.Export,
|
||||
state: FileOperationState.Complete,
|
||||
});
|
||||
|
||||
const task = new CleanupExpiredFileOperationsTask();
|
||||
@@ -47,8 +51,8 @@ describe("CleanupExpiredFileOperationsTask", () => {
|
||||
|
||||
const data = await FileOperation.count({
|
||||
where: {
|
||||
type: "export",
|
||||
state: "expired",
|
||||
type: FileOperationType.Export,
|
||||
state: FileOperationState.Expired,
|
||||
},
|
||||
});
|
||||
expect(data).toEqual(0);
|
||||
|
||||
@@ -2,6 +2,10 @@ import { subDays } from "date-fns";
|
||||
import { Op } from "sequelize";
|
||||
import Logger from "@server/logging/logger";
|
||||
import { FileOperation } from "@server/models";
|
||||
import {
|
||||
FileOperationState,
|
||||
FileOperationType,
|
||||
} from "@server/models/FileOperation";
|
||||
import BaseTask, { TaskPriority } from "./BaseTask";
|
||||
|
||||
type Props = {
|
||||
@@ -13,12 +17,12 @@ export default class CleanupExpiredFileOperationsTask extends BaseTask<Props> {
|
||||
Logger.info("task", `Expiring export file operations older than 30 days…`);
|
||||
const fileOperations = await FileOperation.unscoped().findAll({
|
||||
where: {
|
||||
type: "export",
|
||||
type: FileOperationType.Export,
|
||||
createdAt: {
|
||||
[Op.lt]: subDays(new Date(), 30),
|
||||
},
|
||||
state: {
|
||||
[Op.ne]: "expired",
|
||||
[Op.ne]: FileOperationState.Expired,
|
||||
},
|
||||
},
|
||||
limit,
|
||||
|
||||
87
server/queues/tasks/ImportMarkdownZipTask.test.ts
Normal file
87
server/queues/tasks/ImportMarkdownZipTask.test.ts
Normal file
@@ -0,0 +1,87 @@
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { FileOperation } from "@server/models";
|
||||
import { buildFileOperation } from "@server/test/factories";
|
||||
import { flushdb } from "@server/test/support";
|
||||
import ImportMarkdownZipTask from "./ImportMarkdownZipTask";
|
||||
|
||||
beforeEach(() => flushdb());
|
||||
|
||||
describe("ImportMarkdownZipTask", () => {
|
||||
it("should import the documents, attachments", async () => {
|
||||
const fileOperation = await buildFileOperation();
|
||||
Object.defineProperty(fileOperation, "buffer", {
|
||||
get() {
|
||||
return fs.readFileSync(
|
||||
path.resolve(__dirname, "..", "..", "test", "fixtures", "outline.zip")
|
||||
);
|
||||
},
|
||||
});
|
||||
jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
|
||||
|
||||
const props = {
|
||||
fileOperationId: fileOperation.id,
|
||||
};
|
||||
|
||||
const task = new ImportMarkdownZipTask();
|
||||
const response = await task.perform(props);
|
||||
|
||||
expect(response.collections.size).toEqual(1);
|
||||
expect(response.documents.size).toEqual(8);
|
||||
expect(response.attachments.size).toEqual(6);
|
||||
});
|
||||
|
||||
it("should throw an error with corrupt zip", async () => {
|
||||
const fileOperation = await buildFileOperation();
|
||||
Object.defineProperty(fileOperation, "buffer", {
|
||||
get() {
|
||||
return fs.readFileSync(
|
||||
path.resolve(__dirname, "..", "..", "test", "fixtures", "corrupt.zip")
|
||||
);
|
||||
},
|
||||
});
|
||||
jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
|
||||
|
||||
const props = {
|
||||
fileOperationId: fileOperation.id,
|
||||
};
|
||||
|
||||
let error;
|
||||
try {
|
||||
const task = new ImportMarkdownZipTask();
|
||||
await task.perform(props);
|
||||
} catch (err) {
|
||||
error = err;
|
||||
}
|
||||
|
||||
expect(error && error.message).toBeTruthy();
|
||||
});
|
||||
|
||||
it("should throw an error with empty collection in zip", async () => {
|
||||
const fileOperation = await buildFileOperation();
|
||||
Object.defineProperty(fileOperation, "buffer", {
|
||||
get() {
|
||||
return fs.readFileSync(
|
||||
path.resolve(__dirname, "..", "..", "test", "fixtures", "empty.zip")
|
||||
);
|
||||
},
|
||||
});
|
||||
jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
|
||||
|
||||
const props = {
|
||||
fileOperationId: fileOperation.id,
|
||||
};
|
||||
|
||||
let error;
|
||||
try {
|
||||
const task = new ImportMarkdownZipTask();
|
||||
await task.perform(props);
|
||||
} catch (err) {
|
||||
error = err;
|
||||
}
|
||||
|
||||
expect(error && error.message).toBe(
|
||||
"Uploaded file does not contain any valid documents"
|
||||
);
|
||||
});
|
||||
});
|
||||
171
server/queues/tasks/ImportMarkdownZipTask.ts
Normal file
171
server/queues/tasks/ImportMarkdownZipTask.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
import JSZip from "jszip";
|
||||
import mime from "mime-types";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import documentImporter from "@server/commands/documentImporter";
|
||||
import Logger from "@server/logging/logger";
|
||||
import { FileOperation, User } from "@server/models";
|
||||
import { zipAsFileTree, FileTreeNode } from "@server/utils/zip";
|
||||
import ImportTask, { StructuredImportData } from "./ImportTask";
|
||||
|
||||
export default class ImportMarkdownZipTask extends ImportTask {
|
||||
public async parseData(
|
||||
buffer: Buffer,
|
||||
fileOperation: FileOperation
|
||||
): Promise<StructuredImportData> {
|
||||
const zip = await JSZip.loadAsync(buffer);
|
||||
const tree = zipAsFileTree(zip);
|
||||
|
||||
return this.parseFileTree({ fileOperation, zip, tree });
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the file structure from zipAsFileTree into documents,
|
||||
* collections, and attachments.
|
||||
*
|
||||
* @param tree An array of FileTreeNode representing root files in the zip
|
||||
* @returns A StructuredImportData object
|
||||
*/
|
||||
private async parseFileTree({
|
||||
zip,
|
||||
tree,
|
||||
fileOperation,
|
||||
}: {
|
||||
zip: JSZip;
|
||||
fileOperation: FileOperation;
|
||||
tree: FileTreeNode[];
|
||||
}): Promise<StructuredImportData> {
|
||||
const user = await User.findByPk(fileOperation.userId);
|
||||
const output: StructuredImportData = {
|
||||
collections: [],
|
||||
documents: [],
|
||||
attachments: [],
|
||||
};
|
||||
|
||||
async function parseNodeChildren(
|
||||
children: FileTreeNode[],
|
||||
collectionId: string,
|
||||
parentDocumentId?: string
|
||||
): Promise<void> {
|
||||
if (!user) {
|
||||
throw new Error("User not found");
|
||||
}
|
||||
|
||||
await Promise.all(
|
||||
children.map(async (child) => {
|
||||
// special case for folders of attachments
|
||||
if (
|
||||
child.name === "uploads" ||
|
||||
(child.children.length > 0 && child.path.includes("/uploads/"))
|
||||
) {
|
||||
return parseNodeChildren(child.children, collectionId);
|
||||
}
|
||||
|
||||
const zipObject = zip.files[child.path];
|
||||
const id = uuidv4();
|
||||
|
||||
// this is an attachment
|
||||
if (child.path.includes("/uploads/") && child.children.length === 0) {
|
||||
output.attachments.push({
|
||||
id,
|
||||
name: child.name,
|
||||
path: child.path,
|
||||
mimeType: mime.lookup(child.path) || "application/octet-stream",
|
||||
buffer: await zipObject.async("nodebuffer"),
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const { title, text } = await documentImporter({
|
||||
mimeType: "text/markdown",
|
||||
fileName: child.name,
|
||||
content: await zipObject.async("string"),
|
||||
user,
|
||||
ip: user.lastActiveIp || undefined,
|
||||
});
|
||||
|
||||
let metadata;
|
||||
try {
|
||||
metadata = zipObject.comment ? JSON.parse(zipObject.comment) : {};
|
||||
} catch (err) {
|
||||
Logger.debug(
|
||||
"task",
|
||||
`ZIP comment found for ${child.name}, but could not be parsed as metadata: ${zipObject.comment}`
|
||||
);
|
||||
}
|
||||
|
||||
const createdAt = metadata.createdAt
|
||||
? new Date(metadata.createdAt)
|
||||
: zipObject.date;
|
||||
|
||||
const updatedAt = metadata.updatedAt
|
||||
? new Date(metadata.updatedAt)
|
||||
: zipObject.date;
|
||||
|
||||
const existingEmptyDocumentIndex = output.documents.findIndex(
|
||||
(doc) =>
|
||||
doc.title === title &&
|
||||
doc.collectionId === collectionId &&
|
||||
doc.parentDocumentId === parentDocumentId &&
|
||||
doc.text === ""
|
||||
);
|
||||
|
||||
// When there is a file and a folder with the same name this handles
|
||||
// the case by combining the two into one document with nested children
|
||||
if (existingEmptyDocumentIndex !== -1) {
|
||||
output.documents[existingEmptyDocumentIndex].text = text;
|
||||
} else {
|
||||
output.documents.push({
|
||||
id,
|
||||
title,
|
||||
text,
|
||||
updatedAt,
|
||||
createdAt,
|
||||
collectionId,
|
||||
parentDocumentId,
|
||||
path: child.path,
|
||||
});
|
||||
}
|
||||
|
||||
await parseNodeChildren(child.children, collectionId, id);
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
// All nodes in the root level should be collections
|
||||
for (const node of tree) {
|
||||
if (node.path.endsWith("/")) {
|
||||
const collectionId = uuidv4();
|
||||
output.collections.push({
|
||||
id: collectionId,
|
||||
name: node.title,
|
||||
});
|
||||
await parseNodeChildren(node.children, collectionId);
|
||||
} else {
|
||||
Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
|
||||
fileOperationId: fileOperation.id,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Check all of the attachments we've created against urls in the text
|
||||
// and replace them out with attachment redirect urls before continuing.
|
||||
for (const document of output.documents) {
|
||||
for (const attachment of output.attachments) {
|
||||
// Pull the collection and subdirectory out of the path name, upload
|
||||
// folders in an export are relative to the document itself
|
||||
const normalizedAttachmentPath = attachment.path.replace(
|
||||
/(.*)uploads\//,
|
||||
"uploads/"
|
||||
);
|
||||
|
||||
const reference = `<<${attachment.id}>>`;
|
||||
document.text = document.text
|
||||
.replace(new RegExp(attachment.path, "g"), reference)
|
||||
.replace(new RegExp(normalizedAttachmentPath, "g"), reference)
|
||||
.replace(new RegExp(`/${normalizedAttachmentPath}`, "g"), reference);
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
||||
80
server/queues/tasks/ImportNotionTask.test.ts
Normal file
80
server/queues/tasks/ImportNotionTask.test.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { FileOperation } from "@server/models";
|
||||
import { buildFileOperation } from "@server/test/factories";
|
||||
import { flushdb } from "@server/test/support";
|
||||
import ImportNotionTask from "./ImportNotionTask";
|
||||
|
||||
beforeEach(() => flushdb());
|
||||
|
||||
describe("ImportNotionTask", () => {
|
||||
it("should import successfully from a Markdown export", async () => {
|
||||
const fileOperation = await buildFileOperation();
|
||||
Object.defineProperty(fileOperation, "buffer", {
|
||||
get() {
|
||||
return fs.readFileSync(
|
||||
path.resolve(
|
||||
__dirname,
|
||||
"..",
|
||||
"..",
|
||||
"test",
|
||||
"fixtures",
|
||||
"notion-markdown.zip"
|
||||
)
|
||||
);
|
||||
},
|
||||
});
|
||||
jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
|
||||
|
||||
const props = {
|
||||
fileOperationId: fileOperation.id,
|
||||
};
|
||||
|
||||
const task = new ImportNotionTask();
|
||||
const response = await task.perform(props);
|
||||
|
||||
expect(response.collections.size).toEqual(2);
|
||||
expect(response.documents.size).toEqual(6);
|
||||
expect(response.attachments.size).toEqual(1);
|
||||
|
||||
// Check that the image url was replaced in the text with a redirect
|
||||
const attachments = Array.from(response.attachments.values());
|
||||
const documents = Array.from(response.documents.values());
|
||||
expect(documents[2].text).toContain(attachments[0].redirectUrl);
|
||||
});
|
||||
|
||||
it("should import successfully from a HTML export", async () => {
|
||||
const fileOperation = await buildFileOperation();
|
||||
Object.defineProperty(fileOperation, "buffer", {
|
||||
get() {
|
||||
return fs.readFileSync(
|
||||
path.resolve(
|
||||
__dirname,
|
||||
"..",
|
||||
"..",
|
||||
"test",
|
||||
"fixtures",
|
||||
"notion-html.zip"
|
||||
)
|
||||
);
|
||||
},
|
||||
});
|
||||
jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
|
||||
|
||||
const props = {
|
||||
fileOperationId: fileOperation.id,
|
||||
};
|
||||
|
||||
const task = new ImportNotionTask();
|
||||
const response = await task.perform(props);
|
||||
|
||||
expect(response.collections.size).toEqual(2);
|
||||
expect(response.documents.size).toEqual(6);
|
||||
expect(response.attachments.size).toEqual(4);
|
||||
|
||||
// Check that the image url was replaced in the text with a redirect
|
||||
const attachments = Array.from(response.attachments.values());
|
||||
const documents = Array.from(response.documents.values());
|
||||
expect(documents[1].text).toContain(attachments[1].redirectUrl);
|
||||
});
|
||||
});
|
||||
301
server/queues/tasks/ImportNotionTask.ts
Normal file
301
server/queues/tasks/ImportNotionTask.ts
Normal file
@@ -0,0 +1,301 @@
|
||||
import path from "path";
|
||||
import JSZip from "jszip";
|
||||
import { compact } from "lodash";
|
||||
import mime from "mime-types";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import documentImporter from "@server/commands/documentImporter";
|
||||
import Logger from "@server/logging/logger";
|
||||
import { FileOperation, User } from "@server/models";
|
||||
import { zipAsFileTree, FileTreeNode } from "@server/utils/zip";
|
||||
import ImportTask, { StructuredImportData } from "./ImportTask";
|
||||
|
||||
export default class ImportNotionTask extends ImportTask {
|
||||
public async parseData(
|
||||
buffer: Buffer,
|
||||
fileOperation: FileOperation
|
||||
): Promise<StructuredImportData> {
|
||||
const zip = await JSZip.loadAsync(buffer);
|
||||
const tree = zipAsFileTree(zip);
|
||||
return this.parseFileTree({ fileOperation, zip, tree });
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the file structure from zipAsFileTree into documents,
|
||||
* collections, and attachments.
|
||||
*
|
||||
* @param tree An array of FileTreeNode representing root files in the zip
|
||||
* @returns A StructuredImportData object
|
||||
*/
|
||||
private async parseFileTree({
|
||||
zip,
|
||||
tree,
|
||||
fileOperation,
|
||||
}: {
|
||||
zip: JSZip;
|
||||
fileOperation: FileOperation;
|
||||
tree: FileTreeNode[];
|
||||
}): Promise<StructuredImportData> {
|
||||
const user = await User.findByPk(fileOperation.userId);
|
||||
if (!user) {
|
||||
throw new Error("User not found");
|
||||
}
|
||||
|
||||
const output: StructuredImportData = {
|
||||
collections: [],
|
||||
documents: [],
|
||||
attachments: [],
|
||||
};
|
||||
|
||||
const parseNodeChildren = async (
|
||||
children: FileTreeNode[],
|
||||
collectionId: string,
|
||||
parentDocumentId?: string
|
||||
): Promise<void> => {
|
||||
if (!user) {
|
||||
throw new Error("User not found");
|
||||
}
|
||||
|
||||
await Promise.all(
|
||||
children.map(async (child) => {
|
||||
// Ignore the CSV's for databases upfront
|
||||
if (child.path.endsWith(".csv")) {
|
||||
return;
|
||||
}
|
||||
|
||||
const zipObject = zip.files[child.path];
|
||||
const id = uuidv4();
|
||||
const match = child.title.match(this.NotionUUIDRegex);
|
||||
const name = child.title.replace(this.NotionUUIDRegex, "");
|
||||
const sourceId = match ? match[0].trim() : undefined;
|
||||
|
||||
// If it's not a text file we're going to treat it as an attachment.
|
||||
const mimeType = mime.lookup(child.name);
|
||||
const isDocument =
|
||||
mimeType === "text/markdown" ||
|
||||
mimeType === "text/plain" ||
|
||||
mimeType === "text/html";
|
||||
|
||||
// If it's not a document and not a folder, treat it as an attachment
|
||||
if (!isDocument && mimeType) {
|
||||
output.attachments.push({
|
||||
id,
|
||||
name: child.name,
|
||||
path: child.path,
|
||||
mimeType,
|
||||
buffer: await zipObject.async("nodebuffer"),
|
||||
sourceId,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
Logger.debug("task", `Processing ${name} as ${mimeType}`);
|
||||
|
||||
const { title, text } = await documentImporter({
|
||||
mimeType: mimeType || "text/markdown",
|
||||
fileName: name,
|
||||
content: await zipObject.async("string"),
|
||||
user,
|
||||
ip: user.lastActiveIp || undefined,
|
||||
});
|
||||
|
||||
const existingDocumentIndex = output.documents.findIndex(
|
||||
(doc) => doc.sourceId === sourceId
|
||||
);
|
||||
|
||||
const existingDocument = output.documents[existingDocumentIndex];
|
||||
|
||||
// If there is an existing document with the same sourceId that means
|
||||
// we've already parsed either a folder or a file referencing the same
|
||||
// document, as such we should merge.
|
||||
if (existingDocument) {
|
||||
if (existingDocument.text === "") {
|
||||
output.documents[existingDocumentIndex].text = text;
|
||||
}
|
||||
|
||||
await parseNodeChildren(
|
||||
child.children,
|
||||
collectionId,
|
||||
existingDocument.id
|
||||
);
|
||||
} else {
|
||||
output.documents.push({
|
||||
id,
|
||||
title,
|
||||
text,
|
||||
collectionId,
|
||||
parentDocumentId,
|
||||
path: child.path,
|
||||
sourceId,
|
||||
});
|
||||
await parseNodeChildren(child.children, collectionId, id);
|
||||
}
|
||||
})
|
||||
);
|
||||
};
|
||||
|
||||
const replaceInternalLinksAndImages = (text: string) => {
|
||||
// Find if there are any images in this document
|
||||
const imagesInText = this.parseImages(text);
|
||||
|
||||
for (const image of imagesInText) {
|
||||
const name = path.basename(image.src);
|
||||
const attachment = output.attachments.find((att) => att.name === name);
|
||||
|
||||
if (!attachment) {
|
||||
Logger.info(
|
||||
"task",
|
||||
`Could not find referenced attachment with name ${name} and src ${image.src}`
|
||||
);
|
||||
} else {
|
||||
text = text.replace(
|
||||
new RegExp(image.src, "g"),
|
||||
`<<${attachment.id}>>`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// With Notion's HTML import, images sometimes come wrapped in anchor tags
|
||||
// This isn't supported in Outline's editor, so we need to strip them.
|
||||
text = text.replace(/\[!\[([^[]+)]/g, "![]");
|
||||
|
||||
// Find if there are any links in this document pointing to other documents
|
||||
const internalLinksInText = this.parseInternalLinks(text);
|
||||
|
||||
// For each link update to the standardized format of <<documentId>>
|
||||
// instead of a relative or absolute URL within the original zip file.
|
||||
for (const link of internalLinksInText) {
|
||||
const doc = output.documents.find(
|
||||
(doc) => doc.sourceId === link.sourceId
|
||||
);
|
||||
|
||||
if (!doc) {
|
||||
Logger.info(
|
||||
"task",
|
||||
`Could not find referenced document with sourceId ${link.sourceId}`
|
||||
);
|
||||
} else {
|
||||
text = text.replace(link.href, `<<${doc.id}>>`);
|
||||
}
|
||||
}
|
||||
|
||||
return text;
|
||||
};
|
||||
|
||||
// All nodes in the root level should become collections
|
||||
for (const node of tree) {
|
||||
const match = node.title.match(this.NotionUUIDRegex);
|
||||
const name = node.title.replace(this.NotionUUIDRegex, "");
|
||||
const sourceId = match ? match[0].trim() : undefined;
|
||||
const mimeType = mime.lookup(node.name);
|
||||
|
||||
const existingCollectionIndex = output.collections.findIndex(
|
||||
(collection) => collection.sourceId === sourceId
|
||||
);
|
||||
const existingCollection = output.collections[existingCollectionIndex];
|
||||
const collectionId = existingCollection?.id || uuidv4();
|
||||
let description;
|
||||
|
||||
// Root level docs become the descriptions of collections
|
||||
if (
|
||||
mimeType === "text/markdown" ||
|
||||
mimeType === "text/plain" ||
|
||||
mimeType === "text/html"
|
||||
) {
|
||||
const zipObject = zip.files[node.path];
|
||||
const { text } = await documentImporter({
|
||||
mimeType,
|
||||
fileName: name,
|
||||
content: await zipObject.async("string"),
|
||||
user,
|
||||
ip: user.lastActiveIp || undefined,
|
||||
});
|
||||
|
||||
description = text;
|
||||
} else if (node.children.length > 0) {
|
||||
await parseNodeChildren(node.children, collectionId);
|
||||
} else {
|
||||
Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
|
||||
fileOperationId: fileOperation.id,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
if (existingCollectionIndex !== -1) {
|
||||
if (description) {
|
||||
output.collections[existingCollectionIndex].description = description;
|
||||
}
|
||||
} else {
|
||||
output.collections.push({
|
||||
id: collectionId,
|
||||
name,
|
||||
description,
|
||||
sourceId,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
for (const document of output.documents) {
|
||||
document.text = replaceInternalLinksAndImages(document.text);
|
||||
}
|
||||
|
||||
for (const collection of output.collections) {
|
||||
if (collection.description) {
|
||||
collection.description = replaceInternalLinksAndImages(
|
||||
collection.description
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts internal links from a markdown document, taking into account the
|
||||
* sourceId of the document, which is part of the link title.
|
||||
*
|
||||
* @param text The markdown text to parse
|
||||
* @returns An array of internal links
|
||||
*/
|
||||
private parseInternalLinks(
|
||||
text: string
|
||||
): { title: string; href: string; sourceId: string }[] {
|
||||
return compact(
|
||||
[...text.matchAll(this.NotionLinkRegex)].map((match) => ({
|
||||
title: match[1],
|
||||
href: match[2],
|
||||
sourceId: match[3],
|
||||
}))
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts images from the markdown document
|
||||
*
|
||||
* @param text The markdown text to parse
|
||||
* @returns An array of internal links
|
||||
*/
|
||||
private parseImages(text: string): { alt: string; src: string }[] {
|
||||
return compact(
|
||||
[...text.matchAll(this.ImageRegex)].map((match) => ({
|
||||
alt: match[1],
|
||||
src: match[2],
|
||||
}))
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Regex to find markdown images of all types
|
||||
*/
|
||||
private ImageRegex = /!\[(?<alt>[^\][]*?)]\((?<filename>[^\][]*?)(?=“|\))“?(?<title>[^\][”]+)?”?\)/g;
|
||||
|
||||
/**
|
||||
* Regex to find markdown links containing ID's that look like UUID's with the
|
||||
* "-"'s removed, Notion's sourceId format.
|
||||
*/
|
||||
private NotionLinkRegex = /\[([^[]+)]\((.*?([0-9a-fA-F]{32})\..*?)\)/g;
|
||||
|
||||
/**
|
||||
* Regex to find Notion document UUID's in the title of a document.
|
||||
*/
|
||||
private NotionUUIDRegex = /\s([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}|[0-9a-fA-F]{32})$/;
|
||||
}
|
||||
379
server/queues/tasks/ImportTask.ts
Normal file
379
server/queues/tasks/ImportTask.ts
Normal file
@@ -0,0 +1,379 @@
|
||||
import invariant from "invariant";
|
||||
import attachmentCreator from "@server/commands/attachmentCreator";
|
||||
import documentCreator from "@server/commands/documentCreator";
|
||||
import { sequelize } from "@server/database/sequelize";
|
||||
import { ValidationError } from "@server/errors";
|
||||
import logger from "@server/logging/logger";
|
||||
import {
|
||||
User,
|
||||
Event,
|
||||
Document,
|
||||
Collection,
|
||||
FileOperation,
|
||||
Attachment,
|
||||
} from "@server/models";
|
||||
import { FileOperationState } from "@server/models/FileOperation";
|
||||
import BaseTask, { TaskPriority } from "./BaseTask";
|
||||
|
||||
type Props = {
|
||||
fileOperationId: string;
|
||||
};
|
||||
|
||||
/**
|
||||
* Standardized format for data importing, to be used by all import tasks.
|
||||
*/
|
||||
export type StructuredImportData = {
|
||||
collections: {
|
||||
id: string;
|
||||
name: string;
|
||||
/**
|
||||
* The collection description. To reference an attachment or image use the
|
||||
* special formatting <<attachmentId>>. It will be replaced with a reference
|
||||
* to the actual attachment as part of persistData.
|
||||
*
|
||||
* To reference a document use <<documentId>>, it will be replaced with a
|
||||
* link to the document as part of persistData once the document url is
|
||||
* generated.
|
||||
*/
|
||||
description?: string;
|
||||
/** Optional id from import source, useful for mapping */
|
||||
sourceId?: string;
|
||||
}[];
|
||||
documents: {
|
||||
id: string;
|
||||
title: string;
|
||||
/**
|
||||
* The document text. To reference an attachment or image use the special
|
||||
* formatting <<attachmentId>>. It will be replaced with a reference to the
|
||||
* actual attachment as part of persistData.
|
||||
*
|
||||
* To reference another document use <<documentId>>, it will be replaced
|
||||
* with a link to the document as part of persistData once the document url
|
||||
* is generated.
|
||||
*/
|
||||
text: string;
|
||||
collectionId: string;
|
||||
updatedAt?: Date;
|
||||
createdAt?: Date;
|
||||
parentDocumentId?: string;
|
||||
path: string;
|
||||
/** Optional id from import source, useful for mapping */
|
||||
sourceId?: string;
|
||||
}[];
|
||||
attachments: {
|
||||
id: string;
|
||||
name: string;
|
||||
path: string;
|
||||
mimeType: string;
|
||||
buffer: Buffer;
|
||||
/** Optional id from import source, useful for mapping */
|
||||
sourceId?: string;
|
||||
}[];
|
||||
};
|
||||
|
||||
export default abstract class ImportTask extends BaseTask<Props> {
|
||||
/**
|
||||
* Runs the import task.
|
||||
*
|
||||
* @param props The props
|
||||
*/
|
||||
public async perform({ fileOperationId }: Props) {
|
||||
const fileOperation = await FileOperation.findByPk(fileOperationId);
|
||||
invariant(fileOperation, "fileOperation not found");
|
||||
|
||||
try {
|
||||
logger.info("task", `ImportTask fetching data for ${fileOperationId}`);
|
||||
const data = await this.fetchData(fileOperation);
|
||||
|
||||
logger.info("task", `ImportTask parsing data for ${fileOperationId}`);
|
||||
const parsed = await this.parseData(data, fileOperation);
|
||||
|
||||
if (parsed.collections.length === 0) {
|
||||
throw ValidationError(
|
||||
"Uploaded file does not contain any collections. The root of the zip file must contain folders representing collections."
|
||||
);
|
||||
}
|
||||
|
||||
if (parsed.documents.length === 0) {
|
||||
throw ValidationError(
|
||||
"Uploaded file does not contain any valid documents"
|
||||
);
|
||||
}
|
||||
|
||||
let result;
|
||||
try {
|
||||
logger.info(
|
||||
"task",
|
||||
`ImportTask persisting data for ${fileOperationId}`
|
||||
);
|
||||
result = await this.persistData(parsed, fileOperation);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`ImportTask failed to persist data for ${fileOperationId}`,
|
||||
error
|
||||
);
|
||||
throw new Error("Sorry, an internal error occurred during import");
|
||||
}
|
||||
|
||||
await this.updateFileOperation(
|
||||
fileOperation,
|
||||
FileOperationState.Complete
|
||||
);
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
await this.updateFileOperation(
|
||||
fileOperation,
|
||||
FileOperationState.Error,
|
||||
error
|
||||
);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the state of the underlying FileOperation in the database and send
|
||||
* an event to the client.
|
||||
*
|
||||
* @param fileOperation The FileOperation to update
|
||||
*/
|
||||
private async updateFileOperation(
|
||||
fileOperation: FileOperation,
|
||||
state: FileOperationState,
|
||||
error?: Error
|
||||
) {
|
||||
await fileOperation.update({ state, error: error?.message });
|
||||
await Event.schedule({
|
||||
name: "fileOperations.update",
|
||||
modelId: fileOperation.id,
|
||||
teamId: fileOperation.teamId,
|
||||
actorId: fileOperation.userId,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the remote data needed for the import, by default this will download
|
||||
* any file associated with the FileOperation, save it to a temporary file,
|
||||
* and return the path.
|
||||
*
|
||||
* @param fileOperation The FileOperation to fetch data for
|
||||
* @returns string
|
||||
*/
|
||||
protected async fetchData(fileOperation: FileOperation) {
|
||||
return fileOperation.buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the data loaded from fetchData into a consistent structured format
|
||||
* that represents collections, documents, and the relationships between them.
|
||||
*
|
||||
* @param data The data loaded from fetchData
|
||||
* @returns A promise that resolves to the structured data
|
||||
*/
|
||||
protected abstract parseData(
|
||||
data: any,
|
||||
fileOperation: FileOperation
|
||||
): Promise<StructuredImportData>;
|
||||
|
||||
/**
|
||||
* Persist the data that was already fetched and parsed into the consistent
|
||||
* structured data.
|
||||
*
|
||||
* @param props The props
|
||||
*/
|
||||
protected async persistData(
|
||||
data: StructuredImportData,
|
||||
fileOperation: FileOperation
|
||||
): Promise<{
|
||||
collections: Map<string, Collection>;
|
||||
documents: Map<string, Document>;
|
||||
attachments: Map<string, Attachment>;
|
||||
}> {
|
||||
const collections = new Map<string, Collection>();
|
||||
const documents = new Map<string, Document>();
|
||||
const attachments = new Map<string, Attachment>();
|
||||
|
||||
return sequelize.transaction(async (transaction) => {
|
||||
const user = await User.findByPk(fileOperation.userId, {
|
||||
transaction,
|
||||
});
|
||||
invariant(user, "User not found");
|
||||
|
||||
const ip = user.lastActiveIp || undefined;
|
||||
|
||||
// Attachments
|
||||
for (const item of data.attachments) {
|
||||
const attachment = await attachmentCreator({
|
||||
source: "import",
|
||||
id: item.id,
|
||||
name: item.name,
|
||||
type: item.mimeType,
|
||||
buffer: item.buffer,
|
||||
user,
|
||||
ip,
|
||||
transaction,
|
||||
});
|
||||
attachments.set(item.id, attachment);
|
||||
}
|
||||
|
||||
// Collections
|
||||
for (const item of data.collections) {
|
||||
let description = item.description;
|
||||
|
||||
if (description) {
|
||||
// Check all of the attachments we've created against urls in the text
|
||||
// and replace them out with attachment redirect urls before saving.
|
||||
for (const aitem of data.attachments) {
|
||||
const attachment = attachments.get(aitem.id);
|
||||
if (!attachment) {
|
||||
continue;
|
||||
}
|
||||
description = description.replace(
|
||||
new RegExp(`<<${attachment.id}>>`, "g"),
|
||||
attachment.redirectUrl
|
||||
);
|
||||
}
|
||||
|
||||
// Check all of the document we've created against urls in the text
|
||||
// and replace them out with a valid internal link. Because we are doing
|
||||
// this before saving, we can't use the document slug, but we can take
|
||||
// advantage of the fact that the document id will redirect in the client
|
||||
for (const ditem of data.documents) {
|
||||
description = description.replace(
|
||||
new RegExp(`<<${ditem.id}>>`, "g"),
|
||||
`/doc/${ditem.id}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// check if collection with name exists
|
||||
const response = await Collection.findOrCreate({
|
||||
where: {
|
||||
teamId: fileOperation.teamId,
|
||||
name: item.name,
|
||||
},
|
||||
defaults: {
|
||||
id: item.id,
|
||||
description,
|
||||
createdById: fileOperation.userId,
|
||||
permission: "read_write",
|
||||
},
|
||||
transaction,
|
||||
});
|
||||
|
||||
let collection = response[0];
|
||||
const isCreated = response[1];
|
||||
|
||||
// create new collection if name already exists, yes it's possible that
|
||||
// there is also a "Name (Imported)" but this is a case not worth dealing
|
||||
// with right now
|
||||
if (!isCreated) {
|
||||
const name = `${item.name} (Imported)`;
|
||||
collection = await Collection.create(
|
||||
{
|
||||
id: item.id,
|
||||
description,
|
||||
teamId: fileOperation.teamId,
|
||||
createdById: fileOperation.userId,
|
||||
name,
|
||||
permission: "read_write",
|
||||
},
|
||||
{ transaction }
|
||||
);
|
||||
}
|
||||
|
||||
await Event.create(
|
||||
{
|
||||
name: "collections.create",
|
||||
collectionId: collection.id,
|
||||
teamId: collection.teamId,
|
||||
actorId: fileOperation.userId,
|
||||
data: {
|
||||
name: collection.name,
|
||||
},
|
||||
ip,
|
||||
},
|
||||
{
|
||||
transaction,
|
||||
}
|
||||
);
|
||||
|
||||
collections.set(item.id, collection);
|
||||
}
|
||||
|
||||
// Documents
|
||||
for (const item of data.documents) {
|
||||
let text = item.text;
|
||||
|
||||
// Check all of the attachments we've created against urls in the text
|
||||
// and replace them out with attachment redirect urls before saving.
|
||||
for (const aitem of data.attachments) {
|
||||
const attachment = attachments.get(aitem.id);
|
||||
if (!attachment) {
|
||||
continue;
|
||||
}
|
||||
text = text.replace(
|
||||
new RegExp(`<<${attachment.id}>>`, "g"),
|
||||
attachment.redirectUrl
|
||||
);
|
||||
}
|
||||
|
||||
// Check all of the document we've created against urls in the text
|
||||
// and replace them out with a valid internal link. Because we are doing
|
||||
// this before saving, we can't use the document slug, but we can take
|
||||
// advantage of the fact that the document id will redirect in the client
|
||||
for (const ditem of data.documents) {
|
||||
text = text.replace(
|
||||
new RegExp(`<<${ditem.id}>>`, "g"),
|
||||
`/doc/${ditem.id}`
|
||||
);
|
||||
}
|
||||
|
||||
const document = await documentCreator({
|
||||
source: "import",
|
||||
id: item.id,
|
||||
title: item.title,
|
||||
text,
|
||||
collectionId: item.collectionId,
|
||||
createdAt: item.createdAt,
|
||||
updatedAt: item.updatedAt ?? item.createdAt,
|
||||
publishedAt: item.updatedAt ?? item.createdAt ?? new Date(),
|
||||
parentDocumentId: item.parentDocumentId,
|
||||
user,
|
||||
ip,
|
||||
transaction,
|
||||
});
|
||||
documents.set(item.id, document);
|
||||
|
||||
const collection = collections.get(item.collectionId);
|
||||
if (collection) {
|
||||
await collection.addDocumentToStructure(document, 0, { transaction });
|
||||
}
|
||||
}
|
||||
|
||||
// Return value is only used for testing
|
||||
return {
|
||||
collections,
|
||||
documents,
|
||||
attachments,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Optional hook to remove any temporary files that were created
|
||||
*/
|
||||
protected async cleanupData() {
|
||||
// noop
|
||||
}
|
||||
|
||||
/**
|
||||
* Job options such as priority and retry strategy, as defined by Bull.
|
||||
*/
|
||||
public get options() {
|
||||
return {
|
||||
priority: TaskPriority.Low,
|
||||
attempts: 1,
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user