Store source metadata for imported documents (#6136)

This commit is contained in:
Tom Moor
2023-11-11 10:52:29 -05:00
committed by GitHub
parent 90605e110a
commit 48d688c0a5
16 changed files with 178 additions and 48 deletions

View File

@@ -75,11 +75,12 @@ export default class ImportJSONTask extends ImportTask {
updatedAt: node.updatedAt ? new Date(node.updatedAt) : undefined,
publishedAt: node.publishedAt ? new Date(node.publishedAt) : null,
collectionId,
sourceId: node.id,
externalId: node.id,
mimeType: "application/json",
parentDocumentId: node.parentDocumentId
? find(
output.documents,
(d) => d.sourceId === node.parentDocumentId
(d) => d.externalId === node.parentDocumentId
)?.id
: null,
id,
@@ -101,7 +102,7 @@ export default class ImportJSONTask extends ImportTask {
buffer: () => zipObject.async("nodebuffer"),
mimeType,
path: node.key,
sourceId: node.id,
externalId: node.id,
});
});
}
@@ -132,7 +133,7 @@ export default class ImportJSONTask extends ImportTask {
)
: item.collection.description,
id: collectionId,
sourceId: item.collection.id,
externalId: item.collection.id,
});
if (Object.values(item.documents).length) {
@@ -149,7 +150,7 @@ export default class ImportJSONTask extends ImportTask {
for (const document of output.documents) {
for (const attachment of output.attachments) {
const encodedPath = encodeURI(
`/api/attachments.redirect?id=${attachment.sourceId}`
`/api/attachments.redirect?id=${attachment.externalId}`
);
document.text = document.text.replace(

View File

@@ -139,6 +139,7 @@ export default class ImportMarkdownZipTask extends ImportTask {
collectionId,
parentDocumentId,
path: child.path,
mimeType: "text/markdown",
});
await parseNodeChildren(child.children, collectionId, id);

View File

@@ -62,7 +62,7 @@ export default class ImportNotionTask extends ImportTask {
const id = uuidv4();
const match = child.title.match(this.NotionUUIDRegex);
const name = child.title.replace(this.NotionUUIDRegex, "");
const sourceId = match ? match[0].trim() : undefined;
const externalId = match ? match[0].trim() : undefined;
// If it's not a text file we're going to treat it as an attachment.
const mimeType = mime.lookup(child.name);
@@ -79,7 +79,7 @@ export default class ImportNotionTask extends ImportTask {
path: child.path,
mimeType,
buffer: () => zipObject.async("nodebuffer"),
sourceId,
externalId,
});
return;
}
@@ -95,12 +95,12 @@ export default class ImportNotionTask extends ImportTask {
});
const existingDocumentIndex = output.documents.findIndex(
(doc) => doc.sourceId === sourceId
(doc) => doc.externalId === externalId
);
const existingDocument = output.documents[existingDocumentIndex];
// If there is an existing document with the same sourceId that means
// If there is an existing document with the same externalId that means
// we've already parsed either a folder or a file referencing the same
// document, as such we should merge.
if (existingDocument) {
@@ -122,7 +122,8 @@ export default class ImportNotionTask extends ImportTask {
collectionId,
parentDocumentId,
path: child.path,
sourceId,
mimeType: mimeType || "text/markdown",
externalId,
});
await parseNodeChildren(child.children, collectionId, id);
}
@@ -168,13 +169,13 @@ export default class ImportNotionTask extends ImportTask {
// instead of a relative or absolute URL within the original zip file.
for (const link of internalLinksInText) {
const doc = output.documents.find(
(doc) => doc.sourceId === link.sourceId
(doc) => doc.externalId === link.externalId
);
if (!doc) {
Logger.info(
"task",
`Could not find referenced document with sourceId ${link.sourceId}`
`Could not find referenced document with externalId ${link.externalId}`
);
} else {
text = text.replace(link.href, `<<${doc.id}>>`);
@@ -188,11 +189,11 @@ export default class ImportNotionTask extends ImportTask {
for (const node of tree) {
const match = node.title.match(this.NotionUUIDRegex);
const name = node.title.replace(this.NotionUUIDRegex, "");
const sourceId = match ? match[0].trim() : undefined;
const externalId = match ? match[0].trim() : undefined;
const mimeType = mime.lookup(node.name);
const existingCollectionIndex = output.collections.findIndex(
(collection) => collection.sourceId === sourceId
(collection) => collection.externalId === externalId
);
const existingCollection = output.collections[existingCollectionIndex];
const collectionId = existingCollection?.id || uuidv4();
@@ -232,7 +233,7 @@ export default class ImportNotionTask extends ImportTask {
id: collectionId,
name,
description,
sourceId,
externalId,
});
}
}
@@ -254,19 +255,19 @@ export default class ImportNotionTask extends ImportTask {
/**
* Extracts internal links from a markdown document, taking into account the
* sourceId of the document, which is part of the link title.
* externalId of the document, which is part of the link title.
*
* @param text The markdown text to parse
* @returns An array of internal links
*/
private parseInternalLinks(
text: string
): { title: string; href: string; sourceId: string }[] {
): { title: string; href: string; externalId: string }[] {
return compact(
[...text.matchAll(this.NotionLinkRegex)].map((match) => ({
title: match[1],
href: match[2],
sourceId: match[3],
externalId: match[3],
}))
);
}
@@ -294,7 +295,7 @@ export default class ImportNotionTask extends ImportTask {
/**
* Regex to find markdown links containing ID's that look like UUID's with the
* "-"'s removed, Notion's sourceId format.
* "-"'s removed, Notion's externalId format.
*/
private NotionLinkRegex = /\[([^[]+)]\((.*?([0-9a-fA-F]{32})\..*?)\)/g;

View File

@@ -1,3 +1,4 @@
import path from "path";
import truncate from "lodash/truncate";
import {
AttachmentPreset,
@@ -49,7 +50,7 @@ export type StructuredImportData = {
*/
description?: string | Record<string, any> | null;
/** Optional id from import source, useful for mapping */
sourceId?: string;
externalId?: string;
}[];
documents: {
id: string;
@@ -75,8 +76,9 @@ export type StructuredImportData = {
createdById?: string;
createdByEmail?: string | null;
path: string;
mimeType: string;
/** Optional id from import source, useful for mapping */
sourceId?: string;
externalId?: string;
}[];
attachments: {
id: string;
@@ -85,7 +87,7 @@ export type StructuredImportData = {
mimeType: string;
buffer: () => Promise<Buffer>;
/** Optional id from import source, useful for mapping */
sourceId?: string;
externalId?: string;
}[];
};
@@ -428,7 +430,11 @@ export default abstract class ImportTask extends BaseTask<Props> {
const document = await documentCreator({
...options,
source: "import",
sourceMetadata: {
fileName: path.basename(item.path),
mimeType: item.mimeType,
externalId: item.externalId,
},
id: item.id,
title: item.title,
text,