chore: Refactor data import (#3434)
* Complete refactor of import * feat: Notion data import (#3442)
This commit is contained in:
@@ -1,9 +1,10 @@
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import emojiRegex from "emoji-regex";
|
||||
import { strikethrough, tables } from "joplin-turndown-plugin-gfm";
|
||||
import { truncate } from "lodash";
|
||||
import mammoth from "mammoth";
|
||||
import quotedPrintable from "quoted-printable";
|
||||
import { Transaction } from "sequelize";
|
||||
import TurndownService from "turndown";
|
||||
import utf8 from "utf8";
|
||||
import { MAX_TITLE_LENGTH } from "@shared/constants";
|
||||
@@ -21,7 +22,7 @@ const turndownService = new TurndownService({
|
||||
hr: "---",
|
||||
bulletListMarker: "-",
|
||||
headingStyle: "atx",
|
||||
});
|
||||
}).remove(["script", "style", "title", "head"]);
|
||||
|
||||
// Use the GitHub-flavored markdown plugin to parse
|
||||
// strikethoughs and tables
|
||||
@@ -37,7 +38,7 @@ turndownService
|
||||
|
||||
interface ImportableFile {
|
||||
type: string;
|
||||
getMarkdown: (file: any) => Promise<string>;
|
||||
getMarkdown: (content: Buffer | string) => Promise<string>;
|
||||
}
|
||||
|
||||
const importMapping: ImportableFile[] = [
|
||||
@@ -68,26 +69,34 @@ const importMapping: ImportableFile[] = [
|
||||
},
|
||||
];
|
||||
|
||||
// @ts-expect-error ts-migrate(7006) FIXME: Parameter 'file' implicitly has an 'any' type.
|
||||
async function fileToMarkdown(file): Promise<string> {
|
||||
return fs.promises.readFile(file.path, "utf8");
|
||||
async function fileToMarkdown(content: Buffer | string): Promise<string> {
|
||||
if (content instanceof Buffer) {
|
||||
content = content.toString("utf8");
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
// @ts-expect-error ts-migrate(7006) FIXME: Parameter 'file' implicitly has an 'any' type.
|
||||
async function docxToMarkdown(file): Promise<string> {
|
||||
const { value } = await mammoth.convertToHtml(file);
|
||||
return turndownService.turndown(value);
|
||||
async function docxToMarkdown(content: Buffer | string): Promise<string> {
|
||||
if (content instanceof Buffer) {
|
||||
const { value: html } = await mammoth.convertToHtml({ buffer: content });
|
||||
return turndownService.turndown(html);
|
||||
}
|
||||
|
||||
throw new Error("docxToMarkdown: content must be a Buffer");
|
||||
}
|
||||
|
||||
// @ts-expect-error ts-migrate(7006) FIXME: Parameter 'file' implicitly has an 'any' type.
|
||||
async function htmlToMarkdown(file): Promise<string> {
|
||||
const value = await fs.promises.readFile(file.path, "utf8");
|
||||
return turndownService.turndown(value);
|
||||
async function htmlToMarkdown(content: Buffer | string): Promise<string> {
|
||||
if (content instanceof Buffer) {
|
||||
content = content.toString("utf8");
|
||||
}
|
||||
|
||||
return turndownService.turndown(content);
|
||||
}
|
||||
|
||||
// @ts-expect-error ts-migrate(7006) FIXME: Parameter 'file' implicitly has an 'any' type.
|
||||
async function confluenceToMarkdown(file): Promise<string> {
|
||||
let value = await fs.promises.readFile(file.path, "utf8");
|
||||
async function confluenceToMarkdown(value: Buffer | string): Promise<string> {
|
||||
if (value instanceof Buffer) {
|
||||
value = value.toString("utf8");
|
||||
}
|
||||
|
||||
// We're only supporting the ridiculous output from Confluence here, regular
|
||||
// Word documents should call into the docxToMarkdown importer.
|
||||
@@ -143,22 +152,28 @@ async function confluenceToMarkdown(file): Promise<string> {
|
||||
}
|
||||
|
||||
async function documentImporter({
|
||||
file,
|
||||
mimeType,
|
||||
fileName,
|
||||
content,
|
||||
user,
|
||||
ip,
|
||||
transaction,
|
||||
}: {
|
||||
user: User;
|
||||
file: File;
|
||||
ip: string;
|
||||
mimeType: string;
|
||||
fileName: string;
|
||||
content: Buffer | string;
|
||||
ip?: string;
|
||||
transaction?: Transaction;
|
||||
}): Promise<{
|
||||
text: string;
|
||||
title: string;
|
||||
}> {
|
||||
const fileInfo = importMapping.filter((item) => {
|
||||
if (item.type === file.type) {
|
||||
if (item.type === mimeType) {
|
||||
if (
|
||||
file.type === "application/octet-stream" &&
|
||||
path.extname(file.name) !== ".docx"
|
||||
mimeType === "application/octet-stream" &&
|
||||
path.extname(fileName) !== ".docx"
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
@@ -166,7 +181,7 @@ async function documentImporter({
|
||||
return true;
|
||||
}
|
||||
|
||||
if (item.type === "text/markdown" && path.extname(file.name) === ".md") {
|
||||
if (item.type === "text/markdown" && path.extname(fileName) === ".md") {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -174,20 +189,35 @@ async function documentImporter({
|
||||
})[0];
|
||||
|
||||
if (!fileInfo) {
|
||||
throw InvalidRequestError(`File type ${file.type} not supported`);
|
||||
throw InvalidRequestError(`File type ${mimeType} not supported`);
|
||||
}
|
||||
|
||||
let title = deserializeFilename(file.name.replace(/\.[^/.]+$/, ""));
|
||||
let text = await fileInfo.getMarkdown(file);
|
||||
let title = deserializeFilename(fileName.replace(/\.[^/.]+$/, ""));
|
||||
let text = await fileInfo.getMarkdown(content);
|
||||
text = text.trim();
|
||||
|
||||
// find and extract first emoji, in the case of some imports it can be outside
|
||||
// of the title, at the top of the document.
|
||||
const regex = emojiRegex();
|
||||
const matches = regex.exec(text);
|
||||
const firstEmoji = matches ? matches[0] : undefined;
|
||||
if (firstEmoji && text.startsWith(firstEmoji)) {
|
||||
text = text.replace(firstEmoji, "").trim();
|
||||
}
|
||||
|
||||
// If the first line of the imported text looks like a markdown heading
|
||||
// then we can use this as the document title
|
||||
if (text.trim().startsWith("# ")) {
|
||||
if (text.startsWith("# ")) {
|
||||
const result = parseTitle(text);
|
||||
title = result.title;
|
||||
text = text.replace(`# ${title}\n`, "");
|
||||
}
|
||||
|
||||
// If we parsed an emoji from _above_ the title then add it back at prefixing
|
||||
if (firstEmoji) {
|
||||
title = `${firstEmoji} ${title}`;
|
||||
}
|
||||
|
||||
// find data urls, convert to blobs, upload and write attachments
|
||||
const images = parseImages(text);
|
||||
const dataURIs = images.filter((href) => href.startsWith("data:"));
|
||||
@@ -201,6 +231,7 @@ async function documentImporter({
|
||||
buffer,
|
||||
user,
|
||||
ip,
|
||||
transaction,
|
||||
});
|
||||
text = text.replace(uri, attachment.redirectUrl);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user