From ae457d9c98274e049f00c0c44d234ba6b9bdc321 Mon Sep 17 00:00:00 2001 From: kasperstorgaard Date: Thu, 2 Nov 2023 08:19:41 +0100 Subject: [PATCH 1/3] add gsheet extractor plugin --- package-lock.json | 232 ++++++++++++++++++++- plugins/gsheet-extractor/CHANGELOG.md | 1 + plugins/gsheet-extractor/README.md | 20 ++ plugins/gsheet-extractor/package.json | 42 ++++ plugins/gsheet-extractor/src/extractor.ts | 234 ++++++++++++++++++++++ plugins/gsheet-extractor/src/index.ts | 9 + plugins/gsheet-extractor/src/parser.ts | 63 ++++++ 7 files changed, 596 insertions(+), 5 deletions(-) create mode 100644 plugins/gsheet-extractor/CHANGELOG.md create mode 100644 plugins/gsheet-extractor/README.md create mode 100644 plugins/gsheet-extractor/package.json create mode 100644 plugins/gsheet-extractor/src/extractor.ts create mode 100644 plugins/gsheet-extractor/src/index.ts create mode 100644 plugins/gsheet-extractor/src/parser.ts diff --git a/package-lock.json b/package-lock.json index e42ded02c..eae4ccbb9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -908,6 +908,10 @@ "resolved": "plugins/graphql", "link": true }, + "node_modules/@flatfile/plugin-gsheet-extractor": { + "resolved": "plugins/gsheet-extractor", + "link": true + }, "node_modules/@flatfile/plugin-job-handler": { "resolved": "plugins/job-handler", "link": true @@ -3573,8 +3577,12 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "20.5.9", - "license": "MIT" + "version": "20.8.10", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.8.10.tgz", + "integrity": "sha512-TlgT8JntpcbmKUFzjhsyhGfP2fsiz1Mv56im6enJ905xG1DAYesxJaeSbGqQmAw8OWPdhyJGhGSQGKRNJ45u9w==", + "dependencies": { + "undici-types": "~5.26.4" + } }, "node_modules/@types/normalize-package-data": { "version": "2.4.1", @@ -4044,6 +4052,14 @@ "node": ">=4" } }, + "node_modules/bignumber.js": { + "version": "9.1.2", + "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.1.2.tgz", + "integrity": "sha512-2/mKyZH9K85bzOEfhXDBFZTGd1CTs+5IHpeFQo9luiBG7hghdC851Pj2WAhb6E3R6b9tZj/XKhbg4fum+Kepug==", + "engines": { + "node": "*" + } + }, "node_modules/boolbase": { "version": "1.0.0", "dev": true, @@ -4145,6 +4161,11 @@ "ieee754": "^1.2.1" } }, + "node_modules/buffer-equal-constant-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", + "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==" + }, "node_modules/buffer-from": { "version": "1.1.2", "dev": true, @@ -4874,6 +4895,14 @@ "node": ">=10" } }, + "node_modules/ecdsa-sig-formatter": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", + "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", + "dependencies": { + "safe-buffer": "^5.0.1" + } + }, "node_modules/effect": { "version": "2.0.0-next.31", "resolved": "https://registry.npmjs.org/effect/-/effect-2.0.0-next.31.tgz", @@ -5134,6 +5163,11 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==" + }, "node_modules/extendable-error": { "version": "0.1.7", "dev": true, @@ -5335,6 +5369,32 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gaxios": { + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-6.1.1.tgz", + "integrity": "sha512-bw8smrX+XlAoo9o1JAksBwX+hi/RG15J+NTSxmNPIclKC3ZVK6C2afwY8OSdRvOK0+ZLecUJYtj2MmjOt3Dm0w==", + "dependencies": { + "extend": "^3.0.2", + "https-proxy-agent": "^7.0.1", + "is-stream": "^2.0.0", + "node-fetch": "^2.6.9" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/gcp-metadata": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.0.0.tgz", + "integrity": "sha512-Ozxyi23/1Ar51wjUT2RDklK+3HxqDr8TLBNK8rBBFQ7T85iIGnXnVusauj06QyqCXRFZig8LZC+TUddWbndlpQ==", + "dependencies": { + "gaxios": "^6.0.0", + "json-bigint": "^1.0.0" + }, + "engines": { + "node": ">=14" + } + }, "node_modules/gensync": { "version": "1.0.0-beta.2", "license": "MIT", @@ -5498,6 +5558,50 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/google-auth-library": { + "version": "9.2.0", + "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-9.2.0.tgz", + "integrity": "sha512-1oV3p0JhNEhVbj26eF3FAJcv9MXXQt4S0wcvKZaDbl4oHq5V3UJoSbsGZGQNcjoCdhW4kDSwOs11wLlHog3fgQ==", + "dependencies": { + "base64-js": "^1.3.0", + "ecdsa-sig-formatter": "^1.0.11", + "gaxios": "^6.0.0", + "gcp-metadata": "^6.0.0", + "gtoken": "^7.0.0", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/googleapis": { + "version": "128.0.0", + "resolved": "https://registry.npmjs.org/googleapis/-/googleapis-128.0.0.tgz", + "integrity": "sha512-+sLtVYNazcxaSD84N6rihVX4QiGoqRdnlz2SwmQQkadF31XonDfy4ufk3maMg27+FiySrH0rd7V8p+YJG6cknA==", + "dependencies": { + "google-auth-library": "^9.0.0", + "googleapis-common": "^7.0.0" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/googleapis-common": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/googleapis-common/-/googleapis-common-7.0.1.tgz", + "integrity": "sha512-mgt5zsd7zj5t5QXvDanjWguMdHAcJmmDrF9RkInCecNsyV7S7YtGqm5v2IWONNID88osb7zmx5FtrAP12JfD0w==", + "dependencies": { + "extend": "^3.0.2", + "gaxios": "^6.0.3", + "google-auth-library": "^9.0.0", + "qs": "^6.7.0", + "url-template": "^2.0.8", + "uuid": "^9.0.0" + }, + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/gopd": { "version": "1.0.1", "dev": true, @@ -5518,6 +5622,18 @@ "dev": true, "license": "MIT" }, + "node_modules/gtoken": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/gtoken/-/gtoken-7.0.1.tgz", + "integrity": "sha512-KcFVtoP1CVFtQu0aSk3AyAt2og66PFhZAlkUOuWKwzMLoulHXG5W5wE5xAnHb+yl3/wEFoqGW7/cDGMU8igDZQ==", + "dependencies": { + "gaxios": "^6.0.0", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/handlebars": { "version": "4.7.8", "dev": true, @@ -6068,7 +6184,6 @@ }, "node_modules/is-stream": { "version": "2.0.1", - "dev": true, "license": "MIT", "engines": { "node": ">=8" @@ -7699,6 +7814,14 @@ "node": ">=4" } }, + "node_modules/json-bigint": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", + "integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==", + "dependencies": { + "bignumber.js": "^9.0.0" + } + }, "node_modules/json-parse-even-better-errors": { "version": "2.3.1", "dev": true, @@ -7732,6 +7855,25 @@ "graceful-fs": "^4.1.6" } }, + "node_modules/jwa": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.0.tgz", + "integrity": "sha512-jrZ2Qx916EA+fq9cEAeCROWPTfCwi1IVHqT2tapuqLEVVDKFDENFw1oL+MwrTvH6msKxsd1YTDVw6uKEcsrLEA==", + "dependencies": { + "buffer-equal-constant-time": "1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/jws": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.0.tgz", + "integrity": "sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==", + "dependencies": { + "jwa": "^2.0.0", + "safe-buffer": "^5.0.1" + } + }, "node_modules/kind-of": { "version": "6.0.3", "dev": true, @@ -8152,6 +8294,25 @@ "dev": true, "license": "MIT" }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, "node_modules/node-gyp-build-optional-packages": { "version": "5.0.6", "dev": true, @@ -9148,7 +9309,6 @@ }, "node_modules/safe-buffer": { "version": "5.2.1", - "dev": true, "funding": [ { "type": "github", @@ -9804,6 +9964,11 @@ "node": ">=8.0" } }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" + }, "node_modules/trim-newlines": { "version": "3.0.1", "dev": true, @@ -10107,6 +10272,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, "node_modules/universalify": { "version": "0.1.2", "license": "MIT", @@ -10146,6 +10316,11 @@ "version": "4.0.1", "license": "MIT" }, + "node_modules/url-template": { + "version": "2.0.8", + "resolved": "https://registry.npmjs.org/url-template/-/url-template-2.0.8.tgz", + "integrity": "sha512-XdVKMF4SJ0nP/O7XIPB0JwAEuT9lDIYnNsK8yGVe43y0AWoKeJNdv3ZNWh7ksJ6KqQFjOO6ox/VEitLnaVNufw==" + }, "node_modules/utility-types": { "version": "3.10.0", "dev": true, @@ -10154,6 +10329,18 @@ "node": ">= 4" } }, + "node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/v8-to-istanbul": { "version": "9.1.0", "dev": true, @@ -10201,6 +10388,20 @@ "dev": true, "license": "MIT" }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/which": { "version": "2.0.2", "dev": true, @@ -10555,6 +10756,27 @@ "version": "0.0.2", "license": "ISC" }, + "plugins/gsheet-extractor": { + "name": "@flatfile/plugin-gsheet-extractor", + "version": "0.1.0", + "license": "ISC", + "dependencies": { + "@flatfile/api": "^1.5.30", + "@flatfile/hooks": "^1.3.0", + "@flatfile/listener": "^0.3.15", + "@flatfile/util-common": "^0.2.2", + "@flatfile/util-extractor": "^0.4.6", + "@flatfile/util-file-buffer": "^0.1.2", + "googleapis": "^128.0.0", + "remeda": "^1.14.0" + }, + "devDependencies": { + "@types/node": "^20.8.10" + }, + "engines": { + "node": ">= 12" + } + }, "plugins/job-handler": { "name": "@flatfile/plugin-job-handler", "version": "0.1.4", @@ -10679,7 +10901,7 @@ }, "plugins/record-hook": { "name": "@flatfile/plugin-record-hook", - "version": "1.1.8", + "version": "1.1.9", "license": "ISC", "dependencies": { "@flatfile/api": "^1.5.33", diff --git a/plugins/gsheet-extractor/CHANGELOG.md b/plugins/gsheet-extractor/CHANGELOG.md new file mode 100644 index 000000000..653759bc8 --- /dev/null +++ b/plugins/gsheet-extractor/CHANGELOG.md @@ -0,0 +1 @@ +# @flatfile/gsheet-extractor \ No newline at end of file diff --git a/plugins/gsheet-extractor/README.md b/plugins/gsheet-extractor/README.md new file mode 100644 index 000000000..78275c08f --- /dev/null +++ b/plugins/gsheet-extractor/README.md @@ -0,0 +1,20 @@ +# @flatfile/plugin-gsheet-extractor + +This package parses all Google sheets files and extracts them into Flatfile. + +`npm i @flatfile/plugin-xlsx-extractor` + +## Prerequisites (WIP) +1. Create a google service account, and save the json file with private keys locally. +2. Create the following flatfile secrets below using the google service account json: + - google-cloud-project-id + - google-cloud-private-key-id + - google-cloud-private-key-1 (too long for one secret, split it in half) + - google-cloud-private-key-2 + - google-cloud-client-email + - google-cloud-client-id + - google-cloud-client-cert-url +3. Share one or more folders with your google service account, to enable access (can be root). + +## Get Started +TODO \ No newline at end of file diff --git a/plugins/gsheet-extractor/package.json b/plugins/gsheet-extractor/package.json new file mode 100644 index 000000000..035a1589d --- /dev/null +++ b/plugins/gsheet-extractor/package.json @@ -0,0 +1,42 @@ +{ + "name": "@flatfile/plugin-gsheet-extractor", + "version": "0.1.0", + "description": "A plugin for parsing gsheet files in Flatfile.", + "registryMetadata": { + "category": "extractors" + }, + "engines": { + "node": ">= 12" + }, + "source": "src/index.ts", + "main": "dist/main.js", + "module": "dist/module.mjs", + "types": "dist/types.d.ts", + "scripts": { + "build": "parcel build", + "dev": "parcel watch", + "check": "tsc ./**/*.ts --noEmit --esModuleInterop", + "test": "jest ./**/*.spec.ts --config=../../jest.config.js --runInBand" + }, + "keywords": [], + "author": "David Boskovic", + "repository": { + "type": "git", + "url": "https://github.com/FlatFilers/flatfile-plugins.git", + "directory": "plugins/gsheet-extractor" + }, + "license": "ISC", + "dependencies": { + "@flatfile/api": "^1.5.30", + "@flatfile/hooks": "^1.3.0", + "@flatfile/listener": "^0.3.15", + "@flatfile/util-common": "^0.2.2", + "@flatfile/util-extractor": "^0.4.6", + "@flatfile/util-file-buffer": "^0.1.2", + "googleapis": "^128.0.0", + "remeda": "^1.14.0" + }, + "devDependencies": { + "@types/node": "^20.8.10" + } +} diff --git a/plugins/gsheet-extractor/src/extractor.ts b/plugins/gsheet-extractor/src/extractor.ts new file mode 100644 index 000000000..a93da249c --- /dev/null +++ b/plugins/gsheet-extractor/src/extractor.ts @@ -0,0 +1,234 @@ +import api, { Flatfile } from "@flatfile/api"; +import { JobStatus, JobType } from "@flatfile/api/api"; +import type { FlatfileEvent, FlatfileListener } from "@flatfile/listener"; +import { asyncBatch } from "@flatfile/util-common"; +import { SheetCapture, WorkbookCapture } from "@flatfile/util-extractor"; +import { getFileBuffer } from "@flatfile/util-file-buffer"; +import { mapValues } from "remeda"; + +type Config = { + sheetName: string; + range: string; +}; + +/** + * File extractor, adapted heavily from flatfile extractor plugin. + * + * Main differences: + * - parseBuffer is async + * - get google cloud service account secrets from flatfile + */ +export const Extractor = ( + fileExt: string | RegExp, + parseBuffer: ( + buffer: Buffer, + options: Config & { + serviceAccount: Record; + }, + ) => Promise, + options?: Config, +) => { + return (listener: FlatfileListener) => { + listener.on("file:created", async event => { + const { data: file } = await api.files.get(event.context.fileId); + if (file.mode === "export") return false; + + if (typeof fileExt === "string" && !file.name.endsWith(fileExt)) { + return false; + } + + if (fileExt instanceof RegExp && !fileExt.test(file.name)) return false; + + const jobs = await api.jobs.create({ + type: JobType.File, + operation: `extract-plugin-gsheet`, + status: JobStatus.Ready, + source: event.context.fileId, + }); + + await api.jobs.execute(jobs.data.id); + }); + + listener.on( + "job:ready", + { operation: `extract-plugin-gsheet` }, + async event => { + const { data: file } = await api.files.get(event.context.fileId); + + const buffer = await getFileBuffer(event); + + const { jobId } = event.context; + + try { + await api.jobs.ack(jobId, { progress: 3, info: "Parsing Sheets" }); + + /** + * TODO: figure out a way to get this service account data from secrets earlier, + * not as part of the extractor... + */ + const serviceAccount = await getServiceAccount(event); + + const capture = await parseBuffer(buffer, { + ...options, + serviceAccount, + }); + + const workbook = await createWorkbook( + event.context.environmentId, + file, + capture, + ); + + await api.jobs.ack(jobId, { + progress: 10, + info: "Adding records to Sheets", + }); + + let processedRecords = 0; + + const totalLength = Object.values(capture).reduce( + ( + acc: number, + sheet: { + data: unknown[]; + }, + ) => acc + (sheet?.data?.length || 0), + 0, + ); + + for (const sheet of workbook.sheets) { + if (!capture[sheet.name]) continue; + + await asyncBatch( + capture[sheet.name].data, + async chunk => { + await api.records.insert(sheet.id, chunk); + + processedRecords += chunk.length; + + const progress = Math.min( + 99, + Math.round(10 + (90 * processedRecords) / totalLength), + ); + + await api.jobs.ack(jobId, { + progress, + info: "Adding records to Sheets", + }); + }, + { chunkSize: 10000, parallel: 1, debug: false }, + ); + } + + await api.files.update(file.id, { + workbookId: workbook.id, + }); + + await api.jobs.complete(jobId, { + info: "Extraction complete", + outcome: { + message: "Extracted file", + }, + }); + } catch (error) { + await api.jobs.fail(jobId, { + info: `Extraction failed ${error.message}`, + }); + } + }, + ); + }; +}; + +async function createWorkbook( + environmentId: string, + file: Flatfile.File_, + workbookCapture: WorkbookCapture, +): Promise { + const workbookConfig = getWorkbookConfig( + file.name, + file.spaceId, + environmentId, + workbookCapture, + ); + const workbook = await api.workbooks.create(workbookConfig); + + if (!workbook.data.sheets || workbook.data.sheets.length === 0) { + throw new Error("No sheets found"); + } + + return workbook.data; +} + +function getWorkbookConfig( + name: string, + spaceId: string, + environmentId: string, + workbookCapture: WorkbookCapture, +): Flatfile.CreateWorkbookConfig { + const sheets = Object.values( + mapValues(workbookCapture, (sheet, sheetName) => { + return getSheetConfig(sheetName, sheet); + }), + ); + + return { + name: `[file] ${name}`, + labels: ["file"], + spaceId, + environmentId, + sheets, + }; +} + +function getSheetConfig( + name: string, + { headers, required, descriptions }: SheetCapture, +): Flatfile.SheetConfig { + return { + name, + fields: headers.map(key => ({ + key, + label: key, + description: descriptions?.[key] || "", + type: "string", + constraints: required?.[key] ? [{ type: "required" }] : [], + })), + }; +} + +async function getServiceAccount(event: FlatfileEvent) { + const [ + projectId, + privateKeyId, + privateKey1, + privateKey2, + clientEmail, + clientId, + clientCertUrl, + ] = await Promise.all([ + event.secrets("google-cloud-project-id"), + event.secrets("google-cloud-private-key-id"), + // Flatfile secrets can "only" hold 1024 characters, so had to cut it in half. + event.secrets("google-cloud-private-key-1"), + event.secrets("google-cloud-private-key-2"), + event.secrets("google-cloud-client-email"), + event.secrets("google-cloud-client-id"), + event.secrets("google-cloud-client-cert-url"), + ]); + + return { + type: "service_account", + project_id: projectId, + private_key_id: privateKeyId, + // Seems that flatfile are escaping newlines in a weird way, so we remove it again. + private_key: (privateKey1 + privateKey2).replace(/\\n/g, "\n"), + client_email: clientEmail, + client_id: clientId, + auth_uri: "https://accounts.google.com/o/oauth2/auth", + token_uri: "https://oauth2.googleapis.com/token", + auth_provider_x509_cert_url: "https://www.googleapis.com/oauth2/v1/certs", + client_x509_cert_url: clientCertUrl, + universe_domain: "googleapis.com", + }; +} diff --git a/plugins/gsheet-extractor/src/index.ts b/plugins/gsheet-extractor/src/index.ts new file mode 100644 index 000000000..224ca5f9e --- /dev/null +++ b/plugins/gsheet-extractor/src/index.ts @@ -0,0 +1,9 @@ +import { Extractor } from "./extractor"; +import { parseBuffer } from "./parser"; + +export const GSheetExtractor = (options?: { + sheetName: string; + range: string; +}) => { + return Extractor(/\.gsheet$/i, parseBuffer, options); +}; diff --git a/plugins/gsheet-extractor/src/parser.ts b/plugins/gsheet-extractor/src/parser.ts new file mode 100644 index 000000000..2523ba2e4 --- /dev/null +++ b/plugins/gsheet-extractor/src/parser.ts @@ -0,0 +1,63 @@ +import { RecordData } from "@flatfile/api/api"; +import { WorkbookCapture } from "@flatfile/util-extractor"; +import { google } from "googleapis"; + +type GsheetFile = { + doc_id: string; + resource_key: string; // probably empty + email: string; +}; + +const sheets = google.sheets("v4"); + +export async function parseBuffer( + buffer: Buffer, + options: { + sheetName: string; + range: string; + serviceAccount: Record; + }, +): Promise { + const auth = getAuth(options.serviceAccount); + + const data = JSON.parse(buffer.toString()) as GsheetFile; + + const response = await sheets.spreadsheets.values.get({ + auth, + spreadsheetId: data.doc_id, + range: `${options.sheetName}!${options.range}`, + }); + + const headers = response.data.values[0] as string[]; + + const values = [] as RecordData[]; + + for (const row of response.data.values.slice(1)) { + const value = {} as RecordData; + + for (let index = 0; index < row.length; index++) { + // Don't want to save empty headers. + if (!headers[index]) continue; + + value[headers[index]] = { + value: row[index], + }; + } + + values.push(value); + } + + return { + [options.sheetName]: { + headers: response.data.values[0] as string[], + data: values, + }, + }; +} + +function getAuth(serviceAccount: Record) { + return new google.auth.GoogleAuth({ + credentials: serviceAccount, + scopes: ["https://www.googleapis.com/auth/spreadsheets.readonly"], + }); +} From 7caad86d8c7deee4ff03ba08c2059656d33acbe0 Mon Sep 17 00:00:00 2001 From: kasperstorgaard Date: Thu, 2 Nov 2023 08:48:15 +0100 Subject: [PATCH 2/3] make the plugin more generic --- plugins/gsheet-extractor/src/extractor.ts | 195 +++++++++------------- plugins/gsheet-extractor/src/index.ts | 18 +- plugins/gsheet-extractor/src/parser.ts | 143 +++++++++++----- 3 files changed, 195 insertions(+), 161 deletions(-) diff --git a/plugins/gsheet-extractor/src/extractor.ts b/plugins/gsheet-extractor/src/extractor.ts index a93da249c..d603223fd 100644 --- a/plugins/gsheet-extractor/src/extractor.ts +++ b/plugins/gsheet-extractor/src/extractor.ts @@ -1,15 +1,14 @@ -import api, { Flatfile } from "@flatfile/api"; -import { JobStatus, JobType } from "@flatfile/api/api"; -import type { FlatfileEvent, FlatfileListener } from "@flatfile/listener"; -import { asyncBatch } from "@flatfile/util-common"; -import { SheetCapture, WorkbookCapture } from "@flatfile/util-extractor"; -import { getFileBuffer } from "@flatfile/util-file-buffer"; -import { mapValues } from "remeda"; +import api, { Flatfile } from '@flatfile/api' +import { JobStatus, JobType } from '@flatfile/api/api' +import type { FlatfileEvent, FlatfileListener } from '@flatfile/listener' +import { asyncBatch } from '@flatfile/util-common' +import { SheetCapture, WorkbookCapture } from '@flatfile/util-extractor' +import { getFileBuffer } from '@flatfile/util-file-buffer' +import { mapValues } from 'remeda' type Config = { - sheetName: string; - range: string; -}; + sheetRange: Record +} /** * File extractor, adapted heavily from flatfile extractor plugin. @@ -23,212 +22,172 @@ export const Extractor = ( parseBuffer: ( buffer: Buffer, options: Config & { - serviceAccount: Record; + getSecret: (key: string) => Promise }, ) => Promise, - options?: Config, + config: Config, ) => { return (listener: FlatfileListener) => { - listener.on("file:created", async event => { - const { data: file } = await api.files.get(event.context.fileId); - if (file.mode === "export") return false; + listener.on('file:created', async (event) => { + const { data: file } = await api.files.get(event.context.fileId) + if (file.mode === 'export') return false - if (typeof fileExt === "string" && !file.name.endsWith(fileExt)) { - return false; + if (typeof fileExt === 'string' && !file.name.endsWith(fileExt)) { + return false } - if (fileExt instanceof RegExp && !fileExt.test(file.name)) return false; + if (fileExt instanceof RegExp && !fileExt.test(file.name)) return false const jobs = await api.jobs.create({ type: JobType.File, operation: `extract-plugin-gsheet`, status: JobStatus.Ready, source: event.context.fileId, - }); + }) - await api.jobs.execute(jobs.data.id); - }); + await api.jobs.execute(jobs.data.id) + }) listener.on( - "job:ready", + 'job:ready', { operation: `extract-plugin-gsheet` }, - async event => { - const { data: file } = await api.files.get(event.context.fileId); + async (event) => { + const { data: file } = await api.files.get(event.context.fileId) - const buffer = await getFileBuffer(event); + const buffer = await getFileBuffer(event) - const { jobId } = event.context; + const { jobId } = event.context try { - await api.jobs.ack(jobId, { progress: 3, info: "Parsing Sheets" }); - - /** - * TODO: figure out a way to get this service account data from secrets earlier, - * not as part of the extractor... - */ - const serviceAccount = await getServiceAccount(event); + await api.jobs.ack(jobId, { progress: 3, info: 'Parsing Sheets' }) const capture = await parseBuffer(buffer, { - ...options, - serviceAccount, - }); + ...config, + getSecret: (key: string) => event.secrets(key), + }) const workbook = await createWorkbook( event.context.environmentId, file, - capture, - ); + capture + ) await api.jobs.ack(jobId, { progress: 10, - info: "Adding records to Sheets", - }); + info: 'Adding records to Sheets', + }) - let processedRecords = 0; + let processedRecords = 0 const totalLength = Object.values(capture).reduce( ( acc: number, sheet: { - data: unknown[]; - }, + data: unknown[] + } ) => acc + (sheet?.data?.length || 0), - 0, - ); + 0 + ) for (const sheet of workbook.sheets) { - if (!capture[sheet.name]) continue; + if (!capture[sheet.name]) continue await asyncBatch( capture[sheet.name].data, - async chunk => { - await api.records.insert(sheet.id, chunk); + async (chunk) => { + await api.records.insert(sheet.id, chunk) - processedRecords += chunk.length; + processedRecords += chunk.length const progress = Math.min( 99, - Math.round(10 + (90 * processedRecords) / totalLength), - ); + Math.round(10 + (90 * processedRecords) / totalLength) + ) await api.jobs.ack(jobId, { progress, - info: "Adding records to Sheets", - }); + info: 'Adding records to Sheets', + }) }, - { chunkSize: 10000, parallel: 1, debug: false }, - ); + { chunkSize: 10000, parallel: 1, debug: false } + ) } await api.files.update(file.id, { workbookId: workbook.id, - }); + }) await api.jobs.complete(jobId, { - info: "Extraction complete", + info: 'Extraction complete', outcome: { - message: "Extracted file", + message: 'Extracted file', }, - }); + }) } catch (error) { + console.error(error.message) + await api.jobs.fail(jobId, { info: `Extraction failed ${error.message}`, - }); + }) } - }, - ); - }; -}; + } + ) + } +} async function createWorkbook( environmentId: string, file: Flatfile.File_, - workbookCapture: WorkbookCapture, + workbookCapture: WorkbookCapture ): Promise { const workbookConfig = getWorkbookConfig( file.name, file.spaceId, environmentId, - workbookCapture, - ); - const workbook = await api.workbooks.create(workbookConfig); + workbookCapture + ) + const workbook = await api.workbooks.create(workbookConfig) if (!workbook.data.sheets || workbook.data.sheets.length === 0) { - throw new Error("No sheets found"); + throw new Error('No sheets found') } - return workbook.data; + return workbook.data } function getWorkbookConfig( name: string, spaceId: string, environmentId: string, - workbookCapture: WorkbookCapture, + workbookCapture: WorkbookCapture ): Flatfile.CreateWorkbookConfig { const sheets = Object.values( mapValues(workbookCapture, (sheet, sheetName) => { - return getSheetConfig(sheetName, sheet); - }), - ); + return getSheetConfig(sheetName, sheet) + }) + ) return { name: `[file] ${name}`, - labels: ["file"], + labels: ['file'], spaceId, environmentId, sheets, - }; + } } function getSheetConfig( name: string, - { headers, required, descriptions }: SheetCapture, + { headers, required, descriptions }: SheetCapture ): Flatfile.SheetConfig { return { name, - fields: headers.map(key => ({ + fields: headers.map((key) => ({ key, label: key, - description: descriptions?.[key] || "", - type: "string", - constraints: required?.[key] ? [{ type: "required" }] : [], + description: descriptions?.[key] || '', + type: 'string', + constraints: required?.[key] ? [{ type: 'required' }] : [], })), - }; -} - -async function getServiceAccount(event: FlatfileEvent) { - const [ - projectId, - privateKeyId, - privateKey1, - privateKey2, - clientEmail, - clientId, - clientCertUrl, - ] = await Promise.all([ - event.secrets("google-cloud-project-id"), - event.secrets("google-cloud-private-key-id"), - // Flatfile secrets can "only" hold 1024 characters, so had to cut it in half. - event.secrets("google-cloud-private-key-1"), - event.secrets("google-cloud-private-key-2"), - event.secrets("google-cloud-client-email"), - event.secrets("google-cloud-client-id"), - event.secrets("google-cloud-client-cert-url"), - ]); - - return { - type: "service_account", - project_id: projectId, - private_key_id: privateKeyId, - // Seems that flatfile are escaping newlines in a weird way, so we remove it again. - private_key: (privateKey1 + privateKey2).replace(/\\n/g, "\n"), - client_email: clientEmail, - client_id: clientId, - auth_uri: "https://accounts.google.com/o/oauth2/auth", - token_uri: "https://oauth2.googleapis.com/token", - auth_provider_x509_cert_url: "https://www.googleapis.com/oauth2/v1/certs", - client_x509_cert_url: clientCertUrl, - universe_domain: "googleapis.com", - }; + } } diff --git a/plugins/gsheet-extractor/src/index.ts b/plugins/gsheet-extractor/src/index.ts index 224ca5f9e..fd043d3ad 100644 --- a/plugins/gsheet-extractor/src/index.ts +++ b/plugins/gsheet-extractor/src/index.ts @@ -1,9 +1,19 @@ import { Extractor } from "./extractor"; import { parseBuffer } from "./parser"; -export const GSheetExtractor = (options?: { - sheetName: string; - range: string; -}) => { +type Config = { + sheetRange?: string; +} + +/** + * Plugin config options. + * + * @property {string} sheetRange - use if you need a custom subset of columns + rows (example C4:Z) + */ +export interface GsheetExtractorOptions { + readonly sheetRange?: string +} + +export const GSheetExtractor = (options: GsheetExtractorOptions) => { return Extractor(/\.gsheet$/i, parseBuffer, options); }; diff --git a/plugins/gsheet-extractor/src/parser.ts b/plugins/gsheet-extractor/src/parser.ts index 2523ba2e4..98fa27662 100644 --- a/plugins/gsheet-extractor/src/parser.ts +++ b/plugins/gsheet-extractor/src/parser.ts @@ -1,63 +1,128 @@ -import { RecordData } from "@flatfile/api/api"; -import { WorkbookCapture } from "@flatfile/util-extractor"; -import { google } from "googleapis"; +import { RecordData } from '@flatfile/api/api' +import { WorkbookCapture } from '@flatfile/util-extractor' +import { google } from 'googleapis' type GsheetFile = { - doc_id: string; - resource_key: string; // probably empty - email: string; -}; + doc_id: string + resource_key: string // probably empty + email: string +} -const sheets = google.sheets("v4"); +const sheets = google.sheets('v4') export async function parseBuffer( buffer: Buffer, options: { - sheetName: string; - range: string; - serviceAccount: Record; - }, + getSecret: (key: string) => Promise + sheetRange?: string; + } ): Promise { - const auth = getAuth(options.serviceAccount); + const serviceAccount = await getServiceAccount(options) + + const auth = new google.auth.GoogleAuth({ + credentials: serviceAccount, + scopes: ['https://www.googleapis.com/auth/spreadsheets.readonly'], + }) - const data = JSON.parse(buffer.toString()) as GsheetFile; + const data = JSON.parse(buffer.toString()) as GsheetFile - const response = await sheets.spreadsheets.values.get({ + const sheetsResponse = await sheets.spreadsheets.get({ auth, spreadsheetId: data.doc_id, - range: `${options.sheetName}!${options.range}`, - }); + }) + + const workbooks: WorkbookCapture = {} + + for (const sheet of sheetsResponse.data.sheets) { + const title = sheet.properties.title - const headers = response.data.values[0] as string[]; + const valuesResponse = await sheets.spreadsheets.values.get({ + auth, + spreadsheetId: data.doc_id, + range: options.sheetRange ? `${title}!${options.sheetRange}` : title, + }) - const values = [] as RecordData[]; + const headers: string[] = [] - for (const row of response.data.values.slice(1)) { - const value = {} as RecordData; + for (let header of valuesResponse.data.values[0]) { + let renameCount = 0 - for (let index = 0; index < row.length; index++) { - // Don't want to save empty headers. - if (!headers[index]) continue; + // Empty headers can happen, use "EMPTY" instead when encountered. + if (header === '') { + header = 'EMPTY' + } - value[headers[index]] = { - value: row[index], - }; + // Make sure we do not have duplicate header names + while (headers.includes(header)) { + if (renameCount === 0) { + header = `${header}--${renameCount + 1}` + } else { + header = `${header.slice(0, header.length - 1)}${renameCount + 1}` + } + + renameCount++ + } + + headers.push(header) } - values.push(value); - } + const values: RecordData[] = [] - return { - [options.sheetName]: { - headers: response.data.values[0] as string[], + for (const row of valuesResponse.data.values.slice(1)) { + const value: RecordData = {} + + for (let index = 0; index < row.length; index++) { + value[headers[index]] = { + value: row[index], + } + } + + values.push(value) + } + + workbooks[title] = { + headers, data: values, - }, - }; + } + } + + return workbooks } -function getAuth(serviceAccount: Record) { - return new google.auth.GoogleAuth({ - credentials: serviceAccount, - scopes: ["https://www.googleapis.com/auth/spreadsheets.readonly"], - }); +async function getServiceAccount(options: { + getSecret: (key: string) => Promise +}) { + const [ + projectId, + privateKeyId, + privateKey1, + privateKey2, + clientEmail, + clientId, + clientCertUrl, + ] = await Promise.all([ + options.getSecret('google-cloud-project-id'), + options.getSecret('google-cloud-private-key-id'), + // Flatfile secrets can "only" hold 1024 characters, so had to cut it in half. + options.getSecret('google-cloud-private-key-1'), + options.getSecret('google-cloud-private-key-2'), + options.getSecret('google-cloud-client-email'), + options.getSecret('google-cloud-client-id'), + options.getSecret('google-cloud-client-cert-url'), + ]) + + return { + type: 'service_account', + project_id: projectId, + private_key_id: privateKeyId, + // Seems that flatfile are escaping newlines in a weird way, so we remove it again. + private_key: (privateKey1 + privateKey2).replace(/\\n/g, '\n'), + client_email: clientEmail, + client_id: clientId, + auth_uri: 'https://accounts.google.com/o/oauth2/auth', + token_uri: 'https://oauth2.googleapis.com/token', + auth_provider_x509_cert_url: 'https://www.googleapis.com/oauth2/v1/certs', + client_x509_cert_url: clientCertUrl, + universe_domain: 'googleapis.com', + } } From 51b2c9141834b153df13e73e51bc8f13dbe877f7 Mon Sep 17 00:00:00 2001 From: kasperstorgaard Date: Thu, 2 Nov 2023 09:10:50 +0100 Subject: [PATCH 3/3] add author --- plugins/gsheet-extractor/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/gsheet-extractor/package.json b/plugins/gsheet-extractor/package.json index 035a1589d..333b133d2 100644 --- a/plugins/gsheet-extractor/package.json +++ b/plugins/gsheet-extractor/package.json @@ -19,7 +19,7 @@ "test": "jest ./**/*.spec.ts --config=../../jest.config.js --runInBand" }, "keywords": [], - "author": "David Boskovic", + "author": "Kasper Storgaard", "repository": { "type": "git", "url": "https://github.com/FlatFilers/flatfile-plugins.git",