From 08d0296fee6a695c8312dec7d3bed648f10c7acb Mon Sep 17 00:00:00 2001 From: Matthew Runyon Date: Thu, 31 Aug 2023 15:02:37 -0500 Subject: [PATCH] fix: Zip CSV uploads not working (#1457) Fixes #1080. Fixes #1416 Updates jszip and uses the internal stream helper instead of nodestream which we would need a polyfill for. The progress on zip uploads is a bit odd since we use the unzip progress. It seems papaparse loads more from the stream while processing/before processing and can finish reading the zip before it's done processing chunks. Also, uploading the tables seems to be a blocking operation. Not sure if that's an easy fix, but after the 50% mark on uploading a large zip, there are some blocks on the main thread (marching ants freezes as an indicator). There is a ~2GB limit for JSZip it seems https://github.com/Stuk/jszip/issues/777 --- package-lock.json | 49 ++++++++------------- package.json | 1 - packages/code-studio/package.json | 6 +-- packages/console/src/csv/CsvInputBar.tsx | 2 +- packages/console/src/csv/CsvParser.ts | 41 +++++++++++------ packages/console/src/csv/CsvTypeParser.ts | 36 +++++++++------ packages/console/src/csv/ZipStreamHelper.ts | 33 ++++++++++++++ 7 files changed, 103 insertions(+), 65 deletions(-) create mode 100644 packages/console/src/csv/ZipStreamHelper.ts diff --git a/package-lock.json b/package-lock.json index e5f9b4a07d..e32f6bc3e5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -70,7 +70,6 @@ "@types/eslint": "^8.4.10", "@types/jest": "^29.2.5", "@types/jquery": "^3.5.14", - "@types/jszip": "3.1.7", "@types/lodash": "^4.14.182", "@types/lodash.clamp": "^4.0.6", "@types/lodash.debounce": "^4.0.6", @@ -5633,14 +5632,6 @@ "version": "0.0.29", "license": "MIT" }, - "node_modules/@types/jszip": { - "version": "3.1.7", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/node": "*" - } - }, "node_modules/@types/lodash": { "version": "4.14.182", "dev": true, @@ -16487,13 +16478,14 @@ } }, "node_modules/jszip": { - "version": "3.2.2", - "license": "(MIT OR GPL-3.0)", + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", "dependencies": { "lie": "~3.3.0", "pako": "~1.0.2", "readable-stream": "~2.3.6", - "set-immediate-shim": "~1.0.1" + "setimmediate": "^1.0.5" } }, "node_modules/karma": { @@ -22034,12 +22026,10 @@ "dev": true, "license": "ISC" }, - "node_modules/set-immediate-shim": { - "version": "1.0.1", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==" }, "node_modules/setprototypeof": { "version": "1.2.0", @@ -25233,7 +25223,7 @@ "@fortawesome/react-fontawesome": "^0.2.0", "classnames": "^2.3.1", "event-target-shim": "^6.0.2", - "jszip": "3.2.2", + "jszip": "3.10.1", "lodash.debounce": "^4.0.8", "lodash.throttle": "^4.1.1", "memoize-one": "^5.1.1", @@ -27186,7 +27176,7 @@ "autoprefixer": "^10.4.8", "classnames": "^2.3.1", "event-target-shim": "^6.0.2", - "jszip": "3.2.2", + "jszip": "3.10.1", "lodash.debounce": "^4.0.8", "lodash.throttle": "^4.1.1", "memoize-one": "^5.1.1", @@ -30080,13 +30070,6 @@ "@types/json5": { "version": "0.0.29" }, - "@types/jszip": { - "version": "3.1.7", - "dev": true, - "requires": { - "@types/node": "*" - } - }, "@types/lodash": { "version": "4.14.182", "dev": true @@ -37791,12 +37774,14 @@ } }, "jszip": { - "version": "3.2.2", + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", "requires": { "lie": "~3.3.0", "pako": "~1.0.2", "readable-stream": "~2.3.6", - "set-immediate-shim": "~1.0.1" + "setimmediate": "^1.0.5" } }, "karma": { @@ -41785,8 +41770,10 @@ "version": "2.0.0", "dev": true }, - "set-immediate-shim": { - "version": "1.0.1" + "setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==" }, "setprototypeof": { "version": "1.2.0", diff --git a/package.json b/package.json index e614f8ebe0..ff17b0b5d5 100644 --- a/package.json +++ b/package.json @@ -82,7 +82,6 @@ "@types/eslint": "^8.4.10", "@types/jest": "^29.2.5", "@types/jquery": "^3.5.14", - "@types/jszip": "3.1.7", "@types/lodash": "^4.14.182", "@types/lodash.clamp": "^4.0.6", "@types/lodash.debounce": "^4.0.6", diff --git a/packages/code-studio/package.json b/packages/code-studio/package.json index 7933ca3008..539642128c 100644 --- a/packages/code-studio/package.json +++ b/packages/code-studio/package.json @@ -39,7 +39,7 @@ "@fortawesome/react-fontawesome": "^0.2.0", "classnames": "^2.3.1", "event-target-shim": "^6.0.2", - "jszip": "3.2.2", + "jszip": "3.10.1", "lodash.debounce": "^4.0.8", "lodash.throttle": "^4.1.1", "memoize-one": "^5.1.1", @@ -58,10 +58,6 @@ "redux-thunk": "^2.4.1", "shortid": "^2.2.16" }, - "dependenciesComments": { - "@types/jszip": "3.1.7 is the closest to 3.2.2 available. JSZip adds official typings in 3.4.0, so remove if going past JSZip 3.4.0", - "jszip": "Pinned to 3.2.2 b/c 3.3.0+ breaks nodestream usage. Not fixed as of 3.5.0. https://github.com/Stuk/jszip/issues/663" - }, "homepage": ".", "main": "build/index.html", "files": [ diff --git a/packages/console/src/csv/CsvInputBar.tsx b/packages/console/src/csv/CsvInputBar.tsx index 3753d3462a..f0aa97a747 100644 --- a/packages/console/src/csv/CsvInputBar.tsx +++ b/packages/console/src/csv/CsvInputBar.tsx @@ -178,7 +178,7 @@ class CsvInputBar extends Component { } } - handleFile(file: Blob | JSZipObject, isZip = false): void { + handleFile(file: File | Blob | JSZipObject, isZip = false): void { log.info( `Starting CSV parser for ${ file instanceof File ? file.name : 'pasted values' diff --git a/packages/console/src/csv/CsvParser.ts b/packages/console/src/csv/CsvParser.ts index d34a652b80..79e40aec01 100644 --- a/packages/console/src/csv/CsvParser.ts +++ b/packages/console/src/csv/CsvParser.ts @@ -5,6 +5,7 @@ import type { IdeSession, Table } from '@deephaven/jsapi-types'; import type { JSZipObject } from 'jszip'; import CsvTypeParser from './CsvTypeParser'; import { CsvTypes } from './CsvFormats'; +import makeZipStreamHelper from './ZipStreamHelper'; const log = Log.module('CsvParser'); @@ -15,7 +16,7 @@ const ZIP_CONSOLIDATE_CHUNKS = 650; interface CsvParserConstructor { onFileCompleted: (tables: Table[]) => void; session: IdeSession; - file: Blob | JSZipObject; + file: File | Blob | JSZipObject; type: CsvTypes; readHeaders: boolean; onProgress: (progressValue: number) => boolean; @@ -70,6 +71,8 @@ class CsvParser { this.onProgress = onProgress; this.onError = onError; this.tables = []; + this.rowCount = 0; + this.rowsProcessed = 0; this.chunks = 0; this.totalChunks = isZip ? 0 @@ -102,7 +105,7 @@ class CsvParser { session: IdeSession; - file: Blob | JSZipObject; + file: File | Blob | JSZipObject; isZip: boolean; @@ -122,6 +125,10 @@ class CsvParser { types?: string[]; + rowCount: number; + + rowsProcessed: number; + chunks: number; totalChunks: number; @@ -167,17 +174,20 @@ class CsvParser { } parse(): void { - const handleParseDone = (types: string[]) => { - const toParse = this.isZip - ? (this.file as JSZipObject).nodeStream( - // JsZip types are incorrect, thus the funny casting - // Actual parameter is 'nodebuffer' - 'nodebuffer' as 'nodestream', - this.handleNodeUpdate - ) - : (this.file as Blob); + const handleParseDone = (types: string[], rowCount: number) => { this.types = types; - Papa.parse(toParse, this.config); + this.rowCount = rowCount; + + if (this.file instanceof File || this.file instanceof Blob) { + Papa.parse(this.file, this.config); + } else { + const zipStream = makeZipStreamHelper(this.file, this.handleNodeUpdate); + // This is actually a stream, but papaparse TS doesn't like it + Papa.parse(zipStream as unknown as Blob, this.config); + // The stream needs to be manually resumed since jszip starts paused + // Papaparse does not call resume and assumes the stream is already reading + zipStream.resume(); + } }; const typeParser = new CsvTypeParser( handleParseDone, @@ -274,6 +284,9 @@ class CsvParser { } assertNotNull(this.headers); assertNotNull(types); + + this.rowsProcessed += columns[0].length; + session .newTable(this.headers, types, columns, this.timeZone) .then(table => { @@ -294,7 +307,9 @@ class CsvParser { if (totalChunks > 0) { progress = Math.round((tables.length / totalChunks) * 50) + 50; } else { - progress = Math.round(50 + this.zipProgress / 2); + // The zip file can be read entirely while in the middle of parsing + // Since we know the number of rows from the type parsing, use that for progress + progress = Math.round((this.rowsProcessed / this.rowCount) * 50) + 50; } log.debug2(`CSV parser progress ${progress}`); onProgress(progress); diff --git a/packages/console/src/csv/CsvTypeParser.ts b/packages/console/src/csv/CsvTypeParser.ts index ab4131b445..b332a80591 100644 --- a/packages/console/src/csv/CsvTypeParser.ts +++ b/packages/console/src/csv/CsvTypeParser.ts @@ -4,6 +4,7 @@ import Papa, { Parser, ParseResult, ParseLocalConfig } from 'papaparse'; // Intentionally using isNaN rather than Number.isNaN /* eslint-disable no-restricted-globals */ import NewTableColumnTypes from './NewTableColumnTypes'; +import makeZipStreamHelper from './ZipStreamHelper'; // Initially column types start as unknown const UNKNOWN = 'unknown'; @@ -156,8 +157,8 @@ class CsvTypeParser { } constructor( - onFileCompleted: (types: string[]) => void, - file: Blob | JSZipObject, + onFileCompleted: (types: string[], rowCount: number) => void, + file: File | Blob | JSZipObject, readHeaders: boolean, parentConfig: ParseLocalConfig, nullString: string | null, @@ -175,6 +176,7 @@ class CsvTypeParser { this.onError = onError; this.chunks = 0; this.totalChunks = totalChunks; + this.rowCount = 0; this.isZip = isZip; this.shouldTrim = shouldTrim; this.zipProgress = 0; @@ -192,9 +194,9 @@ class CsvTypeParser { }; } - onFileCompleted: (types: string[]) => void; + onFileCompleted: (types: string[], rowCount: number) => void; - file: Blob | JSZipObject; + file: File | Blob | JSZipObject; readHeaders: boolean; @@ -210,6 +212,8 @@ class CsvTypeParser { totalChunks: number; + rowCount: number; + isZip: boolean; shouldTrim: boolean; @@ -219,15 +223,16 @@ class CsvTypeParser { config: ParseLocalConfig; parse(): void { - const toParse = this.isZip - ? (this.file as JSZipObject).nodeStream( - // JsZip types are incorrect, thus the funny casting - // Actual parameter is 'nodebuffer' - 'nodebuffer' as 'nodestream', - this.handleNodeUpdate - ) - : (this.file as Blob); - Papa.parse(toParse, this.config); + if (this.file instanceof File || this.file instanceof Blob) { + Papa.parse(this.file, this.config); + } else { + const zipStream = makeZipStreamHelper(this.file, this.handleNodeUpdate); + // This is actually a stream, but papaparse TS doesn't like it + Papa.parse(zipStream as unknown as Blob, this.config); + // The stream needs to be manually resumed since jszip starts paused + // Papaparse does not call resume and assumes the stream is already reading + zipStream.resume(); + } } handleChunk(result: ParseResult, parser: Parser): void { @@ -245,6 +250,8 @@ class CsvTypeParser { } } + this.rowCount += data.length; + assertNotNull(this.types); const cloneTypes = [...this.types]; @@ -294,7 +301,8 @@ class CsvTypeParser { type === UNKNOWN || type === NewTableColumnTypes.LOCAL_TIME ? NewTableColumnTypes.STRING : type - ) + ), + this.rowCount ); } } diff --git a/packages/console/src/csv/ZipStreamHelper.ts b/packages/console/src/csv/ZipStreamHelper.ts new file mode 100644 index 0000000000..1af2f2a048 --- /dev/null +++ b/packages/console/src/csv/ZipStreamHelper.ts @@ -0,0 +1,33 @@ +import type { JSZipObject, OnUpdateCallback, JSZipStreamHelper } from 'jszip'; + +/** + * This is used to help papaparse understand our stream. + * It uses these fields for feature detection, but never actually calls read() + * https://github.com/mholt/PapaParse/blob/master/papaparse.js#L244 + */ +interface ZipStreamHelper extends JSZipStreamHelper { + readable: boolean; + read(): void; + removeListener(): void; +} + +export default function makeZipStreamHelper( + zipObj: JSZipObject, + onUpdate: OnUpdateCallback +) { + const helper: ZipStreamHelper = ( + zipObj as JSZipObject & { + // The type could be anything except nodebuffer from https://stuk.github.io/jszip/documentation/api_zipobject/internal_stream.html + // We only need it as a string though + // JSZip types don't include this method for some reason + internalStream(type: 'string'): JSZipStreamHelper; + } + ).internalStream('string') as ZipStreamHelper; + + helper.readable = true; + helper.read = () => false; + helper.removeListener = () => false; + helper.on('data', (_, metadata) => onUpdate(metadata)); + + return helper; +}