Skip to content

Commit

Permalink
fix: Zip CSV uploads not working (#1457)
Browse files Browse the repository at this point in the history
Fixes #1080. Fixes #1416 

Updates jszip and uses the internal stream helper instead of nodestream
which we would need a polyfill for. The progress on zip uploads is a bit
odd since we use the unzip progress. It seems papaparse loads more from
the stream while processing/before processing and can finish reading the
zip before it's done processing chunks.

Also, uploading the tables seems to be a blocking operation. Not sure if
that's an easy fix, but after the 50% mark on uploading a large zip,
there are some blocks on the main thread (marching ants freezes as an
indicator).

There is a ~2GB limit for JSZip it seems
Stuk/jszip#777
  • Loading branch information
mattrunyon committed Aug 31, 2023
1 parent 6ff27a6 commit 08d0296
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 65 deletions.
49 changes: 18 additions & 31 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@
"@types/eslint": "^8.4.10",
"@types/jest": "^29.2.5",
"@types/jquery": "^3.5.14",
"@types/jszip": "3.1.7",
"@types/lodash": "^4.14.182",
"@types/lodash.clamp": "^4.0.6",
"@types/lodash.debounce": "^4.0.6",
Expand Down
6 changes: 1 addition & 5 deletions packages/code-studio/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"@fortawesome/react-fontawesome": "^0.2.0",
"classnames": "^2.3.1",
"event-target-shim": "^6.0.2",
"jszip": "3.2.2",
"jszip": "3.10.1",
"lodash.debounce": "^4.0.8",
"lodash.throttle": "^4.1.1",
"memoize-one": "^5.1.1",
Expand All @@ -58,10 +58,6 @@
"redux-thunk": "^2.4.1",
"shortid": "^2.2.16"
},
"dependenciesComments": {
"@types/jszip": "3.1.7 is the closest to 3.2.2 available. JSZip adds official typings in 3.4.0, so remove if going past JSZip 3.4.0",
"jszip": "Pinned to 3.2.2 b/c 3.3.0+ breaks nodestream usage. Not fixed as of 3.5.0. https://github.com/Stuk/jszip/issues/663"
},
"homepage": ".",
"main": "build/index.html",
"files": [
Expand Down
2 changes: 1 addition & 1 deletion packages/console/src/csv/CsvInputBar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ class CsvInputBar extends Component<CsvInputBarProps, CsvInputBarState> {
}
}

handleFile(file: Blob | JSZipObject, isZip = false): void {
handleFile(file: File | Blob | JSZipObject, isZip = false): void {
log.info(
`Starting CSV parser for ${
file instanceof File ? file.name : 'pasted values'
Expand Down
41 changes: 28 additions & 13 deletions packages/console/src/csv/CsvParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import type { IdeSession, Table } from '@deephaven/jsapi-types';
import type { JSZipObject } from 'jszip';
import CsvTypeParser from './CsvTypeParser';
import { CsvTypes } from './CsvFormats';
import makeZipStreamHelper from './ZipStreamHelper';

const log = Log.module('CsvParser');

Expand All @@ -15,7 +16,7 @@ const ZIP_CONSOLIDATE_CHUNKS = 650;
interface CsvParserConstructor {
onFileCompleted: (tables: Table[]) => void;
session: IdeSession;
file: Blob | JSZipObject;
file: File | Blob | JSZipObject;
type: CsvTypes;
readHeaders: boolean;
onProgress: (progressValue: number) => boolean;
Expand Down Expand Up @@ -70,6 +71,8 @@ class CsvParser {
this.onProgress = onProgress;
this.onError = onError;
this.tables = [];
this.rowCount = 0;
this.rowsProcessed = 0;
this.chunks = 0;
this.totalChunks = isZip
? 0
Expand Down Expand Up @@ -102,7 +105,7 @@ class CsvParser {

session: IdeSession;

file: Blob | JSZipObject;
file: File | Blob | JSZipObject;

isZip: boolean;

Expand All @@ -122,6 +125,10 @@ class CsvParser {

types?: string[];

rowCount: number;

rowsProcessed: number;

chunks: number;

totalChunks: number;
Expand Down Expand Up @@ -167,17 +174,20 @@ class CsvParser {
}

parse(): void {
const handleParseDone = (types: string[]) => {
const toParse = this.isZip
? (this.file as JSZipObject).nodeStream(
// JsZip types are incorrect, thus the funny casting
// Actual parameter is 'nodebuffer'
'nodebuffer' as 'nodestream',
this.handleNodeUpdate
)
: (this.file as Blob);
const handleParseDone = (types: string[], rowCount: number) => {
this.types = types;
Papa.parse(toParse, this.config);
this.rowCount = rowCount;

if (this.file instanceof File || this.file instanceof Blob) {
Papa.parse(this.file, this.config);
} else {
const zipStream = makeZipStreamHelper(this.file, this.handleNodeUpdate);
// This is actually a stream, but papaparse TS doesn't like it
Papa.parse(zipStream as unknown as Blob, this.config);
// The stream needs to be manually resumed since jszip starts paused
// Papaparse does not call resume and assumes the stream is already reading
zipStream.resume();
}
};
const typeParser = new CsvTypeParser(
handleParseDone,
Expand Down Expand Up @@ -274,6 +284,9 @@ class CsvParser {
}
assertNotNull(this.headers);
assertNotNull(types);

this.rowsProcessed += columns[0].length;

session
.newTable(this.headers, types, columns, this.timeZone)
.then(table => {
Expand All @@ -294,7 +307,9 @@ class CsvParser {
if (totalChunks > 0) {
progress = Math.round((tables.length / totalChunks) * 50) + 50;
} else {
progress = Math.round(50 + this.zipProgress / 2);
// The zip file can be read entirely while in the middle of parsing
// Since we know the number of rows from the type parsing, use that for progress
progress = Math.round((this.rowsProcessed / this.rowCount) * 50) + 50;
}
log.debug2(`CSV parser progress ${progress}`);
onProgress(progress);
Expand Down
36 changes: 22 additions & 14 deletions packages/console/src/csv/CsvTypeParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import Papa, { Parser, ParseResult, ParseLocalConfig } from 'papaparse';
// Intentionally using isNaN rather than Number.isNaN
/* eslint-disable no-restricted-globals */
import NewTableColumnTypes from './NewTableColumnTypes';
import makeZipStreamHelper from './ZipStreamHelper';

// Initially column types start as unknown
const UNKNOWN = 'unknown';
Expand Down Expand Up @@ -156,8 +157,8 @@ class CsvTypeParser {
}

constructor(
onFileCompleted: (types: string[]) => void,
file: Blob | JSZipObject,
onFileCompleted: (types: string[], rowCount: number) => void,
file: File | Blob | JSZipObject,
readHeaders: boolean,
parentConfig: ParseLocalConfig<unknown, Blob | NodeJS.ReadableStream>,
nullString: string | null,
Expand All @@ -175,6 +176,7 @@ class CsvTypeParser {
this.onError = onError;
this.chunks = 0;
this.totalChunks = totalChunks;
this.rowCount = 0;
this.isZip = isZip;
this.shouldTrim = shouldTrim;
this.zipProgress = 0;
Expand All @@ -192,9 +194,9 @@ class CsvTypeParser {
};
}

onFileCompleted: (types: string[]) => void;
onFileCompleted: (types: string[], rowCount: number) => void;

file: Blob | JSZipObject;
file: File | Blob | JSZipObject;

readHeaders: boolean;

Expand All @@ -210,6 +212,8 @@ class CsvTypeParser {

totalChunks: number;

rowCount: number;

isZip: boolean;

shouldTrim: boolean;
Expand All @@ -219,15 +223,16 @@ class CsvTypeParser {
config: ParseLocalConfig<unknown, Blob | NodeJS.ReadableStream>;

parse(): void {
const toParse = this.isZip
? (this.file as JSZipObject).nodeStream(
// JsZip types are incorrect, thus the funny casting
// Actual parameter is 'nodebuffer'
'nodebuffer' as 'nodestream',
this.handleNodeUpdate
)
: (this.file as Blob);
Papa.parse(toParse, this.config);
if (this.file instanceof File || this.file instanceof Blob) {
Papa.parse(this.file, this.config);
} else {
const zipStream = makeZipStreamHelper(this.file, this.handleNodeUpdate);
// This is actually a stream, but papaparse TS doesn't like it
Papa.parse(zipStream as unknown as Blob, this.config);
// The stream needs to be manually resumed since jszip starts paused
// Papaparse does not call resume and assumes the stream is already reading
zipStream.resume();
}
}

handleChunk(result: ParseResult<string[]>, parser: Parser): void {
Expand All @@ -245,6 +250,8 @@ class CsvTypeParser {
}
}

this.rowCount += data.length;

assertNotNull(this.types);

const cloneTypes = [...this.types];
Expand Down Expand Up @@ -294,7 +301,8 @@ class CsvTypeParser {
type === UNKNOWN || type === NewTableColumnTypes.LOCAL_TIME
? NewTableColumnTypes.STRING
: type
)
),
this.rowCount
);
}
}
Expand Down
33 changes: 33 additions & 0 deletions packages/console/src/csv/ZipStreamHelper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import type { JSZipObject, OnUpdateCallback, JSZipStreamHelper } from 'jszip';

/**
* This is used to help papaparse understand our stream.
* It uses these fields for feature detection, but never actually calls read()
* https://github.com/mholt/PapaParse/blob/master/papaparse.js#L244
*/
interface ZipStreamHelper extends JSZipStreamHelper<string> {
readable: boolean;
read(): void;
removeListener(): void;
}

export default function makeZipStreamHelper(
zipObj: JSZipObject,
onUpdate: OnUpdateCallback
) {
const helper: ZipStreamHelper = (
zipObj as JSZipObject & {
// The type could be anything except nodebuffer from https://stuk.github.io/jszip/documentation/api_zipobject/internal_stream.html
// We only need it as a string though
// JSZip types don't include this method for some reason
internalStream(type: 'string'): JSZipStreamHelper<string>;
}
).internalStream('string') as ZipStreamHelper;

helper.readable = true;
helper.read = () => false;
helper.removeListener = () => false;
helper.on('data', (_, metadata) => onUpdate(metadata));

return helper;
}

0 comments on commit 08d0296

Please sign in to comment.