Skip to content

Commit

Permalink
refactor: move JSON creation to base class
Browse files Browse the repository at this point in the history
  • Loading branch information
dvirtz committed Feb 4, 2024
1 parent b902827 commit c7d381c
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 24 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ The following setting options are available:
|`parquet-viewer.logging.panel`|`false`|Whether to write diagnostic logs to an output panel|
|`parquet-viewer.logging.folder`|empty|Write diagnostic logs under the given directory|
|`parquet-viewer.logging.level`|info|Diagnostic log level. Choose between: `off`, `fatal`, `error`, `warn`, `info`, `debug` or `trace`|
|`parquet-viewer.jsonSpace`|0|JSON indentation space, passed to `JSON.stringify` as is, see [mdn](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify#parameters) for details. Doesn't apply when `parquet-viewer.backend` is `parquet-tools`.|
|`parquet-viewer.jsonSpace`|0|JSON indentation space, passed to `JSON.stringify` as is, see [mdn](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify#parameters) for details|
|`parquet-viewer.parquetToolsPath`|`parquet-tools`|The name of the parquet-tools executable or a path to the parquet-tools jar|

## Notes
Expand Down
17 changes: 2 additions & 15 deletions src/backends/arrow-backend.ts
Original file line number Diff line number Diff line change
@@ -1,30 +1,17 @@
import { RecordBatchReader } from 'apache-arrow/Arrow';
import { CancellationToken } from 'vscode';
import { ParquetBackend } from './parquet-backend';
import { jsonSpace } from '../settings';

function bigIntToJson(value: bigint) {
// serialize as a number if it's in bounds, otherwise as a string
if (value <= BigInt(Number.MAX_SAFE_INTEGER) && value >= BigInt(Number.MIN_SAFE_INTEGER)) {
return Number(value);
}
return value.toString();
}

export abstract class ArrowBackend extends ParquetBackend {
abstract readParquet(path: string): Promise<RecordBatchReader>;

public async * toJsonImpl(parquetPath: string, _token?: CancellationToken): AsyncGenerator<string> {
public async * toJsonImpl(parquetPath: string, _token?: CancellationToken): AsyncGenerator<object> {
const batches = await this.readParquet(parquetPath);

// read all records from the file and print them
for await (const batch of batches) {
for await (const row of batch) {
yield JSON.stringify(row, (key, value) => {
return typeof value === 'bigint'
? bigIntToJson(value)
: value // return everything else unchanged
}, jsonSpace());
yield row;
}
}
}
Expand Down
18 changes: 16 additions & 2 deletions src/backends/parquet-backend.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,31 @@
import { CancellationToken } from 'vscode';
import { getLogger } from '../logger';
import { jsonSpace } from '../settings';

export abstract class ParquetBackend {
public async* toJson(parquetPath: string, token?: CancellationToken) {

function bigIntToJson(value: bigint) {
// serialize as a number if it's in bounds, otherwise as a string
if (value <= BigInt(Number.MAX_SAFE_INTEGER) && value >= BigInt(Number.MIN_SAFE_INTEGER)) {
return Number(value);
}
return value.toString();
}

getLogger().info(`opening ${parquetPath}`)
for await (const line of this.toJsonImpl(parquetPath, token)) {
if (token?.isCancellationRequested) {
getLogger().info(`parsing ${parquetPath} was cancelled by user`);
break;
}
yield line;
yield JSON.stringify(line, (key, value) => {
return typeof value === 'bigint'
? bigIntToJson(value)
: value // return everything else unchanged
}, jsonSpace());
}
}

abstract toJsonImpl(parquetPath: string, token?: CancellationToken): AsyncGenerator<string>;
abstract toJsonImpl(parquetPath: string, token?: CancellationToken): AsyncGenerator<object>;
}
6 changes: 3 additions & 3 deletions src/backends/parquet-tools-backend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { spawn } from "child_process";
import * as path from 'path';
import { strict as assert } from 'assert';
import { getLogger } from '../logger';
import { parquetTools as getParquetTools, jsonSpace } from '../settings';
import { parquetTools as getParquetTools } from '../settings';
import { createInterface } from 'readline';
import { ParquetBackend } from './parquet-backend';

Expand Down Expand Up @@ -54,9 +54,9 @@ export class ParquetToolsBackend extends ParquetBackend {
return [parquetTools];
}

public async * toJsonImpl(parquetPath: string, token?: vscode.CancellationToken): AsyncGenerator<string> {
public async * toJsonImpl(parquetPath: string, token?: vscode.CancellationToken): AsyncGenerator<object> {
for await (const line of ParquetToolsBackend.spawnParquetTools(['cat', '-j', parquetPath], token)) {
yield JSON.stringify(JSON.parse(line), null, jsonSpace());
yield JSON.parse(line);
}
}
}
5 changes: 2 additions & 3 deletions src/backends/parquets-backend.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import { CancellationToken } from 'vscode';
import { ParquetReader } from '@dvirtz/parquets';
import { ParquetBackend } from './parquet-backend';
import { jsonSpace } from '../settings';

export class ParquetsBackend extends ParquetBackend {
public async * toJsonImpl(parquetPath: string, _token?: CancellationToken): AsyncGenerator<string> {
public async * toJsonImpl(parquetPath: string, _token?: CancellationToken): AsyncGenerator<object> {
const reader = await ParquetReader.openFile(parquetPath);
const cursor = reader.getCursor();

// read all records from the file and print them
let record = null;
while ((record = await cursor.next())) {
yield JSON.stringify(record, null, jsonSpace());
yield record;
}

await reader.close();
Expand Down

0 comments on commit c7d381c

Please sign in to comment.