Skip to content

Commit

Permalink
PARQUET-1365: Don't write page level statistics (#549)
Browse files Browse the repository at this point in the history
Page level statistics were never used in production and became pointless after adding column indexes.
  • Loading branch information
gszadovszky authored and zivanfi committed Nov 19, 2018
1 parent 3201bd1 commit a69f2b3
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1334,12 +1334,13 @@ public void writeDataPageHeader(
writePageHeader(newDataPageHeader(uncompressedSize,
compressedSize,
valueCount,
new org.apache.parquet.column.statistics.BooleanStatistics(),
rlEncoding,
dlEncoding,
valuesEncoding), to);
}

// Statistics are no longer saved in page headers
@Deprecated
public void writeDataPageHeader(
int uncompressedSize,
int compressedSize,
Expand All @@ -1350,15 +1351,14 @@ public void writeDataPageHeader(
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to) throws IOException {
writePageHeader(
newDataPageHeader(uncompressedSize, compressedSize, valueCount, statistics,
newDataPageHeader(uncompressedSize, compressedSize, valueCount,
rlEncoding, dlEncoding, valuesEncoding),
to);
}

private PageHeader newDataPageHeader(
int uncompressedSize, int compressedSize,
int valueCount,
org.apache.parquet.column.statistics.Statistics statistics,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding) {
Expand All @@ -1369,12 +1369,11 @@ private PageHeader newDataPageHeader(
getEncoding(valuesEncoding),
getEncoding(dlEncoding),
getEncoding(rlEncoding)));
if (!statistics.isEmpty()) {
pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics));
}
return pageHeader;
}

// Statistics are no longer saved in page headers
@Deprecated
public void writeDataPageV2Header(
int uncompressedSize, int compressedSize,
int valueCount, int nullCount, int rowCount,
Expand All @@ -1386,26 +1385,50 @@ public void writeDataPageV2Header(
newDataPageV2Header(
uncompressedSize, compressedSize,
valueCount, nullCount, rowCount,
statistics,
dataEncoding,
rlByteLength, dlByteLength), to);
}

public void writeDataPageV1Header(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to) throws IOException {
writePageHeader(newDataPageHeader(uncompressedSize,
compressedSize,
valueCount,
rlEncoding,
dlEncoding,
valuesEncoding), to);
}

public void writeDataPageV2Header(
int uncompressedSize, int compressedSize,
int valueCount, int nullCount, int rowCount,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength, int dlByteLength,
OutputStream to) throws IOException {
writePageHeader(
newDataPageV2Header(
uncompressedSize, compressedSize,
valueCount, nullCount, rowCount,
dataEncoding,
rlByteLength, dlByteLength), to);
}

private PageHeader newDataPageV2Header(
int uncompressedSize, int compressedSize,
int valueCount, int nullCount, int rowCount,
org.apache.parquet.column.statistics.Statistics<?> statistics,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength, int dlByteLength) {
// TODO: pageHeader.crc = ...;
DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2(
valueCount, nullCount, rowCount,
getEncoding(dataEncoding),
dlByteLength, rlByteLength);
if (!statistics.isEmpty()) {
dataPageHeaderV2.setStatistics(
toParquetStatistics(statistics));
}
PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize);
pageHeader.setData_page_header_v2(dataPageHeaderV2);
return pageHeader;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,10 @@ public void writePage(BytesInput bytes,
+ compressedSize);
}
tempOutputStream.reset();
parquetMetadataConverter.writeDataPageHeader(
parquetMetadataConverter.writeDataPageV1Header(
(int)uncompressedSize,
(int)compressedSize,
valueCount,
statistics,
rlEncoding,
dlEncoding,
valuesEncoding,
Expand Down Expand Up @@ -171,7 +170,6 @@ public void writePageV2(
parquetMetadataConverter.writeDataPageV2Header(
uncompressedSize, compressedSize,
valueCount, nullCount, rowCount,
statistics,
dataEncoding,
rlByteLength,
dlByteLength,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ public void writeDataPage(
long beforeHeader = out.getPos();
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int)bytes.size();
metadataConverter.writeDataPageHeader(
metadataConverter.writeDataPageV1Header(
uncompressedPageSize, compressedPageSize,
valueCount,
rlEncoding,
Expand Down Expand Up @@ -518,10 +518,9 @@ private void innerWriteDataPage(
}
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int) bytes.size();
metadataConverter.writeDataPageHeader(
metadataConverter.writeDataPageV1Header(
uncompressedPageSize, compressedPageSize,
valueCount,
statistics,
rlEncoding,
dlEncoding,
valuesEncoding,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,6 @@ public void test() throws Exception {
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
assertEquals(statistics.toString(), page.getStatistics().toString());

// Checking column/offset indexes for the one page
ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
Expand Down

0 comments on commit a69f2b3

Please sign in to comment.