Skip to content

Commit

Permalink
PARQUET-416: C++11 compilation, code reorg, libparquet and installati…
Browse files Browse the repository at this point in the history
…on targets

Reorganize code into a top level src/parquet directly, add a libparquet shared library, and add install targets for libparquet and its header files. Add cpplint script and `make lint` target for code linting.

Replaces earlier PR apache#13

Author: Wes McKinney <wes@cloudera.com>

Closes apache#14 from wesm/libparquet-library and squashes the following commits:

2e356fd [Wes McKinney] PARQUET-416: Compile with C++11 and replace usages of boost::shared_ptr with std::shared_ptr and other C++11 fixes. Reorganize code into a top level src/parquet directly, add a libparquet shared library, and add install targets for libparquet and its header files. Add cpplint script and `make lint` target for code linting.

Change-Id: I4f9d8a35fc5878c621dfa94149dc5e99bf38e803
  • Loading branch information
wesm authored and nongli committed Jan 8, 2016
1 parent 974e111 commit 6119554
Show file tree
Hide file tree
Showing 27 changed files with 5,109 additions and 12 deletions.
18 changes: 18 additions & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Headers: top level
install(FILES
parquet.h
DESTINATION include/parquet)
30 changes: 30 additions & 0 deletions cpp/src/parquet/compression/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2012 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

add_library(parquet_compression STATIC
lz4-codec.cc
snappy-codec.cc
)
target_link_libraries(parquet_compression
lz4static
snappystatic)

set_target_properties(parquet_compression
PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}")

# Headers: compression
install(FILES
codec.h
DESTINATION include/parquet/compression)
71 changes: 71 additions & 0 deletions cpp/src/parquet/compression/codec.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef PARQUET_COMPRESSION_CODEC_H
#define PARQUET_COMPRESSION_CODEC_H

#include "parquet/parquet.h"

#include <cstdint>
#include "parquet/thrift/parquet_constants.h"
#include "parquet/thrift/parquet_types.h"

namespace parquet_cpp {

class Codec {
public:
virtual ~Codec() {}
virtual void Decompress(int input_len, const uint8_t* input,
int output_len, uint8_t* output_buffer) = 0;

virtual int Compress(int input_len, const uint8_t* input,
int output_buffer_len, uint8_t* output_buffer) = 0;

virtual int MaxCompressedLen(int input_len, const uint8_t* input) = 0;

virtual const char* name() const = 0;
};


// Snappy codec.
class SnappyCodec : public Codec {
public:
virtual void Decompress(int input_len, const uint8_t* input,
int output_len, uint8_t* output_buffer);

virtual int Compress(int input_len, const uint8_t* input,
int output_buffer_len, uint8_t* output_buffer);

virtual int MaxCompressedLen(int input_len, const uint8_t* input);

virtual const char* name() const { return "snappy"; }
};

// Lz4 codec.
class Lz4Codec : public Codec {
public:
virtual void Decompress(int input_len, const uint8_t* input,
int output_len, uint8_t* output_buffer);

virtual int Compress(int input_len, const uint8_t* input,
int output_buffer_len, uint8_t* output_buffer);

virtual int MaxCompressedLen(int input_len, const uint8_t* input);

virtual const char* name() const { return "lz4"; }
};

} // namespace parquet_cpp

#endif
40 changes: 40 additions & 0 deletions cpp/src/parquet/compression/lz4-codec.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "parquet/compression/codec.h"

#include <lz4.h>

namespace parquet_cpp {

void Lz4Codec::Decompress(int input_len, const uint8_t* input,
int output_len, uint8_t* output_buffer) {
int n = LZ4_uncompress(reinterpret_cast<const char*>(input),
reinterpret_cast<char*>(output_buffer), output_len);
if (n != input_len) {
throw parquet_cpp::ParquetException("Corrupt lz4 compressed data.");
}
}

int Lz4Codec::MaxCompressedLen(int input_len, const uint8_t* input) {
return LZ4_compressBound(input_len);
}

int Lz4Codec::Compress(int input_len, const uint8_t* input,
int output_buffer_len, uint8_t* output_buffer) {
return LZ4_compress(reinterpret_cast<const char*>(input),
reinterpret_cast<char*>(output_buffer), input_len);
}

} // namespace parquet_cpp
42 changes: 42 additions & 0 deletions cpp/src/parquet/compression/snappy-codec.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "parquet/compression/codec.h"

#include <snappy.h>

namespace parquet_cpp {

void SnappyCodec::Decompress(int input_len, const uint8_t* input,
int output_len, uint8_t* output_buffer) {
if (!snappy::RawUncompress(reinterpret_cast<const char*>(input),
static_cast<size_t>(input_len), reinterpret_cast<char*>(output_buffer))) {
throw parquet_cpp::ParquetException("Corrupt snappy compressed data.");
}
}

int SnappyCodec::MaxCompressedLen(int input_len, const uint8_t* input) {
return snappy::MaxCompressedLength(input_len);
}

int SnappyCodec::Compress(int input_len, const uint8_t* input,
int output_buffer_len, uint8_t* output_buffer) {
size_t output_len;
snappy::RawCompress(reinterpret_cast<const char*>(input),
static_cast<size_t>(input_len), reinterpret_cast<char*>(output_buffer),
&output_len);
return output_len;
}

} // namespace parquet_cpp
24 changes: 24 additions & 0 deletions cpp/src/parquet/encodings/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Headers: encodings
install(FILES
encodings.h
bool-encoding.h
delta-bit-pack-encoding.h
delta-byte-array-encoding.h
delta-length-byte-array-encoding.h
dictionary-encoding.h
plain-encoding.h
DESTINATION include/parquet/encodings)
48 changes: 48 additions & 0 deletions cpp/src/parquet/encodings/bool-encoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef PARQUET_BOOL_ENCODING_H
#define PARQUET_BOOL_ENCODING_H

#include "parquet/encodings/encodings.h"

#include <algorithm>

namespace parquet_cpp {

class BoolDecoder : public Decoder {
public:
BoolDecoder() : Decoder(parquet::Type::BOOLEAN, parquet::Encoding::PLAIN) { }

virtual void SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
decoder_ = RleDecoder(data, len, 1);
}

virtual int GetBool(bool* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
for (int i = 0; i < max_values; ++i) {
if (!decoder_.Get(&buffer[i])) ParquetException::EofException();
}
num_values_ -= max_values;
return max_values;
}

private:
RleDecoder decoder_;
};

} // namespace parquet_cpp

#endif
116 changes: 116 additions & 0 deletions cpp/src/parquet/encodings/delta-bit-pack-encoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef PARQUET_DELTA_BIT_PACK_ENCODING_H
#define PARQUET_DELTA_BIT_PACK_ENCODING_H

#include "parquet/encodings/encodings.h"

#include <algorithm>
#include <vector>

namespace parquet_cpp {

class DeltaBitPackDecoder : public Decoder {
public:
explicit DeltaBitPackDecoder(const parquet::Type::type& type)
: Decoder(type, parquet::Encoding::DELTA_BINARY_PACKED) {
if (type != parquet::Type::INT32 && type != parquet::Type::INT64) {
throw ParquetException("Delta bit pack encoding should only be for integer data.");
}
}

virtual void SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
decoder_ = BitReader(data, len);
values_current_block_ = 0;
values_current_mini_block_ = 0;
}

virtual int GetInt32(int32_t* buffer, int max_values) {
return GetInternal(buffer, max_values);
}

virtual int GetInt64(int64_t* buffer, int max_values) {
return GetInternal(buffer, max_values);
}

private:
void InitBlock() {
uint64_t block_size;
if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
if (!decoder_.GetVlqInt(&values_current_block_)) {
ParquetException::EofException();
}
if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException();
delta_bit_widths_.resize(num_mini_blocks_);

if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException();
for (int i = 0; i < num_mini_blocks_; ++i) {
if (!decoder_.GetAligned<uint8_t>(1, &delta_bit_widths_[i])) {
ParquetException::EofException();
}
}
values_per_mini_block_ = block_size / num_mini_blocks_;
mini_block_idx_ = 0;
delta_bit_width_ = delta_bit_widths_[0];
values_current_mini_block_ = values_per_mini_block_;
}

template <typename T>
int GetInternal(T* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
for (int i = 0; i < max_values; ++i) {
if (UNLIKELY(values_current_mini_block_ == 0)) {
++mini_block_idx_;
if (mini_block_idx_ < delta_bit_widths_.size()) {
delta_bit_width_ = delta_bit_widths_[mini_block_idx_];
values_current_mini_block_ = values_per_mini_block_;
} else {
InitBlock();
buffer[i] = last_value_;
continue;
}
}

// TODO: the key to this algorithm is to decode the entire miniblock at once.
int64_t delta;
if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException();
delta += min_delta_;
last_value_ += delta;
buffer[i] = last_value_;
--values_current_mini_block_;
}
num_values_ -= max_values;
return max_values;
}

BitReader decoder_;
uint64_t values_current_block_;
uint64_t num_mini_blocks_;
uint64_t values_per_mini_block_;
uint64_t values_current_mini_block_;

int64_t min_delta_;
int mini_block_idx_;
std::vector<uint8_t> delta_bit_widths_;
int delta_bit_width_;

int64_t last_value_;
};

} // namespace parquet_cpp

#endif
Loading

0 comments on commit 6119554

Please sign in to comment.