Skip to content

Commit

Permalink
ARROW-4028: [Rust] Merge parquet-rs codebase
Browse files Browse the repository at this point in the history
This imports parquet-rs source code into Apache Arrow Rust implementation. I include most of the source code except a few things such as `fuzz` and benchmarks. Thinking about adding them later.

The module hierarchy now looks like:
- arrow: all the arrow code
- parquet: all the parquet code (in future, parquet-arrow integration will live here)
- util: common util libraries shared between arrow and parquet (I'll try to move the utils from parquet to here in future).

Author: Chao Sun <sunchao@uber.com>
Author: Chao Sun <sunchao@apache.org>

Closes #3050 from sunchao/import-parquet and squashes the following commits:

2ce98bd2a <Chao Sun> Update git submodule
2d296f8f7 <Chao Sun> ARROW-4028:  Merge parquet-rs codebase
  • Loading branch information
sunchao authored and wesm committed Dec 17, 2018
1 parent 660eb0c commit 6d12823
Show file tree
Hide file tree
Showing 52 changed files with 26,756 additions and 19 deletions.
3 changes: 3 additions & 0 deletions ci/rust-build-main.bat
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@

@rem The "main" Rust build script for Windows CI

@rem Retrieve git submodules, configure env var for Parquet unit tests
git submodule update --init || exit /B
set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data
pushd rust

@echo ===================================
Expand Down
2 changes: 2 additions & 0 deletions ci/travis_script_rust.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

set -e

source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh

RUST_DIR=${TRAVIS_BUILD_DIR}/rust

pushd $RUST_DIR
Expand Down
2 changes: 2 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ services:
build:
context: .
dockerfile: rust/Dockerfile
environment:
PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data
volumes: *ubuntu-volumes

r:
Expand Down
12 changes: 12 additions & 0 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,22 @@ serde_derive = "1.0.80"
serde_json = "1.0.13"
rand = "0.5"
csv = "1.0.0"
parquet-format = "2.5.0"
quick-error = "1.2.2"
byteorder = "1"
thrift = "0.0.4"
snap = "0.2"
brotli = "2.5"
flate2 = "1.0.2"
lz4 = "1.23"
zstd = "0.4"
chrono = "0.4"
num-bigint = "0.2"
num = "0.2"

[dev-dependencies]
criterion = "0.2"
lazy_static = "1"

[[bench]]
name = "array_from_vec"
Expand Down
1 change: 0 additions & 1 deletion rust/benches/array_from_vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#[macro_use]
extern crate criterion;

use criterion::Criterion;

extern crate arrow;
Expand Down
6 changes: 4 additions & 2 deletions rust/benches/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@ extern crate arrow;
extern crate criterion;
extern crate rand;

use arrow::builder::*;
use std::mem::size_of;

use criterion::*;
use rand::distributions::Standard;
use rand::{thread_rng, Rng};
use std::mem::size_of;

use arrow::builder::*;

// Build arrays with 512k elements.
const BATCH_SIZE: usize = 8 << 10;
Expand Down
43 changes: 43 additions & 0 deletions rust/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::process::Command;

fn main() {
// Set Parquet version, build hash and "created by" string.
let version = env!("CARGO_PKG_VERSION");
let mut created_by = format!("parquet-rs version {}", version);
if let Ok(git_hash) = run(Command::new("git").arg("rev-parse").arg("HEAD")) {
created_by.push_str(format!(" (build {})", git_hash).as_str());
println!("cargo:rustc-env=PARQUET_BUILD={}", git_hash);
}
println!("cargo:rustc-env=PARQUET_VERSION={}", version);
println!("cargo:rustc-env=PARQUET_CREATED_BY={}", created_by);
}

/// Runs command and returns either content of stdout for successful execution,
/// or an error message otherwise.
fn run(command: &mut Command) -> Result<String, String> {
println!("Running: `{:?}`", command);
match command.output() {
Ok(ref output) if output.status.success() => {
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
}
Ok(ref output) => Err(format!("Failed: `{:?}` ({})", command, output.status)),
Err(error) => Err(format!("Failed: `{:?}` ({})", command, error)),
}
}
5 changes: 3 additions & 2 deletions rust/examples/read_csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@

extern crate arrow;

use std::fs::File;
use std::sync::Arc;

use arrow::array::{BinaryArray, Float64Array};
use arrow::csv;
use arrow::datatypes::{DataType, Field, Schema};
use std::fs::File;
use std::sync::Arc;

fn main() {
let schema = Schema::new(vec![
Expand Down
18 changes: 18 additions & 0 deletions rust/rustfmt.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

format_doc_comments = true
8 changes: 5 additions & 3 deletions rust/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -657,12 +657,14 @@ impl From<Vec<(Field, ArrayRef)>> for StructArray {
#[cfg(test)]
mod tests {
use super::*;

use std::sync::Arc;
use std::thread;

use crate::array_data::ArrayData;
use crate::buffer::Buffer;
use crate::datatypes::{DataType, Field, ToByteSlice};
use crate::datatypes::{DataType, Field};
use crate::memory;
use std::sync::Arc;
use std::thread;

#[test]
fn test_primitive_array_from_vec() {
Expand Down
3 changes: 2 additions & 1 deletion rust/src/array_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,10 @@ impl ArrayDataBuilder {

#[cfg(test)]
mod tests {
use super::*;

use std::sync::Arc;

use super::{ArrayData, DataType};
use crate::buffer::Buffer;
use crate::util::bit_util;

Expand Down
6 changes: 2 additions & 4 deletions rust/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -456,10 +456,10 @@ impl BinaryArrayBuilder {

#[cfg(test)]
mod tests {
use crate::array::Array;

use super::*;

use crate::array::Array;

#[test]
fn test_builder_i32_empty() {
let b = Int32BufferBuilder::new(5);
Expand Down Expand Up @@ -825,7 +825,6 @@ mod tests {

#[test]
fn test_binary_array_builder() {
use crate::array::BinaryArray;
let mut builder = BinaryArrayBuilder::new(20);

builder.push(b'h').unwrap();
Expand Down Expand Up @@ -860,7 +859,6 @@ mod tests {

#[test]
fn test_binary_array_builder_push_string() {
use crate::array::BinaryArray;
let mut builder = BinaryArrayBuilder::new(20);

let var = "hello".to_owned();
Expand Down
10 changes: 5 additions & 5 deletions rust/src/csv/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@
//! use std::sync::Arc;
//!
//! let schema = Schema::new(vec![
//! Field::new("city", DataType::Utf8, false),
//! Field::new("lat", DataType::Float64, false),
//! Field::new("lng", DataType::Float64, false),
//! Field::new("city", DataType::Utf8, false),
//! Field::new("lat", DataType::Float64, false),
//! Field::new("lng", DataType::Float64, false),
//! ]);
//!
//! let file = File::open("test/data/uk_cities.csv").unwrap();
//!
//! let mut csv = csv::Reader::new(file, Arc::new(schema), false, 1024, None);
//! let batch = csv.next().unwrap().unwrap();
//!```
//! ```

use std::fs::File;
use std::io::BufReader;
Expand Down Expand Up @@ -195,8 +195,8 @@ impl Reader {

#[cfg(test)]
mod tests {

use super::*;

use crate::array::*;
use crate::datatypes::Field;

Expand Down
6 changes: 6 additions & 0 deletions rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
// specific language governing permissions and limitations
// under the License.

#![feature(type_ascription)]
#![feature(rustc_private)]
#![feature(specialization)]
#![feature(try_from)]
#![allow(dead_code)]
#![allow(non_camel_case_types)]

pub mod array;
pub mod array_data;
Expand All @@ -27,6 +32,7 @@ pub mod csv;
pub mod datatypes;
pub mod error;
pub mod memory;
pub mod parquet;
pub mod record_batch;
pub mod tensor;
pub mod util;
28 changes: 28 additions & 0 deletions rust/src/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

pub mod array;
pub mod array_data;
pub mod bitmap;
pub mod buffer;
pub mod builder;
pub mod csv;
pub mod datatypes;
pub mod error;
pub mod memory;
pub mod record_batch;
pub mod tensor;
Loading

0 comments on commit 6d12823

Please sign in to comment.