From ee8b5bf9c482e8af1a0fc6ce6bbd9d32b0949d89 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 24 May 2021 21:25:46 +0800 Subject: [PATCH] use prettier to format md files (#367) * use prettier to format md files * apply prettier * update ballista --- CODE_OF_CONDUCT.md | 4 +- DEVELOPERS.md | 72 +++++------ README.md | 115 ++++++++---------- ballista/README.md | 11 +- ballista/docs/README.md | 8 +- ballista/docs/architecture.md | 30 ++--- ballista/docs/dev-env.md | 5 +- ballista/docs/integration-testing.md | 5 +- ballista/rust/client/README.md | 2 +- ballista/rust/core/README.md | 1 + ballista/rust/executor/README.md | 3 +- ballista/rust/scheduler/README.md | 9 +- ballista/ui/scheduler/README.md | 4 + datafusion/docs/cli.md | 13 +- docs/user-guide/README.md | 3 +- docs/user-guide/src/SUMMARY.md | 6 +- .../src/distributed/client-python.md | 3 +- .../user-guide/src/distributed/client-rust.md | 5 +- docs/user-guide/src/distributed/clients.md | 1 + .../src/distributed/configuration.md | 12 +- docs/user-guide/src/distributed/deployment.md | 2 +- .../src/distributed/docker-compose.md | 10 +- .../src/distributed/introduction.md | 12 +- docs/user-guide/src/distributed/kubernetes.md | 49 ++++---- .../user-guide/src/distributed/raspberrypi.md | 31 ++--- docs/user-guide/src/distributed/standalone.md | 9 +- docs/user-guide/src/example-usage.md | 1 + docs/user-guide/src/faq.md | 9 +- docs/user-guide/src/introduction.md | 9 +- docs/user-guide/src/library.md | 1 + docs/user-guide/src/sql/select.md | 13 +- 31 files changed, 239 insertions(+), 219 deletions(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 2efe740b77c5..9a24b9b8a110 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -19,6 +19,6 @@ # Code of Conduct -* [Code of Conduct for The Apache Software Foundation][1] +- [Code of Conduct for The Apache Software Foundation][1] -[1]: https://www.apache.org/foundation/policies/conduct.html \ No newline at end of file +[1]: https://www.apache.org/foundation/policies/conduct.html diff --git a/DEVELOPERS.md b/DEVELOPERS.md index be8bb61b148f..60048c868e6c 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -21,57 +21,57 @@ This section describes how you can get started at developing DataFusion. -For information on developing with Ballista, see the -[Ballista developer documentation](ballista/docs/README.md). +For information on developing with Ballista, see the +[Ballista developer documentation](ballista/docs/README.md). ### Bootstrap environment DataFusion is written in Rust and it uses a standard rust toolkit: -* `cargo build` -* `cargo fmt` to format the code -* `cargo test` to test -* etc. +- `cargo build` +- `cargo fmt` to format the code +- `cargo test` to test +- etc. ## How to add a new scalar function Below is a checklist of what you need to do to add a new scalar function to DataFusion: -* Add the actual implementation of the function: - * [here](datafusion/src/physical_plan/string_expressions.rs) for string functions - * [here](datafusion/src/physical_plan/math_expressions.rs) for math functions - * [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions - * create a new module [here](datafusion/src/physical_plan) for other functions -* In [src/physical_plan/functions](datafusion/src/physical_plan/functions.rs), add: - * a new variant to `BuiltinScalarFunction` - * a new entry to `FromStr` with the name of the function as called by SQL - * a new line in `return_type` with the expected return type of the function, given an incoming type - * a new line in `signature` with the signature of the function (number and types of its arguments) - * a new line in `create_physical_expr` mapping the built-in to the implementation - * tests to the function. -* In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. -* In [src/logical_plan/expr](datafusion/src/logical_plan/expr.rs), add: - * a new entry of the `unary_scalar_expr!` macro for the new function. -* In [src/logical_plan/mod](datafusion/src/logical_plan/mod.rs), add: - * a new entry in the `pub use expr::{}` set. +- Add the actual implementation of the function: + - [here](datafusion/src/physical_plan/string_expressions.rs) for string functions + - [here](datafusion/src/physical_plan/math_expressions.rs) for math functions + - [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions + - create a new module [here](datafusion/src/physical_plan) for other functions +- In [src/physical_plan/functions](datafusion/src/physical_plan/functions.rs), add: + - a new variant to `BuiltinScalarFunction` + - a new entry to `FromStr` with the name of the function as called by SQL + - a new line in `return_type` with the expected return type of the function, given an incoming type + - a new line in `signature` with the signature of the function (number and types of its arguments) + - a new line in `create_physical_expr` mapping the built-in to the implementation + - tests to the function. +- In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. +- In [src/logical_plan/expr](datafusion/src/logical_plan/expr.rs), add: + - a new entry of the `unary_scalar_expr!` macro for the new function. +- In [src/logical_plan/mod](datafusion/src/logical_plan/mod.rs), add: + - a new entry in the `pub use expr::{}` set. ## How to add a new aggregate function Below is a checklist of what you need to do to add a new aggregate function to DataFusion: -* Add the actual implementation of an `Accumulator` and `AggregateExpr`: - * [here](datafusion/src/physical_plan/string_expressions.rs) for string functions - * [here](datafusion/src/physical_plan/math_expressions.rs) for math functions - * [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions - * create a new module [here](datafusion/src/physical_plan) for other functions -* In [src/physical_plan/aggregates](datafusion/src/physical_plan/aggregates.rs), add: - * a new variant to `BuiltinAggregateFunction` - * a new entry to `FromStr` with the name of the function as called by SQL - * a new line in `return_type` with the expected return type of the function, given an incoming type - * a new line in `signature` with the signature of the function (number and types of its arguments) - * a new line in `create_aggregate_expr` mapping the built-in to the implementation - * tests to the function. -* In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. +- Add the actual implementation of an `Accumulator` and `AggregateExpr`: + - [here](datafusion/src/physical_plan/string_expressions.rs) for string functions + - [here](datafusion/src/physical_plan/math_expressions.rs) for math functions + - [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions + - create a new module [here](datafusion/src/physical_plan) for other functions +- In [src/physical_plan/aggregates](datafusion/src/physical_plan/aggregates.rs), add: + - a new variant to `BuiltinAggregateFunction` + - a new entry to `FromStr` with the name of the function as called by SQL + - a new line in `return_type` with the expected return type of the function, given an incoming type + - a new line in `signature` with the signature of the function (number and types of its arguments) + - a new line in `create_aggregate_expr` mapping the built-in to the implementation + - tests to the function. +- In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. ## How to display plans graphically diff --git a/README.md b/README.md index f72c73bb8037..f3ae412fde94 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ logical query plans as well as a query optimizer and execution engine capable of parallel execution against partitioned data sources (CSV and Parquet) using threads. -DataFusion also supports distributed query execution via the +DataFusion also supports distributed query execution via the [Ballista](ballista/README.md) crate. ## Use Cases @@ -42,24 +42,24 @@ the convenience of an SQL interface or a DataFrame API. ## Why DataFusion? -* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance -* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem -* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase -* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. +- _High Performance_: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance +- _Easy to Connect_: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem +- _Easy to Embed_: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase +- _High Quality_: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. ## Known Uses Here are some of the projects known to use DataFusion: -* [Ballista](ballista) Distributed Compute Platform -* [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) -* [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust) -* [datafusion-python](https://pypi.org/project/datafusion) -* [delta-rs](https://github.com/delta-io/delta-rs) -* [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database -* [ROAPI](https://github.com/roapi/roapi) -* [Tensorbase](https://github.com/tensorbase/tensorbase) -* [Squirtle](https://github.com/DSLAM-UMD/Squirtle) +- [Ballista](ballista) Distributed Compute Platform +- [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) +- [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust) +- [datafusion-python](https://pypi.org/project/datafusion) +- [delta-rs](https://github.com/delta-io/delta-rs) +- [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database +- [ROAPI](https://github.com/roapi/roapi) +- [Tensorbase](https://github.com/tensorbase/tensorbase) +- [Squirtle](https://github.com/DSLAM-UMD/Squirtle) (if you know of another project, please submit a PR to add a link!) @@ -122,8 +122,6 @@ Both of these examples will produce +---+--------+ ``` - - ## Using DataFusion as a library DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). @@ -230,7 +228,6 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [x] Parquet primitive types - [ ] Parquet nested types - ## Extensibility DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: @@ -242,26 +239,24 @@ DataFusion is designed to be extensible at all points. To that end, you can prov - [x] User Defined `LogicalPlan` nodes - [x] User Defined `ExecutionPlan` nodes - # Supported SQL This library currently supports many SQL constructs, including -* `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations -* `SELECT ... FROM ...` together with any expression -* `ALIAS` to name an expression -* `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` -* most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. -* `WHERE` to filter -* `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` -* `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` - +- `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations +- `SELECT ... FROM ...` together with any expression +- `ALIAS` to name an expression +- `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` +- most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. +- `WHERE` to filter +- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` +- `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` ## Supported Functions DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. -Currently, only a subset of the PosgreSQL dialect is implemented, and we will document any deviations. +Currently, only a subset of the PostgreSQL dialect is implemented, and we will document any deviations. ## Schema Metadata / Information Schema Support @@ -269,8 +264,7 @@ DataFusion supports the showing metadata about the tables available. This inform More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). - -To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: +To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: ```sql > show tables; @@ -291,7 +285,7 @@ To show tables available for use in DataFusion, use the `SHOW TABLES` command o +---------------+--------------------+------------+--------------+ ``` -To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: +To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: ```sql > show columns from t; @@ -313,8 +307,6 @@ To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or +------------+-------------+------------------+-------------+-----------+ ``` - - ## Supported Data Types DataFusion uses Arrow, and thus the Arrow type system, for query @@ -322,41 +314,38 @@ execution. The SQL types from [sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) are mapped to Arrow types according to the following table - -| SQL Data Type | Arrow DataType | -| --------------- | -------------------------------- | -| `CHAR` | `Utf8` | -| `VARCHAR` | `Utf8` | -| `UUID` | *Not yet supported* | -| `CLOB` | *Not yet supported* | -| `BINARY` | *Not yet supported* | -| `VARBINARY` | *Not yet supported* | -| `DECIMAL` | `Float64` | -| `FLOAT` | `Float32` | -| `SMALLINT` | `Int16` | -| `INT` | `Int32` | -| `BIGINT` | `Int64` | -| `REAL` | `Float64` | -| `DOUBLE` | `Float64` | -| `BOOLEAN` | `Boolean` | -| `DATE` | `Date32` | -| `TIME` | `Time64(TimeUnit::Millisecond)` | -| `TIMESTAMP` | `Date64` | -| `INTERVAL` | *Not yet supported* | -| `REGCLASS` | *Not yet supported* | -| `TEXT` | *Not yet supported* | -| `BYTEA` | *Not yet supported* | -| `CUSTOM` | *Not yet supported* | -| `ARRAY` | *Not yet supported* | - +| SQL Data Type | Arrow DataType | +| ------------- | ------------------------------- | +| `CHAR` | `Utf8` | +| `VARCHAR` | `Utf8` | +| `UUID` | _Not yet supported_ | +| `CLOB` | _Not yet supported_ | +| `BINARY` | _Not yet supported_ | +| `VARBINARY` | _Not yet supported_ | +| `DECIMAL` | `Float64` | +| `FLOAT` | `Float32` | +| `SMALLINT` | `Int16` | +| `INT` | `Int32` | +| `BIGINT` | `Int64` | +| `REAL` | `Float64` | +| `DOUBLE` | `Float64` | +| `BOOLEAN` | `Boolean` | +| `DATE` | `Date32` | +| `TIME` | `Time64(TimeUnit::Millisecond)` | +| `TIMESTAMP` | `Date64` | +| `INTERVAL` | _Not yet supported_ | +| `REGCLASS` | _Not yet supported_ | +| `TEXT` | _Not yet supported_ | +| `BYTEA` | _Not yet supported_ | +| `CUSTOM` | _Not yet supported_ | +| `ARRAY` | _Not yet supported_ | # Architecture Overview There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together. -* (March 2021): The DataFusion architecture is described in *Query Engine Design and the Rust-Based DataFusion in Apache Arrow*: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) -* (Feburary 2021): How DataFusion is used within the Ballista Project is described in *Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) - +- (March 2021): The DataFusion architecture is described in _Query Engine Design and the Rust-Based DataFusion in Apache Arrow_: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) +- (Feburary 2021): How DataFusion is used within the Ballista Project is described in \*Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) # Developer's guide diff --git a/ballista/README.md b/ballista/README.md index 276af3c4d9b2..038146a639ed 100644 --- a/ballista/README.md +++ b/ballista/README.md @@ -19,14 +19,14 @@ # Ballista: Distributed Compute with Apache Arrow -Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is built -on an architecture that allows other programming languages (such as Python, C++, and Java) to be supported as +Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is built +on an architecture that allows other programming languages (such as Python, C++, and Java) to be supported as first-class citizens without paying a penalty for serialization costs. The foundational technologies in Ballista are: - [Apache Arrow](https://arrow.apache.org/) memory model and compute kernels for efficient processing of data. -- [Apache Arrow Flight Protocol](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for efficient +- [Apache Arrow Flight Protocol](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for efficient data transfer between processes. - [Google Protocol Buffers](https://developers.google.com/protocol-buffers) for serializing query plans. - [Docker](https://www.docker.com/) for packaging up executors along with user-defined code. @@ -57,7 +57,6 @@ April 2021 and should be considered experimental. ## Getting Started -The [Ballista Developer Documentation](docs/README.md) and the -[DataFusion User Guide](https://github.com/apache/arrow-datafusion/tree/master/docs/user-guide) are currently the +The [Ballista Developer Documentation](docs/README.md) and the +[DataFusion User Guide](https://github.com/apache/arrow-datafusion/tree/master/docs/user-guide) are currently the best sources of information for getting started with Ballista. - diff --git a/ballista/docs/README.md b/ballista/docs/README.md index 6588c1d4d37b..38d3db5dff39 100644 --- a/ballista/docs/README.md +++ b/ballista/docs/README.md @@ -16,19 +16,19 @@ specific language governing permissions and limitations under the License. --> + # Ballista Developer Documentation -This directory contains documentation for developers that are contributing to Ballista. If you are looking for -end-user documentation for a published release, please start with the +This directory contains documentation for developers that are contributing to Ballista. If you are looking for +end-user documentation for a published release, please start with the [DataFusion User Guide](../../docs/user-guide) instead. ## Architecture & Design -- Read the [Architecture Overview](architecture.md) to get an understanding of the scheduler and executor +- Read the [Architecture Overview](architecture.md) to get an understanding of the scheduler and executor processes and how distributed query execution works. ## Build, Test, Release - Setting up a [development environment](dev-env.md). - [Integration Testing](integration-testing.md) - diff --git a/ballista/docs/architecture.md b/ballista/docs/architecture.md index a73b53a08701..04e1dc26bac1 100644 --- a/ballista/docs/architecture.md +++ b/ballista/docs/architecture.md @@ -16,37 +16,38 @@ specific language governing permissions and limitations under the License. --> + # Ballista Architecture ## Overview -Ballista allows queries to be executed in a distributed cluster. A cluster consists of one or +Ballista allows queries to be executed in a distributed cluster. A cluster consists of one or more scheduler processes and one or more executor processes. See the following sections in this document for more details about these components. -The scheduler accepts logical query plans and translates them into physical query plans using DataFusion and then -runs a secondary planning/optimization process to translate the physical query plan into a distributed physical -query plan. +The scheduler accepts logical query plans and translates them into physical query plans using DataFusion and then +runs a secondary planning/optimization process to translate the physical query plan into a distributed physical +query plan. -This process breaks a query down into a number of query stages that can be executed independently. There are -dependencies between query stages and these dependencies form a directionally-acyclic graph (DAG) because a query +This process breaks a query down into a number of query stages that can be executed independently. There are +dependencies between query stages and these dependencies form a directionally-acyclic graph (DAG) because a query stage cannot start until its child query stages have completed. -Each query stage has one or more partitions that can be processed in parallel by the available +Each query stage has one or more partitions that can be processed in parallel by the available executors in the cluster. This is the basic unit of scalability in Ballista. -The following diagram shows the flow of requests and responses between the client, scheduler, and executor -processes. +The following diagram shows the flow of requests and responses between the client, scheduler, and executor +processes. ![Query Execution Flow](images/query-execution.png) ## Scheduler Process -The scheduler process implements a gRPC interface (defined in +The scheduler process implements a gRPC interface (defined in [ballista.proto](../rust/ballista/proto/ballista.proto)). The interface provides the following methods: | Method | Description | -|----------------------|----------------------------------------------------------------------| +| -------------------- | -------------------------------------------------------------------- | | ExecuteQuery | Submit a logical query plan or SQL query for execution | | GetExecutorsMetadata | Retrieves a list of executors that have registered with a scheduler | | GetFileMetadata | Retrieve metadata about files available in the cluster file system | @@ -60,7 +61,7 @@ The scheduler can run in standalone mode, or can be run in clustered mode using The executor process implements the Apache Arrow Flight gRPC interface and is responsible for: - Executing query stages and persisting the results to disk in Apache Arrow IPC Format -- Making query stage results available as Flights so that they can be retrieved by other executors as well as by +- Making query stage results available as Flights so that they can be retrieved by other executors as well as by clients ## Rust Client @@ -69,7 +70,6 @@ The Rust client provides a DataFrame API that is a thin wrapper around the DataF the means for a client to build a query plan for execution. The client executes the query plan by submitting an `ExecuteLogicalPlan` request to the scheduler and then calls -`GetJobStatus` to check for completion. On completion, the client receives a list of locations for the Flights -containing the results for the query and will then connect to the appropriate executor processes to retrieve +`GetJobStatus` to check for completion. On completion, the client receives a list of locations for the Flights +containing the results for the query and will then connect to the appropriate executor processes to retrieve those results. - diff --git a/ballista/docs/dev-env.md b/ballista/docs/dev-env.md index bf50c9d9c913..f02d156d1ec8 100644 --- a/ballista/docs/dev-env.md +++ b/ballista/docs/dev-env.md @@ -16,13 +16,14 @@ specific language governing permissions and limitations under the License. --> + # Setting up a Rust development environment You will need a standard Rust development environment. The easiest way to achieve this is by using rustup: https://rustup.rs/ ## Install OpenSSL -Follow instructions for [setting up OpenSSL](https://docs.rs/openssl/0.10.28/openssl/). For Ubuntu users, the following +Follow instructions for [setting up OpenSSL](https://docs.rs/openssl/0.10.28/openssl/). For Ubuntu users, the following command works. ```bash @@ -35,4 +36,4 @@ You'll need cmake in order to compile some of ballista's dependencies. Ubuntu us ```bash sudo apt-get install cmake -``` \ No newline at end of file +``` diff --git a/ballista/docs/integration-testing.md b/ballista/docs/integration-testing.md index 3f818a4596b0..5c5b9ee9b656 100644 --- a/ballista/docs/integration-testing.md +++ b/ballista/docs/integration-testing.md @@ -16,10 +16,11 @@ specific language governing permissions and limitations under the License. --> + # Integration Testing -We use the [DataFusion Benchmarks](https://github.com/apache/arrow-datafusion/tree/master/benchmarks) for integration -testing. +We use the [DataFusion Benchmarks](https://github.com/apache/arrow-datafusion/tree/master/benchmarks) for integration +testing. The integration tests can be executed by running the following command from the root of the DataFusion repository. diff --git a/ballista/rust/client/README.md b/ballista/rust/client/README.md index 00bf3ea5ec6c..a9fbd8efa374 100644 --- a/ballista/rust/client/README.md +++ b/ballista/rust/client/README.md @@ -18,5 +18,5 @@ --> # Ballista - Rust -This crate contains the Ballista client library. For an example usage, please refer [here](../benchmarks/tpch/README.md). +This crate contains the Ballista client library. For an example usage, please refer [here](../benchmarks/tpch/README.md). diff --git a/ballista/rust/core/README.md b/ballista/rust/core/README.md index f97952b3f702..d51ae2fddbf6 100644 --- a/ballista/rust/core/README.md +++ b/ballista/rust/core/README.md @@ -18,4 +18,5 @@ --> # Ballista - Rust + This crate contains the core Ballista types. diff --git a/ballista/rust/executor/README.md b/ballista/rust/executor/README.md index c0824e639fdc..105e027c4887 100644 --- a/ballista/rust/executor/README.md +++ b/ballista/rust/executor/README.md @@ -18,6 +18,7 @@ --> # Ballista Executor - Rust + This crate contains the Ballista Executor. It can be used both as a library or as a binary. ## Run @@ -28,4 +29,4 @@ RUST_LOG=info cargo run --release [2021-02-11T05:30:13Z INFO executor] Running with config: ExecutorConfig { host: "localhost", port: 50051, work_dir: "/var/folders/y8/fc61kyjd4n53tn444n72rjrm0000gn/T/.tmpv1LjN0", concurrent_tasks: 4 } ``` -By default, the executor will bind to `localhost` and listen on port `50051`. \ No newline at end of file +By default, the executor will bind to `localhost` and listen on port `50051`. diff --git a/ballista/rust/scheduler/README.md b/ballista/rust/scheduler/README.md index d87eec30e23b..78a800092fda 100644 --- a/ballista/rust/scheduler/README.md +++ b/ballista/rust/scheduler/README.md @@ -18,6 +18,7 @@ --> # Ballista Scheduler + This crate contains the Ballista Scheduler. It can be used both as a library or as a binary. ## Run @@ -32,8 +33,9 @@ $ RUST_LOG=info cargo run --release By default, the scheduler will bind to `localhost` and listen on port `50051`. ## Connecting to Scheduler -Scheduler supports REST model also using content negotiation. -For e.x if you want to get list of executors connected to the scheduler, + +Scheduler supports REST model also using content negotiation. +For e.x if you want to get list of executors connected to the scheduler, you can do (assuming you use default config) ```bash @@ -43,7 +45,8 @@ curl --request GET \ ``` ## Scheduler UI -A basic ui for the scheduler is in `ui/scheduler` of the ballista repo. + +A basic ui for the scheduler is in `ui/scheduler` of the ballista repo. It can be started using the following [yarn](https://yarnpkg.com/) command ```bash diff --git a/ballista/ui/scheduler/README.md b/ballista/ui/scheduler/README.md index 90bc2bface4c..3fee499ead9c 100644 --- a/ballista/ui/scheduler/README.md +++ b/ballista/ui/scheduler/README.md @@ -22,7 +22,9 @@ ## Start project from source ### Run scheduler/executor + First, run scheduler from project: + ```shell $ cd rust/scheduler $ RUST_LOG=info cargo run --release @@ -34,6 +36,7 @@ $ RUST_LOG=info cargo run --release ``` and run executor in new terminal: + ```shell $ cd rust/executor $ RUST_LOG=info cargo run --release @@ -44,6 +47,7 @@ $ RUST_LOG=info cargo run --release ``` ### Run Client project + ```shell $ cd ui/scheduler $ yarn diff --git a/datafusion/docs/cli.md b/datafusion/docs/cli.md index 27605b2e98c0..db3e0b939a81 100644 --- a/datafusion/docs/cli.md +++ b/datafusion/docs/cli.md @@ -45,16 +45,23 @@ docker run -it -v $(your_data_location):/data datafusion-cli ## Usage ``` +DataFusion 4.0.0-SNAPSHOT +DataFusion is an in-memory query engine that uses Apache Arrow as the memory model. It supports executing SQL queries +against CSV and Parquet files as well as querying directly against in-memory data. + USAGE: - datafusion-cli [OPTIONS] + datafusion-cli [FLAGS] [OPTIONS] FLAGS: -h, --help Prints help information + -q, --quiet Reduce printing other than the results and work quietly -V, --version Prints version information OPTIONS: - -c, --batch-size The batch size of each query, default value is 1048576 + -c, --batch-size The batch size of each query, or use DataFusion default -p, --data-path Path to your data, default to current directory + -f, --file Execute commands from file, then exit + --format Output format (possible values: table, csv, tsv, json) [default: table] ``` Type `exit` or `quit` to exit the CLI. @@ -64,7 +71,7 @@ Type `exit` or `quit` to exit the CLI. Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files. ```sql -CREATE EXTERNAL TABLE taxi +CREATE EXTERNAL TABLE taxi STORED AS PARQUET LOCATION '/mnt/nyctaxi/tripdata.parquet'; ``` diff --git a/docs/user-guide/README.md b/docs/user-guide/README.md index 0b9278c593b1..6698e5631d93 100644 --- a/docs/user-guide/README.md +++ b/docs/user-guide/README.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # DataFusion User Guide Source This directory contains the sources for the DataFusion user guide. @@ -27,4 +28,4 @@ To generate the user guide in HTML format, run the following commands: ```bash cargo install mdbook mdbook build -``` \ No newline at end of file +``` diff --git a/docs/user-guide/src/SUMMARY.md b/docs/user-guide/src/SUMMARY.md index 903dad1732a1..aa101b3de117 100644 --- a/docs/user-guide/src/SUMMARY.md +++ b/docs/user-guide/src/SUMMARY.md @@ -16,12 +16,14 @@ specific language governing permissions and limitations under the License. --> + # Summary - [Introduction](introduction.md) -- [Example Usage](example-usage.md) +- [Example Usage](example-usage.md) - [Use as a Library](library.md) - [SQL Reference](sql/introduction.md) + - [SELECT](sql/select.md) - [DDL](sql/ddl.md) - [CREATE EXTERNAL TABLE](sql/ddl.md) @@ -36,4 +38,4 @@ - [Clients](distributed/clients.md) - [Rust](distributed/client-rust.md) - [Python](distributed/client-python.md) -- [Frequently Asked Questions](faq.md) \ No newline at end of file +- [Frequently Asked Questions](faq.md) diff --git a/docs/user-guide/src/distributed/client-python.md b/docs/user-guide/src/distributed/client-python.md index 7525c608ad23..dac06408bed0 100644 --- a/docs/user-guide/src/distributed/client-python.md +++ b/docs/user-guide/src/distributed/client-python.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Python -Coming soon. \ No newline at end of file +Coming soon. diff --git a/docs/user-guide/src/distributed/client-rust.md b/docs/user-guide/src/distributed/client-rust.md index 048c10fc9263..7f7ffcb95b82 100644 --- a/docs/user-guide/src/distributed/client-rust.md +++ b/docs/user-guide/src/distributed/client-rust.md @@ -16,7 +16,8 @@ specific language governing permissions and limitations under the License. --> + ## Ballista Rust Client -The Rust client supports a `DataFrame` API as well as SQL. See the -[TPC-H Benchmark Client](https://github.com/ballista-compute/ballista/tree/main/rust/benchmarks/tpch) for an example. \ No newline at end of file +The Rust client supports a `DataFrame` API as well as SQL. See the +[TPC-H Benchmark Client](https://github.com/ballista-compute/ballista/tree/main/rust/benchmarks/tpch) for an example. diff --git a/docs/user-guide/src/distributed/clients.md b/docs/user-guide/src/distributed/clients.md index 1e223dd8eb05..7b69f195b1a2 100644 --- a/docs/user-guide/src/distributed/clients.md +++ b/docs/user-guide/src/distributed/clients.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + ## Clients - [Rust](client-rust.md) diff --git a/docs/user-guide/src/distributed/configuration.md b/docs/user-guide/src/distributed/configuration.md index 52b05b0e9167..56ca9289511a 100644 --- a/docs/user-guide/src/distributed/configuration.md +++ b/docs/user-guide/src/distributed/configuration.md @@ -16,8 +16,10 @@ specific language governing permissions and limitations under the License. --> -# Configuration -The rust executor and scheduler can be configured using toml files, environment variables and command line arguments. The specification for config options can be found in `rust/ballista/src/bin/[executor|scheduler]_config_spec.toml`. + +# Configuration + +The rust executor and scheduler can be configured using toml files, environment variables and command line arguments. The specification for config options can be found in `rust/ballista/src/bin/[executor|scheduler]_config_spec.toml`. Those files fully define Ballista's configuration. If there is a discrepancy between this documentation and the files, assume those files are correct. @@ -25,8 +27,8 @@ To get a list of command line arguments, run the binary with `--help` There is an example config file at `ballista/rust/ballista/examples/example_executor_config.toml` -The order of precedence for arguments is: default config file < environment variables < specified config file < command line arguments. +The order of precedence for arguments is: default config file < environment variables < specified config file < command line arguments. -The executor and scheduler will look for the default config file at `/etc/ballista/[executor|scheduler].toml` To specify a config file use the `--config-file` argument. +The executor and scheduler will look for the default config file at `/etc/ballista/[executor|scheduler].toml` To specify a config file use the `--config-file` argument. -Environment variables are prefixed by `BALLISTA_EXECUTOR` or `BALLISTA_SCHEDULER` for the executor and scheduler respectively. Hyphens in command line arguments become underscores. For example, the `--scheduler-host` argument for the executor becomes `BALLISTA_EXECUTOR_SCHEDULER_HOST` \ No newline at end of file +Environment variables are prefixed by `BALLISTA_EXECUTOR` or `BALLISTA_SCHEDULER` for the executor and scheduler respectively. Hyphens in command line arguments become underscores. For example, the `--scheduler-host` argument for the executor becomes `BALLISTA_EXECUTOR_SCHEDULER_HOST` diff --git a/docs/user-guide/src/distributed/deployment.md b/docs/user-guide/src/distributed/deployment.md index 2432f2bebb1a..3a00c96822e6 100644 --- a/docs/user-guide/src/distributed/deployment.md +++ b/docs/user-guide/src/distributed/deployment.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Deployment Ballista is packaged as Docker images. Refer to the following guides to create a Ballista cluster: @@ -23,4 +24,3 @@ Ballista is packaged as Docker images. Refer to the following guides to create a - [Create a cluster using Docker](standalone.md) - [Create a cluster using Docker Compose](docker-compose.md) - [Create a cluster using Kubernetes](kubernetes.md) - diff --git a/docs/user-guide/src/distributed/docker-compose.md b/docs/user-guide/src/distributed/docker-compose.md index 2548e57e5a75..de27364fc252 100644 --- a/docs/user-guide/src/distributed/docker-compose.md +++ b/docs/user-guide/src/distributed/docker-compose.md @@ -19,12 +19,12 @@ # Installing Ballista with Docker Compose -Docker Compose is a convenient way to launch a cluister when testing locally. The following Docker Compose example -demonstrates how to start a cluster using a single process that acts as both a scheduler and an executor, with a data +Docker Compose is a convenient way to launch a cluister when testing locally. The following Docker Compose example +demonstrates how to start a cluster using a single process that acts as both a scheduler and an executor, with a data volume mounted into the container so that Ballista can access the host file system. ```yaml -version: '2.0' +version: "2.0" services: etcd: image: quay.io/coreos/etcd:v3.4.9 @@ -41,11 +41,9 @@ services: - "50051:50051" volumes: - ./data:/data - - ``` -With the above content saved to a `docker-compose.yaml` file, the following command can be used to start the single +With the above content saved to a `docker-compose.yaml` file, the following command can be used to start the single node cluster. ```bash diff --git a/docs/user-guide/src/distributed/introduction.md b/docs/user-guide/src/distributed/introduction.md index 59d7a1a2a5c1..0d96c2621958 100644 --- a/docs/user-guide/src/distributed/introduction.md +++ b/docs/user-guide/src/distributed/introduction.md @@ -19,7 +19,7 @@ ## Overview -Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is +Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is built on an architecture that allows other programming languages to be supported as first-class citizens without paying a penalty for serialization costs. @@ -41,12 +41,12 @@ The following diagram highlights some of the integrations that will be possible Although Ballista is largely inspired by Apache Spark, there are some key differences. - The choice of Rust as the main execution language means that memory usage is deterministic and avoids the overhead of GC pauses. -- Ballista is designed from the ground up to use columnar data, enabling a number of efficiencies such as vectorized -processing (SIMD and GPU) and efficient compression. Although Spark does have some columnar support, it is still -largely row-based today. +- Ballista is designed from the ground up to use columnar data, enabling a number of efficiencies such as vectorized + processing (SIMD and GPU) and efficient compression. Although Spark does have some columnar support, it is still + largely row-based today. - The combination of Rust and Arrow provides excellent memory efficiency and memory usage can be 5x - 10x lower than Apache Spark in some cases, which means that more processing can fit on a single node, reducing the overhead of distributed compute. - The use of Apache Arrow as the memory model and network protocol means that data can be exchanged between executors in any programming language with minimal serialization overhead. - + ## Status -Ballista is at the proof-of-concept phase currently but is under active development by a growing community. \ No newline at end of file +Ballista is at the proof-of-concept phase currently but is under active development by a growing community. diff --git a/docs/user-guide/src/distributed/kubernetes.md b/docs/user-guide/src/distributed/kubernetes.md index 027a44d46968..7b9b356dfa42 100644 --- a/docs/user-guide/src/distributed/kubernetes.md +++ b/docs/user-guide/src/distributed/kubernetes.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Deploying Ballista with Kubernetes Ballista can be deployed to any Kubernetes cluster using the following instructions. These instructions assume that @@ -32,15 +33,15 @@ The k8s deployment consists of: Ballista is at an early stage of development and therefore has some significant limitations: -- There is no support for shared object stores such as S3. All data must exist locally on each node in the +- There is no support for shared object stores such as S3. All data must exist locally on each node in the cluster, including where any client process runs. -- Only a single scheduler instance is currently supported unless the scheduler is configured to use `etcd` as a +- Only a single scheduler instance is currently supported unless the scheduler is configured to use `etcd` as a backing store. -## Create Persistent Volume and Persistent Volume Claim +## Create Persistent Volume and Persistent Volume Claim -Copy the following yaml to a `pv.yaml` file and apply to the cluster to create a persistent volume and a persistent -volume claim so that the specified host directory is available to the containers. This is where any data should be +Copy the following yaml to a `pv.yaml` file and apply to the cluster to create a persistent volume and a persistent +volume claim so that the specified host directory is available to the containers. This is where any data should be located so that Ballista can execute queries against it. ```yaml @@ -121,20 +122,20 @@ spec: ballista-cluster: ballista spec: containers: - - name: ballista-scheduler - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT - command: ["/scheduler"] - args: ["--port=50050"] - ports: - - containerPort: 50050 - name: flight - volumeMounts: - - mountPath: /mnt - name: data + - name: ballista-scheduler + image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT + command: ["/scheduler"] + args: ["--port=50050"] + ports: + - containerPort: 50050 + name: flight + volumeMounts: + - mountPath: /mnt + name: data volumes: - - name: data - persistentVolumeClaim: - claimName: data-pv-claim + - name: data + persistentVolumeClaim: + claimName: data-pv-claim --- apiVersion: apps/v1 kind: StatefulSet @@ -156,12 +157,18 @@ spec: - name: ballista-executor image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT command: ["/executor"] - args: ["--port=50051", "--scheduler-host=ballista-scheduler", "--scheduler-port=50050", "--external-host=$(MY_POD_IP)"] + args: + [ + "--port=50051", + "--scheduler-host=ballista-scheduler", + "--scheduler-port=50050", + "--external-host=$(MY_POD_IP)", + ] env: - name: MY_POD_IP valueFrom: fieldRef: - fieldPath: status.podIP + fieldPath: status.podIP ports: - containerPort: 50051 name: flight @@ -212,4 +219,4 @@ Run the following kubectl command to delete the cluster. ```bash kubectl delete -f cluster.yaml -``` \ No newline at end of file +``` diff --git a/docs/user-guide/src/distributed/raspberrypi.md b/docs/user-guide/src/distributed/raspberrypi.md index d4d2079bb5cc..c7e429aec3c8 100644 --- a/docs/user-guide/src/distributed/raspberrypi.md +++ b/docs/user-guide/src/distributed/raspberrypi.md @@ -16,31 +16,32 @@ specific language governing permissions and limitations under the License. --> + # Running Ballista on Raspberry Pi The Raspberry Pi single-board computer provides a fun and relatively inexpensive way to get started with distributed computing. -These instructions have been tested using an Ubuntu Linux desktop as the host, and a +These instructions have been tested using an Ubuntu Linux desktop as the host, and a [Raspberry Pi 4 Model B](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) with 4 GB RAM as the target. ## Preparing the Raspberry Pi We recommend installing the 64-bit version of [Ubuntu for Raspberry Pi](https://ubuntu.com/raspberry-pi). -The Rust implementation of Arrow does not work correctly on 32-bit ARM architectures +The Rust implementation of Arrow does not work correctly on 32-bit ARM architectures ([issue](https://github.com/apache/arrow-rs/issues/109)). ## Cross Compiling DataFusion for the Raspberry Pi -We do not yet publish official Docker images as part of the release process, although we do plan to do this in the -future ([issue #228](https://github.com/apache/arrow-datafusion/issues/228)). +We do not yet publish official Docker images as part of the release process, although we do plan to do this in the +future ([issue #228](https://github.com/apache/arrow-datafusion/issues/228)). -Although it is technically possible to build DataFusion directly on a Raspberry Pi, it really isn't very practical. -It is much faster to use [cross](https://github.com/rust-embedded/cross) to cross-compile from a more powerful +Although it is technically possible to build DataFusion directly on a Raspberry Pi, it really isn't very practical. +It is much faster to use [cross](https://github.com/rust-embedded/cross) to cross-compile from a more powerful desktop computer. -Docker must be installed and the Docker daemon must be running before cross-compiling with cross. See the +Docker must be installed and the Docker daemon must be running before cross-compiling with cross. See the [cross](https://github.com/rust-embedded/cross) project for more detailed instructions. Run the following command to install cross. @@ -63,9 +64,9 @@ cross test --target aarch64-unknown-linux-gnu ## Deploying the binaries to Raspberry Pi -You should now be able to copy the executable to the Raspberry Pi using scp on Linux. You will need to change the IP -address in these commands to be the IP address for your Raspberry Pi. The easiest way to find this is to connect a -keyboard and monitor to the Pi and run `ifconfig`. +You should now be able to copy the executable to the Raspberry Pi using scp on Linux. You will need to change the IP +address in these commands to be the IP address for your Raspberry Pi. The easiest way to find this is to connect a +keyboard and monitor to the Pi and run `ifconfig`. ```bash scp ./target/aarch64-unknown-linux-gnu/release/ballista-scheduler ubuntu@10.0.0.186: @@ -83,9 +84,9 @@ It is now possible to run the Ballista scheduler and executor natively on the Pi ## Docker -Using Docker's `buildx` cross-platform functionality, we can also build a docker image targeting ARM64 -from any desktop environment. This will require write access to a Docker repository -on [Docker Hub](https://hub.docker.com/) because the resulting Docker image will be pushed directly +Using Docker's `buildx` cross-platform functionality, we can also build a docker image targeting ARM64 +from any desktop environment. This will require write access to a Docker repository +on [Docker Hub](https://hub.docker.com/) because the resulting Docker image will be pushed directly to the repo. ```bash @@ -118,11 +119,11 @@ docker run -it myrepo/ballista-arm64 \ --concurrency=24 --iterations=1 --debug --host=ballista-scheduler --port=50050 ``` -Note that it will be necessary to mount appropriate volumes into the containers and also configure networking +Note that it will be necessary to mount appropriate volumes into the containers and also configure networking so that the Docker containers can communicate with each other. This can be achieved using Docker compose or Kubernetes. ## Kubernetes With Docker images built using the instructions above, it is now possible to deploy Ballista to a Kubernetes cluster running on one of more Raspberry Pi computers. Refer to the instructions in the [Kubernetes](kubernetes.md) chapter -for more information, and remember to change the Docker image name to `myrepo/ballista-arm64`. \ No newline at end of file +for more information, and remember to change the Docker image name to `myrepo/ballista-arm64`. diff --git a/docs/user-guide/src/distributed/standalone.md b/docs/user-guide/src/distributed/standalone.md index e4c24fedd319..e9db425dc111 100644 --- a/docs/user-guide/src/distributed/standalone.md +++ b/docs/user-guide/src/distributed/standalone.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + ## Deploying a standalone Ballista cluster ### Start a Scheduler @@ -50,7 +51,7 @@ Start one or more executor processes. Each executor process will need to listen ```bash docker run --network=host \ -d ballistacompute/ballista-rust:0.4.2-SNAPSHOT \ - /executor --external-host localhost --port 50051 + /executor --external-host localhost --port 50051 ``` Use `docker ps` to check that both the scheduer and executor(s) are now running: @@ -71,14 +72,14 @@ $ docker logs 0746ce262a19 [2021-02-14T18:36:25Z INFO executor] Starting registration with scheduler ``` -The external host and port will be registered with the scheduler. The executors will discover other executors by +The external host and port will be registered with the scheduler. The executors will discover other executors by requesting a list of executors from the scheduler. ### Using etcd as backing store _NOTE: This functionality is currently experimental_ -Ballista can optionally use [etcd](https://etcd.io/) as a backing store for the scheduler. +Ballista can optionally use [etcd](https://etcd.io/) as a backing store for the scheduler. ```bash docker run --network=host \ @@ -88,5 +89,5 @@ docker run --network=host \ --etcd-urls etcd:2379 ``` -Please refer to the [etcd](https://etcd.io/) web site for installation instructions. Etcd version 3.4.9 or later is +Please refer to the [etcd](https://etcd.io/) web site for installation instructions. Etcd version 3.4.9 or later is recommended. diff --git a/docs/user-guide/src/example-usage.md b/docs/user-guide/src/example-usage.md index ff23c96de362..2ea7dca9bded 100644 --- a/docs/user-guide/src/example-usage.md +++ b/docs/user-guide/src/example-usage.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Example Usage Run a SQL query against data stored in a CSV: diff --git a/docs/user-guide/src/faq.md b/docs/user-guide/src/faq.md index b73a376988b5..5e1a72c0d033 100644 --- a/docs/user-guide/src/faq.md +++ b/docs/user-guide/src/faq.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Frequently Asked Questions ## What is the relationship between Apache Arrow, DataFusion, and Ballista? @@ -23,9 +24,9 @@ Apache Arrow is a library which provides a standardized memory representation for columnar data. It also provides "kernels" for performing common operations on this data. -DataFusion is a library for executing queries in-process using the Apache Arrow memory -model and computational kernels. It is designed to run within a single process, using threads -for parallel query execution. +DataFusion is a library for executing queries in-process using the Apache Arrow memory +model and computational kernels. It is designed to run within a single process, using threads +for parallel query execution. Ballista is a distributed compute platform design to leverage DataFusion and other query -execution libraries. \ No newline at end of file +execution libraries. diff --git a/docs/user-guide/src/introduction.md b/docs/user-guide/src/introduction.md index c67fb90103d8..7ba3c963cc86 100644 --- a/docs/user-guide/src/introduction.md +++ b/docs/user-guide/src/introduction.md @@ -37,8 +37,7 @@ the convenience of an SQL interface or a DataFrame API. ## Why DataFusion? -* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance -* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem -* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase -* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. - +- _High Performance_: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance +- _Easy to Connect_: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem +- _Easy to Embed_: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase +- _High Quality_: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. diff --git a/docs/user-guide/src/library.md b/docs/user-guide/src/library.md index 12879b160c8f..d35a4b74bbb8 100644 --- a/docs/user-guide/src/library.md +++ b/docs/user-guide/src/library.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Using DataFusion as a library DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). diff --git a/docs/user-guide/src/sql/select.md b/docs/user-guide/src/sql/select.md index 78d0cb58531d..348ffff2887f 100644 --- a/docs/user-guide/src/sql/select.md +++ b/docs/user-guide/src/sql/select.md @@ -20,7 +20,7 @@ # SELECT syntax The queries in DataFusion scan data from tables and return 0 or more rows. -In this documentation we describe the SQL syntax in DataFusion. +In this documentation we describe the SQL syntax in DataFusion. DataFusion supports the following syntax for queries: @@ -32,7 +32,7 @@ DataFusion supports the following syntax for queries: [ [GROUP BY](#group-by-clause) grouping_element [, ...] ]
[ [HAVING](#having-clause) condition]
[ [UNION](#union-clause) [ ALL | select ]
-[ [ORDER BY](#order-by-clause) expression [ ASC | DESC ] [, ...] ]
+[ [ORDER BY](#order-by-clause) expression [ ASC | DESC ][, ...] ]
[ [LIMIT](#limit-clause) count ]
@@ -48,11 +48,10 @@ SELECT a, b FROM x; # SELECT clause - Example: ```sql -SELECT a, b, a + b FROM table +SELECT a, b, a + b FROM table ``` The `DISTINCT` quantifier can be added to make the query return all distinct rows. @@ -65,11 +64,11 @@ SELECT DISTINCT person, age FROM employees # FROM clause Example: + ```sql SELECT t.a FROM table AS t ``` - # WHERE clause Example: @@ -86,7 +85,6 @@ Example: SELECT a, b, MAX(c) FROM table GROUP BY a, b ``` - # HAVING clause Example: @@ -126,7 +124,6 @@ SELECT age, person FROM table ORDER BY age DESC; SELECT age, person FROM table ORDER BY age, person DESC; ``` - # LIMIT clause Limits the number of rows to be a maximum of `count` rows. `count` should be a non-negative integer. @@ -136,4 +133,4 @@ Example: ```sql SELECT age, person FROM table LIMIT 10 -``` \ No newline at end of file +```