Skip to content

Commit

Permalink
Merge pull request #110 from samansmink/attach-delta-squashed
Browse files Browse the repository at this point in the history
Support attaching delta tables as catalogs
  • Loading branch information
samansmink authored Nov 8, 2024
2 parents 8642253 + ca12e53 commit 49c902b
Show file tree
Hide file tree
Showing 258 changed files with 2,823 additions and 52 deletions.
5 changes: 5 additions & 0 deletions .github/regression/micro.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
benchmark/micro/snapshot_performance/delta_scan.benchmark
benchmark/micro/snapshot_performance/snapshot_no_pin.benchmark
benchmark/micro/snapshot_performance/snapshot_no_pin_filter.benchmark
benchmark/micro/snapshot_performance/snapshot_pin.benchmark
benchmark/micro/snapshot_performance/snapshot_pin_filter.benchmark
16 changes: 12 additions & 4 deletions .github/workflows/LocalTesting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -210,12 +210,14 @@ jobs:
- name: Build
shell: bash
run: make generate-data
run: |
make generate-data
make release
- name: Test
shell: bash
run: |
GENERATED_DATA_AVAILABLE=1 make test
GENERATED_DATA_AVAILABLE=1 make test_release
regression-test-benchmark-runner:
name: Performance Regression Tests
Expand Down Expand Up @@ -280,13 +282,19 @@ jobs:
if: always()
shell: bash
run: |
python3 ./duckdb/scripts/regression_test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/tpch_sf1_local.csv --verbose --threads=2 --root-dir=.
python3 ./duckdb/scripts/regression/test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/tpch_sf1_local.csv --verbose --threads=2 --root-dir=.
- name: Regression Test TPC-DS
if: always()
shell: bash
run: |
python ./duckdb/scripts/regression_test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/tpcds_sf1_local.csv --verbose --threads=2 --root-dir=.
python ./duckdb/scripts/regression/test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/tpcds_sf1_local.csv --verbose --threads=2 --root-dir=.
- name: Regression Test Micro
if: always()
shell: bash
run: |
python ./duckdb/scripts/regression/test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/micro.csv --verbose --threads=2 --root-dir=.
- name: Test benchmark makefile
shell: bash
Expand Down
17 changes: 10 additions & 7 deletions .github/workflows/MainDistributionPipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,24 @@ concurrency:
jobs:
duckdb-stable-build:
name: Build extension binaries
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.2
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
with:
duckdb_version: v1.1.2
ci_tools_version: v1.1.2
# pip install duckdb==1.1.4.dev1594
duckdb_version: 0ccf3c25cc
ci_tools_version: main
extension_name: delta
enable_rust: true
exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools'
exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_mingw'
extra_toolchains: 'python3'
vcpkg_commit: c82f74667287d3dc386bce81e44964370c91a289

duckdb-stable-deploy:
name: Deploy extension binaries
needs: duckdb-stable-build
uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@v1.1.2
uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@main
secrets: inherit
with:
extension_name: delta
duckdb_version: v1.1.2
exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools'
duckdb_version: 0ccf3c25cc
exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_mingw'
deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ test/python/__pycache__/
data/generated
__azurite*__.json
__blobstorage__
.venv
venv
.vscode
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,13 @@ set(EXTENSION_SOURCES
src/delta_extension.cpp
src/delta_functions.cpp
src/delta_utils.cpp
src/functions/delta_scan.cpp)
src/functions/delta_scan.cpp
src/storage/delta_catalog.cpp
src/storage/delta_schema_entry.cpp
src/storage/delta_table_entry.cpp
src/storage/delta_transaction.cpp
src/storage/delta_transaction_manager.cpp
)

### Custom config
# TODO: figure out if we really need this?
Expand Down
17 changes: 15 additions & 2 deletions benchmark/benchmark.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,16 @@ plot:
# TPCH SF1 on delta table
bench-run-tpch-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1/local/delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-delta.csv
bench-run-tpch-sf1-delta-attach: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1/local/delta_attach/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-delta-attach.csv
# TPCH SF1 on parquet files
bench-run-tpch-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-parquet.csv
# TPCH SF1 on duckdb file
bench-run-tpch-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1/local/duckdb/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-duckdb.csv
# COMPARES TPCH SF1 on parquet file vs on delta files vs on duckdb files
bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet
bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet bench-run-tpch-sf1-delta-attach

###
# TPCDS
Expand All @@ -42,6 +44,10 @@ bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet
# TPCDS SF1 on delta table
bench-run-tpcds-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta-$(IO_MODE).csv
bench-run-tpcds-sf1-delta-attach: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/delta_attach/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta-attach-$(IO_MODE).csv
bench-run-tpcds-sf1-delta-attach-pin: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/delta_attach_pin/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta-attach-pin-$(IO_MODE).csv
# TPCDS SF1 on parquet files
bench-run-tpcds-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-parquet-$(IO_MODE).csv
Expand All @@ -50,4 +56,11 @@ bench-run-tpcds-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/duckdb/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-duckdb-$(IO_MODE).csv

# COMPARES TPCDS SF1 on parquet file vs on delta files
bench-run-tpcds-sf1: bench-run-tpcds-sf1-delta bench-run-tpcds-sf1-parquet bench-run-tpcds-sf1-duckdb
bench-run-tpcds-sf1: bench-run-tpcds-sf1-delta bench-run-tpcds-sf1-parquet bench-run-tpcds-sf1-duckdb bench-run-tpcds-sf1-delta-attach bench-run-tpcds-sf1-delta-attach-pin

###
# MICRO
###

bench-run-snapshot-performance: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/micro/snapshot_performance/.*' 2>&1 | tee benchmark_results/snapshot-performance.csv
16 changes: 16 additions & 0 deletions benchmark/micro/snapshot_performance/delta_scan.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# name: benchmark/micro/snapshot_performance/delta_scan.benchmark
# description: Reference result to compare attach functions to
# group: [aggregate]

name delta_scan reference
group snapshot_performance

require delta

require parquet

run
SELECT COUNT(*) FROM delta_scan('./data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake')

result I
6001215
16 changes: 16 additions & 0 deletions benchmark/micro/snapshot_performance/delta_scan_filter.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# name: benchmark/micro/snapshot_performance/delta_scan.benchmark
# description: Reference result to compare attach functions to
# group: [aggregate]

name delta_scan reference
group snapshot_performance

require delta

require parquet

run
SELECT COUNT(*) FROM delta_scan('./data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake') where l_orderkey is not null

result I
6001215
19 changes: 19 additions & 0 deletions benchmark/micro/snapshot_performance/snapshot_no_pin.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# name: benchmark/micro/snapshot_performance/snapshot_no_pin.benchmark
# description: Performance of reading from a table with many log entries
# group: [aggregate]

name Snapshot no pin
group snapshot_performance

require delta

require parquet

load
ATTACH './data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake' as lineitem_no_pin (TYPE delta);

run
SELECT COUNT(*) FROM lineitem_no_pin

result I
6001215
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# name: benchmark/micro/snapshot_performance/snapshot_no_pin_filter.benchmark
# description: Performance of reading from a table with many log entries
# group: [aggregate]

name Snapshot no pin filter
group snapshot_performance

require delta

require parquet

load
ATTACH './data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake' as lineitem_no_pin (TYPE delta);

run
SELECT COUNT(*) FROM lineitem_no_pin where l_orderkey is not null

result I
6001215
19 changes: 19 additions & 0 deletions benchmark/micro/snapshot_performance/snapshot_pin.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# name: benchmark/micro/snapshot_performance/snapshot_pin.benchmark
# description: Performance of reading from a table with many log entries
# group: [aggregate]

name Snapshot pin
group snapshot_performance

require delta

require parquet

load
ATTACH './data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake' as lineitem_pin (TYPE delta, PIN_SNAPSHOT);

run
SELECT COUNT(*) FROM lineitem_pin

result I
6001215
19 changes: 19 additions & 0 deletions benchmark/micro/snapshot_performance/snapshot_pin_filter.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# name: benchmark/micro/snapshot_performance/snapshot_pin_filter.benchmark
# description: Performance of reading from a table with many log entries
# group: [aggregate]

name Snapshot pin filter
group snapshot_performance

require delta

require parquet

load
ATTACH './data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake' as lineitem_pin (TYPE delta, PIN_SNAPSHOT);

run
SELECT COUNT(*) FROM lineitem_pin where l_orderkey is not null

result I
6001215
24 changes: 24 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/load.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
ATTACH './data/generated/tpcds_sf1/call_center/delta_lake' as call_center (TYPE delta);
ATTACH './data/generated/tpcds_sf1/catalog_page/delta_lake' as catalog_page (TYPE delta);
ATTACH './data/generated/tpcds_sf1/catalog_returns/delta_lake' as catalog_returns (TYPE delta);
ATTACH './data/generated/tpcds_sf1/catalog_sales/delta_lake' as catalog_sales (TYPE delta);
ATTACH './data/generated/tpcds_sf1/customer/delta_lake' as customer (TYPE delta);
ATTACH './data/generated/tpcds_sf1/customer_demographics/delta_lake' as customer_demographics (TYPE delta);
ATTACH './data/generated/tpcds_sf1/customer_address/delta_lake' as customer_address (TYPE delta);
ATTACH './data/generated/tpcds_sf1/date_dim/delta_lake' as date_dim (TYPE delta);
ATTACH './data/generated/tpcds_sf1/household_demographics/delta_lake' as household_demographics (TYPE delta);
ATTACH './data/generated/tpcds_sf1/inventory/delta_lake' as inventory (TYPE delta);
ATTACH './data/generated/tpcds_sf1/income_band/delta_lake' as income_band (TYPE delta);
ATTACH './data/generated/tpcds_sf1/item/delta_lake' as item (TYPE delta);
ATTACH './data/generated/tpcds_sf1/promotion/delta_lake' as promotion (TYPE delta);
ATTACH './data/generated/tpcds_sf1/reason/delta_lake' as reason (TYPE delta);
ATTACH './data/generated/tpcds_sf1/ship_mode/delta_lake' as ship_mode (TYPE delta);
ATTACH './data/generated/tpcds_sf1/store/delta_lake' as store (TYPE delta);
ATTACH './data/generated/tpcds_sf1/store_returns/delta_lake' as store_returns (TYPE delta);
ATTACH './data/generated/tpcds_sf1/store_sales/delta_lake' as store_sales (TYPE delta);
ATTACH './data/generated/tpcds_sf1/time_dim/delta_lake' as time_dim (TYPE delta);
ATTACH './data/generated/tpcds_sf1/warehouse/delta_lake' as warehouse (TYPE delta);
ATTACH './data/generated/tpcds_sf1/web_page/delta_lake' as web_page (TYPE delta);
ATTACH './data/generated/tpcds_sf1/web_returns/delta_lake' as web_returns (TYPE delta);
ATTACH './data/generated/tpcds_sf1/web_sales/delta_lake' as web_sales (TYPE delta);
ATTACH './data/generated/tpcds_sf1/web_site/delta_lake' as web_site (TYPE delta);
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q01.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q01.benchmark
# description: Run query 01 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=1
QUERY_NUMBER_PADDED=01
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q02.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q02.benchmark
# description: Run query 02 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=2
QUERY_NUMBER_PADDED=02
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q03.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q03.benchmark
# description: Run query 03 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=3
QUERY_NUMBER_PADDED=03
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q04.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q04.benchmark
# description: Run query 04 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=4
QUERY_NUMBER_PADDED=04
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q05.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q05.benchmark
# description: Run query 05 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=5
QUERY_NUMBER_PADDED=05
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q06.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q06.benchmark
# description: Run query 06 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=6
QUERY_NUMBER_PADDED=06
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q07.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q07.benchmark
# description: Run query 07 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=7
QUERY_NUMBER_PADDED=07
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q08.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q08.benchmark
# description: Run query 08 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=8
QUERY_NUMBER_PADDED=08
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q09.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q09.benchmark
# description: Run query 09 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=9
QUERY_NUMBER_PADDED=09
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q10.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q10.benchmark
# description: Run query 10 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=10
QUERY_NUMBER_PADDED=10
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q11.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q11.benchmark
# description: Run query 11 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=11
QUERY_NUMBER_PADDED=11
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q12.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q12.benchmark
# description: Run query 12 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=12
QUERY_NUMBER_PADDED=12
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q13.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q13.benchmark
# description: Run query 13 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=13
QUERY_NUMBER_PADDED=13
Loading

0 comments on commit 49c902b

Please sign in to comment.