Skip to content

Commit

Permalink
Merge branch 'devel' into exp/1095-expose-readable-datasets
Browse files Browse the repository at this point in the history
# Conflicts:
#	dlt/common/destination/reference.py
#	dlt/destinations/sql_client.py
#	dlt/pipeline/pipeline.py
  • Loading branch information
sh-rp committed Jul 17, 2024
2 parents 20bf9ce + e0b0465 commit 6dce626
Show file tree
Hide file tree
Showing 418 changed files with 18,711 additions and 5,824 deletions.
48 changes: 43 additions & 5 deletions .github/workflows/test_destination_clickhouse.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

name: test | clickhouse

on:
Expand All @@ -8,7 +7,7 @@ on:
- devel
workflow_dispatch:
schedule:
- cron: '0 2 * * *'
- cron: '0 2 * * *'

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand All @@ -20,7 +19,7 @@ env:
DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }}

ACTIVE_DESTINATIONS: "[\"clickhouse\"]"
ALL_FILESYSTEM_DRIVERS: "[\"memory\"]"
ALL_FILESYSTEM_DRIVERS: "[\"memory\", \"file\"]"

jobs:
get_docs_changes:
Expand Down Expand Up @@ -67,12 +66,51 @@ jobs:
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

# OSS ClickHouse
- run: |
docker-compose -f "tests/load/clickhouse/clickhouse-compose.yml" up -d
echo "Waiting for ClickHouse to be healthy..."
timeout 30s bash -c 'until docker-compose -f "tests/load/clickhouse/clickhouse-compose.yml" ps | grep -q "healthy"; do sleep 1; done'
echo "ClickHouse is up and running"
name: Start ClickHouse OSS
- run: poetry run pytest tests/load -m "essential"
name: Run essential tests Linux (ClickHouse OSS)
if: ${{ ! (contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule')}}
env:
DESTINATION__CLICKHOUSE__CREDENTIALS__HOST: localhost
DESTINATION__CLICKHOUSE__CREDENTIALS__DATABASE: dlt_data
DESTINATION__CLICKHOUSE__CREDENTIALS__USERNAME: loader
DESTINATION__CLICKHOUSE__CREDENTIALS__PASSWORD: loader
DESTINATION__CLICKHOUSE__CREDENTIALS__PORT: 9000
DESTINATION__CLICKHOUSE__CREDENTIALS__HTTP_PORT: 8123
DESTINATION__CLICKHOUSE__CREDENTIALS__SECURE: 0

- run: poetry run pytest tests/load
name: Run all tests Linux (ClickHouse OSS)
if: ${{ contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule'}}
env:
DESTINATION__CLICKHOUSE__CREDENTIALS__HOST: localhost
DESTINATION__CLICKHOUSE__CREDENTIALS__DATABASE: dlt_data
DESTINATION__CLICKHOUSE__CREDENTIALS__USERNAME: loader
DESTINATION__CLICKHOUSE__CREDENTIALS__PASSWORD: loader
DESTINATION__CLICKHOUSE__CREDENTIALS__PORT: 9000
DESTINATION__CLICKHOUSE__CREDENTIALS__HTTP_PORT: 8123
DESTINATION__CLICKHOUSE__CREDENTIALS__SECURE: 0

- name: Stop ClickHouse OSS
if: always()
run: docker-compose -f "tests/load/clickhouse/clickhouse-compose.yml" down -v

# ClickHouse Cloud
- run: |
poetry run pytest tests/load -m "essential"
name: Run essential tests Linux
name: Run essential tests Linux (ClickHouse Cloud)
if: ${{ ! (contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule')}}
- run: |
poetry run pytest tests/load
name: Run all tests Linux
name: Run all tests Linux (ClickHouse Cloud)
if: ${{ contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule'}}
81 changes: 81 additions & 0 deletions .github/workflows/test_destination_lancedb.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: dest | lancedb

on:
pull_request:
branches:
- master
- devel
workflow_dispatch:
schedule:
- cron: '0 2 * * *'

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }}

RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752
RUNTIME__LOG_LEVEL: ERROR
RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }}

ACTIVE_DESTINATIONS: "[\"lancedb\"]"
ALL_FILESYSTEM_DRIVERS: "[\"memory\"]"

jobs:
get_docs_changes:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
if: ${{ !github.event.pull_request.head.repo.fork || contains(github.event.pull_request.labels.*.name, 'ci from fork')}}

run_loader:
name: dest | lancedb tests
needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true'
defaults:
run:
shell: bash
runs-on: "ubuntu-latest"

steps:
- name: Check out
uses: actions/checkout@master

- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: "3.11.x"

- name: Install Poetry
uses: snok/install-poetry@v1.3.2
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true

- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp

- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

- name: Install dependencies
run: poetry install --no-interaction -E lancedb -E parquet --with sentry-sdk --with pipeline

- name: Install embedding provider dependencies
run: poetry run pip install openai

- run: |
poetry run pytest tests/load -m "essential"
name: Run essential tests Linux
if: ${{ ! (contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule')}}
- run: |
poetry run pytest tests/load
name: Run all tests Linux
if: ${{ contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule'}}
24 changes: 23 additions & 1 deletion .github/workflows/test_doc_snippets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ env:

# Slack hook for chess in production example
RUNTIME__SLACK_INCOMING_HOOK: ${{ secrets.RUNTIME__SLACK_INCOMING_HOOK }}
# Path to local qdrant database
DESTINATION__QDRANT__CREDENTIALS__PATH: zendesk.qdb
# detect if the workflow is executed in a repo fork
IS_FORK: ${{ github.event.pull_request.head.repo.fork }}

Expand All @@ -32,6 +34,26 @@ jobs:
# Do not run on forks, unless allowed, secrets are used here
if: ${{ !github.event.pull_request.head.repo.fork || contains(github.event.pull_request.labels.*.name, 'ci from fork')}}

# Service containers to run with `container-job`
services:
# Label used to access the service container
postgres:
# Docker Hub image
image: postgres
# Provide the password for postgres
env:
POSTGRES_DB: dlt_data
POSTGRES_USER: loader
POSTGRES_PASSWORD: loader
ports:
- 5432:5432
# Set health checks to wait until postgres has started
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:

- name: Check out
Expand Down Expand Up @@ -61,7 +83,7 @@ jobs:

- name: Install dependencies
# if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres --with docs,sentry-sdk --without airflow
run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow

- name: create secrets.toml for examples
run: pwd && echo "$DLT_SECRETS_TOML" > docs/examples/.dlt/secrets.toml
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/test_local_destinations.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ env:
RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752
RUNTIME__LOG_LEVEL: ERROR
RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }}
ACTIVE_DESTINATIONS: "[\"duckdb\", \"postgres\", \"filesystem\", \"weaviate\"]"
ACTIVE_DESTINATIONS: "[\"duckdb\", \"postgres\", \"filesystem\", \"weaviate\", \"qdrant\"]"
ALL_FILESYSTEM_DRIVERS: "[\"memory\", \"file\"]"

DESTINATION__WEAVIATE__VECTORIZER: text2vec-contextionary
Expand Down Expand Up @@ -63,6 +63,11 @@ jobs:
--health-timeout 5s
--health-retries 5
qdrant:
image: qdrant/qdrant:v1.8.4
ports:
- 6333:6333

steps:
- name: Check out
uses: actions/checkout@master
Expand Down Expand Up @@ -90,7 +95,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations

- name: Install dependencies
run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline -E deltalake
run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant --with sentry-sdk --with pipeline -E deltalake

- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
Expand All @@ -100,6 +105,7 @@ jobs:
name: Run tests Linux
env:
DESTINATION__POSTGRES__CREDENTIALS: postgresql://loader:loader@localhost:5432/dlt_data
DESTINATION__QDRANT__CREDENTIALS__location: http://localhost:6333

- name: Stop weaviate
if: always()
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ lint-and-test-snippets:
cd docs/website/docs && poetry run pytest --ignore=node_modules

lint-and-test-examples:
poetry run mypy --config-file mypy.ini docs/examples
poetry run flake8 --max-line-length=200 docs/examples
cd docs/tools && poetry run python prepare_examples_tests.py
poetry run flake8 --max-line-length=200 docs/examples
poetry run mypy --config-file mypy.ini docs/examples
cd docs/examples && poetry run pytest


Expand Down
10 changes: 1 addition & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,12 @@ Be it a Google Colab notebook, AWS Lambda function, an Airflow DAG, your local l

dlt supports Python 3.8+.

**pip:**
```sh
pip install dlt
```

**pixi:**
```sh
pixi add dlt
```
More options: [Install via Conda or Pixi](https://dlthub.com/docs/reference/installation#install-dlt-via-pixi-and-conda)

**conda:**
```sh
conda install -c conda-forge dlt
```

## Quick Start

Expand Down
2 changes: 1 addition & 1 deletion deploy/dlt/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ RUN apk update &&\
# add build labels and envs
ARG COMMIT_SHA=""
ARG IMAGE_VERSION=""
LABEL commit_sha = ${COMMIT_SHA}
LABEL commit_sha=${COMMIT_SHA}
LABEL version=${IMAGE_VERSION}
ENV COMMIT_SHA=${COMMIT_SHA}
ENV IMAGE_VERSION=${IMAGE_VERSION}
Expand Down
2 changes: 1 addition & 1 deletion deploy/dlt/Dockerfile.airflow
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ WORKDIR /tmp/pydlt
# add build labels and envs
ARG COMMIT_SHA=""
ARG IMAGE_VERSION=""
LABEL commit_sha = ${COMMIT_SHA}
LABEL commit_sha=${COMMIT_SHA}
LABEL version=${IMAGE_VERSION}
ENV COMMIT_SHA=${COMMIT_SHA}
ENV IMAGE_VERSION=${IMAGE_VERSION}
Expand Down
2 changes: 1 addition & 1 deletion dlt/cli/_dlt.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def schema_command_wrapper(file_path: str, format_: str, remove_defaults: bool)
schema_str = json.dumps(s.to_dict(remove_defaults=remove_defaults), pretty=True)
else:
schema_str = s.to_pretty_yaml(remove_defaults=remove_defaults)
print(schema_str)
fmt.echo(schema_str)
return 0


Expand Down
44 changes: 41 additions & 3 deletions dlt/cli/pipeline_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
from dlt.common.destination.reference import TDestinationReferenceArg
from dlt.common.runners import Venv
from dlt.common.runners.stdout import iter_stdout
from dlt.common.schema.utils import group_tables_by_resource, remove_defaults
from dlt.common.schema.utils import (
group_tables_by_resource,
has_table_seen_data,
is_complete_column,
remove_defaults,
)
from dlt.common.storages import FileStorage, PackageStorage
from dlt.pipeline.helpers import DropCommand
from dlt.pipeline.exceptions import CannotRestorePipelineException
Expand Down Expand Up @@ -180,6 +185,35 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
fmt.bold(str(res_state_slots)),
)
)
if verbosity > 0:
for table in tables:
incomplete_columns = len(
[
col
for col in table["columns"].values()
if not is_complete_column(col)
]
)
fmt.echo(
"\t%s table %s column(s) %s %s"
% (
fmt.bold(table["name"]),
fmt.bold(str(len(table["columns"]))),
(
fmt.style("received data", fg="green")
if has_table_seen_data(table)
else fmt.style("not yet received data", fg="yellow")
),
(
fmt.style(
f"{incomplete_columns} incomplete column(s)",
fg="yellow",
)
if incomplete_columns > 0
else ""
),
)
)
fmt.echo()
fmt.echo("Working dir content:")
_display_pending_packages()
Expand Down Expand Up @@ -272,7 +306,7 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
fmt.echo(package_info.asstr(verbosity))
if len(package_info.schema_update) > 0:
if verbosity == 0:
print("Add -v option to see schema update. Note that it could be large.")
fmt.echo("Add -v option to see schema update. Note that it could be large.")
else:
tables = remove_defaults({"tables": package_info.schema_update}) # type: ignore
fmt.echo(fmt.bold("Schema update:"))
Expand Down Expand Up @@ -316,7 +350,7 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
fmt.echo(
"About to drop the following data in dataset %s in destination %s:"
% (
fmt.bold(drop.info["dataset_name"]),
fmt.bold(p.dataset_name),
fmt.bold(p.destination.destination_name),
)
)
Expand All @@ -329,6 +363,10 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
)
)
fmt.echo("%s: %s" % (fmt.style("Table(s) to drop", fg="green"), drop.info["tables"]))
fmt.echo(
"%s: %s"
% (fmt.style("\twith data in destination", fg="green"), drop.info["tables_with_data"])
)
fmt.echo(
"%s: %s"
% (
Expand Down
Loading

0 comments on commit 6dce626

Please sign in to comment.