Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into itzhaks/sparse-tensor…
Browse files Browse the repository at this point in the history
…s-indices-to-offsets
  • Loading branch information
Itzhak Stern committed Feb 6, 2025
2 parents 983996f + 86adc44 commit 5629ece
Show file tree
Hide file tree
Showing 307 changed files with 7,753 additions and 11,092 deletions.
11 changes: 7 additions & 4 deletions .github/workflows/broken-link-checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@ jobs:
max_attempts: 3
retry_on: error
command: blc https://www.getdaft.io -ro --exclude www.pytorch.org/ --exclude https://github.com/Eventual-Inc/Daft/ --exclude https://twitter.com/daft_dataframe --exclude https://www.linkedin.com/company/eventualcomputing/
notify_on_failure:
runs-on: self-hosted
if: failure()
needs: check-links
steps:
- name: Send Slack notification on failure
uses: slackapi/slack-github-action@v2.0.0
if: failure()
with:
payload: |
{
Expand All @@ -39,6 +43,5 @@ jobs:
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
webhook-type: incoming-webhook
1 change: 1 addition & 0 deletions .github/workflows/build-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ jobs:
pip install uv
- name: Install dependencies
run: |
source venv/bin/activate
uv pip install -r requirements-dev.txt
- name: Build Daft in development mode and generate docs
# TODO: Break on any Sphinx warnings with nitpicks
Expand Down
19 changes: 11 additions & 8 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ on:
push:
tags:
- v*
# NOTE: Using workflow_dispatch will skip the actual publishing of the package!
workflow_dispatch:

env:
PYTHON_VERSION: 3.11
DAFT_ANALYTICS_ENABLED: '0'
Expand Down Expand Up @@ -110,14 +112,15 @@ jobs:
RUSTFLAGS: -Ctarget-cpu=apple-m1
CFLAGS: -mtune=apple-m1

- name: Install and test built wheel - Linux and Mac x86_64
if: ${{ (matrix.os == 'ubuntu') && (matrix.compile_arch == 'x86_64') }}
run: |
uv pip install -r requirements-dev.txt dist/*-*x86_64*.whl --force-reinstall
rm -rf daft
pytest -v
env:
DAFT_RUNNER: native
# NOTE: Skip running tests entirely for this workflow
# - name: Install and test built wheel - Linux and Mac x86_64
# if: ${{ (matrix.os == 'ubuntu') && (matrix.compile_arch == 'x86_64') }}
# run: |
# uv pip install -r requirements-dev.txt dist/*-*x86_64*.whl --force-reinstall
# rm -rf daft
# pytest -v
# env:
# DAFT_RUNNER: native

- name: Upload wheels
uses: actions/upload-artifact@v4
Expand Down
17 changes: 14 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@ log/
# pytest benchmarks
.benchmarks

# docs autogen
/docs/source/api_docs/doc_gen/

# Added by pyenv
.python-version

Expand All @@ -46,3 +43,17 @@ log/

# uv
uv.lock

# dependencies
node_modules

# next.js
.next
out

# production
/build

# typescript
*.tsbuildinfo
next-env.d.ts
10 changes: 7 additions & 3 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
version: 2

build:
os: ubuntu-22.04
os: "ubuntu-24.04"
tools:
python: '3.10'
rust: '1.64'
jobs:
pre_build:
- VIRTUAL_ENV=${READTHEDOCS_VIRTUALENV_PATH} make VENV=${READTHEDOCS_VIRTUALENV_PATH} build

sphinx:
configuration: docs/source/conf.py
python:
install:
- requirements: requirements-docs.txt

mkdocs:
configuration: docs/mkdocs.yml

formats:
- htmlzip
4 changes: 4 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ jaq-parse = "1.0.0"
jaq-std = "1.2.0"
mur3 = "0.1.0"
num-derive = "0.3.3"
num-format = "0.4.4"
num-traits = "0.2"
once_cell = "1.19.0"
path_macro = "1.0.0"
Expand Down
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,12 @@ dsdgen: .venv ## Generate TPC-DS data
$(VENV_BIN)/python benchmarking/tpcds/datagen.py --scale-factor=$(SCALE_FACTOR) --tpcds-gen-folder=$(OUTPUT_DIR)

.PHONY: docs
docs: .venv ## Serve docs
uv run --with-requirements requirements-docs.txt mkdocs serve
docs: .venv sphinx-docs ## Build both MkDocs and Sphinx documentation
JUPYTER_PLATFORM_DIRS=1 uv run --with-requirements requirements-docs.txt mkdocs build -f docs/mkdocs.yml

.PHONY: sphinx-docs
sphinx-docs: .venv ## Build Sphinx API documentation
uv run --with-requirements requirements-docs.txt sphinx-build -b html "docs/sphinx/source" "docs/sphinx/_build"

.PHONY: clean
clean:
Expand Down
11 changes: 9 additions & 2 deletions daft/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ def refresh_logger() -> None:
# Daft top-level imports
###

from daft.catalog import read_table, register_table
from daft.catalog import (
Identifier,
read_table,
register_table,
)
from daft.context import set_execution_config, set_planning_config, execution_config_ctx, planning_config_ctx
from daft.convert import (
from_arrow,
Expand All @@ -72,7 +76,7 @@ def refresh_logger() -> None:
from daft.dataframe import DataFrame
from daft.logical.schema import Schema
from daft.datatype import DataType, TimeUnit
from daft.expressions import Expression, col, lit, interval, coalesce
from daft.expressions import Expression, col, list_, lit, interval, struct, coalesce
from daft.io import (
DataCatalogTable,
DataCatalogType,
Expand All @@ -99,6 +103,7 @@ def refresh_logger() -> None:
"DataFrame",
"DataType",
"Expression",
"Identifier",
"ImageFormat",
"ImageMode",
"ResourceRequest",
Expand All @@ -116,6 +121,7 @@ def refresh_logger() -> None:
"from_pylist",
"from_ray_dataset",
"interval",
"list_",
"lit",
"planning_config_ctx",
"read_csv",
Expand All @@ -134,6 +140,7 @@ def refresh_logger() -> None:
"set_planning_config",
"sql",
"sql_expr",
"struct",
"to_struct",
"udf",
]
59 changes: 59 additions & 0 deletions daft/catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

from __future__ import annotations

from collections.abc import Sequence
from daft.daft import catalog as native_catalog
from daft.logical.builder import LogicalPlanBuilder

Expand All @@ -53,6 +54,7 @@


__all__ = [
"Identifier",
"read_table",
"register_python_catalog",
"register_table",
Expand Down Expand Up @@ -156,3 +158,60 @@ def register_python_catalog(catalog: PyIcebergCatalog | UnityCatalog, name: str
raise ValueError(f"Unsupported catalog type: {type(catalog)}")

return native_catalog.register_python_catalog(python_catalog, name)


class Identifier(Sequence):
"""A reference (path) to a catalog object.
Example:
>>> id1 = Identifier("a", "b")
>>> id2 = Identifier.parse("a.b")
"""

_identifier: native_catalog.PyIdentifier

def __init__(self, *parts: str):
"""Creates an Identifier from its parts.
Example:
>>> id = Identifier("schema", "table")
>>> id # Identifier('schema.table')
Returns:
Identifier: A new identifier.
"""
if len(parts) < 1:
raise ValueError("Identifier requires at least one part.")
self._identifier = native_catalog.PyIdentifier(parts[:-1], parts[-1])

@staticmethod
def parse(input: str) -> Identifier:
"""Parses an Identifier from an SQL string.
Example:
>>> id = Identifier.parse("schema.table")
>>> assert len(id) == 2
Returns:
Identifier: A new identifier.
"""
i = Identifier.__new__(Identifier)
i._identifier = native_catalog.PyIdentifier.parse(input)
return i

def __eq__(self, other: object) -> bool:
if not isinstance(other, Identifier):
return False
return self._identifier.eq(other._identifier)

def __getitem__(self, index: int | slice) -> str | Sequence[str]:
if isinstance(index, slice):
raise IndexError("slicing not supported")
if isinstance(index, int):
return self._identifier.getitem(index)

def __len__(self) -> int:
return self._identifier.__len__()

def __repr__(self) -> str:
return f"Identifier('{self._identifier.__repr__()}')"
5 changes: 4 additions & 1 deletion daft/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ def set_execution_config(
config: PyDaftExecutionConfig | None = None,
scan_tasks_min_size_bytes: int | None = None,
scan_tasks_max_size_bytes: int | None = None,
max_sources_per_scan_task: int | None = None,
broadcast_join_size_bytes_threshold: int | None = None,
parquet_split_row_groups_max_files: int | None = None,
sort_merge_join_sort_with_aligned_boundaries: bool | None = None,
Expand Down Expand Up @@ -368,6 +369,7 @@ def set_execution_config(
scan_tasks_max_size_bytes: Maximum size in bytes when merging ScanTasks when reading files from storage.
Increasing this value will increase the upper bound of the size of merged ScanTasks, which leads to bigger but
fewer partitions. (Defaults to 384 MiB)
max_sources_per_scan_task: Maximum number of sources in a single ScanTask. (Defaults to 10)
broadcast_join_size_bytes_threshold: If one side of a join is smaller than this threshold, a broadcast join will be used.
Default is 10 MiB.
parquet_split_row_groups_max_files: Maximum number of files to read in which the row group splitting should happen. (Defaults to 10)
Expand All @@ -393,7 +395,7 @@ def set_execution_config(
enable_aqe: Enables Adaptive Query Execution, Defaults to False
enable_native_executor: Enables the native executor, Defaults to False
default_morsel_size: Default size of morsels used for the new local executor. Defaults to 131072 rows.
shuffle_algorithm: The shuffle algorithm to use. Defaults to "map_reduce". Other options are "pre_shuffle_merge".
shuffle_algorithm: The shuffle algorithm to use. Defaults to "auto", which will let Daft determine the algorithm. Options are "map_reduce" and "pre_shuffle_merge".
pre_shuffle_merge_threshold: Memory threshold in bytes for pre-shuffle merge. Defaults to 1GB
enable_ray_tracing: Enable tracing for Ray. Accessible in `/tmp/ray/session_latest/logs/daft` after the run completes. Defaults to False.
scantask_splitting_level: How aggressively to split scan tasks. Setting this to `2` will use a more aggressive ScanTask splitting algorithm which might be more expensive to run but results in more even splits of partitions. Defaults to 1.
Expand All @@ -406,6 +408,7 @@ def set_execution_config(
new_daft_execution_config = old_daft_execution_config.with_config_values(
scan_tasks_min_size_bytes=scan_tasks_min_size_bytes,
scan_tasks_max_size_bytes=scan_tasks_max_size_bytes,
max_sources_per_scan_task=max_sources_per_scan_task,
broadcast_join_size_bytes_threshold=broadcast_join_size_bytes_threshold,
parquet_split_row_groups_max_files=parquet_split_row_groups_max_files,
sort_merge_join_sort_with_aligned_boundaries=sort_merge_join_sort_with_aligned_boundaries,
Expand Down
Loading

0 comments on commit 5629ece

Please sign in to comment.