Merge remote-tracking branch 'origin/main' into itzhaks/sparse-tensor…

…s-indices-to-offsets
Eventual-Inc · Feb 6, 2025 · 5629ece · 5629ece
2 parents 983996f + 86adc44
commit 5629ece
Show file tree

Hide file tree

Showing 307 changed files with 7,753 additions and 11,092 deletions.
diff --git a/.github/workflows/broken-link-checker.yml b/.github/workflows/broken-link-checker.yml
@@ -23,9 +23,13 @@ jobs:
         max_attempts: 3
         retry_on: error
         command: blc https://www.getdaft.io -ro --exclude www.pytorch.org/ --exclude https://github.com/Eventual-Inc/Daft/ --exclude https://twitter.com/daft_dataframe --exclude https://www.linkedin.com/company/eventualcomputing/
+  notify_on_failure:
+    runs-on: self-hosted
+    if: failure()
+    needs: check-links
+    steps:
     - name: Send Slack notification on failure
       uses: slackapi/slack-github-action@v2.0.0
-      if: failure()
       with:
         payload: |
           {
@@ -39,6 +43,5 @@ jobs:
               }
             ]
           }
-      env:
-        SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
-        SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
+        webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
+        webhook-type: incoming-webhook
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
@@ -44,6 +44,7 @@ jobs:
         pip install uv
     - name: Install dependencies
       run: |
+        source venv/bin/activate
         uv pip install -r requirements-dev.txt
     - name: Build Daft in development mode and generate docs
       # TODO: Break on any Sphinx warnings with nitpicks

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -13,7 +13,9 @@ on:
   push:
     tags:
     - v*
+  # NOTE: Using workflow_dispatch will skip the actual publishing of the package!
   workflow_dispatch:
+
 env:
   PYTHON_VERSION: 3.11
   DAFT_ANALYTICS_ENABLED: '0'
@@ -110,14 +112,15 @@ jobs:
         RUSTFLAGS: -Ctarget-cpu=apple-m1
         CFLAGS: -mtune=apple-m1
 
-    - name: Install and test built wheel - Linux and Mac x86_64
-      if: ${{ (matrix.os == 'ubuntu') && (matrix.compile_arch == 'x86_64')  }}
-      run: |
-        uv pip install -r requirements-dev.txt dist/*-*x86_64*.whl --force-reinstall
-        rm -rf daft
-        pytest -v
-      env:
-        DAFT_RUNNER: native
+    # NOTE: Skip running tests entirely for this workflow
+    # - name: Install and test built wheel - Linux and Mac x86_64
+    #   if: ${{ (matrix.os == 'ubuntu') && (matrix.compile_arch == 'x86_64')  }}
+    #   run: |
+    #     uv pip install -r requirements-dev.txt dist/*-*x86_64*.whl --force-reinstall
+    #     rm -rf daft
+    #     pytest -v
+    #   env:
+    #     DAFT_RUNNER: native
 
     - name: Upload wheels
       uses: actions/upload-artifact@v4

diff --git a/.gitignore b/.gitignore
@@ -31,9 +31,6 @@ log/
 # pytest benchmarks
 .benchmarks
 
-# docs autogen
-/docs/source/api_docs/doc_gen/
-
 # Added by pyenv
 .python-version
 
@@ -46,3 +43,17 @@ log/
 
 # uv
 uv.lock
+
+# dependencies
+node_modules
+
+# next.js
+.next
+out
+
+# production
+/build
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -1,16 +1,20 @@
 version: 2
 
 build:
-  os: ubuntu-22.04
+  os: "ubuntu-24.04"
   tools:
     python: '3.10'
     rust: '1.64'
   jobs:
     pre_build:
     - VIRTUAL_ENV=${READTHEDOCS_VIRTUALENV_PATH} make VENV=${READTHEDOCS_VIRTUALENV_PATH} build
 
-sphinx:
-  configuration: docs/source/conf.py
+python:
+  install:
+  - requirements: requirements-docs.txt
+
+mkdocs:
+  configuration: docs/mkdocs.yml
 
 formats:
 - htmlzip
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -218,6 +218,7 @@ jaq-parse = "1.0.0"
 jaq-std = "1.2.0"
 mur3 = "0.1.0"
 num-derive = "0.3.3"
+num-format = "0.4.4"
 num-traits = "0.2"
 once_cell = "1.19.0"
 path_macro = "1.0.0"

diff --git a/Makefile b/Makefile
@@ -71,8 +71,12 @@ dsdgen: .venv ## Generate TPC-DS data
 	$(VENV_BIN)/python benchmarking/tpcds/datagen.py --scale-factor=$(SCALE_FACTOR) --tpcds-gen-folder=$(OUTPUT_DIR)
 
 .PHONY: docs
-docs: .venv ## Serve docs
-	uv run --with-requirements requirements-docs.txt mkdocs serve
+docs: .venv sphinx-docs ## Build both MkDocs and Sphinx documentation
+	JUPYTER_PLATFORM_DIRS=1 uv run --with-requirements requirements-docs.txt mkdocs build -f docs/mkdocs.yml
+
+.PHONY: sphinx-docs
+sphinx-docs: .venv ## Build Sphinx API documentation
+	uv run --with-requirements requirements-docs.txt sphinx-build -b html "docs/sphinx/source" "docs/sphinx/_build"
 
 .PHONY: clean
 clean:

diff --git a/daft/__init__.py b/daft/__init__.py
@@ -58,7 +58,11 @@ def refresh_logger() -> None:
 # Daft top-level imports
 ###
 
-from daft.catalog import read_table, register_table
+from daft.catalog import (
+    Identifier,
+    read_table,
+    register_table,
+)
 from daft.context import set_execution_config, set_planning_config, execution_config_ctx, planning_config_ctx
 from daft.convert import (
     from_arrow,
@@ -72,7 +76,7 @@ def refresh_logger() -> None:
 from daft.dataframe import DataFrame
 from daft.logical.schema import Schema
 from daft.datatype import DataType, TimeUnit
-from daft.expressions import Expression, col, lit, interval, coalesce
+from daft.expressions import Expression, col, list_, lit, interval, struct, coalesce
 from daft.io import (
     DataCatalogTable,
     DataCatalogType,
@@ -99,6 +103,7 @@ def refresh_logger() -> None:
     "DataFrame",
     "DataType",
     "Expression",
+    "Identifier",
     "ImageFormat",
     "ImageMode",
     "ResourceRequest",
@@ -116,6 +121,7 @@ def refresh_logger() -> None:
     "from_pylist",
     "from_ray_dataset",
     "interval",
+    "list_",
     "lit",
     "planning_config_ctx",
     "read_csv",
@@ -134,6 +140,7 @@ def refresh_logger() -> None:
     "set_planning_config",
     "sql",
     "sql_expr",
+    "struct",
     "to_struct",
     "udf",
 ]
diff --git a/daft/catalog/__init__.py b/daft/catalog/__init__.py
@@ -40,6 +40,7 @@
 
 from __future__ import annotations
 
+from collections.abc import Sequence
 from daft.daft import catalog as native_catalog
 from daft.logical.builder import LogicalPlanBuilder
 
@@ -53,6 +54,7 @@
 
 
 __all__ = [
+    "Identifier",
     "read_table",
     "register_python_catalog",
     "register_table",
@@ -156,3 +158,60 @@ def register_python_catalog(catalog: PyIcebergCatalog | UnityCatalog, name: str
         raise ValueError(f"Unsupported catalog type: {type(catalog)}")
 
     return native_catalog.register_python_catalog(python_catalog, name)
+
+
+class Identifier(Sequence):
+    """A reference (path) to a catalog object.
+
+    Example:
+    >>> id1 = Identifier("a", "b")
+    >>> id2 = Identifier.parse("a.b")
+    """
+
+    _identifier: native_catalog.PyIdentifier
+
+    def __init__(self, *parts: str):
+        """Creates an Identifier from its parts.
+
+        Example:
+        >>> id = Identifier("schema", "table")
+        >>> id  # Identifier('schema.table')
+
+        Returns:
+            Identifier: A new identifier.
+        """
+        if len(parts) < 1:
+            raise ValueError("Identifier requires at least one part.")
+        self._identifier = native_catalog.PyIdentifier(parts[:-1], parts[-1])
+
+    @staticmethod
+    def parse(input: str) -> Identifier:
+        """Parses an Identifier from an SQL string.
+
+        Example:
+        >>> id = Identifier.parse("schema.table")
+        >>> assert len(id) == 2
+
+        Returns:
+            Identifier: A new identifier.
+        """
+        i = Identifier.__new__(Identifier)
+        i._identifier = native_catalog.PyIdentifier.parse(input)
+        return i
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Identifier):
+            return False
+        return self._identifier.eq(other._identifier)
+
+    def __getitem__(self, index: int | slice) -> str | Sequence[str]:
+        if isinstance(index, slice):
+            raise IndexError("slicing not supported")
+        if isinstance(index, int):
+            return self._identifier.getitem(index)
+
+    def __len__(self) -> int:
+        return self._identifier.__len__()
+
+    def __repr__(self) -> str:
+        return f"Identifier('{self._identifier.__repr__()}')"
diff --git a/daft/context.py b/daft/context.py
@@ -331,6 +331,7 @@ def set_execution_config(
     config: PyDaftExecutionConfig | None = None,
     scan_tasks_min_size_bytes: int | None = None,
     scan_tasks_max_size_bytes: int | None = None,
+    max_sources_per_scan_task: int | None = None,
     broadcast_join_size_bytes_threshold: int | None = None,
     parquet_split_row_groups_max_files: int | None = None,
     sort_merge_join_sort_with_aligned_boundaries: bool | None = None,
@@ -368,6 +369,7 @@ def set_execution_config(
         scan_tasks_max_size_bytes: Maximum size in bytes when merging ScanTasks when reading files from storage.
             Increasing this value will increase the upper bound of the size of merged ScanTasks, which leads to bigger but
             fewer partitions. (Defaults to 384 MiB)
+        max_sources_per_scan_task: Maximum number of sources in a single ScanTask. (Defaults to 10)
         broadcast_join_size_bytes_threshold: If one side of a join is smaller than this threshold, a broadcast join will be used.
             Default is 10 MiB.
         parquet_split_row_groups_max_files: Maximum number of files to read in which the row group splitting should happen. (Defaults to 10)
@@ -393,7 +395,7 @@ def set_execution_config(
         enable_aqe: Enables Adaptive Query Execution, Defaults to False
         enable_native_executor: Enables the native executor, Defaults to False
         default_morsel_size: Default size of morsels used for the new local executor. Defaults to 131072 rows.
-        shuffle_algorithm: The shuffle algorithm to use. Defaults to "map_reduce". Other options are "pre_shuffle_merge".
+        shuffle_algorithm: The shuffle algorithm to use. Defaults to "auto", which will let Daft determine the algorithm. Options are "map_reduce" and "pre_shuffle_merge".
         pre_shuffle_merge_threshold: Memory threshold in bytes for pre-shuffle merge. Defaults to 1GB
         enable_ray_tracing: Enable tracing for Ray. Accessible in `/tmp/ray/session_latest/logs/daft` after the run completes. Defaults to False.
         scantask_splitting_level: How aggressively to split scan tasks. Setting this to `2` will use a more aggressive ScanTask splitting algorithm which might be more expensive to run but results in more even splits of partitions. Defaults to 1.
@@ -406,6 +408,7 @@ def set_execution_config(
         new_daft_execution_config = old_daft_execution_config.with_config_values(
             scan_tasks_min_size_bytes=scan_tasks_min_size_bytes,
             scan_tasks_max_size_bytes=scan_tasks_max_size_bytes,
+            max_sources_per_scan_task=max_sources_per_scan_task,
             broadcast_join_size_bytes_threshold=broadcast_join_size_bytes_threshold,
             parquet_split_row_groups_max_files=parquet_split_row_groups_max_files,
             sort_merge_join_sort_with_aligned_boundaries=sort_merge_join_sort_with_aligned_boundaries,