feat(lance): add lance

hongbo-miao · Dec 27, 2024 · 6fee042 · 6fee042
1 parent 5b25d9a
commit 6fee042
Show file tree

Hide file tree

Showing 8 changed files with 496 additions and 0 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -102,6 +102,7 @@ jobs:
       iads-iads-data-producer: ${{ steps.filter.outputs.iads-iads-data-producer }}
       iads-zeromq-iads-bridge: ${{ steps.filter.outputs.iads-zeromq-iads-bridge }}
       hugging-face-analyze-sentiment: ${{ steps.filter.outputs.hugging-face-analyze-sentiment }}
+      lance: ${{ steps.filter.outputs.lance }}
       kafka-rust-proto-consumer: ${{ steps.filter.outputs.kafka-rust-proto-consumer }}
       kafka-rust-proto-producer: ${{ steps.filter.outputs.kafka-rust-proto-producer }}
       kafka-rust-udp-kafka-bridge: ${{ steps.filter.outputs.kafka-rust-udp-kafka-bridge }}
@@ -383,6 +384,9 @@ jobs:
             iads-zeromq-iads-bridge:
               - '.github/workflows/test.yml'
               - 'data-visualization/iads/iads-rtstation/zeromq-iads-bridge/**'
+            lance:
+              - '.github/workflows/test.yml'
+              - 'data-storage/lance/**'
             mineru:
               - '.github/workflows/test.yml'
               - 'machine-learning/mineru/**'
@@ -1406,6 +1410,39 @@ jobs:
         with:
           directory: data-storage/hm-duckdb/query-protobuf
 
+  lance-test:
+    name: Lance | Test
+    needs: detect-changes
+    if: ${{ needs.detect-changes.outputs.hm-lance == 'true' }}
+    runs-on: ubuntu-24.04
+    environment: test
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.2.2
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5.1.0
+        with:
+          version: 0.5.11
+          enable-cache: true
+          cache-dependency-glob: data-storage/hm-lance/uv.lock
+      - name: Set up Python
+        uses: actions/setup-python@v5.3.0
+        with:
+          python-version-file: data-storage/hm-lance/pyproject.toml
+      - name: Install dependencies
+        working-directory: data-storage/hm-lance
+        run: |
+          uv sync --dev
+      - name: Test
+        working-directory: data-storage/hm-lance
+        run: |
+          uv run poe test-coverage
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5.1.2
+        with:
+          directory: data-storage/hm-lance
+
   lancedb-test:
     name: LanceDB | Test
     needs: detect-changes

diff --git a/.mergify.yml b/.mergify.yml
@@ -263,6 +263,9 @@ pull_request_rules:
       - or:
           - check-success=DuckDB (query-protobuf) | Test
           - check-skipped=DuckDB (query-protobuf) | Test
+      - or:
+          - check-success=Lance | Test
+          - check-skipped=Lance | Test
       - or:
           - check-success=LanceDB | Test
           - check-skipped=LanceDB | Test

diff --git a/README.md b/README.md
@@ -231,6 +231,12 @@ The diagram illustrates the repository's architecture, which is considered overl
 
 ## Data
 
+## Data Format
+
+- **Protocol Buffers (Protobuf)** - Data serialization format
+- **Apache Parquet** - Columnar file format for big data
+- **Lance** - Columnar file format for machine learning
+
 ### Database, Data Warehouse, Data Lakehouse
 
 - **Trino** - Distributed SQL query engine

diff --git a/data-storage/lance/Makefile b/data-storage/lance/Makefile
@@ -0,0 +1,13 @@
+uv-install-python::
+	uv python install
+uv-update-lock-file:
+	uv lock
+uv-install-dependencies:
+	uv sync --dev
+
+uv-run-dev:
+	uv run poe dev
+uv-run-test:
+	uv run poe test
+uv-run-test-coverage:
+	uv run poe test-coverage
diff --git a/data-storage/lance/pyproject.toml b/data-storage/lance/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "lance"
+version = "1.0.0"
+requires-python = "~=3.13.0"
+dependencies = [
+  "pandas==2.2.3",
+  "polars==1.18.0",
+  "pylance==0.21.0",
+  "tqdm==4.67.1",
+]
+
+[dependency-groups]
+dev = [
+  "poethepoet==0.31.1",
+  "pytest==8.3.4",
+  "pytest-cov==6.0.0",
+]
+
+[tool.uv]
+package = false
+
+[tool.poe.tasks]
+dev = "python src/main.py"
+test = "pytest --verbose --verbose"
+test-coverage = "pytest --cov=. --cov-report=xml"
diff --git a/data-storage/lance/src/dummy_test.py b/data-storage/lance/src/dummy_test.py
@@ -0,0 +1,3 @@
+class TestDummy:
+    def test_dummy(self):
+        assert 1 + 1 == 2
diff --git a/data-storage/lance/src/main.py b/data-storage/lance/src/main.py
@@ -0,0 +1,85 @@
+import logging
+
+import lance
+import numpy as np
+from lance.vector import vec_to_table
+
+
+def main() -> None:
+    # Create sample vectors (minimum 5000 recommended for meaningful indexing)
+    num_vectors = 5000  # Increased from 1000 to meet minimum recommendation
+    vector_dim = 128  # Dimension of each vector (common for embeddings)
+    vectors = np.random.randn(num_vectors, vector_dim)
+
+    # Create some distinct vectors at the beginning for demonstration
+    # Make the first vector have a clear pattern
+    vectors[0] = np.array([1.0] * 32 + [2.0] * 32 + [3.0] * 32 + [4.0] * 32)
+    # Make the second vector similar to the first but with some variation
+    vectors[1] = vectors[0] + np.random.randn(vector_dim) * 0.1
+
+    # Convert to Lance table
+    vector_table = vec_to_table(vectors)
+
+    # Save to Lance dataset
+    uri = "/tmp/lancedb/vectors.lance"
+    dataset = lance.write_dataset(vector_table, uri, mode="overwrite")
+    logging.info(
+        "Dataset saved to %s with %d vectors of dimension %d",
+        uri,
+        num_vectors,
+        vector_dim,
+    )
+
+    # https://lancedb.github.io/lancedb/concepts/index_ivfpq/
+    # Create an index for vector similarity search
+    # IVF-PQ is a composite index that combines inverted file index (IVF) and product quantization (PQ)
+    # - IVF divides the vector space into Voronoi cells using K-means clustering
+    # - PQ reduces dimensionality by dividing vectors into sub-vectors and quantizing them
+    dataset.create_index(
+        "vector",
+        index_type="IVF_PQ",
+        # num_partitions: The number of partitions (Voronoi cells) in the IVF portion
+        # - Controls how the vector space is divided
+        # - Higher values increase query throughput but may reduce recall
+        # - Should be chosen to target a particular number of vectors per partition
+        # - For 5000 vectors, we use 64 partitions (~78 vectors per partition)
+        num_partitions=64,
+        # num_sub_vectors: The number of sub-vectors created during Product Quantization (PQ)
+        # - Controls the compression level and search accuracy
+        # - Chosen based on desired recall and vector dimensionality
+        # - Trade-off: more sub-vectors = better compression but potentially lower accuracy
+        num_sub_vectors=16,
+    )
+    logging.info("Created vector similarity index")
+
+    # Read back the dataset
+    dataset = lance.dataset(uri)
+
+    # Perform vector similarity search
+    query_vector = vectors[1]
+    logging.info(
+        "Performing similarity search for vector with pattern [1.0]*32 + [2.0]*32 + [3.0]*32 + [4.0]*32"
+    )
+
+    # Find 5 nearest neighbors
+    # Note: For better accuracy, you can use nprobes (5-10% of dataset) and refine_factor
+    k = 5
+    results = dataset.to_table(
+        nearest={
+            "column": "vector",
+            "k": k,
+            "q": query_vector,
+        }
+    ).to_pandas()
+
+    logging.info("Nearest neighbors (distances show similarity, lower = more similar):")
+    for idx, row in results.iterrows():
+        vector_preview = np.array(row["vector"])
+        logging.info(
+            f"Result {idx + 1}/{k}: Distance: {row['_distance']:.4f}, Vector preview: {vector_preview[:8]}..."
+        )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    main()