Skip to content

Commit

Permalink
Merge bf7af03 into 5b25d9a
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao authored Dec 27, 2024
2 parents 5b25d9a + bf7af03 commit a7b832b
Show file tree
Hide file tree
Showing 10 changed files with 498 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/.static-type-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ jobs:
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-protobuf
uv run poe static-type-check-python --package=data-storage.hm-lancedb
uv run poe static-type-check-python --package=data-storage.hm-protobuf
uv run poe static-type-check-python --package=data-storage.lance
uv run poe static-type-check-python --package=data-visualization.grafana.hm-dashboard
uv run poe static-type-check-python --package=data-visualization.iads.iads-data-manager.iads-config-reader
uv run poe static-type-check-python --package=data-visualization.iads.iads-data-manager.iads-data-reader
Expand Down
37 changes: 37 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ jobs:
iads-iads-data-producer: ${{ steps.filter.outputs.iads-iads-data-producer }}
iads-zeromq-iads-bridge: ${{ steps.filter.outputs.iads-zeromq-iads-bridge }}
hugging-face-analyze-sentiment: ${{ steps.filter.outputs.hugging-face-analyze-sentiment }}
lance: ${{ steps.filter.outputs.lance }}
kafka-rust-proto-consumer: ${{ steps.filter.outputs.kafka-rust-proto-consumer }}
kafka-rust-proto-producer: ${{ steps.filter.outputs.kafka-rust-proto-producer }}
kafka-rust-udp-kafka-bridge: ${{ steps.filter.outputs.kafka-rust-udp-kafka-bridge }}
Expand Down Expand Up @@ -383,6 +384,9 @@ jobs:
iads-zeromq-iads-bridge:
- '.github/workflows/test.yml'
- 'data-visualization/iads/iads-rtstation/zeromq-iads-bridge/**'
lance:
- '.github/workflows/test.yml'
- 'data-storage/lance/**'
mineru:
- '.github/workflows/test.yml'
- 'machine-learning/mineru/**'
Expand Down Expand Up @@ -1406,6 +1410,39 @@ jobs:
with:
directory: data-storage/hm-duckdb/query-protobuf

lance-test:
name: Lance | Test
needs: detect-changes
if: ${{ needs.detect-changes.outputs.lance == 'true' }}
runs-on: ubuntu-24.04
environment: test
timeout-minutes: 10
steps:
- name: Checkout
uses: actions/checkout@v4.2.2
- name: Install uv
uses: astral-sh/setup-uv@v5.1.0
with:
version: 0.5.11
enable-cache: true
cache-dependency-glob: data-storage/lance/uv.lock
- name: Set up Python
uses: actions/setup-python@v5.3.0
with:
python-version-file: data-storage/lance/pyproject.toml
- name: Install dependencies
working-directory: data-storage/lance
run: |
uv sync --dev
- name: Test
working-directory: data-storage/lance
run: |
uv run poe test-coverage
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5.1.2
with:
directory: data-storage/lance

lancedb-test:
name: LanceDB | Test
needs: detect-changes
Expand Down
3 changes: 3 additions & 0 deletions .mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ pull_request_rules:
- or:
- check-success=DuckDB (query-protobuf) | Test
- check-skipped=DuckDB (query-protobuf) | Test
- or:
- check-success=Lance | Test
- check-skipped=Lance | Test
- or:
- check-success=LanceDB | Test
- check-skipped=LanceDB | Test
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ static-type-check-python:
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-protobuf
uv run poe static-type-check-python --package=data-storage.hm-lancedb
uv run poe static-type-check-python --package=data-storage.hm-protobuf
uv run poe static-type-check-python --package=data-storage.lance
uv run poe static-type-check-python --package=data-visualization.grafana.hm-dashboard
uv run poe static-type-check-python --package=data-visualization.iads.iads-data-manager.iads-config-reader
uv run poe static-type-check-python --package=data-visualization.iads.iads-data-manager.iads-data-reader
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,12 @@ The diagram illustrates the repository's architecture, which is considered overl

## Data

## Data Format

- **Protocol Buffers (Protobuf)** - Data serialization format
- **Apache Parquet** - Columnar file format for big data
- **Lance** - Columnar file format for machine learning

### Database, Data Warehouse, Data Lakehouse

- **Trino** - Distributed SQL query engine
Expand Down
13 changes: 13 additions & 0 deletions data-storage/lance/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
uv-install-python::
uv python install
uv-update-lock-file:
uv lock
uv-install-dependencies:
uv sync --dev

uv-run-dev:
uv run poe dev
uv-run-test:
uv run poe test
uv-run-test-coverage:
uv run poe test-coverage
25 changes: 25 additions & 0 deletions data-storage/lance/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[project]
name = "lance"
version = "1.0.0"
requires-python = "~=3.13.0"
dependencies = [
"pandas==2.2.3",
"polars==1.18.0",
"pylance==0.21.0",
"tqdm==4.67.1",
]

[dependency-groups]
dev = [
"poethepoet==0.31.1",
"pytest==8.3.4",
"pytest-cov==6.0.0",
]

[tool.uv]
package = false

[tool.poe.tasks]
dev = "python src/main.py"
test = "pytest --verbose --verbose"
test-coverage = "pytest --cov=. --cov-report=xml"
3 changes: 3 additions & 0 deletions data-storage/lance/src/dummy_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class TestDummy:
def test_dummy(self):
assert 1 + 1 == 2
85 changes: 85 additions & 0 deletions data-storage/lance/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import logging

import lance
import numpy as np
from lance.vector import vec_to_table


def main() -> None:
# Create sample vectors (minimum 5000 recommended for meaningful indexing)
num_vectors = 5000 # Increased from 1000 to meet minimum recommendation
vector_dim = 128 # Dimension of each vector (common for embeddings)
vectors = np.random.randn(num_vectors, vector_dim)

# Create some distinct vectors at the beginning for demonstration
# Make the first vector have a clear pattern
vectors[0] = np.array([1.0] * 32 + [2.0] * 32 + [3.0] * 32 + [4.0] * 32)
# Make the second vector similar to the first but with some variation
vectors[1] = vectors[0] + np.random.randn(vector_dim) * 0.1

# Convert to Lance table
vector_table = vec_to_table(vectors)

# Save to Lance dataset
uri = "/tmp/lancedb/vectors.lance"
dataset = lance.write_dataset(vector_table, uri, mode="overwrite")
logging.info(
"Dataset saved to %s with %d vectors of dimension %d",
uri,
num_vectors,
vector_dim,
)

# https://lancedb.github.io/lancedb/concepts/index_ivfpq/
# Create an index for vector similarity search
# IVF-PQ is a composite index that combines inverted file index (IVF) and product quantization (PQ)
# - IVF divides the vector space into Voronoi cells using K-means clustering
# - PQ reduces dimensionality by dividing vectors into sub-vectors and quantizing them
dataset.create_index(
"vector",
index_type="IVF_PQ",
# num_partitions: The number of partitions (Voronoi cells) in the IVF portion
# - Controls how the vector space is divided
# - Higher values increase query throughput but may reduce recall
# - Should be chosen to target a particular number of vectors per partition
# - For 5000 vectors, we use 64 partitions (~78 vectors per partition)
num_partitions=64,
# num_sub_vectors: The number of sub-vectors created during Product Quantization (PQ)
# - Controls the compression level and search accuracy
# - Chosen based on desired recall and vector dimensionality
# - Trade-off: more sub-vectors = better compression but potentially lower accuracy
num_sub_vectors=16,
)
logging.info("Created vector similarity index")

# Read back the dataset
dataset = lance.dataset(uri)

# Perform vector similarity search
query_vector = vectors[1]
logging.info(
"Performing similarity search for vector with pattern [1.0]*32 + [2.0]*32 + [3.0]*32 + [4.0]*32"
)

# Find 5 nearest neighbors
# Note: For better accuracy, you can use nprobes (5-10% of dataset) and refine_factor
k = 5
results = dataset.to_table(
nearest={
"column": "vector",
"k": k,
"q": query_vector,
}
).to_pandas()

logging.info("Nearest neighbors (distances show similarity, lower = more similar):")
for idx, row in results.iterrows():
vector_preview = np.array(row["vector"])
logging.info(
f"Result {idx + 1}/{k}: Distance: {row['_distance']:.4f}, Vector preview: {vector_preview[:8]}..."
)


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()
Loading

0 comments on commit a7b832b

Please sign in to comment.