-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5b25d9a
commit 6fee042
Showing
8 changed files
with
496 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
uv-install-python:: | ||
uv python install | ||
uv-update-lock-file: | ||
uv lock | ||
uv-install-dependencies: | ||
uv sync --dev | ||
|
||
uv-run-dev: | ||
uv run poe dev | ||
uv-run-test: | ||
uv run poe test | ||
uv-run-test-coverage: | ||
uv run poe test-coverage |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
[project] | ||
name = "lance" | ||
version = "1.0.0" | ||
requires-python = "~=3.13.0" | ||
dependencies = [ | ||
"pandas==2.2.3", | ||
"polars==1.18.0", | ||
"pylance==0.21.0", | ||
"tqdm==4.67.1", | ||
] | ||
|
||
[dependency-groups] | ||
dev = [ | ||
"poethepoet==0.31.1", | ||
"pytest==8.3.4", | ||
"pytest-cov==6.0.0", | ||
] | ||
|
||
[tool.uv] | ||
package = false | ||
|
||
[tool.poe.tasks] | ||
dev = "python src/main.py" | ||
test = "pytest --verbose --verbose" | ||
test-coverage = "pytest --cov=. --cov-report=xml" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
class TestDummy: | ||
def test_dummy(self): | ||
assert 1 + 1 == 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import logging | ||
|
||
import lance | ||
import numpy as np | ||
from lance.vector import vec_to_table | ||
|
||
|
||
def main() -> None: | ||
# Create sample vectors (minimum 5000 recommended for meaningful indexing) | ||
num_vectors = 5000 # Increased from 1000 to meet minimum recommendation | ||
vector_dim = 128 # Dimension of each vector (common for embeddings) | ||
vectors = np.random.randn(num_vectors, vector_dim) | ||
|
||
# Create some distinct vectors at the beginning for demonstration | ||
# Make the first vector have a clear pattern | ||
vectors[0] = np.array([1.0] * 32 + [2.0] * 32 + [3.0] * 32 + [4.0] * 32) | ||
# Make the second vector similar to the first but with some variation | ||
vectors[1] = vectors[0] + np.random.randn(vector_dim) * 0.1 | ||
|
||
# Convert to Lance table | ||
vector_table = vec_to_table(vectors) | ||
|
||
# Save to Lance dataset | ||
uri = "/tmp/lancedb/vectors.lance" | ||
dataset = lance.write_dataset(vector_table, uri, mode="overwrite") | ||
logging.info( | ||
"Dataset saved to %s with %d vectors of dimension %d", | ||
uri, | ||
num_vectors, | ||
vector_dim, | ||
) | ||
|
||
# https://lancedb.github.io/lancedb/concepts/index_ivfpq/ | ||
# Create an index for vector similarity search | ||
# IVF-PQ is a composite index that combines inverted file index (IVF) and product quantization (PQ) | ||
# - IVF divides the vector space into Voronoi cells using K-means clustering | ||
# - PQ reduces dimensionality by dividing vectors into sub-vectors and quantizing them | ||
dataset.create_index( | ||
"vector", | ||
index_type="IVF_PQ", | ||
# num_partitions: The number of partitions (Voronoi cells) in the IVF portion | ||
# - Controls how the vector space is divided | ||
# - Higher values increase query throughput but may reduce recall | ||
# - Should be chosen to target a particular number of vectors per partition | ||
# - For 5000 vectors, we use 64 partitions (~78 vectors per partition) | ||
num_partitions=64, | ||
# num_sub_vectors: The number of sub-vectors created during Product Quantization (PQ) | ||
# - Controls the compression level and search accuracy | ||
# - Chosen based on desired recall and vector dimensionality | ||
# - Trade-off: more sub-vectors = better compression but potentially lower accuracy | ||
num_sub_vectors=16, | ||
) | ||
logging.info("Created vector similarity index") | ||
|
||
# Read back the dataset | ||
dataset = lance.dataset(uri) | ||
|
||
# Perform vector similarity search | ||
query_vector = vectors[1] | ||
logging.info( | ||
"Performing similarity search for vector with pattern [1.0]*32 + [2.0]*32 + [3.0]*32 + [4.0]*32" | ||
) | ||
|
||
# Find 5 nearest neighbors | ||
# Note: For better accuracy, you can use nprobes (5-10% of dataset) and refine_factor | ||
k = 5 | ||
results = dataset.to_table( | ||
nearest={ | ||
"column": "vector", | ||
"k": k, | ||
"q": query_vector, | ||
} | ||
).to_pandas() | ||
|
||
logging.info("Nearest neighbors (distances show similarity, lower = more similar):") | ||
for idx, row in results.iterrows(): | ||
vector_preview = np.array(row["vector"]) | ||
logging.info( | ||
f"Result {idx + 1}/{k}: Distance: {row['_distance']:.4f}, Vector preview: {vector_preview[:8]}..." | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
logging.basicConfig(level=logging.INFO) | ||
main() |
Oops, something went wrong.