Skip to content

Commit

Permalink
Add ruff rules for pylint (#556)
Browse files Browse the repository at this point in the history
  • Loading branch information
cbornet authored Jul 10, 2024
1 parent 1546775 commit b1d2468
Show file tree
Hide file tree
Showing 26 changed files with 95 additions and 71 deletions.
2 changes: 1 addition & 1 deletion examples/notebooks/llama-parse-astra.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@
"response = requests.get(url, timeout=30)\n",
"\n",
"# Check if the request was successful\n",
"if response.status_code == 200:\n",
"if response.status_code == requests.codes.ok:\n",
" # Open the file in binary write mode and save the content\n",
" with open(file_path, \"wb\") as file:\n",
" file.write(response.content)\n",
Expand Down
2 changes: 0 additions & 2 deletions examples/notebooks/qa-maximal-marginal-relevance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,6 @@
" llm,\n",
" retriever=retriever,\n",
")\n",
"#\n",
"response = chain({chain.question_key: QUESTION})\n",
"print(\"Similarity-based chain:\")\n",
"print(f' ANSWER : {response[\"answer\"].strip()}')\n",
Expand Down Expand Up @@ -690,7 +689,6 @@
" llm,\n",
" retriever=retriever,\n",
")\n",
"#\n",
"response = chain({chain.question_key: QUESTION})\n",
"print(\"MMR-based chain:\")\n",
"print(f' ANSWER : {response[\"answer\"].strip()}')\n",
Expand Down
7 changes: 5 additions & 2 deletions libs/colbert/ragstack_colbert/colbert_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,11 @@ def all_gpus_support_fp16(is_cuda: Optional[bool] = False):
for device_id in range(torch.cuda.device_count()):
compute_capability = torch.cuda.get_device_capability(device_id)
# FP16 support requires compute capability of 5.3 or higher
if compute_capability[0] < 5 or (
compute_capability[0] == 5 and compute_capability[1] < 3
min_compute_capability_major = 5
min_compute_capability_minor = 3
if compute_capability[0] < min_compute_capability_major or (
compute_capability[0] == min_compute_capability_major
and compute_capability[1] < min_compute_capability_minor
):
logging.info(
"Device %s with compute capability %s does not support FP16 "
Expand Down
4 changes: 2 additions & 2 deletions libs/colbert/tests/integration_tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_database_sync(request, vector_store: str):

results = database.add_chunks(chunks=[chunk_0, chunk_1])

assert len(results) == 2
assert len(results) == 2 # noqa: PLR2004
assert results[0] == (doc_id, 0)
assert results[1] == (doc_id, 1)

Expand Down Expand Up @@ -94,7 +94,7 @@ async def test_database_async(request, vector_store: str):
)

results = await database.aadd_chunks(chunks=[chunk_0, chunk_1])
assert len(results) == 2
assert len(results) == 2 # noqa: PLR2004
assert results[0] == (doc_id, 0)
assert results[1] == (doc_id, 1)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ def chunk_texts(text, chunk_size, overlap_size):

retriever = store.as_retriever()

k = 5
chunk_scores = retriever.text_search(
query_text="what kind fish lives shallow coral reefs", k=5
query_text="what kind fish lives shallow coral reefs", k=k
)
assert len(chunk_scores) == 5
assert len(chunk_scores) == k
for chunk, score in chunk_scores:
logging.info("got chunk_id %s with score %s", chunk.chunk_id, score)

Expand Down
31 changes: 18 additions & 13 deletions libs/colbert/tests/unit_tests/test_colbert_baseline_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from colbert.infra.config import ColBERTConfig
from colbert.modeling.checkpoint import Checkpoint
from ragstack_colbert import ColbertEmbeddingModel, Embedding
from ragstack_colbert.constant import DEFAULT_COLBERT_MODEL
from ragstack_colbert.constant import DEFAULT_COLBERT_DIM, DEFAULT_COLBERT_MODEL
from torch import Tensor
from torch.nn.functional import cosine_similarity

Expand Down Expand Up @@ -37,6 +37,10 @@
arctic_botany_chunks = list(arctic_botany_dict.values())


MINIMUM_SIMILARITY = 0.999
MAXIMUM_DISTANCE = 0.001


# a uility function to evaluate similarity of two embeddings at per token level
def are_they_similar(embedded_chunks: List[Embedding], tensors: List[Tensor]):
n = 0
Expand All @@ -47,12 +51,12 @@ def are_they_similar(embedded_chunks: List[Embedding], tensors: List[Tensor]):
assert vector_tensor.shape == tensors[n].shape

# we still have outlier over the specified limit but almost 0
assert pdist(vector_tensor, tensors[n]).item() < 0.0001
assert pdist(vector_tensor, tensors[n]).item() < MAXIMUM_DISTANCE

similarity = cosine_similarity(
vector_tensor.unsqueeze(0), tensors[n].unsqueeze(0)
)
assert similarity.item() > 0.999
assert similarity.item() > MINIMUM_SIMILARITY
n = n + 1

assert n == len(tensors)
Expand Down Expand Up @@ -84,22 +88,22 @@ def test_embeddings_with_baseline():
vector_tensor = torch.tensor(vector)
embedded_tensors.append(vector_tensor)
distance = torch.norm(vector_tensor - baseline_tensors[n])
assert abs(distance) < 0.001
assert abs(distance) < MAXIMUM_DISTANCE
# another way to measure pairwise distance
# it must be a positive since it's from square root
assert pdist(vector_tensor, baseline_tensors[n]).item() < 0.001
assert pdist(vector_tensor, baseline_tensors[n]).item() < MAXIMUM_DISTANCE

similarity = cosine_similarity(
vector_tensor.unsqueeze(0), baseline_tensors[n].unsqueeze(0)
)
assert similarity.shape == torch.Size([1]) # this has to be scalar
# debug code to identify which token deviates
if similarity.item() < 0.99:
if similarity.item() < MINIMUM_SIMILARITY:
logging.warning("n = %s, similarity = %s", n, similarity.item())
assert similarity.item() > 0.99
assert similarity.item() > MINIMUM_SIMILARITY
n = n + 1

assert len(embedded_tensors) == 645
assert len(embedded_tensors) == 645 # noqa: PLR2004

# test against the same function to make sure to generate the same embeddings.
# use the same ColBERT configurations but reload the checkpoint with the default
Expand Down Expand Up @@ -138,22 +142,23 @@ def model_embedding(model: str):
)
embeddings = colbert_svc.embed_texts(arctic_botany_chunks)

assert len(embeddings) == 8
assert len(embeddings) == len(arctic_botany_chunks)
n = 0
for embedding in embeddings:
for vector in embedding:
assert len(vector) == 128
assert len(vector) == DEFAULT_COLBERT_DIM
n = n + 1

assert n == 645
assert n == 645 # noqa: PLR2004

# recall embeddings test
query_maxlen = 32
embedding = colbert_svc.embed_query(
query="What adaptations enable Arctic plants to survive and thrive "
"in extremely cold temperatures and minimal sunlight?",
query_maxlen=32,
query_maxlen=query_maxlen,
)
assert len(embedding) == 32
assert len(embedding) == query_maxlen


def test_compatible_models():
Expand Down
9 changes: 5 additions & 4 deletions libs/colbert/tests/unit_tests/test_colbert_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_colbert_token_embeddings():

embeddings = colbert.embed_texts(texts=texts)

assert len(embeddings) == 2
assert len(embeddings) == 2 # noqa: PLR2004
assert len(embeddings[0][0]) == DEFAULT_COLBERT_DIM


Expand All @@ -27,7 +27,7 @@ def test_colbert_token_embeddings_with_params():

embeddings = colbert.embed_texts(texts=texts)

assert len(embeddings) == 3
assert len(embeddings) == 3 # noqa: PLR2004

assert len(embeddings[0][0]) == DEFAULT_COLBERT_DIM

Expand All @@ -40,5 +40,6 @@ def test_colbert_query_embeddings():
assert query_tensor.shape == (12, 128)

# test query encoding
embedding = colbert.embed_query("test-query", query_maxlen=512)
assert len(embedding) == 512
query_maxlen = 512
embedding = colbert.embed_query("test-query", query_maxlen=query_maxlen)
assert len(embedding) == query_maxlen
4 changes: 2 additions & 2 deletions libs/colbert/tests/unit_tests/test_colbert_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_max_similarity_torch():

def test_query_maxlen_calculation():
tokens = [["word1"], ["word2", "word3"]]
assert calculate_query_maxlen(tokens) == 5
assert calculate_query_maxlen(tokens) == 5 # noqa: PLR2004

tokens = [["word1", "word2", "word3"], ["word1", "word2"]]
assert calculate_query_maxlen(tokens) == 6
assert calculate_query_maxlen(tokens) == 6 # noqa: PLR2004
20 changes: 11 additions & 9 deletions libs/e2e-tests/e2e_tests/langchain/test_astra.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from e2e_tests.test_utils.astradb_vector_store_handler import AstraDBVectorStoreHandler
from e2e_tests.test_utils.vector_store_handler import VectorStoreImplementation

MINIMUM_ACCEPTABLE_SCORE = 0.1


def test_basic_vector_search(vectorstore: AstraDBVectorStore):
print("Running test_basic_vector_search")
Expand Down Expand Up @@ -236,7 +238,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
)

documents = vectorstore.search("RAGStack", "similarity")
assert len(documents) == 2
assert len(documents) == 2 # noqa: PLR2004

documents = vectorstore.search(
"RAGStack", "similarity", filter={"context": "homepage"}
Expand All @@ -249,7 +251,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
)

documents = vectorstore.search("RAGStack", "mmr")
assert len(documents) == 2
assert len(documents) == 2 # noqa: PLR2004

documents = vectorstore.search("RAGStack", "mmr", filter={"context": "homepage"})
assert len(documents) == 1
Expand Down Expand Up @@ -287,7 +289,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
assert len(documents_with_score) == 1
# th elements are Tuple(document, score)
score = documents_with_score[0][1]
assert score > 0.1
assert score > MINIMUM_ACCEPTABLE_SCORE

verify_document(
documents_with_score[0][0],
Expand All @@ -303,7 +305,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
assert len(documents_with_score) == 1
# the elements are Tuple(document, score)
score = documents_with_score[0][1]
assert score > 0.1
assert score > MINIMUM_ACCEPTABLE_SCORE

verify_document(
documents_with_score[0][0],
Expand All @@ -317,7 +319,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
assert len(documents_with_score) == 1
# the elements are Tuple(document, score)
score = documents_with_score[0][1]
assert score > 0.1
assert score > MINIMUM_ACCEPTABLE_SCORE

# test for max_marginal_relevance_search_by_vector

Expand Down Expand Up @@ -349,7 +351,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
assert len(documents) == 1

documents = vectorstore.similarity_search_by_vector(embedding=vector, k=2)
assert len(documents) == 2
assert len(documents) == 2 # noqa: PLR2004

documents = vectorstore.similarity_search_by_vector(
embedding=vector, k=1, filter={"context": "none"}
Expand Down Expand Up @@ -382,18 +384,18 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):

retriever = vectorstore.as_retriever()
documents = retriever.get_relevant_documents("RAGStack")
assert len(documents) == 2
assert len(documents) == 2 # noqa: PLR2004

documents = retriever.invoke("RAGStack", RunnableConfig(tags=["custom_retriever"]))
assert len(documents) == 2
assert len(documents) == 2 # noqa: PLR2004

retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
documents = retriever.get_relevant_documents("RAGStack")
assert len(documents) == 1

retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
documents = retriever.get_relevant_documents("RAGStack")
assert len(documents) == 2
assert len(documents) == 2 # noqa: PLR2004

# delete all the documents
vectorstore.delete(document_ids)
Expand Down
10 changes: 6 additions & 4 deletions libs/e2e-tests/e2e_tests/langchain/test_document_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_csv_loader():
]
)
docs = CSVLoader(file_path=temp_csv_file.name).load()
assert len(docs) == 3
assert len(docs) == 3 # noqa: PLR2004

doc1 = docs[0]
assert doc1.page_content == "column1: value1\ncolumn2: value2\ncolumn3: value3"
Expand All @@ -48,7 +48,7 @@ def test_web_based_loader():
)
loader.requests_per_second = 1
docs = loader.load()
assert len(docs) == 2
assert len(docs) == 2 # noqa: PLR2004

doc1 = docs[0]
assert "0.1.0 - Oct 4, 2023" in doc1.page_content
Expand Down Expand Up @@ -150,19 +150,21 @@ def test_astradb_loader() -> None:
[{"foo": "bar2", "baz": "qux"}] * 4 + [{"foo": "bar", "baz": "qux"}] * 4
)

doc_number = 22

loader = AstraDBLoader(
astra_ref.collection,
token=astra_ref.token,
api_endpoint=astra_ref.api_endpoint,
nb_prefetched=1,
projection={"foo": 1},
find_options={"limit": 22},
find_options={"limit": doc_number},
filter_criteria={"foo": "bar"},
page_content_mapper=lambda r: "Payload: " + json.dumps(r),
)
docs = loader.load()

assert len(docs) == 22
assert len(docs) == doc_number
ids = set()
for doc in docs:
assert doc.page_content.startswith("Payload: ")
Expand Down
2 changes: 1 addition & 1 deletion libs/e2e-tests/e2e_tests/llama_index/test_astra.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_vector_search_with_metadata(environment: Environment):
)

documents = index.as_retriever().retrieve("RAGStack")
assert len(documents) == 2
assert len(documents) == 2 # noqa: PLR2004

# delete all the documents
for doc_id in document_ids:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def await_ongoing_deletions_completed(self):
"%s deletions still running, waiting to complete", pending_deletions
)
time.sleep(1)
return

def run_delete(self, collection: str):
"""Runs a delete_collection in the background, blocking if max_workers are
Expand Down
3 changes: 2 additions & 1 deletion libs/knowledge-store/notebooks/astra_support.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@
"source": [
"not_found = 0\n",
"found = 0\n",
"BATCH_SIZE = 50\n",
"\n",
"docs = []\n",
"async for doc in load_pages(URLS):\n",
Expand All @@ -288,7 +289,7 @@
" docs.append(doc)\n",
" found += 1\n",
"\n",
" if len(docs) >= 50:\n",
" if len(docs) >= BATCH_SIZE:\n",
" graph_store.add_documents(docs)\n",
" docs.clear()\n",
"\n",
Expand Down
4 changes: 2 additions & 2 deletions libs/knowledge-store/ragstack_knowledge_store/graph_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,11 +330,11 @@ def add_nodes(
link_from_tags = set() # link from these tags

for tag in links:
if tag.direction == "in" or tag.direction == "bidir":
if tag.direction in {"in", "bidir"}:
# An incoming link should be linked *from* nodes with the given
# tag.
link_from_tags.add((tag.kind, tag.tag))
if tag.direction == "out" or tag.direction == "bidir":
if tag.direction in {"out", "bidir"}:
link_to_tags.add((tag.kind, tag.tag))

metadata_blob = _serialize_metadata(metadata)
Expand Down
4 changes: 2 additions & 2 deletions libs/langchain/tests/integration_tests/test_colbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def test_sync_from_docs(request, vector_store: str):
"What are Xenospheric Particulates?"
)

assert len(results) > 3
assert len(results) > 3 # noqa: PLR2004
assert results[1][1] > 0 # check score from result 2
assert results[2][1] > 0 # check score from result 3
assert results[1][1] > results[2][1] # check that scores are returned in order
Expand Down Expand Up @@ -171,7 +171,7 @@ async def test_async_from_docs(request, vector_store: str):
"What are Xenospheric Particulates?"
)

assert len(results) > 3
assert len(results) > 3 # noqa: PLR2004
assert results[1][1] > 0 # check score from result 2
assert results[2][1] > 0 # check score from result 3
assert results[1][1] > results[2][1] # check that scores are returned in order
Expand Down
Loading

0 comments on commit b1d2468

Please sign in to comment.