Add ruff rules for pylint (#556)

datastax · Jul 10, 2024 · b1d2468 · b1d2468
1 parent 1546775
commit b1d2468
Show file tree

Hide file tree

Showing 26 changed files with 95 additions and 71 deletions.
diff --git a/examples/notebooks/llama-parse-astra.ipynb b/examples/notebooks/llama-parse-astra.ipynb
@@ -151,7 +151,7 @@
     "response = requests.get(url, timeout=30)\n",
     "\n",
     "# Check if the request was successful\n",
-    "if response.status_code == 200:\n",
+    "if response.status_code == requests.codes.ok:\n",
     "    # Open the file in binary write mode and save the content\n",
     "    with open(file_path, \"wb\") as file:\n",
     "        file.write(response.content)\n",

diff --git a/examples/notebooks/qa-maximal-marginal-relevance.ipynb b/examples/notebooks/qa-maximal-marginal-relevance.ipynb
@@ -648,7 +648,6 @@
     "    llm,\n",
     "    retriever=retriever,\n",
     ")\n",
-    "#\n",
     "response = chain({chain.question_key: QUESTION})\n",
     "print(\"Similarity-based chain:\")\n",
     "print(f'  ANSWER : {response[\"answer\"].strip()}')\n",
@@ -690,7 +689,6 @@
     "    llm,\n",
     "    retriever=retriever,\n",
     ")\n",
-    "#\n",
     "response = chain({chain.question_key: QUESTION})\n",
     "print(\"MMR-based chain:\")\n",
     "print(f'  ANSWER : {response[\"answer\"].strip()}')\n",

diff --git a/libs/colbert/ragstack_colbert/colbert_retriever.py b/libs/colbert/ragstack_colbert/colbert_retriever.py
@@ -37,8 +37,11 @@ def all_gpus_support_fp16(is_cuda: Optional[bool] = False):
     for device_id in range(torch.cuda.device_count()):
         compute_capability = torch.cuda.get_device_capability(device_id)
         # FP16 support requires compute capability of 5.3 or higher
-        if compute_capability[0] < 5 or (
-            compute_capability[0] == 5 and compute_capability[1] < 3
+        min_compute_capability_major = 5
+        min_compute_capability_minor = 3
+        if compute_capability[0] < min_compute_capability_major or (
+            compute_capability[0] == min_compute_capability_major
+            and compute_capability[1] < min_compute_capability_minor
         ):
             logging.info(
                 "Device %s with compute capability %s does not support FP16 "

diff --git a/libs/colbert/tests/integration_tests/test_database.py b/libs/colbert/tests/integration_tests/test_database.py
@@ -51,7 +51,7 @@ def test_database_sync(request, vector_store: str):
 
     results = database.add_chunks(chunks=[chunk_0, chunk_1])
 
-    assert len(results) == 2
+    assert len(results) == 2  # noqa: PLR2004
     assert results[0] == (doc_id, 0)
     assert results[1] == (doc_id, 1)
 
@@ -94,7 +94,7 @@ async def test_database_async(request, vector_store: str):
     )
 
     results = await database.aadd_chunks(chunks=[chunk_0, chunk_1])
-    assert len(results) == 2
+    assert len(results) == 2  # noqa: PLR2004
     assert results[0] == (doc_id, 0)
     assert results[1] == (doc_id, 1)
 

diff --git a/libs/colbert/tests/integration_tests/test_embedding_retrieval.py b/libs/colbert/tests/integration_tests/test_embedding_retrieval.py
@@ -73,10 +73,11 @@ def chunk_texts(text, chunk_size, overlap_size):
 
     retriever = store.as_retriever()
 
+    k = 5
     chunk_scores = retriever.text_search(
-        query_text="what kind fish lives shallow coral reefs", k=5
+        query_text="what kind fish lives shallow coral reefs", k=k
     )
-    assert len(chunk_scores) == 5
+    assert len(chunk_scores) == k
     for chunk, score in chunk_scores:
         logging.info("got chunk_id %s with score %s", chunk.chunk_id, score)
 

diff --git a/libs/colbert/tests/unit_tests/test_colbert_baseline_embeddings.py b/libs/colbert/tests/unit_tests/test_colbert_baseline_embeddings.py
@@ -6,7 +6,7 @@
 from colbert.infra.config import ColBERTConfig
 from colbert.modeling.checkpoint import Checkpoint
 from ragstack_colbert import ColbertEmbeddingModel, Embedding
-from ragstack_colbert.constant import DEFAULT_COLBERT_MODEL
+from ragstack_colbert.constant import DEFAULT_COLBERT_DIM, DEFAULT_COLBERT_MODEL
 from torch import Tensor
 from torch.nn.functional import cosine_similarity
 
@@ -37,6 +37,10 @@
 arctic_botany_chunks = list(arctic_botany_dict.values())
 
 
+MINIMUM_SIMILARITY = 0.999
+MAXIMUM_DISTANCE = 0.001
+
+
 # a uility function to evaluate similarity of two embeddings at per token level
 def are_they_similar(embedded_chunks: List[Embedding], tensors: List[Tensor]):
     n = 0
@@ -47,12 +51,12 @@ def are_they_similar(embedded_chunks: List[Embedding], tensors: List[Tensor]):
             assert vector_tensor.shape == tensors[n].shape
 
             # we still have outlier over the specified limit but almost 0
-            assert pdist(vector_tensor, tensors[n]).item() < 0.0001
+            assert pdist(vector_tensor, tensors[n]).item() < MAXIMUM_DISTANCE
 
             similarity = cosine_similarity(
                 vector_tensor.unsqueeze(0), tensors[n].unsqueeze(0)
             )
-            assert similarity.item() > 0.999
+            assert similarity.item() > MINIMUM_SIMILARITY
             n = n + 1
 
     assert n == len(tensors)
@@ -84,22 +88,22 @@ def test_embeddings_with_baseline():
             vector_tensor = torch.tensor(vector)
             embedded_tensors.append(vector_tensor)
             distance = torch.norm(vector_tensor - baseline_tensors[n])
-            assert abs(distance) < 0.001
+            assert abs(distance) < MAXIMUM_DISTANCE
             # another way to measure pairwise distance
             # it must be a positive since it's from square root
-            assert pdist(vector_tensor, baseline_tensors[n]).item() < 0.001
+            assert pdist(vector_tensor, baseline_tensors[n]).item() < MAXIMUM_DISTANCE
 
             similarity = cosine_similarity(
                 vector_tensor.unsqueeze(0), baseline_tensors[n].unsqueeze(0)
             )
             assert similarity.shape == torch.Size([1])  # this has to be scalar
             # debug code to identify which token deviates
-            if similarity.item() < 0.99:
+            if similarity.item() < MINIMUM_SIMILARITY:
                 logging.warning("n = %s, similarity = %s", n, similarity.item())
-            assert similarity.item() > 0.99
+            assert similarity.item() > MINIMUM_SIMILARITY
             n = n + 1
 
-    assert len(embedded_tensors) == 645
+    assert len(embedded_tensors) == 645  # noqa: PLR2004
 
     # test against the same function to make sure to generate the same embeddings.
     # use the same ColBERT configurations but reload the checkpoint with the default
@@ -138,22 +142,23 @@ def model_embedding(model: str):
     )
     embeddings = colbert_svc.embed_texts(arctic_botany_chunks)
 
-    assert len(embeddings) == 8
+    assert len(embeddings) == len(arctic_botany_chunks)
     n = 0
     for embedding in embeddings:
         for vector in embedding:
-            assert len(vector) == 128
+            assert len(vector) == DEFAULT_COLBERT_DIM
             n = n + 1
 
-    assert n == 645
+    assert n == 645  # noqa: PLR2004
 
     # recall embeddings test
+    query_maxlen = 32
     embedding = colbert_svc.embed_query(
         query="What adaptations enable Arctic plants to survive and thrive "
         "in extremely cold temperatures and minimal sunlight?",
-        query_maxlen=32,
+        query_maxlen=query_maxlen,
     )
-    assert len(embedding) == 32
+    assert len(embedding) == query_maxlen
 
 
 def test_compatible_models():

diff --git a/libs/colbert/tests/unit_tests/test_colbert_embeddings.py b/libs/colbert/tests/unit_tests/test_colbert_embeddings.py
@@ -10,7 +10,7 @@ def test_colbert_token_embeddings():
 
     embeddings = colbert.embed_texts(texts=texts)
 
-    assert len(embeddings) == 2
+    assert len(embeddings) == 2  # noqa: PLR2004
     assert len(embeddings[0][0]) == DEFAULT_COLBERT_DIM
 
 
@@ -27,7 +27,7 @@ def test_colbert_token_embeddings_with_params():
 
     embeddings = colbert.embed_texts(texts=texts)
 
-    assert len(embeddings) == 3
+    assert len(embeddings) == 3  # noqa: PLR2004
 
     assert len(embeddings[0][0]) == DEFAULT_COLBERT_DIM
 
@@ -40,5 +40,6 @@ def test_colbert_query_embeddings():
     assert query_tensor.shape == (12, 128)
 
     # test query encoding
-    embedding = colbert.embed_query("test-query", query_maxlen=512)
-    assert len(embedding) == 512
+    query_maxlen = 512
+    embedding = colbert.embed_query("test-query", query_maxlen=query_maxlen)
+    assert len(embedding) == query_maxlen
diff --git a/libs/colbert/tests/unit_tests/test_colbert_retriever.py b/libs/colbert/tests/unit_tests/test_colbert_retriever.py
@@ -30,7 +30,7 @@ def test_max_similarity_torch():
 
 def test_query_maxlen_calculation():
     tokens = [["word1"], ["word2", "word3"]]
-    assert calculate_query_maxlen(tokens) == 5
+    assert calculate_query_maxlen(tokens) == 5  # noqa: PLR2004
 
     tokens = [["word1", "word2", "word3"], ["word1", "word2"]]
-    assert calculate_query_maxlen(tokens) == 6
+    assert calculate_query_maxlen(tokens) == 6  # noqa: PLR2004
diff --git a/libs/e2e-tests/e2e_tests/langchain/test_astra.py b/libs/e2e-tests/e2e_tests/langchain/test_astra.py
@@ -18,6 +18,8 @@
 from e2e_tests.test_utils.astradb_vector_store_handler import AstraDBVectorStoreHandler
 from e2e_tests.test_utils.vector_store_handler import VectorStoreImplementation
 
+MINIMUM_ACCEPTABLE_SCORE = 0.1
+
 
 def test_basic_vector_search(vectorstore: AstraDBVectorStore):
     print("Running test_basic_vector_search")
@@ -236,7 +238,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
     )
 
     documents = vectorstore.search("RAGStack", "similarity")
-    assert len(documents) == 2
+    assert len(documents) == 2  # noqa: PLR2004
 
     documents = vectorstore.search(
         "RAGStack", "similarity", filter={"context": "homepage"}
@@ -249,7 +251,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
     )
 
     documents = vectorstore.search("RAGStack", "mmr")
-    assert len(documents) == 2
+    assert len(documents) == 2  # noqa: PLR2004
 
     documents = vectorstore.search("RAGStack", "mmr", filter={"context": "homepage"})
     assert len(documents) == 1
@@ -287,7 +289,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
     assert len(documents_with_score) == 1
     # th elements are Tuple(document, score)
     score = documents_with_score[0][1]
-    assert score > 0.1
+    assert score > MINIMUM_ACCEPTABLE_SCORE
 
     verify_document(
         documents_with_score[0][0],
@@ -303,7 +305,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
     assert len(documents_with_score) == 1
     # the elements are Tuple(document, score)
     score = documents_with_score[0][1]
-    assert score > 0.1
+    assert score > MINIMUM_ACCEPTABLE_SCORE
 
     verify_document(
         documents_with_score[0][0],
@@ -317,7 +319,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
     assert len(documents_with_score) == 1
     # the elements are Tuple(document, score)
     score = documents_with_score[0][1]
-    assert score > 0.1
+    assert score > MINIMUM_ACCEPTABLE_SCORE
 
     # test for max_marginal_relevance_search_by_vector
 
@@ -349,7 +351,7 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
     assert len(documents) == 1
 
     documents = vectorstore.similarity_search_by_vector(embedding=vector, k=2)
-    assert len(documents) == 2
+    assert len(documents) == 2  # noqa: PLR2004
 
     documents = vectorstore.similarity_search_by_vector(
         embedding=vector, k=1, filter={"context": "none"}
@@ -382,18 +384,18 @@ def test_vector_search_with_metadata(vectorstore: VectorStore):
 
     retriever = vectorstore.as_retriever()
     documents = retriever.get_relevant_documents("RAGStack")
-    assert len(documents) == 2
+    assert len(documents) == 2  # noqa: PLR2004
 
     documents = retriever.invoke("RAGStack", RunnableConfig(tags=["custom_retriever"]))
-    assert len(documents) == 2
+    assert len(documents) == 2  # noqa: PLR2004
 
     retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
     documents = retriever.get_relevant_documents("RAGStack")
     assert len(documents) == 1
 
     retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
     documents = retriever.get_relevant_documents("RAGStack")
-    assert len(documents) == 2
+    assert len(documents) == 2  # noqa: PLR2004
 
     # delete all the documents
     vectorstore.delete(document_ids)

diff --git a/libs/e2e-tests/e2e_tests/langchain/test_document_loaders.py b/libs/e2e-tests/e2e_tests/langchain/test_document_loaders.py
@@ -34,7 +34,7 @@ def test_csv_loader():
                 ]
             )
         docs = CSVLoader(file_path=temp_csv_file.name).load()
-        assert len(docs) == 3
+        assert len(docs) == 3  # noqa: PLR2004
 
         doc1 = docs[0]
         assert doc1.page_content == "column1: value1\ncolumn2: value2\ncolumn3: value3"
@@ -48,7 +48,7 @@ def test_web_based_loader():
     )
     loader.requests_per_second = 1
     docs = loader.load()
-    assert len(docs) == 2
+    assert len(docs) == 2  # noqa: PLR2004
 
     doc1 = docs[0]
     assert "0.1.0 - Oct 4, 2023" in doc1.page_content
@@ -150,19 +150,21 @@ def test_astradb_loader() -> None:
         [{"foo": "bar2", "baz": "qux"}] * 4 + [{"foo": "bar", "baz": "qux"}] * 4
     )
 
+    doc_number = 22
+
     loader = AstraDBLoader(
         astra_ref.collection,
         token=astra_ref.token,
         api_endpoint=astra_ref.api_endpoint,
         nb_prefetched=1,
         projection={"foo": 1},
-        find_options={"limit": 22},
+        find_options={"limit": doc_number},
         filter_criteria={"foo": "bar"},
         page_content_mapper=lambda r: "Payload: " + json.dumps(r),
     )
     docs = loader.load()
 
-    assert len(docs) == 22
+    assert len(docs) == doc_number
     ids = set()
     for doc in docs:
         assert doc.page_content.startswith("Payload: ")

diff --git a/libs/e2e-tests/e2e_tests/llama_index/test_astra.py b/libs/e2e-tests/e2e_tests/llama_index/test_astra.py
@@ -186,7 +186,7 @@ def test_vector_search_with_metadata(environment: Environment):
     )
 
     documents = index.as_retriever().retrieve("RAGStack")
-    assert len(documents) == 2
+    assert len(documents) == 2  # noqa: PLR2004
 
     # delete all the documents
     for doc_id in document_ids:

diff --git a/libs/e2e-tests/e2e_tests/test_utils/astradb_vector_store_handler.py b/libs/e2e-tests/e2e_tests/test_utils/astradb_vector_store_handler.py
@@ -65,7 +65,6 @@ def await_ongoing_deletions_completed(self):
                 "%s deletions still running, waiting to complete", pending_deletions
             )
             time.sleep(1)
-        return
 
     def run_delete(self, collection: str):
         """Runs a delete_collection in the background, blocking if max_workers are

diff --git a/libs/knowledge-store/notebooks/astra_support.ipynb b/libs/knowledge-store/notebooks/astra_support.ipynb
@@ -278,6 +278,7 @@
    "source": [
     "not_found = 0\n",
     "found = 0\n",
+    "BATCH_SIZE = 50\n",
     "\n",
     "docs = []\n",
     "async for doc in load_pages(URLS):\n",
@@ -288,7 +289,7 @@
     "    docs.append(doc)\n",
     "    found += 1\n",
     "\n",
-    "    if len(docs) >= 50:\n",
+    "    if len(docs) >= BATCH_SIZE:\n",
     "        graph_store.add_documents(docs)\n",
     "        docs.clear()\n",
     "\n",

diff --git a/libs/knowledge-store/ragstack_knowledge_store/graph_store.py b/libs/knowledge-store/ragstack_knowledge_store/graph_store.py
@@ -330,11 +330,11 @@ def add_nodes(
                 link_from_tags = set()  # link from these tags
 
                 for tag in links:
-                    if tag.direction == "in" or tag.direction == "bidir":
+                    if tag.direction in {"in", "bidir"}:
                         # An incoming link should be linked *from* nodes with the given
                         # tag.
                         link_from_tags.add((tag.kind, tag.tag))
-                    if tag.direction == "out" or tag.direction == "bidir":
+                    if tag.direction in {"out", "bidir"}:
                         link_to_tags.add((tag.kind, tag.tag))
 
                 metadata_blob = _serialize_metadata(metadata)

diff --git a/libs/langchain/tests/integration_tests/test_colbert.py b/libs/langchain/tests/integration_tests/test_colbert.py
@@ -112,7 +112,7 @@ def test_sync_from_docs(request, vector_store: str):
         "What are Xenospheric Particulates?"
     )
 
-    assert len(results) > 3
+    assert len(results) > 3  # noqa: PLR2004
     assert results[1][1] > 0  # check score from result 2
     assert results[2][1] > 0  # check score from result 3
     assert results[1][1] > results[2][1]  # check that scores are returned in order
@@ -171,7 +171,7 @@ async def test_async_from_docs(request, vector_store: str):
         "What are Xenospheric Particulates?"
     )
 
-    assert len(results) > 3
+    assert len(results) > 3  # noqa: PLR2004
     assert results[1][1] > 0  # check score from result 2
     assert results[2][1] > 0  # check score from result 3
     assert results[1][1] > results[2][1]  # check that scores are returned in order