Add ruff rules for docstrings (#576)

datastax · Jul 4, 2024 · a624749 · a624749
1 parent f491241
commit a624749
Show file tree

Hide file tree

Showing 59 changed files with 434 additions and 374 deletions.
diff --git a/libs/colbert/ragstack_colbert/__init__.py b/libs/colbert/ragstack_colbert/__init__.py
@@ -1,4 +1,5 @@
-"""
+"""Ragstack Colbert: A ColBERT-based text retrieval system.
+
 This package provides a suite of tools for encoding and retrieving text using the
 ColBERT model, integrated with a Cassandra database for scalable storage and retrieval
 operations. It includes classes for token embeddings, managing the vector store, and

diff --git a/libs/colbert/ragstack_colbert/base_database.py b/libs/colbert/ragstack_colbert/base_database.py
@@ -1,6 +1,7 @@
-"""
-This module defines abstract base classes for implementing storage mechanisms for text
-chunk embeddings, specifically designed to work with ColBERT or similar embedding
+"""Base Database module.
+
+This module defines abstract base classes for implementing storage mechanisms for
+text chunk embeddings, specifically designed to work with ColBERT or similar embedding
 models.
 """
 
@@ -11,7 +12,8 @@
 
 
 class BaseDatabase(ABC):
-    """
+    """Base Database abstract class for ColBERT.
+
     Abstract base class (ABC) for a storage system designed to hold vector
     representations of text chunks, typically generated by a ColBERT model or similar
     embedding model.
@@ -23,10 +25,9 @@ class BaseDatabase(ABC):
 
     @abstractmethod
     def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]:
-        """
-        Stores a list of embedded text chunks in the vector store
+        """Stores a list of embedded text chunks in the vector store.
 
-        Parameters:
+        Args:
             chunks (List[Chunk]): A list of `Chunk` instances to be stored.
 
         Returns:
@@ -35,10 +36,9 @@ def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]:
 
     @abstractmethod
     def delete_chunks(self, doc_ids: List[str]) -> bool:
-        """
-        Deletes chunks from the vector store based on their document id.
+        """Deletes chunks from the vector store based on their document id.
 
-        Parameters:
+        Args:
             doc_ids (List[str]): A list of document identifiers specifying the chunks
                 to be deleted.
 
@@ -50,10 +50,9 @@ def delete_chunks(self, doc_ids: List[str]) -> bool:
     async def aadd_chunks(
         self, chunks: List[Chunk], concurrent_inserts: Optional[int] = 100
     ) -> List[Tuple[str, int]]:
-        """
-        Stores a list of embedded text chunks in the vector store
+        """Stores a list of embedded text chunks in the vector store.
 
-        Parameters:
+        Args:
             chunks (List[Chunk]): A list of `Chunk` instances to be stored.
             concurrent_inserts (Optional[int]): How many concurrent inserts to make to
                 the database. Defaults to 100.
@@ -66,10 +65,9 @@ async def aadd_chunks(
     async def adelete_chunks(
         self, doc_ids: List[str], concurrent_deletes: Optional[int] = 100
     ) -> bool:
-        """
-        Deletes chunks from the vector store based on their document id.
+        """Deletes chunks from the vector store based on their document id.
 
-        Parameters:
+        Args:
             doc_ids (List[str]): A list of document identifiers specifying the chunks
                 to be deleted.
             concurrent_deletes (Optional[int]): How many concurrent deletes to make
@@ -81,8 +79,7 @@ async def adelete_chunks(
 
     @abstractmethod
     async def search_relevant_chunks(self, vector: Vector, n: int) -> List[Chunk]:
-        """
-        Retrieves 'n' ANN results for an embedded token vector.
+        """Retrieves 'n' ANN results for an embedded token vector.
 
         Returns:
             A list of Chunks with only `doc_id` and `chunk_id` set.
@@ -91,8 +88,7 @@ async def search_relevant_chunks(self, vector: Vector, n: int) -> List[Chunk]:
 
     @abstractmethod
     async def get_chunk_embedding(self, doc_id: str, chunk_id: int) -> Chunk:
-        """
-        Retrieve the embedding data for a chunk.
+        """Retrieve the embedding data for a chunk.
 
         Returns:
             A chunk with `doc_id`, `chunk_id`, and `embedding` set.
@@ -102,8 +98,7 @@ async def get_chunk_embedding(self, doc_id: str, chunk_id: int) -> Chunk:
     async def get_chunk_data(
         self, doc_id: str, chunk_id: int, include_embedding: Optional[bool]
     ) -> Chunk:
-        """
-        Retrieve the text and metadata for a chunk.
+        """Retrieve the text and metadata for a chunk.
 
         Returns:
             A chunk with `doc_id`, `chunk_id`, `text`, `metadata`, and optionally
@@ -112,6 +107,4 @@ async def get_chunk_data(
 
     @abstractmethod
     def close(self) -> None:
-        """
-        Cleans up any open resources.
-        """
+        """Cleans up any open resources."""
diff --git a/libs/colbert/ragstack_colbert/base_embedding_model.py b/libs/colbert/ragstack_colbert/base_embedding_model.py
@@ -1,6 +1,7 @@
-"""
-This module defines an abstract base class (ABC) for generating token-based embeddings
-for text.
+"""Base embedding for ColBERT.
+
+This module defines an abstract base class (ABC) for generating token-based
+embeddings for text.
 """
 
 from abc import ABC, abstractmethod
@@ -10,8 +11,7 @@
 
 
 class BaseEmbeddingModel(ABC):
-    """
-    Abstract base class (ABC) for token-based embedding models.
+    """Abstract base class (ABC) for token-based embedding models.
 
     This class defines the interface for models that generate embeddings for text
     chunks and queries.
@@ -22,11 +22,9 @@ class BaseEmbeddingModel(ABC):
 
     @abstractmethod
     def embed_texts(self, texts: List[str]) -> List[Embedding]:
-        """
-        Embeds a list of texts into their corresponding vector embedding
-        representations.
+        """Embeds a list of texts into their vector embedding representations.
 
-        Parameters:
+        Args:
             texts (List[str]): A list of string texts.
 
         Returns:
@@ -40,13 +38,12 @@ def embed_query(
         full_length_search: Optional[bool] = False,
         query_maxlen: int = -1,
     ) -> Embedding:
-        """
-        Embeds a single query text into its vector representation.
+        """Embeds a single query text into its vector representation.
 
         If the query has fewer than query_maxlen tokens it will be padded with BERT
         special [mast] tokens.
 
-        Parameters:
+        Args:
             query (str): The query text to encode.
             full_length_search (Optional[bool]): Indicates whether to encode the
                 query for a full-length search. Defaults to False.

diff --git a/libs/colbert/ragstack_colbert/base_retriever.py b/libs/colbert/ragstack_colbert/base_retriever.py
@@ -1,4 +1,5 @@
-"""
+"""Base retriever module.
+
 This module defines abstract base classes for implementing retrieval mechanisms for
 text chunk embeddings, specifically designed to work with ColBERT or similar embedding
 models.
@@ -11,9 +12,10 @@
 
 
 class BaseRetriever(ABC):
-    """
-    Abstract base class (ABC) for a retrieval system that operates on a ColBERT vector
-    store, facilitating the search and retrieval of text chunks based on query
+    """Base Retriever abstract class for ColBERT.
+
+    Abstract base class (ABC) for a retrieval system that operates on a ColBERT
+    vector store, facilitating the search and retrieval of text chunks based on query
     embeddings.
     """
 
@@ -26,11 +28,12 @@ def embedding_search(
         include_embedding: Optional[bool] = False,
         **kwargs: Any,
     ) -> List[Tuple[Chunk, float]]:
-        """
+        """Search for relevant text chunks based on a query embedding.
+
         Retrieves a list of text chunks relevant to a given query from the vector
         store, ranked by relevance or other metrics.
 
-        Parameters:
+        Args:
             query_embedding (Embedding): The query embedding to search for relevant
                 text chunks.
             k (Optional[int]): The number of top results to retrieve.
@@ -54,11 +57,12 @@ async def aembedding_search(
         include_embedding: Optional[bool] = False,
         **kwargs: Any,
     ) -> List[Tuple[Chunk, float]]:
-        """
+        """Search for relevant text chunks based on a query embedding.
+
         Retrieves a list of text chunks relevant to a given query from the vector
         store, ranked by relevance or other metrics.
 
-        Parameters:
+        Args:
             query_embedding (Embedding): The query embedding to search for relevant
                 text chunks.
             k (Optional[int]): The number of top results to retrieve.
@@ -83,11 +87,12 @@ def text_search(
         include_embedding: Optional[bool] = False,
         **kwargs: Any,
     ) -> List[Tuple[Chunk, float]]:
-        """
+        """Search for relevant text chunks based on a query text.
+
         Retrieves a list of text chunks relevant to a given query from the vector
         store, ranked by relevance or other metrics.
 
-        Parameters:
+        Args:
             query_text (str): The query text to search for relevant text chunks.
             k (Optional[int]): The number of top results to retrieve.
             query_maxlen (Optional[int]): The maximum length of the query to consider.
@@ -113,11 +118,12 @@ async def atext_search(
         include_embedding: Optional[bool] = False,
         **kwargs: Any,
     ) -> List[Tuple[Chunk, float]]:
-        """
+        """Search for relevant text chunks based on a query text.
+
         Retrieves a list of text chunks relevant to a given query from the vector
         store, ranked by relevance or other metrics.
 
-        Parameters:
+        Args:
             query_text (str): The query text to search for relevant text chunks.
             k (Optional[int]): The number of top results to retrieve.
             query_maxlen (Optional[int]): The maximum length of the query to consider.

diff --git a/libs/colbert/ragstack_colbert/base_vector_store.py b/libs/colbert/ragstack_colbert/base_vector_store.py
@@ -1,4 +1,5 @@
-"""
+"""Base Vector Store module for ColBERT.
+
 This module defines the abstract base class for a standard vector store
 specifically designed to work with ColBERT or similar dense embedding models,
 and can be used to create a LangChain or LlamaIndex ColBERT vector store.
@@ -23,7 +24,8 @@
 
 
 class BaseVectorStore(ABC):
-    """
+    """Base Vector Store abstract class for ColBERT.
+
     Abstract base class (ABC) for a storage system designed to hold vector
     representations of text chunks, typically generated by a ColBERT model or similar
     embedding model.
@@ -36,10 +38,9 @@ class BaseVectorStore(ABC):
     # handles LlamaIndex add
     @abstractmethod
     def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]:
-        """
-        Stores a list of embedded text chunks in the vector store
+        """Stores a list of embedded text chunks in the vector store.
 
-        Parameters:
+        Args:
             chunks (List[Chunk]): A list of `Chunk` instances to be stored.
 
         Returns:
@@ -54,11 +55,12 @@ def add_texts(
         metadatas: Optional[List[Metadata]],
         doc_id: Optional[str] = None,
     ) -> List[Tuple[str, int]]:
-        """
+        """Adds text chunks to the vector store.
+
         Embeds and stores a list of text chunks and optional metadata into the vector
         store.
 
-        Parameters:
+        Args:
             texts (List[str]): The list of text chunks to be embedded
             metadatas (Optional[List[Metadata]])): An optional list of Metadata to be
                 stored. If provided, these are set 1 to 1 with the texts list.
@@ -72,10 +74,9 @@ def add_texts(
     # handles LangChain and LlamaIndex delete
     @abstractmethod
     def delete_chunks(self, doc_ids: List[str]) -> bool:
-        """
-        Deletes chunks from the vector store based on their document id.
+        """Deletes chunks from the vector store based on their document id.
 
-        Parameters:
+        Args:
             doc_ids (List[str]): A list of document identifiers specifying the chunks
                 to be deleted.
 
@@ -88,10 +89,9 @@ def delete_chunks(self, doc_ids: List[str]) -> bool:
     async def aadd_chunks(
         self, chunks: List[Chunk], concurrent_inserts: Optional[int] = 100
     ) -> List[Tuple[str, int]]:
-        """
-        Stores a list of embedded text chunks in the vector store
+        """Stores a list of embedded text chunks in the vector store.
 
-        Parameters:
+        Args:
             chunks (List[Chunk]): A list of `Chunk` instances to be stored.
             concurrent_inserts (Optional[int]): How many concurrent inserts to make to
                 the database. Defaults to 100.
@@ -109,11 +109,12 @@ async def aadd_texts(
         doc_id: Optional[str] = None,
         concurrent_inserts: Optional[int] = 100,
     ) -> List[Tuple[str, int]]:
-        """
+        """Adds text chunks to the vector store.
+
         Embeds and stores a list of text chunks and optional metadata into the vector
         store.
 
-        Parameters:
+        Args:
             texts (List[str]): The list of text chunks to be embedded
             metadatas (Optional[List[Metadata]])): An optional list of Metadata to be
                 stored. If provided, these are set 1 to 1 with the texts list.
@@ -131,10 +132,9 @@ async def aadd_texts(
     async def adelete_chunks(
         self, doc_ids: List[str], concurrent_deletes: Optional[int] = 100
     ) -> bool:
-        """
-        Deletes chunks from the vector store based on their document id.
+        """Deletes chunks from the vector store based on their document id.
 
-        Parameters:
+        Args:
             doc_ids (List[str]): A list of document identifiers specifying the chunks
                 to be deleted.
             concurrent_deletes (Optional[int]): How many concurrent deletes to make to
@@ -147,6 +147,4 @@ async def adelete_chunks(
     # handles LangChain as_retriever
     @abstractmethod
     def as_retriever(self) -> BaseRetriever:
-        """
-        Gets a retriever using the vector store.
-        """
+        """Gets a retriever using the vector store."""