Skip to content

Commit

Permalink
Add ruff rules for docstrings (#576)
Browse files Browse the repository at this point in the history
  • Loading branch information
cbornet authored Jul 4, 2024
1 parent f491241 commit a624749
Show file tree
Hide file tree
Showing 59 changed files with 434 additions and 374 deletions.
3 changes: 2 additions & 1 deletion libs/colbert/ragstack_colbert/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
"""Ragstack Colbert: A ColBERT-based text retrieval system.
This package provides a suite of tools for encoding and retrieving text using the
ColBERT model, integrated with a Cassandra database for scalable storage and retrieval
operations. It includes classes for token embeddings, managing the vector store, and
Expand Down
43 changes: 18 additions & 25 deletions libs/colbert/ragstack_colbert/base_database.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This module defines abstract base classes for implementing storage mechanisms for text
chunk embeddings, specifically designed to work with ColBERT or similar embedding
"""Base Database module.
This module defines abstract base classes for implementing storage mechanisms for
text chunk embeddings, specifically designed to work with ColBERT or similar embedding
models.
"""

Expand All @@ -11,7 +12,8 @@


class BaseDatabase(ABC):
"""
"""Base Database abstract class for ColBERT.
Abstract base class (ABC) for a storage system designed to hold vector
representations of text chunks, typically generated by a ColBERT model or similar
embedding model.
Expand All @@ -23,10 +25,9 @@ class BaseDatabase(ABC):

@abstractmethod
def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]:
"""
Stores a list of embedded text chunks in the vector store
"""Stores a list of embedded text chunks in the vector store.
Parameters:
Args:
chunks (List[Chunk]): A list of `Chunk` instances to be stored.
Returns:
Expand All @@ -35,10 +36,9 @@ def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]:

@abstractmethod
def delete_chunks(self, doc_ids: List[str]) -> bool:
"""
Deletes chunks from the vector store based on their document id.
"""Deletes chunks from the vector store based on their document id.
Parameters:
Args:
doc_ids (List[str]): A list of document identifiers specifying the chunks
to be deleted.
Expand All @@ -50,10 +50,9 @@ def delete_chunks(self, doc_ids: List[str]) -> bool:
async def aadd_chunks(
self, chunks: List[Chunk], concurrent_inserts: Optional[int] = 100
) -> List[Tuple[str, int]]:
"""
Stores a list of embedded text chunks in the vector store
"""Stores a list of embedded text chunks in the vector store.
Parameters:
Args:
chunks (List[Chunk]): A list of `Chunk` instances to be stored.
concurrent_inserts (Optional[int]): How many concurrent inserts to make to
the database. Defaults to 100.
Expand All @@ -66,10 +65,9 @@ async def aadd_chunks(
async def adelete_chunks(
self, doc_ids: List[str], concurrent_deletes: Optional[int] = 100
) -> bool:
"""
Deletes chunks from the vector store based on their document id.
"""Deletes chunks from the vector store based on their document id.
Parameters:
Args:
doc_ids (List[str]): A list of document identifiers specifying the chunks
to be deleted.
concurrent_deletes (Optional[int]): How many concurrent deletes to make
Expand All @@ -81,8 +79,7 @@ async def adelete_chunks(

@abstractmethod
async def search_relevant_chunks(self, vector: Vector, n: int) -> List[Chunk]:
"""
Retrieves 'n' ANN results for an embedded token vector.
"""Retrieves 'n' ANN results for an embedded token vector.
Returns:
A list of Chunks with only `doc_id` and `chunk_id` set.
Expand All @@ -91,8 +88,7 @@ async def search_relevant_chunks(self, vector: Vector, n: int) -> List[Chunk]:

@abstractmethod
async def get_chunk_embedding(self, doc_id: str, chunk_id: int) -> Chunk:
"""
Retrieve the embedding data for a chunk.
"""Retrieve the embedding data for a chunk.
Returns:
A chunk with `doc_id`, `chunk_id`, and `embedding` set.
Expand All @@ -102,8 +98,7 @@ async def get_chunk_embedding(self, doc_id: str, chunk_id: int) -> Chunk:
async def get_chunk_data(
self, doc_id: str, chunk_id: int, include_embedding: Optional[bool]
) -> Chunk:
"""
Retrieve the text and metadata for a chunk.
"""Retrieve the text and metadata for a chunk.
Returns:
A chunk with `doc_id`, `chunk_id`, `text`, `metadata`, and optionally
Expand All @@ -112,6 +107,4 @@ async def get_chunk_data(

@abstractmethod
def close(self) -> None:
"""
Cleans up any open resources.
"""
"""Cleans up any open resources."""
21 changes: 9 additions & 12 deletions libs/colbert/ragstack_colbert/base_embedding_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This module defines an abstract base class (ABC) for generating token-based embeddings
for text.
"""Base embedding for ColBERT.
This module defines an abstract base class (ABC) for generating token-based
embeddings for text.
"""

from abc import ABC, abstractmethod
Expand All @@ -10,8 +11,7 @@


class BaseEmbeddingModel(ABC):
"""
Abstract base class (ABC) for token-based embedding models.
"""Abstract base class (ABC) for token-based embedding models.
This class defines the interface for models that generate embeddings for text
chunks and queries.
Expand All @@ -22,11 +22,9 @@ class BaseEmbeddingModel(ABC):

@abstractmethod
def embed_texts(self, texts: List[str]) -> List[Embedding]:
"""
Embeds a list of texts into their corresponding vector embedding
representations.
"""Embeds a list of texts into their vector embedding representations.
Parameters:
Args:
texts (List[str]): A list of string texts.
Returns:
Expand All @@ -40,13 +38,12 @@ def embed_query(
full_length_search: Optional[bool] = False,
query_maxlen: int = -1,
) -> Embedding:
"""
Embeds a single query text into its vector representation.
"""Embeds a single query text into its vector representation.
If the query has fewer than query_maxlen tokens it will be padded with BERT
special [mast] tokens.
Parameters:
Args:
query (str): The query text to encode.
full_length_search (Optional[bool]): Indicates whether to encode the
query for a full-length search. Defaults to False.
Expand Down
30 changes: 18 additions & 12 deletions libs/colbert/ragstack_colbert/base_retriever.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
"""Base retriever module.
This module defines abstract base classes for implementing retrieval mechanisms for
text chunk embeddings, specifically designed to work with ColBERT or similar embedding
models.
Expand All @@ -11,9 +12,10 @@


class BaseRetriever(ABC):
"""
Abstract base class (ABC) for a retrieval system that operates on a ColBERT vector
store, facilitating the search and retrieval of text chunks based on query
"""Base Retriever abstract class for ColBERT.
Abstract base class (ABC) for a retrieval system that operates on a ColBERT
vector store, facilitating the search and retrieval of text chunks based on query
embeddings.
"""

Expand All @@ -26,11 +28,12 @@ def embedding_search(
include_embedding: Optional[bool] = False,
**kwargs: Any,
) -> List[Tuple[Chunk, float]]:
"""
"""Search for relevant text chunks based on a query embedding.
Retrieves a list of text chunks relevant to a given query from the vector
store, ranked by relevance or other metrics.
Parameters:
Args:
query_embedding (Embedding): The query embedding to search for relevant
text chunks.
k (Optional[int]): The number of top results to retrieve.
Expand All @@ -54,11 +57,12 @@ async def aembedding_search(
include_embedding: Optional[bool] = False,
**kwargs: Any,
) -> List[Tuple[Chunk, float]]:
"""
"""Search for relevant text chunks based on a query embedding.
Retrieves a list of text chunks relevant to a given query from the vector
store, ranked by relevance or other metrics.
Parameters:
Args:
query_embedding (Embedding): The query embedding to search for relevant
text chunks.
k (Optional[int]): The number of top results to retrieve.
Expand All @@ -83,11 +87,12 @@ def text_search(
include_embedding: Optional[bool] = False,
**kwargs: Any,
) -> List[Tuple[Chunk, float]]:
"""
"""Search for relevant text chunks based on a query text.
Retrieves a list of text chunks relevant to a given query from the vector
store, ranked by relevance or other metrics.
Parameters:
Args:
query_text (str): The query text to search for relevant text chunks.
k (Optional[int]): The number of top results to retrieve.
query_maxlen (Optional[int]): The maximum length of the query to consider.
Expand All @@ -113,11 +118,12 @@ async def atext_search(
include_embedding: Optional[bool] = False,
**kwargs: Any,
) -> List[Tuple[Chunk, float]]:
"""
"""Search for relevant text chunks based on a query text.
Retrieves a list of text chunks relevant to a given query from the vector
store, ranked by relevance or other metrics.
Parameters:
Args:
query_text (str): The query text to search for relevant text chunks.
k (Optional[int]): The number of top results to retrieve.
query_maxlen (Optional[int]): The maximum length of the query to consider.
Expand Down
40 changes: 19 additions & 21 deletions libs/colbert/ragstack_colbert/base_vector_store.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
"""Base Vector Store module for ColBERT.
This module defines the abstract base class for a standard vector store
specifically designed to work with ColBERT or similar dense embedding models,
and can be used to create a LangChain or LlamaIndex ColBERT vector store.
Expand All @@ -23,7 +24,8 @@


class BaseVectorStore(ABC):
"""
"""Base Vector Store abstract class for ColBERT.
Abstract base class (ABC) for a storage system designed to hold vector
representations of text chunks, typically generated by a ColBERT model or similar
embedding model.
Expand All @@ -36,10 +38,9 @@ class BaseVectorStore(ABC):
# handles LlamaIndex add
@abstractmethod
def add_chunks(self, chunks: List[Chunk]) -> List[Tuple[str, int]]:
"""
Stores a list of embedded text chunks in the vector store
"""Stores a list of embedded text chunks in the vector store.
Parameters:
Args:
chunks (List[Chunk]): A list of `Chunk` instances to be stored.
Returns:
Expand All @@ -54,11 +55,12 @@ def add_texts(
metadatas: Optional[List[Metadata]],
doc_id: Optional[str] = None,
) -> List[Tuple[str, int]]:
"""
"""Adds text chunks to the vector store.
Embeds and stores a list of text chunks and optional metadata into the vector
store.
Parameters:
Args:
texts (List[str]): The list of text chunks to be embedded
metadatas (Optional[List[Metadata]])): An optional list of Metadata to be
stored. If provided, these are set 1 to 1 with the texts list.
Expand All @@ -72,10 +74,9 @@ def add_texts(
# handles LangChain and LlamaIndex delete
@abstractmethod
def delete_chunks(self, doc_ids: List[str]) -> bool:
"""
Deletes chunks from the vector store based on their document id.
"""Deletes chunks from the vector store based on their document id.
Parameters:
Args:
doc_ids (List[str]): A list of document identifiers specifying the chunks
to be deleted.
Expand All @@ -88,10 +89,9 @@ def delete_chunks(self, doc_ids: List[str]) -> bool:
async def aadd_chunks(
self, chunks: List[Chunk], concurrent_inserts: Optional[int] = 100
) -> List[Tuple[str, int]]:
"""
Stores a list of embedded text chunks in the vector store
"""Stores a list of embedded text chunks in the vector store.
Parameters:
Args:
chunks (List[Chunk]): A list of `Chunk` instances to be stored.
concurrent_inserts (Optional[int]): How many concurrent inserts to make to
the database. Defaults to 100.
Expand All @@ -109,11 +109,12 @@ async def aadd_texts(
doc_id: Optional[str] = None,
concurrent_inserts: Optional[int] = 100,
) -> List[Tuple[str, int]]:
"""
"""Adds text chunks to the vector store.
Embeds and stores a list of text chunks and optional metadata into the vector
store.
Parameters:
Args:
texts (List[str]): The list of text chunks to be embedded
metadatas (Optional[List[Metadata]])): An optional list of Metadata to be
stored. If provided, these are set 1 to 1 with the texts list.
Expand All @@ -131,10 +132,9 @@ async def aadd_texts(
async def adelete_chunks(
self, doc_ids: List[str], concurrent_deletes: Optional[int] = 100
) -> bool:
"""
Deletes chunks from the vector store based on their document id.
"""Deletes chunks from the vector store based on their document id.
Parameters:
Args:
doc_ids (List[str]): A list of document identifiers specifying the chunks
to be deleted.
concurrent_deletes (Optional[int]): How many concurrent deletes to make to
Expand All @@ -147,6 +147,4 @@ async def adelete_chunks(
# handles LangChain as_retriever
@abstractmethod
def as_retriever(self) -> BaseRetriever:
"""
Gets a retriever using the vector store.
"""
"""Gets a retriever using the vector store."""
Loading

0 comments on commit a624749

Please sign in to comment.