Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(anthropic_engine): add support for attachments through URLs, various fixes #570

Merged
merged 11 commits into from
Mar 3, 2025
Merged
2 changes: 2 additions & 0 deletions basilisk/conversation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
AttachmentFileTypes,
ImageFile,
NotImageError,
build_from_url,
get_mime_type,
parse_supported_attachment_formats,
)
Expand All @@ -21,6 +22,7 @@
__all__ = [
"AttachmentFile",
"AttachmentFileTypes",
"build_from_url",
"Conversation",
"get_mime_type",
"ImageFile",
Expand Down
147 changes: 73 additions & 74 deletions basilisk/conversation/attached_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,7 @@

log = logging.getLogger(__name__)

URL_PATTERN = re.compile(
r'(https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|data:image/\S+)',
re.IGNORECASE,
)
URL_PATTERN = re.compile(r'https?://[^\s<>"]+|data:\S+', re.IGNORECASE)


def get_image_dimensions(reader: BufferedReader) -> tuple[int, int]:
Expand Down Expand Up @@ -130,6 +127,50 @@ def get_mime_type(path: str) -> str | None:
return mimetypes.guess_type(path)[0]


@measure_time
def build_from_url(url: str) -> AttachmentFile:
"""Fetch a file from a given URL and create an AttachmentFile instance.

This class method retrieves a file from the specified URL and constructs an AttachmentFile with metadata about the file.

Args:
url: The URL of the file to retrieve.

Returns:
An instance of AttachmentFile with details about the retrieved file.

Raises:
httpx.HTTPError: If there is an error during the HTTP request.

Example:
file = build_from_url("https://example.com/file.pdf")
image = build_from_url("https://example.com/image.jpg")
"""
r = httpx.get(url, follow_redirects=True)
r.raise_for_status()
size = r.headers.get("Content-Length")
if size and size.isdigit():
size = int(size)
mime_type = r.headers.get("content-type", None)
if not mime_type:
raise NotImageError("No MIME type found")
if mime_type.startswith("image/"):
dimensions = get_image_dimensions(BytesIO(r.content))
return ImageFile(
location=url,
type=AttachmentFileTypes.URL,
size=size,
mime_type=mime_type,
dimensions=dimensions,
)
return AttachmentFile(
location=url,
type=AttachmentFileTypes.URL,
size=size,
mime_type=mime_type,
)

Comment on lines +130 to +172
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick (assertive)

Implement security and performance considerations in build_from_url.

Currently, the function fetches the file directly. Consider:
• Adding a maximum size check for large files.
• Handling domain whitelisting or advanced timeouts.
• Logging or raising a more detailed exception when the MIME type is missing, rather than NotImageError.

These measures can reduce risk from untrusted or slow endpoints.


class AttachmentFileTypes(enum.StrEnum):
"""Enumeration of file types based on their source location."""

Expand Down Expand Up @@ -181,6 +222,7 @@ class AttachmentFile(BaseModel):
name: str | None = None
description: str | None = None
size: int | None = None
mime_type: str | None = None

@field_serializer("location", mode="wrap")
@classmethod
Expand Down Expand Up @@ -242,19 +284,19 @@ def validate_location(
raise ValueError("Invalid location")
return value

def __init__(self, /, **data: Any) -> None:
def __init__(self, /, **kwargs: Any) -> None:
"""Initialize an AttachmentFile instance with optional data.

If no name is provided, automatically generates a name using the internal _get_name() method.
If no size is set, retrieves the file size using _get_size() method.

Args:
data: Keyword arguments for initializing the AttachmentFile instance. Can include optional attributes like name and size.
kwargs: Keyword arguments for initializing the AttachmentFile instance. Can include optional attributes like name, size, and description.
"""
super().__init__(**data)
if not self.name:
self.name = self._get_name()
self.size = self._get_size()
super().__init__(**kwargs)
self.name = self.name or self._get_name()
self.mime_type = kwargs.get("mime_type") or self._get_mime_type()
self.size = kwargs.get("size") or self._get_size()

__init__.__pydantic_base_init__ = True

Expand Down Expand Up @@ -326,8 +368,7 @@ def send_location(self) -> UPath:
"""
return getattr(self, "resize_location", None) or self.location

@property
def mime_type(self) -> str | None:
def _get_mime_type(self) -> str | None:
"""Get the MIME type of the file.

Returns:
Expand Down Expand Up @@ -363,11 +404,11 @@ def remove_location(location: UPath):
except Exception as e:
log.error(f"Error deleting file at {location}: {e}")

def read_as_str(self) -> str:
"""Read the file as a string.
def read_as_plain_text(self) -> str:
"""Read the file as a plain text string.

Returns:
The contents of the file as a string.
The contents of the file as a plain text string.
"""
with self.send_location.open(mode="r") as file:
return file.read()
Expand Down Expand Up @@ -396,66 +437,37 @@ def get_display_info(self) -> tuple[str, str, str]:
"""
return self.name, self.display_size, self.display_location

@property
def url(self) -> str:
"""Get the URL of the file.

Returns:
The URL of the file, or the base64-encoded data if the file is in memory.
"""
if self.type == AttachmentFileTypes.URL:
return str(self.location)
base64_data = self.encode_base64()
return f"data:{self.mime_type};base64,{base64_data}"

Comment on lines +441 to +451
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick (assertive)

**Provide fallback MIME type for data URIs **

When mime_type is None, you'll form a data URI with data:None;base64,.... Consider defaulting to "application/octet-stream":

 if self.type == AttachmentFileTypes.URL:
   return str(self.location)
-base64_data = self.encode_base64()
-return f"data:{self.mime_type};base64,{base64_data}"
+mime = self.mime_type or "application/octet-stream"
+base64_data = self.encode_base64()
+return f"data:{mime};base64,{base64_data}"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def url(self) -> str:
"""Get the URL of the file.
Returns:
The URL of the file, or the base64-encoded data if the file is in memory.
"""
if self.type == AttachmentFileTypes.URL:
return str(self.location)
base64_data = self.encode_base64()
return f"data:{self.mime_type};base64,{base64_data}"
def url(self) -> str:
"""Get the URL of the file.
Returns:
The URL of the file, or the base64-encoded data if the file is in memory.
"""
if self.type == AttachmentFileTypes.URL:
return str(self.location)
mime = self.mime_type or "application/octet-stream"
base64_data = self.encode_base64()
return f"data:{mime};base64,{base64_data}"

Comment on lines +440 to +451
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick (assertive)

Return base64 data for non-URL attachments.

This property is handy for referencing attachments with a unified interface. Ensure potential large attachments are handled gracefully or have a safeguard to avoid memory issues with base64 encoding.


class ImageFile(AttachmentFile):
"""Represents an image file in a conversation."""

dimensions: tuple[int, int] | None = None
resize_location: PydanticUPath | None = Field(default=None, exclude=True)

@classmethod
@measure_time
def build_from_url(cls, url: str) -> ImageFile:
"""Fetch an image from a given URL and create an ImageFile instance.

This class method retrieves an image from the specified URL, validates that it is an image,
and constructs an ImageFile with metadata about the image.

Args:
url: The URL of the image to retrieve.

Returns:
An instance of ImageFile with details about the retrieved image.

Raises:
httpx.HTTPError: If there is an error during the HTTP request.
NotImageError: If the URL does not point to an image (content type is not image/*).

Example:
image = ImageFile.build_from_url("https://example.com/image.jpg")
"""
r = httpx.get(url, follow_redirects=True)
r.raise_for_status()
content_type = r.headers.get("content-type", "")
if not content_type.startswith("image/"):
e = NotImageError("URL does not point to an image")
e.content_type = content_type
raise e
size = r.headers.get("Content-Length")
if size and size.isdigit():
size = int(size)
dimensions = get_image_dimensions(BytesIO(r.content))
return cls(
location=url,
type=AttachmentFileTypes.URL,
size=size,
description=content_type,
dimensions=dimensions,
)

def __init__(self, /, **data: Any) -> None:
def __init__(self, /, **kwargs: Any) -> None:
"""Initialize an ImageFile instance with optional data.

If no name is provided, automatically generates a name using the internal _get_name() method.
If no size is set, retrieves the file size using _get_size() method.
If no dimensions are specified, determines image dimensions using _get_dimensions() method.

Args:
data: Keyword arguments for initializing the ImageFile instance. Can include optional attributes like name, size, and dimensions.
kwargs: Keyword arguments for initializing the ImageFile instance. Can include optional attributes like name, size, and dimensions.
"""
super().__init__(**data)
if not self.dimensions:
self.dimensions = self._get_dimensions()
super().__init__(**kwargs)
self.dimensions = self.dimensions or self._get_dimensions()

__init__.__pydantic_base_init__ = True

Expand Down Expand Up @@ -511,7 +523,7 @@ def resize(
self.resize_location = resize_location if success else None

@measure_time
def encode_image(self) -> str:
def encode_base64(self) -> str:
"""Encode the image file as a base64 string.

Returns:
Expand All @@ -523,19 +535,6 @@ def encode_image(self) -> str:
)
return super().encode_base64()

@property
def url(self) -> str:
"""Get the URL of the image file.

Returns:
The URL of the image file, or the base64-encoded image data if the image is in memory.
"""
if not isinstance(self.type, AttachmentFileTypes):
raise ValueError("Invalid image type")
if self.type == AttachmentFileTypes.URL:
return str(self.location)
return f"data:{self.mime_type};base64,{self.encode_image()}"

@property
def display_location(self):
"""Get the display location of the image file.
Expand Down
Loading