diff --git a/autogpt_platform/backend/.env.example b/autogpt_platform/backend/.env.example index 0ec84ca83e66..298178733453 100644 --- a/autogpt_platform/backend/.env.example +++ b/autogpt_platform/backend/.env.example @@ -95,6 +95,9 @@ REPLICATE_API_KEY= # Ideogram IDEOGRAM_API_KEY= +# Openrouter +OPENROUTER_API_KEY= + # Logging Configuration LOG_LEVEL=INFO ENABLE_CLOUD_LOGGING=false diff --git a/autogpt_platform/backend/backend/blocks/multimodal_ai_block.py b/autogpt_platform/backend/backend/blocks/multimodal_ai_block.py new file mode 100644 index 000000000000..04a484c7ed39 --- /dev/null +++ b/autogpt_platform/backend/backend/blocks/multimodal_ai_block.py @@ -0,0 +1,104 @@ +from enum import Enum + +from openai import OpenAI + +from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema +from backend.data.model import BlockSecret, SchemaField, SecretField + + +# Model name enum +class MultimodalAIModelName(str, Enum): + LLAMA_VISION_11B = "meta-llama/llama-3.2-11b-vision-instruct:free" + LLAMA_VISION_90B = "meta-llama/llama-3.2-90b-vision-instruct" + PIXTRAL_12B = "mistralai/pixtral-12b" + CHATGPT_4O_LATEST = "openai/chatgpt-4o-latest" + + +class MultimodalAIBlock(Block): + class Input(BlockSchema): + api_key: BlockSecret = SecretField( + key="openrouter_api_key", + description="OpenRouter API Key", + advanced=False, + ) + prompt: str = SchemaField( + description="Text prompt for multimodal AI response", + placeholder="e.g., 'Describe the contents of the image'", + title="Prompt", + ) + image_url: str = SchemaField( + description="URL of the image to analyze", + placeholder="e.g., 'https://example.com/image.jpg'", + title="Image URL", + ) + model_name: MultimodalAIModelName = SchemaField( + description="The name of the multimodal AI model", + default=MultimodalAIModelName.LLAMA_VISION_11B, + title="Multimodal AI Model", + advanced=False, + ) + + class Output(BlockSchema): + result: str = SchemaField(description="Generated output") + error: str = SchemaField(description="Error message if the model run failed") + + def __init__(self): + super().__init__( + id="62f5be31-9896-40ed-bd71-2dda74459e20", + description="This block runs multimodal AI models on OpenRouter with advanced settings.", + categories={BlockCategory.AI}, + input_schema=MultimodalAIBlock.Input, + output_schema=MultimodalAIBlock.Output, + test_input={ + "api_key": "test_api_key", + "model_name": MultimodalAIModelName.LLAMA_VISION_11B, + "prompt": "Describe the contents of the image", + "image_url": "https://example.com/image.jpg", + }, + test_output=[ + ( + "result", + "The image depicts a serene boardwalk surrounded by lush greenery.", + ), + ], + test_mock={ + "run_model": lambda api_key, model_name, prompt, image_url: "The image depicts a serene boardwalk surrounded by lush greenery.", + }, + ) + + def run(self, input_data: Input, **kwargs) -> BlockOutput: + try: + # Call the separated model execution logic + result = self.run_model( + api_key=input_data.api_key.get_secret_value(), + model_name=input_data.model_name, + prompt=input_data.prompt, + image_url=input_data.image_url, + ) + yield "result", result + except Exception as e: + yield "error", str(e) + + def run_model(self, api_key, model_name, prompt, image_url): + # Initialize OpenAI client with the API key + client = OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=api_key, + ) + + # Call the API to create a completion based on the input data + completion = client.chat.completions.create( + model=model_name, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + } + ], + ) + + # Extract and return the content from the API response + return completion.choices[0].message.content diff --git a/autogpt_platform/backend/backend/util/settings.py b/autogpt_platform/backend/backend/util/settings.py index f521bf104f7d..fde2e83465a3 100644 --- a/autogpt_platform/backend/backend/util/settings.py +++ b/autogpt_platform/backend/backend/util/settings.py @@ -250,7 +250,7 @@ class Secrets(UpdateTrackingModel["Secrets"], BaseSettings): replicate_api_key: str = Field(default="", description="Replicate API Key") unreal_speech_api_key: str = Field(default="", description="Unreal Speech API Key") ideogram_api_key: str = Field(default="", description="Ideogram API Key") - + openrouter_api_key: str = Field(default="", description="OpenRouter API Key") # Add more secret fields as needed model_config = SettingsConfigDict(