From 51fdd2c79bb727227ce709bb5ae71bb8f3219e93 Mon Sep 17 00:00:00 2001 From: Stephanie Ding Date: Thu, 12 Dec 2024 15:24:05 -0800 Subject: [PATCH] Support audio input for OpenAI models Reviewed By: laurendeason Differential Revision: D67126541 fbshipit-source-id: f6a229ae41a3fb127f9c00874609e9d2953bf23b --- CybersecurityBenchmarks/benchmark/benchmark_utils.py | 7 +++++++ CybersecurityBenchmarks/benchmark/llm.py | 12 ++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/CybersecurityBenchmarks/benchmark/benchmark_utils.py b/CybersecurityBenchmarks/benchmark/benchmark_utils.py index d1e78174..cddf6d3b 100644 --- a/CybersecurityBenchmarks/benchmark/benchmark_utils.py +++ b/CybersecurityBenchmarks/benchmark/benchmark_utils.py @@ -9,6 +9,7 @@ import io import re import subprocess +from pathlib import Path from typing import Optional @@ -93,6 +94,12 @@ def image_to_b64(image_path: str, image_format: str = "PNG") -> str: return img_base64.decode("utf-8") +def audio_to_b64(audio_path: str) -> str: + audio_bytes = Path(audio_path).read_bytes() + audio_b64 = base64.b64encode(audio_bytes) + return audio_b64.decode("utf-8") + + def get_mime_type(file_path: str) -> Optional[str]: result = subprocess.run( ["file", "--mime-type", "-b", file_path], capture_output=True, text=True diff --git a/CybersecurityBenchmarks/benchmark/llm.py b/CybersecurityBenchmarks/benchmark/llm.py index 4cb3cb63..a8b9c60d 100644 --- a/CybersecurityBenchmarks/benchmark/llm.py +++ b/CybersecurityBenchmarks/benchmark/llm.py @@ -19,7 +19,7 @@ from typing_extensions import override -from .benchmark_utils import image_to_b64 +from .benchmark_utils import audio_to_b64, image_to_b64 from .cache_handler import CacheHandler @@ -625,7 +625,15 @@ def query_multimodal( ) if audio_path: - raise NotImplementedError("Audio input in OpenAI is not yet supported") + user_message_content.append( + { + "type": "input_audio", + "input_audio": { + "data": audio_to_b64(str(audio_path)), + "format": "wav", + }, + } + ) messages.append( {