Support audio input for OpenAI models

Reviewed By: laurendeason Differential Revision: D67126541 fbshipit-source-id: f6a229ae41a3fb127f9c00874609e9d2953bf23b
meta-llama · Dec 12, 2024 · 51fdd2c · 51fdd2c
1 parent e1db0e1
commit 51fdd2c
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 2 deletions.
diff --git a/CybersecurityBenchmarks/benchmark/benchmark_utils.py b/CybersecurityBenchmarks/benchmark/benchmark_utils.py
@@ -9,6 +9,7 @@
 import io
 import re
 import subprocess
+from pathlib import Path
 
 from typing import Optional
 
@@ -93,6 +94,12 @@ def image_to_b64(image_path: str, image_format: str = "PNG") -> str:
     return img_base64.decode("utf-8")
 
 
+def audio_to_b64(audio_path: str) -> str:
+    audio_bytes = Path(audio_path).read_bytes()
+    audio_b64 = base64.b64encode(audio_bytes)
+    return audio_b64.decode("utf-8")
+
+
 def get_mime_type(file_path: str) -> Optional[str]:
     result = subprocess.run(
         ["file", "--mime-type", "-b", file_path], capture_output=True, text=True

diff --git a/CybersecurityBenchmarks/benchmark/llm.py b/CybersecurityBenchmarks/benchmark/llm.py
@@ -19,7 +19,7 @@
 
 from typing_extensions import override
 
-from .benchmark_utils import image_to_b64
+from .benchmark_utils import audio_to_b64, image_to_b64
 from .cache_handler import CacheHandler
 
 
@@ -625,7 +625,15 @@ def query_multimodal(
             )
 
         if audio_path:
-            raise NotImplementedError("Audio input in OpenAI is not yet supported")
+            user_message_content.append(
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_to_b64(str(audio_path)),
+                        "format": "wav",
+                    },
+                }
+            )
 
         messages.append(
             {