diff --git a/gemini-2/live_api_starter.py b/gemini-2/live_api_starter.py index df8ff0964..14e929c4e 100755 --- a/gemini-2/live_api_starter.py +++ b/gemini-2/live_api_starter.py @@ -13,13 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. -# To install the dependencies for this script, run: -# pip install google-genai opencv-python pyaudio pillow mss -# And to run this script, ensure the GOOGLE_API_KEY environment -# variable is set to the key you obtained from Google AI Studio. +""" +## Setup -# Add the "--mode screen" if you want to share your screen to the model -# instead of your camera stream +To install the dependencies for this script, run: + +``` +pip install google-genai opencv-python pyaudio pillow mss +``` + +Before running this script, ensure the `GOOGLE_API_KEY` environment +variable is set to the api-key you obtained from Google AI Studio. + +Important: **Use headphones**. This script uses the system default audio +input and output, which often won't include echo cancellation. So to prevent +the model from interrupting itself it is important that you use headphones. + +## Run + +To run the script: + +``` +python live_api_starter.py +``` + +The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none". +The default is "camera". To share your screen run: + +``` +python live_api_starter.py --mode screen +``` +""" import asyncio import base64 @@ -35,16 +59,6 @@ import argparse -parser = argparse.ArgumentParser() -parser.add_argument( - "--mode", - type=str, - default="camera", - help="pixels to stream from", - choices=["camera", "screen"], -) -args = parser.parse_args() - from google import genai if sys.version_info < (3, 11, 0): @@ -61,7 +75,7 @@ MODEL = "models/gemini-2.0-flash-exp" -MODE = args.mode +DEFAULT_MODE = "camera" client = genai.Client(http_options={"api_version": "v1alpha"}) @@ -71,11 +85,13 @@ class AudioLoop: - def __init__(self): + def __init__(self, video_mode=DEFAULT_MODE): + self.video_mode = video_mode + self.audio_in_queue = None - self.audio_out_queue = None - self.video_out_queue = None + self.out_queue = None + self.session = None self.send_text_task = None @@ -228,10 +244,11 @@ async def run(self): send_text_task = tg.create_task(self.send_text()) tg.create_task(self.send_realtime()) tg.create_task(self.listen_audio()) - if MODE == "camera": + if self.video_mode == "camera": tg.create_task(self.get_frames()) - elif MODE == "screen": + elif self.video_mode == "screen": tg.create_task(self.get_screen()) + tg.create_task(self.receive_audio()) tg.create_task(self.play_audio()) @@ -246,5 +263,14 @@ async def run(self): if __name__ == "__main__": - main = AudioLoop() + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", + type=str, + default=DEFAULT_MODE, + help="pixels to stream from", + choices=["camera", "screen", "none"], + ) + args = parser.parse_args() + main = AudioLoop(video_mode=args.mode) asyncio.run(main.run()) diff --git a/gemini-2/websockets/live_api_starter.py b/gemini-2/websockets/live_api_starter.py index 42b4042aa..1b0e688c4 100755 --- a/gemini-2/websockets/live_api_starter.py +++ b/gemini-2/websockets/live_api_starter.py @@ -13,13 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. -# To install the dependencies for this script, run: -# pip install opencv-python pyaudio pillow websockets -# And to run this script, ensure the GOOGLE_API_KEY environment -# variable is set to the key you obtained from Google AI Studio. +""" +## Setup -# Add the "--mode screen" if you want to share your screen to the model -# instead of your camera stream +To install the dependencies for this script, run: + +``` +pip install google-genai opencv-python pyaudio pillow mss +``` + +Before running this script, ensure the `GOOGLE_API_KEY` environment +variable is set to the api-key you obtained from Google AI Studio. + +Important: **Use headphones**. This script uses the system default audio +input and output, which often won't include echo cancellation. So to prevent +the model from interrupting itself it is important that you use headphones. + +## Run + +To run the script: + +``` +python live_api_starter.py +``` + +The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none". +The default is "camera". To share your screen run: + +``` +python live_api_starter.py --mode screen +``` +""" import asyncio import base64 @@ -35,16 +59,6 @@ import mss import argparse -parser = argparse.ArgumentParser() -parser.add_argument( - "--mode", - type=str, - default="camera", - help="pixels to stream from", - choices=["camera", "screen"], -) -args = parser.parse_args() - from websockets.asyncio.client import connect if sys.version_info < (3, 11, 0): @@ -61,15 +75,16 @@ host = "generativelanguage.googleapis.com" model = "gemini-2.0-flash-exp" +DEFAULT_MODE="camera" -MODE = args.mode api_key = os.environ["GOOGLE_API_KEY"] uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}" class AudioLoop: - def __init__(self): + def __init__(self, video_mode=DEFAULT_MODE): + self.video_mode=video_mode self.audio_in_queue = None self.out_queue = None @@ -160,8 +175,9 @@ async def get_screen(self): break await asyncio.sleep(1.0) - - await self.out_queue.put(frame) + + msg = {"realtime_input": {"media_chunks": frame}} + await self.out_queue.put(msg) async def send_realtime(self): while True: @@ -254,9 +270,9 @@ async def run(self): tg.create_task(self.send_realtime()) tg.create_task(self.listen_audio()) - if MODE == "camera": + if self.video_mode == "camera": tg.create_task(self.get_frames()) - elif MODE == "screen": + elif self.video_mode == "screen": tg.create_task(self.get_screen()) tg.create_task(self.receive_audio()) tg.create_task(self.play_audio()) @@ -272,5 +288,15 @@ async def run(self): if __name__ == "__main__": - main = AudioLoop() + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", + type=str, + default=DEFAULT_MODE, + help="pixels to stream from", + choices=["camera", "screen", "none"], + ) + args = parser.parse_args() + + main = AudioLoop(video_mode=args.mode) asyncio.run(main.run())