Skip to content

Commit

Permalink
Update live_api_starter scripts. (#377)
Browse files Browse the repository at this point in the history
* Update live_api_starter.py

* Update live_api_starter.py

* Update live_api_starter.py

* Update live_api_starter.py

* fix video_mode
  • Loading branch information
MarkDaoust authored Dec 20, 2024
1 parent 89a301b commit ace3178
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 46 deletions.
72 changes: 49 additions & 23 deletions gemini-2/live_api_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,37 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# To install the dependencies for this script, run:
# pip install google-genai opencv-python pyaudio pillow mss
# And to run this script, ensure the GOOGLE_API_KEY environment
# variable is set to the key you obtained from Google AI Studio.
"""
## Setup
# Add the "--mode screen" if you want to share your screen to the model
# instead of your camera stream
To install the dependencies for this script, run:
```
pip install google-genai opencv-python pyaudio pillow mss
```
Before running this script, ensure the `GOOGLE_API_KEY` environment
variable is set to the api-key you obtained from Google AI Studio.
Important: **Use headphones**. This script uses the system default audio
input and output, which often won't include echo cancellation. So to prevent
the model from interrupting itself it is important that you use headphones.
## Run
To run the script:
```
python live_api_starter.py
```
The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none".
The default is "camera". To share your screen run:
```
python live_api_starter.py --mode screen
```
"""

import asyncio
import base64
Expand All @@ -35,16 +59,6 @@

import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
"--mode",
type=str,
default="camera",
help="pixels to stream from",
choices=["camera", "screen"],
)
args = parser.parse_args()

from google import genai

if sys.version_info < (3, 11, 0):
Expand All @@ -61,7 +75,7 @@

MODEL = "models/gemini-2.0-flash-exp"

MODE = args.mode
DEFAULT_MODE = "camera"

client = genai.Client(http_options={"api_version": "v1alpha"})

Expand All @@ -71,11 +85,13 @@


class AudioLoop:
def __init__(self):
def __init__(self, video_mode=DEFAULT_MODE):
self.video_mode = video_mode

self.audio_in_queue = None
self.audio_out_queue = None
self.video_out_queue = None
self.out_queue = None


self.session = None

self.send_text_task = None
Expand Down Expand Up @@ -228,10 +244,11 @@ async def run(self):
send_text_task = tg.create_task(self.send_text())
tg.create_task(self.send_realtime())
tg.create_task(self.listen_audio())
if MODE == "camera":
if self.video_mode == "camera":
tg.create_task(self.get_frames())
elif MODE == "screen":
elif self.video_mode == "screen":
tg.create_task(self.get_screen())

tg.create_task(self.receive_audio())
tg.create_task(self.play_audio())

Expand All @@ -246,5 +263,14 @@ async def run(self):


if __name__ == "__main__":
main = AudioLoop()
parser = argparse.ArgumentParser()
parser.add_argument(
"--mode",
type=str,
default=DEFAULT_MODE,
help="pixels to stream from",
choices=["camera", "screen", "none"],
)
args = parser.parse_args()
main = AudioLoop(video_mode=args.mode)
asyncio.run(main.run())
72 changes: 49 additions & 23 deletions gemini-2/websockets/live_api_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,37 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# To install the dependencies for this script, run:
# pip install opencv-python pyaudio pillow websockets
# And to run this script, ensure the GOOGLE_API_KEY environment
# variable is set to the key you obtained from Google AI Studio.
"""
## Setup
# Add the "--mode screen" if you want to share your screen to the model
# instead of your camera stream
To install the dependencies for this script, run:
```
pip install google-genai opencv-python pyaudio pillow mss
```
Before running this script, ensure the `GOOGLE_API_KEY` environment
variable is set to the api-key you obtained from Google AI Studio.
Important: **Use headphones**. This script uses the system default audio
input and output, which often won't include echo cancellation. So to prevent
the model from interrupting itself it is important that you use headphones.
## Run
To run the script:
```
python live_api_starter.py
```
The script takes a video-mode flag `--mode`, this can be "camera", "screen", or "none".
The default is "camera". To share your screen run:
```
python live_api_starter.py --mode screen
```
"""

import asyncio
import base64
Expand All @@ -35,16 +59,6 @@
import mss
import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
"--mode",
type=str,
default="camera",
help="pixels to stream from",
choices=["camera", "screen"],
)
args = parser.parse_args()

from websockets.asyncio.client import connect

if sys.version_info < (3, 11, 0):
Expand All @@ -61,15 +75,16 @@

host = "generativelanguage.googleapis.com"
model = "gemini-2.0-flash-exp"
DEFAULT_MODE="camera"

MODE = args.mode

api_key = os.environ["GOOGLE_API_KEY"]
uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"


class AudioLoop:
def __init__(self):
def __init__(self, video_mode=DEFAULT_MODE):
self.video_mode=video_mode
self.audio_in_queue = None
self.out_queue = None

Expand Down Expand Up @@ -160,8 +175,9 @@ async def get_screen(self):
break

await asyncio.sleep(1.0)

await self.out_queue.put(frame)

msg = {"realtime_input": {"media_chunks": frame}}
await self.out_queue.put(msg)

async def send_realtime(self):
while True:
Expand Down Expand Up @@ -254,9 +270,9 @@ async def run(self):

tg.create_task(self.send_realtime())
tg.create_task(self.listen_audio())
if MODE == "camera":
if self.video_mode == "camera":
tg.create_task(self.get_frames())
elif MODE == "screen":
elif self.video_mode == "screen":
tg.create_task(self.get_screen())
tg.create_task(self.receive_audio())
tg.create_task(self.play_audio())
Expand All @@ -272,5 +288,15 @@ async def run(self):


if __name__ == "__main__":
main = AudioLoop()
parser = argparse.ArgumentParser()
parser.add_argument(
"--mode",
type=str,
default=DEFAULT_MODE,
help="pixels to stream from",
choices=["camera", "screen", "none"],
)
args = parser.parse_args()

main = AudioLoop(video_mode=args.mode)
asyncio.run(main.run())

0 comments on commit ace3178

Please sign in to comment.