diff --git a/speech/api/README.md b/speech/api/README.md index b2207f740162..cfeb46a356c7 100644 --- a/speech/api/README.md +++ b/speech/api/README.md @@ -52,6 +52,16 @@ for more information. $ pip install -r requirements-speech_streaming.txt ``` + The sample uses the [PyAudio][pyaudio] library to stream audio from your + computer's microphone. PyAudio depends on [PortAudio][portaudio], which may + need to be compiled when you install PyAudio. If you run into compilation + issues that mention PortAudio, you may have to [install some + dependencies][pyaudio-install]. + +[pyaudio]: https://people.csail.mit.edu/hubert/pyaudio/ +[portaudio]: http://www.portaudio.com/ +[pyaudio-install]: https://people.csail.mit.edu/hubert/pyaudio/#downloads + ## Run the example * To run the `speech_rest.py` sample: diff --git a/speech/api/speech_streaming.py b/speech/api/speech_streaming.py index a860e37bf931..606a1c732f5d 100644 --- a/speech/api/speech_streaming.py +++ b/speech/api/speech_streaming.py @@ -1,6 +1,7 @@ #!/usr/bin/python import contextlib +import re import threading from gcloud.credentials import get_credentials @@ -70,16 +71,27 @@ def request_stream(stop_audio, channels=CHANNELS, rate=RATE, chunk=CHUNK): # The initial request must contain metadata about the stream, so the # server knows how to interpret it. metadata = InitialRecognizeRequest( - encoding='LINEAR16', sample_rate=rate) - audio_request = AudioRequest(content=audio_stream.read(chunk)) + encoding='LINEAR16', sample_rate=rate, + # Note that setting interim_results to True means that you'll + # likely get multiple results for the same bit of audio, as the + # system re-interprets audio in the context of subsequent audio. + # However, this will give us quick results without having to tell + # the server when to finalize a piece of audio. + interim_results=True, continuous=False, + ) + data = audio_stream.read(chunk) + audio_request = AudioRequest(content=data) yield RecognizeRequest( initial_request=metadata, audio_request=audio_request) while not stop_audio.is_set(): + data = audio_stream.read(chunk) + if not data: + raise StopIteration() # Subsequent requests can all just have the content - audio_request = AudioRequest(content=audio_stream.read(chunk)) + audio_request = AudioRequest(content=data) yield RecognizeRequest(audio_request=audio_request) @@ -95,8 +107,7 @@ def listen_print_loop(recognize_stream): # Exit recognition if any of the transcribed phrases could be # one of our keywords. - if any(alt.confidence > .5 and - (alt.transcript.strip() in ('exit', 'quit')) + if any(re.search(r'\b(exit|quit)\b', alt.transcript) for result in resp.results for alt in result.alternatives): print('Exiting..') diff --git a/speech/api/speech_streaming_test.py b/speech/api/speech_streaming_test.py index fe7ea375198e..e3b71b88ac99 100644 --- a/speech/api/speech_streaming_test.py +++ b/speech/api/speech_streaming_test.py @@ -15,8 +15,8 @@ import io import re import sys +import time -from gcp.testing.flaky import flaky import pytest import speech_streaming @@ -39,6 +39,9 @@ def __call__(self, *args): return self def read(self, num_frames): + # Approximate realtime by sleeping for the appropriate time for the + # requested number of frames + time.sleep(num_frames / float(speech_streaming.RATE)) # audio is 16-bit samples, whereas python byte is 8-bit num_bytes = 2 * num_frames chunk = self.audio_file.read(num_bytes) or self.silence.read(num_bytes) @@ -54,7 +57,6 @@ def mock_audio_stream(channels, rate, chunk): return mock_audio_stream -@flaky @pytest.mark.skipif( sys.version_info >= (3, 0), reason=("grpc doesn't yet support python3 "