Print intermediate speech transcriptions.

Also - fix the test.
GoogleCloudPlatform · May 10, 2016 · 1be89c3 · 1be89c3
1 parent 2a813b9
commit 1be89c3
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 7 deletions.
diff --git a/speech/api/speech_streaming.py b/speech/api/speech_streaming.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 
 import contextlib
+import re
 import threading
 
 from gcloud.credentials import get_credentials
@@ -70,16 +71,27 @@ def request_stream(stop_audio, channels=CHANNELS, rate=RATE, chunk=CHUNK):
         # The initial request must contain metadata about the stream, so the
         # server knows how to interpret it.
         metadata = InitialRecognizeRequest(
-            encoding='LINEAR16', sample_rate=rate)
-        audio_request = AudioRequest(content=audio_stream.read(chunk))
+            encoding='LINEAR16', sample_rate=rate,
+            # Note that setting interim_results to True means that you'll
+            # likely get multiple results for the same bit of audio, as the
+            # system re-interprets audio in the context of subsequent audio.
+            # However, this will give us quick results without having to tell
+            # the server when to finalize a piece of audio.
+            interim_results=True, continuous=False,
+        )
+        data = audio_stream.read(chunk)
+        audio_request = AudioRequest(content=data)
 
         yield RecognizeRequest(
             initial_request=metadata,
             audio_request=audio_request)
 
         while not stop_audio.is_set():
+            data = audio_stream.read(chunk)
+            if not data:
+                raise StopIteration()
             # Subsequent requests can all just have the content
-            audio_request = AudioRequest(content=audio_stream.read(chunk))
+            audio_request = AudioRequest(content=data)
 
             yield RecognizeRequest(audio_request=audio_request)
 
@@ -95,8 +107,7 @@ def listen_print_loop(recognize_stream):
 
         # Exit recognition if any of the transcribed phrases could be
         # one of our keywords.
-        if any(alt.confidence > .5 and
-               (alt.transcript.strip() in ('exit', 'quit'))
+        if any(re.search(r'\b(exit|quit)\b', alt.transcript)
                for result in resp.results
                for alt in result.alternatives):
             print('Exiting..')

diff --git a/speech/api/speech_streaming_test.py b/speech/api/speech_streaming_test.py
@@ -15,8 +15,8 @@
 import io
 import re
 import sys
+import time
 
-from gcp.testing.flaky import flaky
 import pytest
 
 import speech_streaming
@@ -39,6 +39,9 @@ def __call__(self, *args):
         return self
 
     def read(self, num_frames):
+        # Approximate realtime by sleeping for the appropriate time for the
+        # requested number of frames
+        time.sleep(num_frames / float(speech_streaming.RATE))
         # audio is 16-bit samples, whereas python byte is 8-bit
         num_bytes = 2 * num_frames
         chunk = self.audio_file.read(num_bytes) or self.silence.read(num_bytes)
@@ -54,7 +57,6 @@ def mock_audio_stream(channels, rate, chunk):
     return mock_audio_stream
 
 
-@flaky
 @pytest.mark.skipif(
         sys.version_info >= (3, 0),
         reason=("grpc doesn't yet support python3 "