diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst index 6161f1925d09..3e1ada07b6bc 100644 --- a/docs/speech-usage.rst +++ b/docs/speech-usage.rst @@ -157,6 +157,8 @@ data to possible text alternatives on the fly. See: https://cloud.google.com/speech/limits#content +.. code-block:: python + >>> import io >>> from google.cloud import speech >>> from google.cloud.speech.encoding import Encoding @@ -164,17 +166,38 @@ data to possible text alternatives on the fly. >>> with io.open('./hello.wav', 'rb') as stream: >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, ... sample_rate=16000) - >>> stream_container = client.stream_recognize(sample) - >>> print(stream_container) - - >>> print(stream_container.responses) - {0: } - >>> print(stream_container.responses[0].results[0].alternatives[0].confidence) - 0.698092460632 - >>> print(stream_container.is_finished) + >>> for response in client.stream_recognize(sample): + ... print(response.transcript) + hello + ... print(response.is_final) True - >>> print stream_container.get_full_text() + + +By setting ``interim_results`` to true, interim results (tentative hypotheses) +may be returned as they become available (these interim results are indicated +with the is_final=false flag). If false or omitted, only is_final=true +result(s) are returned. + +.. code-block:: python + + >>> import io + >>> from google.cloud import speech + >>> from google.cloud.speech.encoding import Encoding + >>> client = speech.Client() + >>> with io.open('./hello.wav', 'rb') as stream: + >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, + ... sample_rate=16000) + >>> for response in client.stream_recognize(sample, + ... interim_results=True): + ... print(response.transcript) + hell + ... print(response.is_final) + False + ... print(response.transcript) hello + ... print(response.is_final) + True + By default the recognizer will perform continuous recognition (continuing to process audio even if the user pauses speaking) until the client @@ -195,32 +218,6 @@ See: `Single Utterance`_ >>> print(stream_container.get_full_text()) hello - -If ``interim_results`` is set to ``True``, interim results -(tentative hypotheses) may be returned as they become available. - - .. code-block:: python - - >>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream: - >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, - ... sample_rate=16000) - >>> stream_container = client.stream_recognize(sample, - ... interim_results=True) - >>> print(stream_container.get_full_text()) - hello - - >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, - ... sample_rate=44100) - >>> results = client.stream_recognize(sample, interim_results=True) - >>> print(stream_container.responses[0].results[0].alternatives[0].transcript) - how - print(stream_container.responses[1].results[0].alternatives[0].transcript) - hello - >>> print(stream_container.responses[1].results[2].is_final) - True - - .. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig .. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize .. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index c9598d16cee4..8809b4eed540 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -26,7 +26,7 @@ from google.cloud.speech.operation import Operation from google.cloud.speech.streaming.request import _make_request_stream from google.cloud.speech.sample import Sample -from google.cloud.speech.streaming.container import StreamingResponseContainer +from google.cloud.speech.streaming.response import StreamingSpeechResponse try: from google.cloud.gapic.speech.v1beta1.speech_api import SpeechApi @@ -302,12 +302,9 @@ def stream_recognize(self, sample, language_code=None, single_utterance=single_utterance, interim_results=interim_results) - responses = StreamingResponseContainer() for response in self.speech_api.streaming_recognize(requests): - if response: - responses.add_response(response) - - return responses + if hasattr(response, 'results') or interim_results: + yield StreamingSpeechResponse.from_pb(response) @property def speech_api(self): diff --git a/speech/google/cloud/speech/streaming/container.py b/speech/google/cloud/speech/streaming/container.py deleted file mode 100644 index 52384b9597a2..000000000000 --- a/speech/google/cloud/speech/streaming/container.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Representation of a group of GAPIC Speech API responses.""" - -from google.cloud.speech.streaming.response import StreamingSpeechResponse - - -class StreamingResponseContainer(object): - """Response container to help manage streaming responses. - - :type responses: list of :class:`~response.StreamingSpeechResponse` - :param responses: List of ``StreamingSpeechResponse`` objects. - """ - def __init__(self, responses=None): - self._responses = responses or {} - - def add_response(self, response): - """Add/update response based on the ``result_index``. - - :type response: :class:`~response.StreamingSpeechResponse` - :param response: Instance of ``StreamingSpeechResponse``. - """ - self._responses.update({response.result_index: - StreamingSpeechResponse.from_pb(response)}) - - @property - def responses(self): - """All responses held in container. - - :rtype: list of :class:`~response.StreamingSpeechResponse` - :returns: List of ``StreamingSpeechResponse`` objects. - """ - return self._responses - - @property - def is_finished(self): - """Helper property to determin if all resuls are final. - - :rtype: bool - :returns: True of all captured results are final. - """ - finished = [] - for response in self.responses.values(): - for result in response.results: - finished.append(result.is_final) - return all(finished) - - def get_full_text(self): - """Parse together all transcript results to form complete text. - - :rtype: str - :returns: Complete transcription. - """ - text = None - if self.is_finished: - text = '' - for response in self.responses.values(): - for result in response.results: - text += result.alternatives[0].transcript - return text diff --git a/speech/google/cloud/speech/streaming/endpointer_type.py b/speech/google/cloud/speech/streaming/endpointer_type.py new file mode 100644 index 000000000000..987775a6a75d --- /dev/null +++ b/speech/google/cloud/speech/streaming/endpointer_type.py @@ -0,0 +1,14 @@ +class EndpointerType(object): + ENDPOINTER_EVENT_UNSPECIFIED = 0 + START_OF_SPEECH = 1 + END_OF_SPEECH = 2 + END_OF_AUDIO = 3 + END_OF_UTTERANCE = 4 + + reverse_map = { + 0: 'ENDPOINTER_EVENT_UNSPECIFIED', + 1: 'START_OF_SPEECH', + 2: 'END_OF_SPEECH', + 3: 'END_OF_AUDIO', + 4: 'END_OF_UTTERANCE' + } diff --git a/speech/google/cloud/speech/streaming/response.py b/speech/google/cloud/speech/streaming/response.py index ec9428985913..4caf39ba186c 100644 --- a/speech/google/cloud/speech/streaming/response.py +++ b/speech/google/cloud/speech/streaming/response.py @@ -14,6 +14,7 @@ """Representation of a GAPIC Speech API response.""" +from google.cloud.speech.streaming.endpointer_type import EndpointerType from google.cloud.speech.streaming.result import StreamingSpeechResult @@ -34,9 +35,12 @@ class StreamingSpeechResponse(object): :param result_index: Index for specific result set. Used for updating with ``interim_results``. """ - def __init__(self, error, endpointer_type, results, result_index): + def __init__(self, error=None, endpointer_type=None, results=None, + result_index=None): + results = results or [] self._error = error - self._endpointer_type = endpointer_type # Should be enum. + self._endpointer_type = EndpointerType.reverse_map.get( + endpointer_type, None) self._result_index = result_index self._results = [StreamingSpeechResult.from_pb(result) for result in results] @@ -56,7 +60,41 @@ def from_pb(cls, pb_response): endpointer_type = pb_response.endpointer_type results = pb_response.results result_index = pb_response.result_index - return cls(error, endpointer_type, results, result_index) + return cls(error=error, endpointer_type=endpointer_type, + results=results, result_index=result_index) + + @property + def confidence(self): + """Confidence score for recognized speech. + + :rtype: float + :returns: Confidence score of recognized speech [0.0-1.0]. + """ + if self.results and self.results[0].alternatives: + return self.results[0].alternatives[0].confidence + else: + return 0.0 + + @property + def endpointer_type(self): + """Endpointer indicating the state of the speech detection. + + :rtype: str + :returns: String derived from :class:`~endpointer_type.EndpointerType`. + """ + return self._endpointer_type + + @property + def is_final(self): + """Represents an interim result that may change. + + :rtype: bool + :returns: True if the result has completed it's processing. + """ + if len(self.results): + return self.results[0].is_final + else: + return False @property def result_index(self): @@ -75,3 +113,15 @@ def results(self): :returns: List of ``StreamingSpeechResult`` in this response. """ return self._results + + @property + def transcript(self): + """Get most likely transcript from response. + + :rtype: str + :returns: Transcript text from response. + """ + if self.results and self.results[0].alternatives: + return self.results[0].alternatives[0].transcript + else: + return '' diff --git a/speech/google/cloud/speech/streaming/result.py b/speech/google/cloud/speech/streaming/result.py index 104916eda9e0..6cfc37c18ad5 100644 --- a/speech/google/cloud/speech/streaming/result.py +++ b/speech/google/cloud/speech/streaming/result.py @@ -70,4 +70,4 @@ def is_final(self): :rtype: bool :returns: True if the result has completed it's processing. """ - return self._is_final + return bool(self._is_final) diff --git a/speech/unit_tests/streaming/test_container.py b/speech/unit_tests/streaming/test_container.py deleted file mode 100644 index 3d1d8bd13c35..000000000000 --- a/speech/unit_tests/streaming/test_container.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright 2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - - -class TestStreamingContainer(unittest.TestCase): - def _getTargetClass(self): - from google.cloud.speech.streaming.container import ( - StreamingResponseContainer) - return StreamingResponseContainer - - def _makeOne(self, *args, **kw): - return self._getTargetClass()(*args, **kw) - - def test_ctor(self): - streaming_container = self._makeOne() - self.assertEqual(streaming_container.responses, {}) - streaming_container.add_response(_MockGAPICSpeechResponse()) - self.assertEqual(len(streaming_container.responses), 1) - - def test_is_not_finished(self): - true_result = _MockGAPICSpeechResult() - true_result.is_final = True - - false_result = _MockGAPICSpeechResult() - false_result.is_final = False - - first_response = _MockGAPICSpeechResponse() - first_response.results.append(true_result) - first_response.results.append(false_result) - - second_response = _MockGAPICSpeechResponse() - second_response.results.append(true_result) - second_response.results.append(true_result) - - streaming_container = self._makeOne() - streaming_container.add_response(first_response) - streaming_container.add_response(second_response) - - self.assertFalse(streaming_container.is_finished) - - def test_is_finished(self): - true_result = _MockGAPICSpeechResult() - true_result.is_final = True - - first_response = _MockGAPICSpeechResponse() - first_response.results.append(true_result) - first_response.results.append(true_result) - - second_response = _MockGAPICSpeechResponse() - second_response.results.append(true_result) - second_response.results.append(true_result) - second_response.result_index = 1 - - streaming_container = self._makeOne() - streaming_container.add_response(first_response) - streaming_container.add_response(second_response) - - self.assertTrue(streaming_container.is_finished) - - def test_get_full_text(self): - first_part = _MockGAPICSpeechResultAlternative(transcript='testing') - second_part = _MockGAPICSpeechResultAlternative(transcript=' 1 2 3') - - first_result = _MockGAPICSpeechResult(alternatives=[first_part]) - first_result.is_final = True - - second_result = _MockGAPICSpeechResult(alternatives=[second_part]) - second_result.is_final = True - - response = _MockGAPICSpeechResponse() - response.results.append(first_result) - response.results.append(second_result) - - streaming_container = self._makeOne() - streaming_container.add_response(response) - - self.assertEqual(streaming_container.get_full_text(), 'testing 1 2 3') - - def test_unfinshed_full_test(self): - first_part = _MockGAPICSpeechResultAlternative(transcript='testing') - - first_result = _MockGAPICSpeechResult(alternatives=[first_part]) - first_result.is_final = False - - response = _MockGAPICSpeechResponse() - response.results.append(first_result) - - streaming_container = self._makeOne() - streaming_container.add_response(response) - - self.assertIsNone(streaming_container.get_full_text()) - - -class _MockGAPICSpeechResultAlternative(object): - def __init__(self, transcript='', confidence=0): - self.transcript = transcript - self.confidence = confidence - - -class _MockGAPICSpeechResult(object): - def __init__(self, alternatives=None): - self.alternatives = alternatives or [] - stability = 0 - is_final = False - - -class _MockGAPICSpeechResponse(object): - error = None - endpointer_type = None - results = [] - result_index = 0 diff --git a/speech/unit_tests/streaming/test_response.py b/speech/unit_tests/streaming/test_response.py index d31be50dd0f1..d2a695d3f14d 100644 --- a/speech/unit_tests/streaming/test_response.py +++ b/speech/unit_tests/streaming/test_response.py @@ -27,3 +27,34 @@ def _makeOne(self, *args, **kw): def test_ctor(self): response = self._makeOne({}, 'END_OF_UTTERANCE', [], 0) self.assertEqual(response.result_index, 0) + self.assertEqual(response.confidence, 0.0) + self.assertEqual(response.endpointer_type, None) + self.assertEqual(response.results, []) + self.assertEqual(response.transcript, '') + self.assertFalse(response.is_final) + + def test_from_pb(self): + response = self._makeOne() + res = response.from_pb(_MockSpeechPBResponse) + self.assertFalse(res.is_final) + self.assertEqual(res.endpointer_type, 'END_OF_AUDIO') + self.assertEqual(res.transcript, 'hello there!') + self.assertEqual(res.confidence, 0.9704365) + + +class _MockSpeechPBAlternative(object): + transcript = 'hello there!' + confidence = 0.9704365 + + +class _MockSpeechPBResult(object): + alternatives = [_MockSpeechPBAlternative()] + is_final = False + stability = 0.0 + + +class _MockSpeechPBResponse(object): + error = {} + endpointer_type = 3 + result_index = 0 + results = [_MockSpeechPBResult, _MockSpeechPBResult] diff --git a/speech/unit_tests/test_client.py b/speech/unit_tests/test_client.py index 09a67c1fb778..9d3176c56ba9 100644 --- a/speech/unit_tests/test_client.py +++ b/speech/unit_tests/test_client.py @@ -204,7 +204,7 @@ def test_streaming_depends_on_gax(self): with _Monkey(MUT, _USE_GAX=False): with self.assertRaises(EnvironmentError): - client.stream_recognize({}) + next(client.stream_recognize({})) def test_set_speech_api(self): from google.cloud.speech import client as MUT @@ -233,13 +233,31 @@ def test_streaming_closed_stream(self): encoding=Encoding.LINEAR16, sample_rate=self.SAMPLE_RATE) with self.assertRaises(ValueError): - client.stream_recognize(sample) + next(client.stream_recognize(sample)) + + def test_streaming_with_empty_response(self): + from io import BytesIO + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + client._speech_api._responses = [] + + sample = client.sample(stream=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + results = client.stream_recognize(sample) + with self.assertRaises(StopIteration): + next(results) def test_stream_recognize(self): from io import BytesIO from google.cloud.speech.encoding import Encoding - from google.cloud.speech.streaming.container import ( - StreamingResponseContainer) + from google.cloud.speech.streaming.response import ( + StreamingSpeechResponse) stream = BytesIO(b'Some audio data...') credentials = _Credentials() @@ -252,7 +270,7 @@ def test_stream_recognize(self): sample_rate=self.SAMPLE_RATE) results = client.stream_recognize(sample) - self.assertIsInstance(results, StreamingResponseContainer) + self.assertIsInstance(next(results), StreamingSpeechResponse) requests = [] for req in client.speech_api._requests: requests.append(req) @@ -268,10 +286,11 @@ class _MockGAPICSpeechResponse(object): class _MockGAPICSpeechAPI(object): _requests = None + _responses = [None, _MockGAPICSpeechResponse()] def streaming_recognize(self, requests): self._requests = requests - return [None, _MockGAPICSpeechResponse()] + return self._responses class _Credentials(object):