Skip to content

Commit

Permalink
docs(samples): Update code samples for adaptation and VAD (#462)
Browse files Browse the repository at this point in the history
Co-authored-by: AJ Morozoff <amorozoff@google.com>
Co-authored-by: Anthonios Partheniou <partheniou@google.com>
  • Loading branch information
3 people authored Oct 20, 2022
1 parent a9d4879 commit 6e6b668
Show file tree
Hide file tree
Showing 15 changed files with 390 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def adaptation_v2_custom_class_reference(project_id, recognizer_id, phrase_set_i
request = cloud_speech.CreateCustomClassRequest(
parent=f"projects/{project_id}/locations/global",
custom_class_id=custom_class_id,
custom_class=cloud_speech.CustomClass(items=[{"value": "Keem"}]))
custom_class=cloud_speech.CustomClass(items=[{"value": "fare"}]))

operation = client.create_custom_class(request=request)
custom_class = operation.result()
Expand All @@ -70,10 +70,6 @@ def adaptation_v2_custom_class_reference(project_id, recognizer_id, phrase_set_i
auto_decoding_config={}, adaptation=adaptation
)

print(custom_class)
print(phrase_set)
print(config)

request = cloud_speech.RecognizeRequest(
recognizer=recognizer.name, config=config, content=content
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ def test_adaptation_v2_custom_class_reference(capsys):
phrase_set_id = "phrase-set-" + str(uuid4())
custom_class_id = "custom-class-" + str(uuid4())
response = adaptation_v2_custom_class_reference.adaptation_v2_custom_class_reference(
project_id, recognizer_id, phrase_set_id, custom_class_id, os.path.join(RESOURCES, "baby_keem.wav")
project_id, recognizer_id, phrase_set_id, custom_class_id, os.path.join(RESOURCES, "fair.wav")
)

assert re.search(
r"play Baby Keem",
r"the word is fare",
response.results[0].alternatives[0].transcript,
re.DOTALL | re.I,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def adaptation_v2_inline_custom_class(project_id, recognizer_id, audio_file):
content = f.read()

# Build inline phrase set to produce a more accurate transcript
phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "${keem}", "boost": 20}])
custom_class = cloud_speech.CustomClass(name="keem", items=[{"value": "Keem"}])
phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "${fare}", "boost": 20}])
custom_class = cloud_speech.CustomClass(name="fare", items=[{"value": "fare"}])
adaptation = cloud_speech.SpeechAdaptation(
phrase_sets=[
cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ def test_adaptation_v2_inline_custom_class(capsys):

recognizer_id = "recognizer-" + str(uuid4())
response = adaptation_v2_inline_custom_class.adaptation_v2_inline_custom_class(
project_id, recognizer_id, os.path.join(RESOURCES, "baby_keem.wav")
project_id, recognizer_id, os.path.join(RESOURCES, "fair.wav")
)

assert re.search(
r"play Baby Keem",
r"the word is fare",
response.results[0].alternatives[0].transcript,
re.DOTALL | re.I,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def adaptation_v2_inline_phrase_set(project_id, recognizer_id, audio_file):
content = f.read()

# Build inline phrase set to produce a more accurate transcript
phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "Keem", "boost": 10}])
phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "fare", "boost": 10}])
adaptation = cloud_speech.SpeechAdaptation(
phrase_sets=[
cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ def test_adaptation_v2_inline_phrase_set(capsys):

recognizer_id = "recognizer-" + str(uuid4())
response = adaptation_v2_inline_phrase_set.adaptation_v2_inline_phrase_set(
project_id, recognizer_id, os.path.join(RESOURCES, "baby_keem.wav")
project_id, recognizer_id, os.path.join(RESOURCES, "fair.wav")
)

assert re.search(
r"play Baby Keem",
r"the word is fare",
response.results[0].alternatives[0].transcript,
re.DOTALL | re.I,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def adaptation_v2_phrase_set_reference(project_id, recognizer_id, phrase_set_id,
request = cloud_speech.CreatePhraseSetRequest(
parent=f"projects/{project_id}/locations/global",
phrase_set_id=phrase_set_id,
phrase_set=cloud_speech.PhraseSet(phrases=[{"value": "Keem", "boost": 10}]))
phrase_set=cloud_speech.PhraseSet(phrases=[{"value": "fare", "boost": 10}]))

operation = client.create_phrase_set(request=request)
phrase_set = operation.result()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ def test_adaptation_v2_phrase_set_reference(capsys):
recognizer_id = "recognizer-" + str(uuid4())
phrase_set_id = "phrase-set-" + str(uuid4())
response = adaptation_v2_phrase_set_reference.adaptation_v2_phrase_set_reference(
project_id, recognizer_id, phrase_set_id, os.path.join(RESOURCES, "baby_keem.wav")
project_id, recognizer_id, phrase_set_id, os.path.join(RESOURCES, "fair.wav")
)

assert re.search(
r"play Baby Keem",
r"the word is fare",
response.results[0].alternatives[0].transcript,
re.DOTALL | re.I,
)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright 2022 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import argparse

# [START speech_transcribe_streaming_voice_activity_events]
import io

from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech


def transcribe_streaming_voice_activity_events(project_id, recognizer_id, audio_file):
# Instantiates a client
client = SpeechClient()

request = cloud_speech.CreateRecognizerRequest(
parent=f"projects/{project_id}/locations/global",
recognizer_id=recognizer_id,
recognizer=cloud_speech.Recognizer(
language_codes=["en-US"], model="latest_long"
),
)

# Creates a Recognizer
operation = client.create_recognizer(request=request)
recognizer = operation.result()

# Reads a file as bytes
with io.open(audio_file, "rb") as f:
content = f.read()

# In practice, stream should be a generator yielding chunks of audio data
chunk_length = len(content) // 5
stream = [
content[start : start + chunk_length]
for start in range(0, len(content), chunk_length)
]
audio_requests = (
cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream
)

recognition_config = cloud_speech.RecognitionConfig(auto_decoding_config={})

# Sets the flag to enable voice activity events
streaming_features = cloud_speech.StreamingRecognitionFeatures(
enable_voice_activity_events=True
)
streaming_config = cloud_speech.StreamingRecognitionConfig(
config=recognition_config, streaming_features=streaming_features
)

config_request = cloud_speech.StreamingRecognizeRequest(
recognizer=recognizer.name, streaming_config=streaming_config
)

def requests(config, audio):
yield config
for message in audio:
yield message

# Transcribes the audio into text
responses_iterator = client.streaming_recognize(
requests=requests(config_request, audio_requests)
)
responses = []
for response in responses_iterator:
responses.append(response)
if (
response.speech_event_type
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
):
print("Speech started.")
if (
response.speech_event_type
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
):
print("Speech ended.")
for result in response.results:
print("Transcript: {}".format(result.alternatives[0].transcript))

return responses
# [END speech_transcribe_streaming_voice_activity_events]


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("project_id", help="project to create recognizer in")
parser.add_argument("recognizer_id", help="name of recognizer to create")
parser.add_argument("audio_file", help="audio file to stream")
args = parser.parse_args()
transcribe_streaming_voice_activity_events(
args.project_id, args.recognizer_id, args.audio_file
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2022, Google, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
from uuid import uuid4

from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech

import transcribe_streaming_voice_activity_events

RESOURCES = os.path.join(os.path.dirname(__file__), "resources")


def delete_recognizer(name):
client = SpeechClient()
request = cloud_speech.DeleteRecognizerRequest(name=name)
client.delete_recognizer(request=request)


def test_transcribe_streaming_voice_activity_events(capsys):
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")

recognizer_id = "recognizer-" + str(uuid4())
responses = transcribe_streaming_voice_activity_events.transcribe_streaming_voice_activity_events(
project_id, recognizer_id, os.path.join(RESOURCES, "audio.wav")
)

transcript = ""
for response in responses:
for result in response.results:
transcript += result.alternatives[0].transcript

assert (
responses[0].speech_event_type
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
)

assert re.search(
r"how old is the Brooklyn Bridge",
transcript,
re.DOTALL | re.I,
)

delete_recognizer(
f"projects/{project_id}/locations/global/recognizers/{recognizer_id}"
)
Loading

0 comments on commit 6e6b668

Please sign in to comment.