From 7da4a408d135f63bf40379e83e167aa478e48e23 Mon Sep 17 00:00:00 2001
From: Tres Seaver <tseaver@palladion.com>
Date: Tue, 10 Sep 2019 19:39:46 -0400
Subject: [PATCH] Add speaker diarization configuration support (via synth).
 (#9202)

Also, exclude 'noxfile.py' from synth (to preserve 'samples').

Supersedes #9196.
---
 .../cloud/speech_v1/proto/cloud_speech.proto  |  40 +++
 .../cloud/speech_v1/proto/cloud_speech_pb2.py | 245 +++++++++++++++---
 speech/synth.metadata                         |  14 +-
 speech/synth.py                               |   2 +-
 4 files changed, 256 insertions(+), 45 deletions(-)

diff --git a/speech/google/cloud/speech_v1/proto/cloud_speech.proto b/speech/google/cloud/speech_v1/proto/cloud_speech.proto
index 90ff515b5f76..30e540cbb2ee 100644
--- a/speech/google/cloud/speech_v1/proto/cloud_speech.proto
+++ b/speech/google/cloud/speech_v1/proto/cloud_speech.proto
@@ -276,6 +276,16 @@ message RecognitionConfig {
   // premium feature.
   bool enable_automatic_punctuation = 11;
 
+  // *Optional* Config to enable speaker diarization and set additional
+  // parameters to make diarization better suited for your application.
+  // Note: When this is enabled, we send all the words from the beginning of the
+  // audio for the top alternative in every consecutive STREAMING responses.
+  // This is done in order to improve our speaker tags as our models learn to
+  // identify the speakers in the conversation over time.
+  // For non-streaming requests, the diarization results will be provided only
+  // in the top alternative of the FINAL SpeechRecognitionResult.
+  SpeakerDiarizationConfig diarization_config = 19;
+
   // *Optional* Metadata regarding this request.
   RecognitionMetadata metadata = 9;
 
@@ -324,6 +334,36 @@ message RecognitionConfig {
   bool use_enhanced = 14;
 }
 
+// *Optional* Config to enable speaker diarization.
+message SpeakerDiarizationConfig {
+  // *Optional* If 'true', enables speaker detection for each recognized word in
+  // the top alternative of the recognition result using a speaker_tag provided
+  // in the WordInfo.
+  bool enable_speaker_diarization = 1;
+
+  // Note: Set min_speaker_count = max_speaker_count to fix the number of
+  // speakers to be detected in the audio.
+
+  // *Optional*
+  // Minimum number of speakers in the conversation. This range gives you more
+  // flexibility by allowing the system to automatically determine the correct
+  // number of speakers. If not set, the default value is 2.
+  int32 min_speaker_count = 2;
+
+  // *Optional*
+  // Maximum number of speakers in the conversation. This range gives you more
+  // flexibility by allowing the system to automatically determine the correct
+  // number of speakers. If not set, the default value is 6.
+  int32 max_speaker_count = 3;
+
+  // Output only. A distinct integer value is assigned for every speaker within
+  // the audio. This field specifies which one of those speakers was detected to
+  // have spoken this word. Value ranges from '1' to diarization_speaker_count.
+  // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
+  // top alternative.
+  int32 speaker_tag = 5;
+}
+
 // Description of audio data to be recognized.
 message RecognitionMetadata {
   // Use case categories that the audio recognition request can be described
diff --git a/speech/google/cloud/speech_v1/proto/cloud_speech_pb2.py b/speech/google/cloud/speech_v1/proto/cloud_speech_pb2.py
index e2667403c18c..7769b0b85df7 100644
--- a/speech/google/cloud/speech_v1/proto/cloud_speech_pb2.py
+++ b/speech/google/cloud/speech_v1/proto/cloud_speech_pb2.py
@@ -32,7 +32,7 @@
         "\n\032com.google.cloud.speech.v1B\013SpeechProtoP\001Z<google.golang.org/genproto/googleapis/cloud/speech/v1;speech\370\001\001"
     ),
     serialized_pb=_b(
-        '\n/google/cloud/speech_v1/proto/cloud_speech.proto\x12\x16google.cloud.speech.v1\x1a\x1cgoogle/api/annotations.proto\x1a#google/longrunning/operations.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x17google/rpc/status.proto"\x86\x01\n\x10RecognizeRequest\x12\x39\n\x06\x63onfig\x18\x01 \x01(\x0b\x32).google.cloud.speech.v1.RecognitionConfig\x12\x37\n\x05\x61udio\x18\x02 \x01(\x0b\x32(.google.cloud.speech.v1.RecognitionAudio"\x91\x01\n\x1bLongRunningRecognizeRequest\x12\x39\n\x06\x63onfig\x18\x01 \x01(\x0b\x32).google.cloud.speech.v1.RecognitionConfig\x12\x37\n\x05\x61udio\x18\x02 \x01(\x0b\x32(.google.cloud.speech.v1.RecognitionAudio"\x99\x01\n\x19StreamingRecognizeRequest\x12N\n\x10streaming_config\x18\x01 \x01(\x0b\x32\x32.google.cloud.speech.v1.StreamingRecognitionConfigH\x00\x12\x17\n\raudio_content\x18\x02 \x01(\x0cH\x00\x42\x13\n\x11streaming_request"\x8a\x01\n\x1aStreamingRecognitionConfig\x12\x39\n\x06\x63onfig\x18\x01 \x01(\x0b\x32).google.cloud.speech.v1.RecognitionConfig\x12\x18\n\x10single_utterance\x18\x02 \x01(\x08\x12\x17\n\x0finterim_results\x18\x03 \x01(\x08"\x8c\x05\n\x11RecognitionConfig\x12I\n\x08\x65ncoding\x18\x01 \x01(\x0e\x32\x37.google.cloud.speech.v1.RecognitionConfig.AudioEncoding\x12\x19\n\x11sample_rate_hertz\x18\x02 \x01(\x05\x12\x1b\n\x13\x61udio_channel_count\x18\x07 \x01(\x05\x12/\n\'enable_separate_recognition_per_channel\x18\x0c \x01(\x08\x12\x15\n\rlanguage_code\x18\x03 \x01(\t\x12\x18\n\x10max_alternatives\x18\x04 \x01(\x05\x12\x18\n\x10profanity_filter\x18\x05 \x01(\x08\x12>\n\x0fspeech_contexts\x18\x06 \x03(\x0b\x32%.google.cloud.speech.v1.SpeechContext\x12 \n\x18\x65nable_word_time_offsets\x18\x08 \x01(\x08\x12$\n\x1c\x65nable_automatic_punctuation\x18\x0b \x01(\x08\x12=\n\x08metadata\x18\t \x01(\x0b\x32+.google.cloud.speech.v1.RecognitionMetadata\x12\r\n\x05model\x18\r \x01(\t\x12\x14\n\x0cuse_enhanced\x18\x0e \x01(\x08"\x8b\x01\n\rAudioEncoding\x12\x18\n\x14\x45NCODING_UNSPECIFIED\x10\x00\x12\x0c\n\x08LINEAR16\x10\x01\x12\x08\n\x04\x46LAC\x10\x02\x12\t\n\x05MULAW\x10\x03\x12\x07\n\x03\x41MR\x10\x04\x12\n\n\x06\x41MR_WB\x10\x05\x12\x0c\n\x08OGG_OPUS\x10\x06\x12\x1a\n\x16SPEEX_WITH_HEADER_BYTE\x10\x07"\xa0\x08\n\x13RecognitionMetadata\x12U\n\x10interaction_type\x18\x01 \x01(\x0e\x32;.google.cloud.speech.v1.RecognitionMetadata.InteractionType\x12$\n\x1cindustry_naics_code_of_audio\x18\x03 \x01(\r\x12[\n\x13microphone_distance\x18\x04 \x01(\x0e\x32>.google.cloud.speech.v1.RecognitionMetadata.MicrophoneDistance\x12Z\n\x13original_media_type\x18\x05 \x01(\x0e\x32=.google.cloud.speech.v1.RecognitionMetadata.OriginalMediaType\x12^\n\x15recording_device_type\x18\x06 \x01(\x0e\x32?.google.cloud.speech.v1.RecognitionMetadata.RecordingDeviceType\x12\x1d\n\x15recording_device_name\x18\x07 \x01(\t\x12\x1a\n\x12original_mime_type\x18\x08 \x01(\t\x12\x13\n\x0b\x61udio_topic\x18\n \x01(\t"\xc5\x01\n\x0fInteractionType\x12 \n\x1cINTERACTION_TYPE_UNSPECIFIED\x10\x00\x12\x0e\n\nDISCUSSION\x10\x01\x12\x10\n\x0cPRESENTATION\x10\x02\x12\x0e\n\nPHONE_CALL\x10\x03\x12\r\n\tVOICEMAIL\x10\x04\x12\x1b\n\x17PROFESSIONALLY_PRODUCED\x10\x05\x12\x10\n\x0cVOICE_SEARCH\x10\x06\x12\x11\n\rVOICE_COMMAND\x10\x07\x12\r\n\tDICTATION\x10\x08"d\n\x12MicrophoneDistance\x12#\n\x1fMICROPHONE_DISTANCE_UNSPECIFIED\x10\x00\x12\r\n\tNEARFIELD\x10\x01\x12\x0c\n\x08MIDFIELD\x10\x02\x12\x0c\n\x08\x46\x41RFIELD\x10\x03"N\n\x11OriginalMediaType\x12#\n\x1fORIGINAL_MEDIA_TYPE_UNSPECIFIED\x10\x00\x12\t\n\x05\x41UDIO\x10\x01\x12\t\n\x05VIDEO\x10\x02"\xa4\x01\n\x13RecordingDeviceType\x12%\n!RECORDING_DEVICE_TYPE_UNSPECIFIED\x10\x00\x12\x0e\n\nSMARTPHONE\x10\x01\x12\x06\n\x02PC\x10\x02\x12\x0e\n\nPHONE_LINE\x10\x03\x12\x0b\n\x07VEHICLE\x10\x04\x12\x18\n\x14OTHER_OUTDOOR_DEVICE\x10\x05\x12\x17\n\x13OTHER_INDOOR_DEVICE\x10\x06" \n\rSpeechContext\x12\x0f\n\x07phrases\x18\x01 \x03(\t"D\n\x10RecognitionAudio\x12\x11\n\x07\x63ontent\x18\x01 \x01(\x0cH\x00\x12\r\n\x03uri\x18\x02 \x01(\tH\x00\x42\x0e\n\x0c\x61udio_source"U\n\x11RecognizeResponse\x12@\n\x07results\x18\x02 \x03(\x0b\x32/.google.cloud.speech.v1.SpeechRecognitionResult"`\n\x1cLongRunningRecognizeResponse\x12@\n\x07results\x18\x02 \x03(\x0b\x32/.google.cloud.speech.v1.SpeechRecognitionResult"\x9e\x01\n\x1cLongRunningRecognizeMetadata\x12\x18\n\x10progress_percent\x18\x01 \x01(\x05\x12.\n\nstart_time\x18\x02 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x34\n\x10last_update_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\xb1\x02\n\x1aStreamingRecognizeResponse\x12!\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x12.google.rpc.Status\x12\x43\n\x07results\x18\x02 \x03(\x0b\x32\x32.google.cloud.speech.v1.StreamingRecognitionResult\x12]\n\x11speech_event_type\x18\x04 \x01(\x0e\x32\x42.google.cloud.speech.v1.StreamingRecognizeResponse.SpeechEventType"L\n\x0fSpeechEventType\x12\x1c\n\x18SPEECH_EVENT_UNSPECIFIED\x10\x00\x12\x1b\n\x17\x45ND_OF_SINGLE_UTTERANCE\x10\x01"\xed\x01\n\x1aStreamingRecognitionResult\x12J\n\x0c\x61lternatives\x18\x01 \x03(\x0b\x32\x34.google.cloud.speech.v1.SpeechRecognitionAlternative\x12\x10\n\x08is_final\x18\x02 \x01(\x08\x12\x11\n\tstability\x18\x03 \x01(\x02\x12\x32\n\x0fresult_end_time\x18\x04 \x01(\x0b\x32\x19.google.protobuf.Duration\x12\x13\n\x0b\x63hannel_tag\x18\x05 \x01(\x05\x12\x15\n\rlanguage_code\x18\x06 \x01(\t"z\n\x17SpeechRecognitionResult\x12J\n\x0c\x61lternatives\x18\x01 \x03(\x0b\x32\x34.google.cloud.speech.v1.SpeechRecognitionAlternative\x12\x13\n\x0b\x63hannel_tag\x18\x02 \x01(\x05"w\n\x1cSpeechRecognitionAlternative\x12\x12\n\ntranscript\x18\x01 \x01(\t\x12\x12\n\nconfidence\x18\x02 \x01(\x02\x12/\n\x05words\x18\x03 \x03(\x0b\x32 .google.cloud.speech.v1.WordInfo"t\n\x08WordInfo\x12-\n\nstart_time\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\x12+\n\x08\x65nd_time\x18\x02 \x01(\x0b\x32\x19.google.protobuf.Duration\x12\x0c\n\x04word\x18\x03 \x01(\t2\xa9\x03\n\x06Speech\x12\x81\x01\n\tRecognize\x12(.google.cloud.speech.v1.RecognizeRequest\x1a).google.cloud.speech.v1.RecognizeResponse"\x1f\x82\xd3\xe4\x93\x02\x19"\x14/v1/speech:recognize:\x01*\x12\x96\x01\n\x14LongRunningRecognize\x12\x33.google.cloud.speech.v1.LongRunningRecognizeRequest\x1a\x1d.google.longrunning.Operation"*\x82\xd3\xe4\x93\x02$"\x1f/v1/speech:longrunningrecognize:\x01*\x12\x81\x01\n\x12StreamingRecognize\x12\x31.google.cloud.speech.v1.StreamingRecognizeRequest\x1a\x32.google.cloud.speech.v1.StreamingRecognizeResponse"\x00(\x01\x30\x01\x42l\n\x1a\x63om.google.cloud.speech.v1B\x0bSpeechProtoP\x01Z<google.golang.org/genproto/googleapis/cloud/speech/v1;speech\xf8\x01\x01\x62\x06proto3'
+        '\n/google/cloud/speech_v1/proto/cloud_speech.proto\x12\x16google.cloud.speech.v1\x1a\x1cgoogle/api/annotations.proto\x1a#google/longrunning/operations.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x17google/rpc/status.proto"\x86\x01\n\x10RecognizeRequest\x12\x39\n\x06\x63onfig\x18\x01 \x01(\x0b\x32).google.cloud.speech.v1.RecognitionConfig\x12\x37\n\x05\x61udio\x18\x02 \x01(\x0b\x32(.google.cloud.speech.v1.RecognitionAudio"\x91\x01\n\x1bLongRunningRecognizeRequest\x12\x39\n\x06\x63onfig\x18\x01 \x01(\x0b\x32).google.cloud.speech.v1.RecognitionConfig\x12\x37\n\x05\x61udio\x18\x02 \x01(\x0b\x32(.google.cloud.speech.v1.RecognitionAudio"\x99\x01\n\x19StreamingRecognizeRequest\x12N\n\x10streaming_config\x18\x01 \x01(\x0b\x32\x32.google.cloud.speech.v1.StreamingRecognitionConfigH\x00\x12\x17\n\raudio_content\x18\x02 \x01(\x0cH\x00\x42\x13\n\x11streaming_request"\x8a\x01\n\x1aStreamingRecognitionConfig\x12\x39\n\x06\x63onfig\x18\x01 \x01(\x0b\x32).google.cloud.speech.v1.RecognitionConfig\x12\x18\n\x10single_utterance\x18\x02 \x01(\x08\x12\x17\n\x0finterim_results\x18\x03 \x01(\x08"\xda\x05\n\x11RecognitionConfig\x12I\n\x08\x65ncoding\x18\x01 \x01(\x0e\x32\x37.google.cloud.speech.v1.RecognitionConfig.AudioEncoding\x12\x19\n\x11sample_rate_hertz\x18\x02 \x01(\x05\x12\x1b\n\x13\x61udio_channel_count\x18\x07 \x01(\x05\x12/\n\'enable_separate_recognition_per_channel\x18\x0c \x01(\x08\x12\x15\n\rlanguage_code\x18\x03 \x01(\t\x12\x18\n\x10max_alternatives\x18\x04 \x01(\x05\x12\x18\n\x10profanity_filter\x18\x05 \x01(\x08\x12>\n\x0fspeech_contexts\x18\x06 \x03(\x0b\x32%.google.cloud.speech.v1.SpeechContext\x12 \n\x18\x65nable_word_time_offsets\x18\x08 \x01(\x08\x12$\n\x1c\x65nable_automatic_punctuation\x18\x0b \x01(\x08\x12L\n\x12\x64iarization_config\x18\x13 \x01(\x0b\x32\x30.google.cloud.speech.v1.SpeakerDiarizationConfig\x12=\n\x08metadata\x18\t \x01(\x0b\x32+.google.cloud.speech.v1.RecognitionMetadata\x12\r\n\x05model\x18\r \x01(\t\x12\x14\n\x0cuse_enhanced\x18\x0e \x01(\x08"\x8b\x01\n\rAudioEncoding\x12\x18\n\x14\x45NCODING_UNSPECIFIED\x10\x00\x12\x0c\n\x08LINEAR16\x10\x01\x12\x08\n\x04\x46LAC\x10\x02\x12\t\n\x05MULAW\x10\x03\x12\x07\n\x03\x41MR\x10\x04\x12\n\n\x06\x41MR_WB\x10\x05\x12\x0c\n\x08OGG_OPUS\x10\x06\x12\x1a\n\x16SPEEX_WITH_HEADER_BYTE\x10\x07"\x89\x01\n\x18SpeakerDiarizationConfig\x12"\n\x1a\x65nable_speaker_diarization\x18\x01 \x01(\x08\x12\x19\n\x11min_speaker_count\x18\x02 \x01(\x05\x12\x19\n\x11max_speaker_count\x18\x03 \x01(\x05\x12\x13\n\x0bspeaker_tag\x18\x05 \x01(\x05"\xa0\x08\n\x13RecognitionMetadata\x12U\n\x10interaction_type\x18\x01 \x01(\x0e\x32;.google.cloud.speech.v1.RecognitionMetadata.InteractionType\x12$\n\x1cindustry_naics_code_of_audio\x18\x03 \x01(\r\x12[\n\x13microphone_distance\x18\x04 \x01(\x0e\x32>.google.cloud.speech.v1.RecognitionMetadata.MicrophoneDistance\x12Z\n\x13original_media_type\x18\x05 \x01(\x0e\x32=.google.cloud.speech.v1.RecognitionMetadata.OriginalMediaType\x12^\n\x15recording_device_type\x18\x06 \x01(\x0e\x32?.google.cloud.speech.v1.RecognitionMetadata.RecordingDeviceType\x12\x1d\n\x15recording_device_name\x18\x07 \x01(\t\x12\x1a\n\x12original_mime_type\x18\x08 \x01(\t\x12\x13\n\x0b\x61udio_topic\x18\n \x01(\t"\xc5\x01\n\x0fInteractionType\x12 \n\x1cINTERACTION_TYPE_UNSPECIFIED\x10\x00\x12\x0e\n\nDISCUSSION\x10\x01\x12\x10\n\x0cPRESENTATION\x10\x02\x12\x0e\n\nPHONE_CALL\x10\x03\x12\r\n\tVOICEMAIL\x10\x04\x12\x1b\n\x17PROFESSIONALLY_PRODUCED\x10\x05\x12\x10\n\x0cVOICE_SEARCH\x10\x06\x12\x11\n\rVOICE_COMMAND\x10\x07\x12\r\n\tDICTATION\x10\x08"d\n\x12MicrophoneDistance\x12#\n\x1fMICROPHONE_DISTANCE_UNSPECIFIED\x10\x00\x12\r\n\tNEARFIELD\x10\x01\x12\x0c\n\x08MIDFIELD\x10\x02\x12\x0c\n\x08\x46\x41RFIELD\x10\x03"N\n\x11OriginalMediaType\x12#\n\x1fORIGINAL_MEDIA_TYPE_UNSPECIFIED\x10\x00\x12\t\n\x05\x41UDIO\x10\x01\x12\t\n\x05VIDEO\x10\x02"\xa4\x01\n\x13RecordingDeviceType\x12%\n!RECORDING_DEVICE_TYPE_UNSPECIFIED\x10\x00\x12\x0e\n\nSMARTPHONE\x10\x01\x12\x06\n\x02PC\x10\x02\x12\x0e\n\nPHONE_LINE\x10\x03\x12\x0b\n\x07VEHICLE\x10\x04\x12\x18\n\x14OTHER_OUTDOOR_DEVICE\x10\x05\x12\x17\n\x13OTHER_INDOOR_DEVICE\x10\x06" \n\rSpeechContext\x12\x0f\n\x07phrases\x18\x01 \x03(\t"D\n\x10RecognitionAudio\x12\x11\n\x07\x63ontent\x18\x01 \x01(\x0cH\x00\x12\r\n\x03uri\x18\x02 \x01(\tH\x00\x42\x0e\n\x0c\x61udio_source"U\n\x11RecognizeResponse\x12@\n\x07results\x18\x02 \x03(\x0b\x32/.google.cloud.speech.v1.SpeechRecognitionResult"`\n\x1cLongRunningRecognizeResponse\x12@\n\x07results\x18\x02 \x03(\x0b\x32/.google.cloud.speech.v1.SpeechRecognitionResult"\x9e\x01\n\x1cLongRunningRecognizeMetadata\x12\x18\n\x10progress_percent\x18\x01 \x01(\x05\x12.\n\nstart_time\x18\x02 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x34\n\x10last_update_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\xb1\x02\n\x1aStreamingRecognizeResponse\x12!\n\x05\x65rror\x18\x01 \x01(\x0b\x32\x12.google.rpc.Status\x12\x43\n\x07results\x18\x02 \x03(\x0b\x32\x32.google.cloud.speech.v1.StreamingRecognitionResult\x12]\n\x11speech_event_type\x18\x04 \x01(\x0e\x32\x42.google.cloud.speech.v1.StreamingRecognizeResponse.SpeechEventType"L\n\x0fSpeechEventType\x12\x1c\n\x18SPEECH_EVENT_UNSPECIFIED\x10\x00\x12\x1b\n\x17\x45ND_OF_SINGLE_UTTERANCE\x10\x01"\xed\x01\n\x1aStreamingRecognitionResult\x12J\n\x0c\x61lternatives\x18\x01 \x03(\x0b\x32\x34.google.cloud.speech.v1.SpeechRecognitionAlternative\x12\x10\n\x08is_final\x18\x02 \x01(\x08\x12\x11\n\tstability\x18\x03 \x01(\x02\x12\x32\n\x0fresult_end_time\x18\x04 \x01(\x0b\x32\x19.google.protobuf.Duration\x12\x13\n\x0b\x63hannel_tag\x18\x05 \x01(\x05\x12\x15\n\rlanguage_code\x18\x06 \x01(\t"z\n\x17SpeechRecognitionResult\x12J\n\x0c\x61lternatives\x18\x01 \x03(\x0b\x32\x34.google.cloud.speech.v1.SpeechRecognitionAlternative\x12\x13\n\x0b\x63hannel_tag\x18\x02 \x01(\x05"w\n\x1cSpeechRecognitionAlternative\x12\x12\n\ntranscript\x18\x01 \x01(\t\x12\x12\n\nconfidence\x18\x02 \x01(\x02\x12/\n\x05words\x18\x03 \x03(\x0b\x32 .google.cloud.speech.v1.WordInfo"t\n\x08WordInfo\x12-\n\nstart_time\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\x12+\n\x08\x65nd_time\x18\x02 \x01(\x0b\x32\x19.google.protobuf.Duration\x12\x0c\n\x04word\x18\x03 \x01(\t2\xa9\x03\n\x06Speech\x12\x81\x01\n\tRecognize\x12(.google.cloud.speech.v1.RecognizeRequest\x1a).google.cloud.speech.v1.RecognizeResponse"\x1f\x82\xd3\xe4\x93\x02\x19"\x14/v1/speech:recognize:\x01*\x12\x96\x01\n\x14LongRunningRecognize\x12\x33.google.cloud.speech.v1.LongRunningRecognizeRequest\x1a\x1d.google.longrunning.Operation"*\x82\xd3\xe4\x93\x02$"\x1f/v1/speech:longrunningrecognize:\x01*\x12\x81\x01\n\x12StreamingRecognize\x12\x31.google.cloud.speech.v1.StreamingRecognizeRequest\x1a\x32.google.cloud.speech.v1.StreamingRecognizeResponse"\x00(\x01\x30\x01\x42l\n\x1a\x63om.google.cloud.speech.v1B\x0bSpeechProtoP\x01Z<google.golang.org/genproto/googleapis/cloud/speech/v1;speech\xf8\x01\x01\x62\x06proto3'
     ),
     dependencies=[
         google_dot_api_dot_annotations__pb2.DESCRIPTOR,
@@ -85,8 +85,8 @@
     ],
     containing_type=None,
     serialized_options=None,
-    serialized_start=1328,
-    serialized_end=1467,
+    serialized_start=1406,
+    serialized_end=1545,
 )
 _sym_db.RegisterEnumDescriptor(_RECOGNITIONCONFIG_AUDIOENCODING)
 
@@ -134,8 +134,8 @@
     ],
     containing_type=None,
     serialized_options=None,
-    serialized_start=1980,
-    serialized_end=2177,
+    serialized_start=2198,
+    serialized_end=2395,
 )
 _sym_db.RegisterEnumDescriptor(_RECOGNITIONMETADATA_INTERACTIONTYPE)
 
@@ -164,8 +164,8 @@
     ],
     containing_type=None,
     serialized_options=None,
-    serialized_start=2179,
-    serialized_end=2279,
+    serialized_start=2397,
+    serialized_end=2497,
 )
 _sym_db.RegisterEnumDescriptor(_RECOGNITIONMETADATA_MICROPHONEDISTANCE)
 
@@ -191,8 +191,8 @@
     ],
     containing_type=None,
     serialized_options=None,
-    serialized_start=2281,
-    serialized_end=2359,
+    serialized_start=2499,
+    serialized_end=2577,
 )
 _sym_db.RegisterEnumDescriptor(_RECOGNITIONMETADATA_ORIGINALMEDIATYPE)
 
@@ -238,8 +238,8 @@
     ],
     containing_type=None,
     serialized_options=None,
-    serialized_start=2362,
-    serialized_end=2526,
+    serialized_start=2580,
+    serialized_end=2744,
 )
 _sym_db.RegisterEnumDescriptor(_RECOGNITIONMETADATA_RECORDINGDEVICETYPE)
 
@@ -266,8 +266,8 @@
     ],
     containing_type=None,
     serialized_options=None,
-    serialized_start=3208,
-    serialized_end=3284,
+    serialized_start=3426,
+    serialized_end=3502,
 )
 _sym_db.RegisterEnumDescriptor(_STREAMINGRECOGNIZERESPONSE_SPEECHEVENTTYPE)
 
@@ -713,10 +713,28 @@
             serialized_options=None,
             file=DESCRIPTOR,
         ),
+        _descriptor.FieldDescriptor(
+            name="diarization_config",
+            full_name="google.cloud.speech.v1.RecognitionConfig.diarization_config",
+            index=10,
+            number=19,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
         _descriptor.FieldDescriptor(
             name="metadata",
             full_name="google.cloud.speech.v1.RecognitionConfig.metadata",
-            index=10,
+            index=11,
             number=9,
             type=11,
             cpp_type=10,
@@ -734,7 +752,7 @@
         _descriptor.FieldDescriptor(
             name="model",
             full_name="google.cloud.speech.v1.RecognitionConfig.model",
-            index=11,
+            index=12,
             number=13,
             type=9,
             cpp_type=9,
@@ -752,7 +770,7 @@
         _descriptor.FieldDescriptor(
             name="use_enhanced",
             full_name="google.cloud.speech.v1.RecognitionConfig.use_enhanced",
-            index=12,
+            index=13,
             number=14,
             type=8,
             cpp_type=7,
@@ -777,7 +795,100 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=815,
-    serialized_end=1467,
+    serialized_end=1545,
+)
+
+
+_SPEAKERDIARIZATIONCONFIG = _descriptor.Descriptor(
+    name="SpeakerDiarizationConfig",
+    full_name="google.cloud.speech.v1.SpeakerDiarizationConfig",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="enable_speaker_diarization",
+            full_name="google.cloud.speech.v1.SpeakerDiarizationConfig.enable_speaker_diarization",
+            index=0,
+            number=1,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=False,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="min_speaker_count",
+            full_name="google.cloud.speech.v1.SpeakerDiarizationConfig.min_speaker_count",
+            index=1,
+            number=2,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="max_speaker_count",
+            full_name="google.cloud.speech.v1.SpeakerDiarizationConfig.max_speaker_count",
+            index=2,
+            number=3,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="speaker_tag",
+            full_name="google.cloud.speech.v1.SpeakerDiarizationConfig.speaker_tag",
+            index=3,
+            number=5,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto3",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1548,
+    serialized_end=1685,
 )
 
 
@@ -946,8 +1057,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=1470,
-    serialized_end=2526,
+    serialized_start=1688,
+    serialized_end=2744,
 )
 
 
@@ -985,8 +1096,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2528,
-    serialized_end=2560,
+    serialized_start=2746,
+    serialized_end=2778,
 )
 
 
@@ -1050,8 +1161,8 @@
             fields=[],
         )
     ],
-    serialized_start=2562,
-    serialized_end=2630,
+    serialized_start=2780,
+    serialized_end=2848,
 )
 
 
@@ -1089,8 +1200,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2632,
-    serialized_end=2717,
+    serialized_start=2850,
+    serialized_end=2935,
 )
 
 
@@ -1128,8 +1239,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2719,
-    serialized_end=2815,
+    serialized_start=2937,
+    serialized_end=3033,
 )
 
 
@@ -1203,8 +1314,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2818,
-    serialized_end=2976,
+    serialized_start=3036,
+    serialized_end=3194,
 )
 
 
@@ -1278,8 +1389,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2979,
-    serialized_end=3284,
+    serialized_start=3197,
+    serialized_end=3502,
 )
 
 
@@ -1407,8 +1518,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3287,
-    serialized_end=3524,
+    serialized_start=3505,
+    serialized_end=3742,
 )
 
 
@@ -1464,8 +1575,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3526,
-    serialized_end=3648,
+    serialized_start=3744,
+    serialized_end=3866,
 )
 
 
@@ -1539,8 +1650,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3650,
-    serialized_end=3769,
+    serialized_start=3868,
+    serialized_end=3987,
 )
 
 
@@ -1614,8 +1725,8 @@
     syntax="proto3",
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3771,
-    serialized_end=3887,
+    serialized_start=3989,
+    serialized_end=4105,
 )
 
 _RECOGNIZEREQUEST.fields_by_name["config"].message_type = _RECOGNITIONCONFIG
@@ -1642,6 +1753,9 @@
     "encoding"
 ].enum_type = _RECOGNITIONCONFIG_AUDIOENCODING
 _RECOGNITIONCONFIG.fields_by_name["speech_contexts"].message_type = _SPEECHCONTEXT
+_RECOGNITIONCONFIG.fields_by_name[
+    "diarization_config"
+].message_type = _SPEAKERDIARIZATIONCONFIG
 _RECOGNITIONCONFIG.fields_by_name["metadata"].message_type = _RECOGNITIONMETADATA
 _RECOGNITIONCONFIG_AUDIOENCODING.containing_type = _RECOGNITIONCONFIG
 _RECOGNITIONMETADATA.fields_by_name[
@@ -1721,6 +1835,7 @@
     "StreamingRecognitionConfig"
 ] = _STREAMINGRECOGNITIONCONFIG
 DESCRIPTOR.message_types_by_name["RecognitionConfig"] = _RECOGNITIONCONFIG
+DESCRIPTOR.message_types_by_name["SpeakerDiarizationConfig"] = _SPEAKERDIARIZATIONCONFIG
 DESCRIPTOR.message_types_by_name["RecognitionMetadata"] = _RECOGNITIONMETADATA
 DESCRIPTOR.message_types_by_name["SpeechContext"] = _SPEECHCONTEXT
 DESCRIPTOR.message_types_by_name["RecognitionAudio"] = _RECOGNITIONAUDIO
@@ -1950,6 +2065,17 @@
           offered as an experimental service, complimentary to all
           users. In the future this may be exclusively available as a
           premium feature.
+      diarization_config:
+          *Optional* Config to enable speaker diarization and set
+          additional parameters to make diarization better suited for
+          your application. Note: When this is enabled, we send all the
+          words from the beginning of the audio for the top alternative
+          in every consecutive STREAMING responses. This is done in
+          order to improve our speaker tags as our models learn to
+          identify the speakers in the conversation over time. For non-
+          streaming requests, the diarization results will be provided
+          only in the top alternative of the FINAL
+          SpeechRecognitionResult.
       metadata:
           *Optional* Metadata regarding this request.
       model:
@@ -1991,6 +2117,43 @@
 )
 _sym_db.RegisterMessage(RecognitionConfig)
 
+SpeakerDiarizationConfig = _reflection.GeneratedProtocolMessageType(
+    "SpeakerDiarizationConfig",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_SPEAKERDIARIZATIONCONFIG,
+        __module__="google.cloud.speech_v1.proto.cloud_speech_pb2",
+        __doc__="""*Optional* Config to enable speaker diarization.
+  
+  
+  Attributes:
+      enable_speaker_diarization:
+          *Optional* If 'true', enables speaker detection for each
+          recognized word in the top alternative of the recognition
+          result using a speaker\_tag provided in the WordInfo.
+      min_speaker_count:
+          *Optional* Minimum number of speakers in the conversation.
+          This range gives you more flexibility by allowing the system
+          to automatically determine the correct number of speakers. If
+          not set, the default value is 2.
+      max_speaker_count:
+          *Optional* Maximum number of speakers in the conversation.
+          This range gives you more flexibility by allowing the system
+          to automatically determine the correct number of speakers. If
+          not set, the default value is 6.
+      speaker_tag:
+          Output only. A distinct integer value is assigned for every
+          speaker within the audio. This field specifies which one of
+          those speakers was detected to have spoken this word. Value
+          ranges from '1' to diarization\_speaker\_count. speaker\_tag
+          is set if enable\_speaker\_diarization = 'true' and only in
+          the top alternative.
+  """,
+        # @@protoc_insertion_point(class_scope:google.cloud.speech.v1.SpeakerDiarizationConfig)
+    ),
+)
+_sym_db.RegisterMessage(SpeakerDiarizationConfig)
+
 RecognitionMetadata = _reflection.GeneratedProtocolMessageType(
     "RecognitionMetadata",
     (_message.Message,),
@@ -2396,8 +2559,8 @@
     file=DESCRIPTOR,
     index=0,
     serialized_options=None,
-    serialized_start=3890,
-    serialized_end=4315,
+    serialized_start=4108,
+    serialized_end=4533,
     methods=[
         _descriptor.MethodDescriptor(
             name="Recognize",
diff --git a/speech/synth.metadata b/speech/synth.metadata
index da81eb6a3ae6..938230043e78 100644
--- a/speech/synth.metadata
+++ b/speech/synth.metadata
@@ -1,11 +1,19 @@
 {
-  "updateTime": "2019-08-29T22:41:20.931044Z",
+  "updateTime": "2019-09-10T17:54:47.674701Z",
   "sources": [
     {
       "generator": {
         "name": "artman",
-        "version": "0.35.1",
-        "dockerImage": "googleapis/artman@sha256:b11c7ea0d0831c54016fb50f4b796d24d1971439b30fbc32a369ba1ac887c384"
+        "version": "0.36.2",
+        "dockerImage": "googleapis/artman@sha256:0e6f3a668cd68afc768ecbe08817cf6e56a0e64fcbdb1c58c3b97492d12418a1"
+      }
+    },
+    {
+      "git": {
+        "name": "googleapis",
+        "remote": "https://github.com/googleapis/googleapis.git",
+        "sha": "b4b182552fa0088e463ada73afcf48b405965c2c",
+        "internalRef": "268243295"
       }
     },
     {
diff --git a/speech/synth.py b/speech/synth.py
index dba406260300..c812ed261ef2 100644
--- a/speech/synth.py
+++ b/speech/synth.py
@@ -61,6 +61,6 @@
 # Add templated files
 # ----------------------------------------------------------------------------
 templated_files = common.py_library(unit_cov_level=97, cov_level=100)
-s.move(templated_files)
+s.move(templated_files, excludes=["noxfile.py"])
 
 s.shell.run(["nox", "-s", "blacken"], hide_output=False)