Add speaker diarization configuration support (via synth). (#9202)

Also, exclude 'noxfile.py' from synth (to preserve 'samples'). Supersedes #9196.
googleapis · Sep 10, 2019 · 33d27aa · 33d27aa
1 parent cae21b0
commit 33d27aa
Show file tree

Hide file tree

Showing 4 changed files with 256 additions and 45 deletions.
diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1/proto/cloud_speech.proto b/packages/google-cloud-python-speech/google/cloud/speech_v1/proto/cloud_speech.proto
@@ -276,6 +276,16 @@ message RecognitionConfig {
   // premium feature.
   bool enable_automatic_punctuation = 11;
 
+  // *Optional* Config to enable speaker diarization and set additional
+  // parameters to make diarization better suited for your application.
+  // Note: When this is enabled, we send all the words from the beginning of the
+  // audio for the top alternative in every consecutive STREAMING responses.
+  // This is done in order to improve our speaker tags as our models learn to
+  // identify the speakers in the conversation over time.
+  // For non-streaming requests, the diarization results will be provided only
+  // in the top alternative of the FINAL SpeechRecognitionResult.
+  SpeakerDiarizationConfig diarization_config = 19;
+
   // *Optional* Metadata regarding this request.
   RecognitionMetadata metadata = 9;
 
@@ -324,6 +334,36 @@ message RecognitionConfig {
   bool use_enhanced = 14;
 }
 
+// *Optional* Config to enable speaker diarization.
+message SpeakerDiarizationConfig {
+  // *Optional* If 'true', enables speaker detection for each recognized word in
+  // the top alternative of the recognition result using a speaker_tag provided
+  // in the WordInfo.
+  bool enable_speaker_diarization = 1;
+
+  // Note: Set min_speaker_count = max_speaker_count to fix the number of
+  // speakers to be detected in the audio.
+
+  // *Optional*
+  // Minimum number of speakers in the conversation. This range gives you more
+  // flexibility by allowing the system to automatically determine the correct
+  // number of speakers. If not set, the default value is 2.
+  int32 min_speaker_count = 2;
+
+  // *Optional*
+  // Maximum number of speakers in the conversation. This range gives you more
+  // flexibility by allowing the system to automatically determine the correct
+  // number of speakers. If not set, the default value is 6.
+  int32 max_speaker_count = 3;
+
+  // Output only. A distinct integer value is assigned for every speaker within
+  // the audio. This field specifies which one of those speakers was detected to
+  // have spoken this word. Value ranges from '1' to diarization_speaker_count.
+  // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
+  // top alternative.
+  int32 speaker_tag = 5;
+}
+
 // Description of audio data to be recognized.
 message RecognitionMetadata {
   // Use case categories that the audio recognition request can be described