feat: [texttospeech] add multi-speaker markup, which allows generatin…

…g dialogue between multiple speakers (#5760) * feat: Add brand voice lite PiperOrigin-RevId: 687058189 Source-Link: googleapis/googleapis@5ebb5c1 Source-Link: googleapis/googleapis-gen@293badf Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXRleHR0b3NwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiMjkzYmFkZmQ2MTQ1OTdiYTFkZDE1MTZkYjU4MDk0NTU3ZjdiNGM3ZSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: add brand voice lite, which lets you clone a voice with just 10 seconds of audio PiperOrigin-RevId: 688491221 Source-Link: googleapis/googleapis@60688dc Source-Link: googleapis/googleapis-gen@4072c99 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXRleHR0b3NwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiNDA3MmM5OTliMzZiYWEyMWNjYjI5YzVmNWQ1NjA4OTYyODQwYjMxYSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: add multi-speaker markup, which allows generating dialogue between multiple speakers PiperOrigin-RevId: 689444598 Source-Link: googleapis/googleapis@56f5fa4 Source-Link: googleapis/googleapis-gen@532d282 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXRleHR0b3NwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiNTMyZDI4MjkxMmIxZWFmMTQxNTgxN2MwNjhlY2UyZTM1NGYwMDU1ZSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: add multi-speaker markup, which allows generating dialogue between multiple speakers PiperOrigin-RevId: 690597076 Source-Link: googleapis/googleapis@3bc81b7 Source-Link: googleapis/googleapis-gen@a2253a0 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXRleHR0b3NwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiYTIyNTNhMDFlNmU5M2U3OWExZGVjN2UwYjJhZWVhNzA4YjYzNGQxZSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: sofisl <55454395+sofisl@users.noreply.github.com>
googleapis · Oct 30, 2024 · f9dea89 · f9dea89
1 parent 96bb1b5
commit f9dea89
Show file tree

Hide file tree

Showing 7 changed files with 2,353 additions and 129 deletions.
diff --git a/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts.proto b/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts.proto
@@ -21,7 +21,6 @@ import "google/api/client.proto";
 import "google/api/field_behavior.proto";
 import "google/api/resource.proto";
 
-option cc_enable_arenas = true;
 option csharp_namespace = "Google.Cloud.TextToSpeech.V1";
 option go_package = "cloud.google.com/go/texttospeech/apiv1/texttospeechpb;texttospeechpb";
 option java_multiple_files = true;
@@ -208,6 +207,22 @@ message CustomPronunciations {
   repeated CustomPronunciationParams pronunciations = 1;
 }
 
+// A collection of turns for multi-speaker synthesis.
+message MultiSpeakerMarkup {
+  // A Multi-speaker turn.
+  message Turn {
+    // Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
+    // to documentation for available speakers.
+    string speaker = 1 [(google.api.field_behavior) = REQUIRED];
+
+    // Required. The text to speak.
+    string text = 2 [(google.api.field_behavior) = REQUIRED];
+  }
+
+  // Required. Speaker turns.
+  repeated Turn turns = 1 [(google.api.field_behavior) = REQUIRED];
+}
+
 // Contains text input to be synthesized. Either `text` or `ssml` must be
 // supplied. Supplying both or neither returns
 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. The
@@ -224,6 +239,10 @@ message SynthesisInput {
     // more information, see
     // [SSML](https://cloud.google.com/text-to-speech/docs/ssml).
     string ssml = 2;
+
+    // The multi-speaker input to be synthesized. Only applicable for
+    // multi-speaker synthesis.
+    MultiSpeakerMarkup multi_speaker_markup = 4;
   }
 
   // Optional. The pronunciation customizations to be applied to the input. If
@@ -273,6 +292,11 @@ message VoiceSelectionParams {
   // the service will choose the custom voice matching the specified
   // configuration.
   CustomVoiceParams custom_voice = 4;
+
+  // Optional. The configuration for a voice clone. If
+  // [VoiceCloneParams.voice_clone_key] is set, the service will choose the
+  // voice clone matching the specified configuration.
+  VoiceCloneParams voice_clone = 5 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Description of audio data to be synthesized.
@@ -360,6 +384,12 @@ message CustomVoiceParams {
       [deprecated = true, (google.api.field_behavior) = OPTIONAL];
 }
 
+// The configuration of Voice Clone feature.
+message VoiceCloneParams {
+  // Required. Created by GenerateVoiceCloningKey.
+  string voice_cloning_key = 1 [(google.api.field_behavior) = REQUIRED];
+}
+
 // The message returned to the client by the `SynthesizeSpeech` method.
 message SynthesizeSpeechResponse {
   // The audio data bytes encoded as specified in the request, including the

diff --git a/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts_lrs.proto b/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts_lrs.proto
@@ -23,7 +23,6 @@ import "google/cloud/texttospeech/v1/cloud_tts.proto";
 import "google/longrunning/operations.proto";
 import "google/protobuf/timestamp.proto";
 
-option cc_enable_arenas = true;
 option csharp_namespace = "Google.Cloud.TextToSpeech.V1";
 option go_package = "cloud.google.com/go/texttospeech/apiv1/texttospeechpb;texttospeechpb";
 option java_multiple_files = true;

diff --git a/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1beta1/cloud_tts.proto b/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1beta1/cloud_tts.proto
@@ -21,7 +21,6 @@ import "google/api/client.proto";
 import "google/api/field_behavior.proto";
 import "google/api/resource.proto";
 
-option cc_enable_arenas = true;
 option csharp_namespace = "Google.Cloud.TextToSpeech.V1Beta1";
 option go_package = "cloud.google.com/go/texttospeech/apiv1beta1/texttospeechpb;texttospeechpb";
 option java_multiple_files = true;
@@ -223,6 +222,22 @@ message CustomPronunciations {
   repeated CustomPronunciationParams pronunciations = 1;
 }
 
+// A collection of turns for multi-speaker synthesis.
+message MultiSpeakerMarkup {
+  // A Multi-speaker turn.
+  message Turn {
+    // Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
+    // to documentation for available speakers.
+    string speaker = 1 [(google.api.field_behavior) = REQUIRED];
+
+    // Required. The text to speak.
+    string text = 2 [(google.api.field_behavior) = REQUIRED];
+  }
+
+  // Required. Speaker turns.
+  repeated Turn turns = 1 [(google.api.field_behavior) = REQUIRED];
+}
+
 // Contains text input to be synthesized. Either `text` or `ssml` must be
 // supplied. Supplying both or neither returns
 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. The
@@ -239,6 +254,10 @@ message SynthesisInput {
     // more information, see
     // [SSML](https://cloud.google.com/text-to-speech/docs/ssml).
     string ssml = 2;
+
+    // The multi-speaker input to be synthesized. Only applicable for
+    // multi-speaker synthesis.
+    MultiSpeakerMarkup multi_speaker_markup = 4;
   }
 
   // Optional. The pronunciation customizations to be applied to the input. If
@@ -288,6 +307,11 @@ message VoiceSelectionParams {
   // the service will choose the custom voice matching the specified
   // configuration.
   CustomVoiceParams custom_voice = 4;
+
+  // Optional. The configuration for a voice clone. If
+  // [VoiceCloneParams.voice_clone_key] is set, the service will choose the
+  // voice clone matching the specified configuration.
+  VoiceCloneParams voice_clone = 5 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Description of audio data to be synthesized.
@@ -375,6 +399,12 @@ message CustomVoiceParams {
       [deprecated = true, (google.api.field_behavior) = OPTIONAL];
 }
 
+// The configuration of Voice Clone feature.
+message VoiceCloneParams {
+  // Required. Created by GenerateVoiceCloningKey.
+  string voice_cloning_key = 1 [(google.api.field_behavior) = REQUIRED];
+}
+
 // The message returned to the client by the `SynthesizeSpeech` method.
 message SynthesizeSpeechResponse {
   // The audio data bytes encoded as specified in the request, including the

diff --git a/...es/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1beta1/cloud_tts_lrs.proto b/...es/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1beta1/cloud_tts_lrs.proto
@@ -23,7 +23,6 @@ import "google/cloud/texttospeech/v1beta1/cloud_tts.proto";
 import "google/longrunning/operations.proto";
 import "google/protobuf/timestamp.proto";
 
-option cc_enable_arenas = true;
 option csharp_namespace = "Google.Cloud.TextToSpeech.V1Beta1";
 option go_package = "cloud.google.com/go/texttospeech/apiv1beta1/texttospeechpb;texttospeechpb";
 option java_multiple_files = true;