feat: [texttospeech] Add low latency journey option to proto (#5742)

* feat: Add low latency journey option to proto feat: Add CustomPronunciationParams for upcoming feature work PiperOrigin-RevId: 684207737 Source-Link: googleapis/googleapis@d6f9dbc Source-Link: googleapis/googleapis-gen@42ac4d7 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXRleHR0b3NwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiNDJhYzRkNzViZWMxYjlkZWViYjA2OGZmODQyYzk1YTgxZGE5YjRkMCJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add low latency journey option to proto feat: Add CustomPronunciationParams for upcoming feature work PiperOrigin-RevId: 684229364 Source-Link: googleapis/googleapis@10b8dc3 Source-Link: googleapis/googleapis-gen@deaa9f3 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXRleHR0b3NwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiZGVhYTlmMzE5ZjFjNmMxOWE5ZDM2ZWEyMzNhZTMwNzU2NmI2MGMwOSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
googleapis · Oct 10, 2024 · d34249c · d34249c
1 parent cec22ab
commit d34249c
Show file tree

Hide file tree

Showing 12 changed files with 4,842 additions and 2,103 deletions.
diff --git a/packages/google-cloud-texttospeech/README.md b/packages/google-cloud-texttospeech/README.md
@@ -44,7 +44,7 @@ Google APIs Client Libraries, in [Client Libraries Explained][explained].
 1.  [Select or create a Cloud Platform project][projects].
 1.  [Enable billing for your project][billing].
 1.  [Enable the Google Cloud Text-to-Speech API][enable_api].
-1.  [Set up authentication with a service account][auth] so you can access the
+1.  [Set up authentication][auth] so you can access the
     API from your local workstation.
 
 ### Installing the client library
@@ -175,4 +175,4 @@ See [LICENSE](https://github.com/googleapis/google-cloud-node/blob/main/LICENSE)
 [projects]: https://console.cloud.google.com/project
 [billing]: https://support.google.com/cloud/answer/6293499#enable-billing
 [enable_api]: https://console.cloud.google.com/flows/enableapi?apiid=texttospeech.googleapis.com
-[auth]: https://cloud.google.com/docs/authentication/getting-started
+[auth]: https://cloud.google.com/docs/authentication/external/set-up-adc-local
diff --git a/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts.proto b/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts.proto
@@ -151,6 +151,13 @@ message Voice {
   int32 natural_sample_rate_hertz = 4;
 }
 
+// Used for advanced voice options.
+message AdvancedVoiceOptions {
+  // Only for Journey voices. If false, the synthesis will be context aware
+  // and have higher latency.
+  optional bool low_latency_journey_synthesis = 1;
+}
+
 // The top-level message sent by the client for the `SynthesizeSpeech` method.
 message SynthesizeSpeechRequest {
   // Required. The Synthesizer requires either plain text or SSML as input.
@@ -161,6 +168,44 @@ message SynthesizeSpeechRequest {
 
   // Required. The configuration of the synthesized audio.
   AudioConfig audio_config = 3 [(google.api.field_behavior) = REQUIRED];
+
+  // Advanced voice options.
+  optional AdvancedVoiceOptions advanced_voice_options = 8;
+}
+
+// Pronunciation customization for a phrase.
+message CustomPronunciationParams {
+  // The phonetic encoding of the phrase.
+  enum PhoneticEncoding {
+    // Not specified.
+    PHONETIC_ENCODING_UNSPECIFIED = 0;
+
+    // IPA. (e.g. apple -> ˈæpəl )
+    // https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
+    PHONETIC_ENCODING_IPA = 1;
+
+    // X-SAMPA (e.g. apple -> "{p@l" )
+    // https://en.wikipedia.org/wiki/X-SAMPA
+    PHONETIC_ENCODING_X_SAMPA = 2;
+  }
+
+  // The phrase to which the customization will be applied.
+  // The phrase can be multiple words (in the case of proper nouns etc), but
+  // should not span to a whole sentence.
+  optional string phrase = 1;
+
+  // The phonetic encoding of the phrase.
+  optional PhoneticEncoding phonetic_encoding = 2;
+
+  // The pronunciation of the phrase. This must be in the phonetic encoding
+  // specified above.
+  optional string pronunciation = 3;
+}
+
+// A collection of pronunciation customizations.
+message CustomPronunciations {
+  // The pronunciation customizations to be applied.
+  repeated CustomPronunciationParams pronunciations = 1;
 }
 
 // Contains text input to be synthesized. Either `text` or `ssml` must be
@@ -180,6 +225,21 @@ message SynthesisInput {
     // [SSML](https://cloud.google.com/text-to-speech/docs/ssml).
     string ssml = 2;
   }
+
+  // Optional. The pronunciation customizations to be applied to the input. If
+  // this is set, the input will be synthesized using the given pronunciation
+  // customizations.
+  //
+  // The initial support will be for EFIGS (English, French,
+  // Italian, German, Spanish) languages, as provided in
+  // VoiceSelectionParams. Journey and Instant Clone voices are
+  // not supported yet.
+  //
+  // In order to customize the pronunciation of a phrase, there must be an exact
+  // match of the phrase in the input types. If using SSML, the phrase must not
+  // be inside a phoneme tag (entirely or partially).
+  CustomPronunciations custom_pronunciations = 3
+      [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Description of which voice to use for a synthesis request.

diff --git a/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1beta1/cloud_tts.proto b/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1beta1/cloud_tts.proto
@@ -154,6 +154,13 @@ message Voice {
   int32 natural_sample_rate_hertz = 4;
 }
 
+// Used for advanced voice options.
+message AdvancedVoiceOptions {
+  // Only for Journey voices. If false, the synthesis will be context aware
+  // and have higher latency.
+  optional bool low_latency_journey_synthesis = 1;
+}
+
 // The top-level message sent by the client for the `SynthesizeSpeech` method.
 message SynthesizeSpeechRequest {
   // The type of timepoint information that is returned in the response.
@@ -176,6 +183,44 @@ message SynthesizeSpeechRequest {
 
   // Whether and what timepoints are returned in the response.
   repeated TimepointType enable_time_pointing = 4;
+
+  // Advanced voice options.
+  optional AdvancedVoiceOptions advanced_voice_options = 8;
+}
+
+// Pronunciation customization for a phrase.
+message CustomPronunciationParams {
+  // The phonetic encoding of the phrase.
+  enum PhoneticEncoding {
+    // Not specified.
+    PHONETIC_ENCODING_UNSPECIFIED = 0;
+
+    // IPA. (e.g. apple -> ˈæpəl )
+    // https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
+    PHONETIC_ENCODING_IPA = 1;
+
+    // X-SAMPA (e.g. apple -> "{p@l" )
+    // https://en.wikipedia.org/wiki/X-SAMPA
+    PHONETIC_ENCODING_X_SAMPA = 2;
+  }
+
+  // The phrase to which the customization will be applied.
+  // The phrase can be multiple words (in the case of proper nouns etc), but
+  // should not span to a whole sentence.
+  optional string phrase = 1;
+
+  // The phonetic encoding of the phrase.
+  optional PhoneticEncoding phonetic_encoding = 2;
+
+  // The pronunciation of the phrase. This must be in the phonetic encoding
+  // specified above.
+  optional string pronunciation = 3;
+}
+
+// A collection of pronunciation customizations.
+message CustomPronunciations {
+  // The pronunciation customizations to be applied.
+  repeated CustomPronunciationParams pronunciations = 1;
 }
 
 // Contains text input to be synthesized. Either `text` or `ssml` must be
@@ -195,6 +240,21 @@ message SynthesisInput {
     // [SSML](https://cloud.google.com/text-to-speech/docs/ssml).
     string ssml = 2;
   }
+
+  // Optional. The pronunciation customizations to be applied to the input. If
+  // this is set, the input will be synthesized using the given pronunciation
+  // customizations.
+  //
+  // The initial support will be for EFIGS (English, French,
+  // Italian, German, Spanish) languages, as provided in
+  // VoiceSelectionParams. Journey and Instant Clone voices are
+  // not supported yet.
+  //
+  // In order to customize the pronunciation of a phrase, there must be an exact
+  // match of the phrase in the input types. If using SSML, the phrase must not
+  // be inside a phoneme tag (entirely or partially).
+  CustomPronunciations custom_pronunciations = 3
+      [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Description of which voice to use for a synthesis request.