diff --git a/packages/google-cloud-speech/protos/google/cloud/speech/v1p1beta1/cloud_speech.proto b/packages/google-cloud-speech/protos/google/cloud/speech/v1p1beta1/cloud_speech.proto index 208f53633e0..53283d3b7d3 100644 --- a/packages/google-cloud-speech/protos/google/cloud/speech/v1p1beta1/cloud_speech.proto +++ b/packages/google-cloud-speech/protos/google/cloud/speech/v1p1beta1/cloud_speech.proto @@ -1,4 +1,4 @@ -// Copyright 2018 Google Inc. +// Copyright 2018 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// syntax = "proto3"; @@ -20,6 +21,7 @@ import "google/api/annotations.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/any.proto"; import "google/protobuf/duration.proto"; +import "google/protobuf/empty.proto"; import "google/protobuf/timestamp.proto"; import "google/rpc/status.proto"; @@ -54,7 +56,8 @@ service Speech { // Performs bidirectional streaming speech recognition: receive results while // sending audio. This method is only available via the gRPC API (not REST). - rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse); + rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) { + } } // The top-level message sent by the client for the `Recognize` method. @@ -98,7 +101,7 @@ message StreamingRecognizeRequest { // `audio_content` data. The audio bytes must be encoded as specified in // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a // pure binary representation (not base64). See - // [audio limits](https://cloud.google.com/speech/limits#content). + // [content limits](/speech-to-text/quotas#content). bytes audio_content = 2; } } @@ -218,36 +221,36 @@ message RecognitionConfig { // Valid values for OGG_OPUS are '1'-'254'. // Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`. // If `0` or omitted, defaults to one channel (mono). - // NOTE: We only recognize the first channel by default. + // Note: We only recognize the first channel by default. // To perform independent recognition on each channel set - // enable_separate_recognition_per_channel to 'true'. + // `enable_separate_recognition_per_channel` to 'true'. int32 audio_channel_count = 7; - // This needs to be set to ‘true’ explicitly and audio_channel_count > 1 + // This needs to be set to ‘true’ explicitly and `audio_channel_count` > 1 // to get each channel recognized separately. The recognition result will - // contain a channel_tag field to state which channel that result belongs to. - // If this is not ‘true’, we will only recognize the first channel. - // NOTE: The request is also billed cumulatively for all channels recognized: - // (audio_channel_count times the audio length) + // contain a `channel_tag` field to state which channel that result belongs + // to. If this is not true, we will only recognize the first channel. The + // request is billed cumulatively for all channels recognized: + // `audio_channel_count` multiplied by the length of the audio. bool enable_separate_recognition_per_channel = 12; // *Required* The language of the supplied audio as a // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. // Example: "en-US". - // See [Language Support](https://cloud.google.com/speech/docs/languages) + // See [Language Support](/speech-to-text/docs/languages) // for a list of the currently supported language codes. string language_code = 3; // *Optional* A list of up to 3 additional // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags, // listing possible alternative languages of the supplied audio. - // See [Language Support](https://cloud.google.com/speech/docs/languages) + // See [Language Support](/speech-to-text/docs/languages) // for a list of the currently supported language codes. // If alternative languages are listed, recognition result will contain // recognition in the most likely language detected including the main // language_code. The recognition result will include the language tag // of the language detected in the audio. - // NOTE: This feature is only supported for Voice Command and Voice Search + // Note: This feature is only supported for Voice Command and Voice Search // use cases and performance may vary for other use cases (e.g., phone call // transcription). repeated string alternative_language_codes = 18; @@ -266,7 +269,9 @@ message RecognitionConfig { // won't be filtered out. bool profanity_filter = 5; - // *Optional* A means to provide context to assist the speech recognition. + // *Optional* array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext]. + // A means to provide context to assist the speech recognition. For more + // information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints). repeated SpeechContext speech_contexts = 6; // *Optional* If `true`, the top result includes a list of words and @@ -284,18 +289,20 @@ message RecognitionConfig { // This feature is only available in select languages. Setting this for // requests in other languages has no effect at all. // The default 'false' value does not add punctuation to result hypotheses. - // NOTE: "This is currently offered as an experimental service, complimentary + // Note: This is currently offered as an experimental service, complimentary // to all users. In the future this may be exclusively available as a - // premium feature." + // premium feature. bool enable_automatic_punctuation = 11; // *Optional* If 'true', enables speaker detection for each recognized word in // the top alternative of the recognition result using a speaker_tag provided // in the WordInfo. // Note: When this is true, we send all the words from the beginning of the - // audio for the top alternative in every consecutive responses. + // audio for the top alternative in every consecutive STREAMING responses. // This is done in order to improve our speaker tags as our models learn to // identify the speakers in the conversation over time. + // For non-streaming requests, the diarization results will be provided only + // in the top alternative of the FINAL SpeechRecognitionResult. bool enable_speaker_diarization = 16; // *Optional* @@ -342,14 +349,18 @@ message RecognitionConfig { string model = 13; // *Optional* Set to true to use an enhanced model for speech recognition. - // You must also set the `model` field to a valid, enhanced model. If - // `use_enhanced` is set to true and the `model` field is not set, then - // `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced - // version of the specified model does not exist, then the speech is - // recognized using the standard version of the specified model. + // If `use_enhanced` is set to true and the `model` field is not set, then + // an appropriate enhanced model is chosen if: + // 1. project is eligible for requesting enhanced models + // 2. an enhanced model exists for the audio + // + // If `use_enhanced` is true and an enhanced version of the specified model + // does not exist, then the speech is recognized using the standard version + // of the specified model. // - // Enhanced speech models require that you opt-in to the audio logging using - // instructions in the [alpha documentation](/speech/data-sharing). If you set + // Enhanced speech models require that you opt-in to data logging using + // instructions in the + // [documentation](/speech-to-text/docs/enable-data-logging). If you set // `use_enhanced` to true and you have not enabled audio logging, then you // will receive an error. bool use_enhanced = 14; @@ -494,14 +505,14 @@ message SpeechContext { // to improve the accuracy for specific words and phrases, for example, if // specific commands are typically spoken by the user. This can also be used // to add additional words to the vocabulary of the recognizer. See - // [usage limits](https://cloud.google.com/speech/limits#content). + // [usage limits](/speech-to-text/quotas#content). repeated string phrases = 1; } // Contains audio data in the encoding specified in the `RecognitionConfig`. // Either `content` or `uri` must be supplied. Supplying both or neither // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See -// [audio limits](https://cloud.google.com/speech/limits#content). +// [content limits](/speech-to-text/quotas#content). message RecognitionAudio { // The audio source, which is either inline content or a Google Cloud // Storage uri. @@ -512,7 +523,8 @@ message RecognitionAudio { bytes content = 1; // URI that points to a file that contains audio data bytes as specified in - // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are + // `RecognitionConfig`. The file must not be compressed (for example, gzip). + // Currently, only Google Cloud Storage URIs are // supported, which must be specified in the following format: // `gs://bucket_name/object_name` (other URI formats return // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see @@ -658,6 +670,10 @@ message StreamingRecognitionResult { // The default of 0.0 is a sentinel value indicating `stability` was not set. float stability = 3; + // Output only. Time offset of the end of this result relative to the + // beginning of the audio. + google.protobuf.Duration result_end_time = 4; + // For multi-channel audio, this is the channel number corresponding to the // recognized result for the audio from that channel. // For audio_channel_count = N, its output values can range from '1' to 'N'. @@ -705,7 +721,7 @@ message SpeechRecognitionAlternative { float confidence = 2; // Output only. A list of word-specific information for each recognized word. - // Note: When enable_speaker_diarization is true, you will see all the words + // Note: When `enable_speaker_diarization` is true, you will see all the words // from the beginning of the audio. repeated WordInfo words = 3; } @@ -746,5 +762,4 @@ message WordInfo { // speaker_tag is set if enable_speaker_diarization = 'true' and only in the // top alternative. int32 speaker_tag = 5; - } diff --git a/packages/google-cloud-speech/src/v1/speech_client_config.json b/packages/google-cloud-speech/src/v1/speech_client_config.json index a19dc8f9618..0f0346afb09 100644 --- a/packages/google-cloud-speech/src/v1/speech_client_config.json +++ b/packages/google-cloud-speech/src/v1/speech_client_config.json @@ -21,17 +21,17 @@ }, "methods": { "Recognize": { - "timeout_millis": 1000000, + "timeout_millis": 200000, "retry_codes_name": "idempotent", "retry_params_name": "default" }, "LongRunningRecognize": { - "timeout_millis": 60000, + "timeout_millis": 200000, "retry_codes_name": "non_idempotent", "retry_params_name": "default" }, "StreamingRecognize": { - "timeout_millis": 1000000, + "timeout_millis": 200000, "retry_codes_name": "idempotent", "retry_params_name": "default" } diff --git a/packages/google-cloud-speech/src/v1p1beta1/doc/google/cloud/speech/v1p1beta1/doc_cloud_speech.js b/packages/google-cloud-speech/src/v1p1beta1/doc/google/cloud/speech/v1p1beta1/doc_cloud_speech.js index 62ff732be6d..8c96144c58e 100644 --- a/packages/google-cloud-speech/src/v1p1beta1/doc/google/cloud/speech/v1p1beta1/doc_cloud_speech.js +++ b/packages/google-cloud-speech/src/v1p1beta1/doc/google/cloud/speech/v1p1beta1/doc_cloud_speech.js @@ -82,7 +82,7 @@ const LongRunningRecognizeRequest = { * `audio_content` data. The audio bytes must be encoded as specified in * `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a * pure binary representation (not base64). See - * [audio limits](https://cloud.google.com/speech/limits#content). + * [content limits](https://cloud.google.com/speech-to-text/quotas#content). * * @typedef StreamingRecognizeRequest * @memberof google.cloud.speech.v1p1beta1 @@ -156,36 +156,36 @@ const StreamingRecognitionConfig = { * Valid values for OGG_OPUS are '1'-'254'. * Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`. * If `0` or omitted, defaults to one channel (mono). - * NOTE: We only recognize the first channel by default. + * Note: We only recognize the first channel by default. * To perform independent recognition on each channel set - * enable_separate_recognition_per_channel to 'true'. + * `enable_separate_recognition_per_channel` to 'true'. * * @property {boolean} enableSeparateRecognitionPerChannel - * This needs to be set to ‘true’ explicitly and audio_channel_count > 1 + * This needs to be set to ‘true’ explicitly and `audio_channel_count` > 1 * to get each channel recognized separately. The recognition result will - * contain a channel_tag field to state which channel that result belongs to. - * If this is not ‘true’, we will only recognize the first channel. - * NOTE: The request is also billed cumulatively for all channels recognized: - * (audio_channel_count times the audio length) + * contain a `channel_tag` field to state which channel that result belongs + * to. If this is not true, we will only recognize the first channel. The + * request is billed cumulatively for all channels recognized: + * `audio_channel_count` multiplied by the length of the audio. * * @property {string} languageCode * *Required* The language of the supplied audio as a * [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. * Example: "en-US". - * See [Language Support](https://cloud.google.com/speech/docs/languages) + * See [Language Support](https://cloud.google.com/speech-to-text/docs/languages) * for a list of the currently supported language codes. * * @property {string[]} alternativeLanguageCodes * *Optional* A list of up to 3 additional * [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags, * listing possible alternative languages of the supplied audio. - * See [Language Support](https://cloud.google.com/speech/docs/languages) + * See [Language Support](https://cloud.google.com/speech-to-text/docs/languages) * for a list of the currently supported language codes. * If alternative languages are listed, recognition result will contain * recognition in the most likely language detected including the main * language_code. The recognition result will include the language tag * of the language detected in the audio. - * NOTE: This feature is only supported for Voice Command and Voice Search + * Note: This feature is only supported for Voice Command and Voice Search * use cases and performance may vary for other use cases (e.g., phone call * transcription). * @@ -204,7 +204,9 @@ const StreamingRecognitionConfig = { * won't be filtered out. * * @property {Object[]} speechContexts - * *Optional* A means to provide context to assist the speech recognition. + * *Optional* array of SpeechContext. + * A means to provide context to assist the speech recognition. For more + * information, see [Phrase Hints](https://cloud.google.com/speech-to-text/docs/basics#phrase-hints). * * This object should have the same structure as [SpeechContext]{@link google.cloud.speech.v1p1beta1.SpeechContext} * @@ -224,18 +226,20 @@ const StreamingRecognitionConfig = { * This feature is only available in select languages. Setting this for * requests in other languages has no effect at all. * The default 'false' value does not add punctuation to result hypotheses. - * NOTE: "This is currently offered as an experimental service, complimentary + * Note: This is currently offered as an experimental service, complimentary * to all users. In the future this may be exclusively available as a - * premium feature." + * premium feature. * * @property {boolean} enableSpeakerDiarization * *Optional* If 'true', enables speaker detection for each recognized word in * the top alternative of the recognition result using a speaker_tag provided * in the WordInfo. * Note: When this is true, we send all the words from the beginning of the - * audio for the top alternative in every consecutive responses. + * audio for the top alternative in every consecutive STREAMING responses. * This is done in order to improve our speaker tags as our models learn to * identify the speakers in the conversation over time. + * For non-streaming requests, the diarization results will be provided only + * in the top alternative of the FINAL SpeechRecognitionResult. * * @property {number} diarizationSpeakerCount * *Optional* @@ -284,14 +288,18 @@ const StreamingRecognitionConfig = { * * @property {boolean} useEnhanced * *Optional* Set to true to use an enhanced model for speech recognition. - * You must also set the `model` field to a valid, enhanced model. If - * `use_enhanced` is set to true and the `model` field is not set, then - * `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced - * version of the specified model does not exist, then the speech is - * recognized using the standard version of the specified model. - * - * Enhanced speech models require that you opt-in to the audio logging using - * instructions in the [alpha documentation](https://cloud.google.com/speech/data-sharing). If you set + * If `use_enhanced` is set to true and the `model` field is not set, then + * an appropriate enhanced model is chosen if: + * 1. project is eligible for requesting enhanced models + * 2. an enhanced model exists for the audio + * + * If `use_enhanced` is true and an enhanced version of the specified model + * does not exist, then the speech is recognized using the standard version + * of the specified model. + * + * Enhanced speech models require that you opt-in to data logging using + * instructions in the + * [documentation](https://cloud.google.com/speech-to-text/docs/enable-data-logging). If you set * `use_enhanced` to true and you have not enabled audio logging, then you * will receive an error. * @@ -617,7 +625,7 @@ const RecognitionMetadata = { * to improve the accuracy for specific words and phrases, for example, if * specific commands are typically spoken by the user. This can also be used * to add additional words to the vocabulary of the recognizer. See - * [usage limits](https://cloud.google.com/speech/limits#content). + * [usage limits](https://cloud.google.com/speech-to-text/quotas#content). * * @typedef SpeechContext * @memberof google.cloud.speech.v1p1beta1 @@ -631,7 +639,7 @@ const SpeechContext = { * Contains audio data in the encoding specified in the `RecognitionConfig`. * Either `content` or `uri` must be supplied. Supplying both or neither * returns google.rpc.Code.INVALID_ARGUMENT. See - * [audio limits](https://cloud.google.com/speech/limits#content). + * [content limits](https://cloud.google.com/speech-to-text/quotas#content). * * @property {string} content * The audio data bytes encoded as specified in @@ -640,7 +648,8 @@ const SpeechContext = { * * @property {string} uri * URI that points to a file that contains audio data bytes as specified in - * `RecognitionConfig`. Currently, only Google Cloud Storage URIs are + * `RecognitionConfig`. The file must not be compressed (for example, gzip). + * Currently, only Google Cloud Storage URIs are * supported, which must be specified in the following format: * `gs://bucket_name/object_name` (other URI formats return * google.rpc.Code.INVALID_ARGUMENT). For more information, see @@ -850,6 +859,12 @@ const StreamingRecognizeResponse = { * This field is only provided for interim results (`is_final=false`). * The default of 0.0 is a sentinel value indicating `stability` was not set. * + * @property {Object} resultEndTime + * Output only. Time offset of the end of this result relative to the + * beginning of the audio. + * + * This object should have the same structure as [Duration]{@link google.protobuf.Duration} + * * @property {number} channelTag * For multi-channel audio, this is the channel number corresponding to the * recognized result for the audio from that channel. @@ -916,7 +931,7 @@ const SpeechRecognitionResult = { * * @property {Object[]} words * Output only. A list of word-specific information for each recognized word. - * Note: When enable_speaker_diarization is true, you will see all the words + * Note: When `enable_speaker_diarization` is true, you will see all the words * from the beginning of the audio. * * This object should have the same structure as [WordInfo]{@link google.cloud.speech.v1p1beta1.WordInfo} diff --git a/packages/google-cloud-speech/src/v1p1beta1/speech_client_config.json b/packages/google-cloud-speech/src/v1p1beta1/speech_client_config.json index 244513dc672..a71519f4637 100644 --- a/packages/google-cloud-speech/src/v1p1beta1/speech_client_config.json +++ b/packages/google-cloud-speech/src/v1p1beta1/speech_client_config.json @@ -21,17 +21,17 @@ }, "methods": { "Recognize": { - "timeout_millis": 1000000, + "timeout_millis": 200000, "retry_codes_name": "idempotent", "retry_params_name": "default" }, "LongRunningRecognize": { - "timeout_millis": 60000, + "timeout_millis": 200000, "retry_codes_name": "non_idempotent", "retry_params_name": "default" }, "StreamingRecognize": { - "timeout_millis": 1000000, + "timeout_millis": 905000, "retry_codes_name": "idempotent", "retry_params_name": "default" }