huggingface · ydshieh · Oct 20, 2023 · Oct 17, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
@@ -30,7 +30,7 @@
 
 class Speech2TextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used to instantiate an
+    This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used to instantiate a
     Speech2Text model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Speech2Text
     [facebook/s2t-small-librispeech-asr](https://huggingface.co/facebook/s2t-small-librispeech-asr) architecture.
@@ -40,26 +40,36 @@ class Speech2TextConfig(PretrainedConfig):
 
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 10000):
             Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by
             the `inputs_ids` passed when calling [`Speech2TextModel`]
-        d_model (`int`, *optional*, defaults to 1024):
-            Dimensionality of the layers and the pooler layer.
         encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 12):
-            Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 16):
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+        decoder_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](https://arxiv.org/abs/1909.11556) for
+            more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](https://arxiv.org/abs/1909.11556) for
+            more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is set up as an encoder-decoder architecture for sequence-to-sequence tasks.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimensionality of the layers and the pooler layer.
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -68,18 +78,20 @@ class Speech2TextConfig(PretrainedConfig):
             The dropout ratio for activations inside the fully connected layer.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
-        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
+        decoder_start_token_id (`int`, *optional*, defaults to 2):
+            The initial token ID of the decoder when decoding sequences.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Whether the embeddings are scaled by the square root of `d_model`.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The padding token ID for padding sequences to the same length.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The beginning-of-sequence (BOS) token ID for indicating the start of a sequence.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The end-of-sequence (EOS) token ID for indicating the end of a sequence.
         max_source_positions (`int`, *optional*, defaults to 6000):
             The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
         max_target_positions (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            The maximum sequence length that this model might ever be used with. Typically, set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         num_conv_layers (`int`, *optional*, defaults to 2):
             Number of 1D convolutional layers in the conv module.

diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
@@ -473,7 +473,6 @@
     "SpecialTokensMixin",
     "Speech2Text2Config",
     "Speech2Text2Tokenizer",
-    "Speech2TextConfig",
     "Speech2TextTokenizer",
     "SpeechEncoderDecoderModel",
     "SpeechT5Config",