diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index fc36732cc420df..0b150832b755ce 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -555,16 +555,16 @@ def __init__( def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): - return tensor if position_embeddings is None else tensor + position_embeddings + def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor]): + return tensor if object_queries is None else tensor + object_queries def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - position_embeddings: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, key_value_states: Optional[torch.Tensor] = None, - key_value_position_embeddings: Optional[torch.Tensor] = None, + spatial_position_embeddings: Optional[torch.Tensor] = None, output_attentions: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel""" @@ -575,14 +575,14 @@ def forward( batch_size, target_len, embed_dim = hidden_states.size() # add position embeddings to the hidden states before projecting to queries and keys - if position_embeddings is not None: + if object_queries is not None: hidden_states_original = hidden_states - hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states = self.with_pos_embed(hidden_states, object_queries) # add key-value position embeddings to the key value states - if key_value_position_embeddings is not None: + if spatial_position_embeddings is not None: key_value_states_original = key_value_states - key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings) + key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings) # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -790,7 +790,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, - position_embeddings: torch.Tensor = None, + object_queries: torch.Tensor = None, output_attentions: bool = False, ): """ @@ -799,7 +799,8 @@ def forward( attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. - position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states. + object_queries (`torch.FloatTensor`, *optional*): + Object queries (also called content embeddings), to be added to the hidden states. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -808,7 +809,7 @@ def forward( hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, - position_embeddings=position_embeddings, + object_queries=object_queries, output_attentions=output_attentions, ) @@ -1150,7 +1151,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel): Small tweak for ConditionalDETR: - - position_embeddings are added to the forward pass. + - object_queries are added to the forward pass. Args: config: ConditionalDetrConfig @@ -1173,7 +1174,7 @@ def forward( self, inputs_embeds=None, attention_mask=None, - position_embeddings=None, + object_queries=None, output_attentions=None, output_hidden_states=None, return_dict=None, @@ -1191,8 +1192,8 @@ def forward( [What are attention masks?](../glossary#attention-mask) - position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Position embeddings that are added to the queries and keys in each self-attention layer. + object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Object queries that are added to the queries in each self-attention layer. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under @@ -1232,11 +1233,11 @@ def forward( if to_drop: layer_outputs = (None, None) else: - # we add position_embeddings as extra input to the encoder_layer + # we add object_queries as extra input to the encoder_layer layer_outputs = encoder_layer( hidden_states, attention_mask, - position_embeddings=position_embeddings, + object_queries=object_queries, output_attentions=output_attentions, ) diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index 98f0cdb5044f46..1515c37151a141 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -437,16 +437,16 @@ def __init__( def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): - return tensor if position_embeddings is None else tensor + position_embeddings + def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor]): + return tensor if object_queries is None else tensor + object_queries def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - position_embeddings: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, key_value_states: Optional[torch.Tensor] = None, - key_value_position_embeddings: Optional[torch.Tensor] = None, + spatial_position_embeddings: Optional[torch.Tensor] = None, output_attentions: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel""" @@ -457,14 +457,14 @@ def forward( batch_size, target_len, embed_dim = hidden_states.size() # add position embeddings to the hidden states before projecting to queries and keys - if position_embeddings is not None: + if object_queries is not None: hidden_states_original = hidden_states - hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states = self.with_pos_embed(hidden_states, object_queries) # add key-value position embeddings to the key value states - if key_value_position_embeddings is not None: + if spatial_position_embeddings is not None: key_value_states_original = key_value_states - key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings) + key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings) # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -563,7 +563,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - position_embeddings: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, query_position_embeddings: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, @@ -575,8 +575,8 @@ def forward( attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative values. - position_embeddings (`torch.FloatTensor`, *optional*): - position embeddings that are added to the queries and keys + object_queries (`torch.FloatTensor`, *optional*): + object_queries that are added to the hidden states in the cross-attention layer. query_position_embeddings (`torch.FloatTensor`, *optional*): position embeddings that are added to the queries and keys @@ -595,7 +595,7 @@ def forward( # Self Attention hidden_states, self_attn_weights = self.self_attn( hidden_states=hidden_states, - position_embeddings=query_position_embeddings, + object_queries=query_position_embeddings, attention_mask=attention_mask, output_attentions=output_attentions, ) @@ -611,10 +611,10 @@ def forward( hidden_states, cross_attn_weights = self.encoder_attn( hidden_states=hidden_states, - position_embeddings=query_position_embeddings, + object_queries=object_queries, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, - key_value_position_embeddings=position_embeddings, + spatial_position_embeddings=query_position_embeddings, output_attentions=output_attentions, ) diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index df2cf23b650d08..49178e060133b4 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -461,16 +461,16 @@ def __init__( def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): - return tensor if position_embeddings is None else tensor + position_embeddings + def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor]): + return tensor if object_queries is None else tensor + object_queries def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - position_embeddings: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, key_value_states: Optional[torch.Tensor] = None, - key_value_position_embeddings: Optional[torch.Tensor] = None, + spatial_position_embeddings: Optional[torch.Tensor] = None, output_attentions: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel""" @@ -481,14 +481,14 @@ def forward( batch_size, target_len, embed_dim = hidden_states.size() # add position embeddings to the hidden states before projecting to queries and keys - if position_embeddings is not None: + if object_queries is not None: hidden_states_original = hidden_states - hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states = self.with_pos_embed(hidden_states, object_queries) # add key-value position embeddings to the key value states - if key_value_position_embeddings is not None: + if spatial_position_embeddings is not None: key_value_states_original = key_value_states - key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings) + key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings) # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -962,7 +962,7 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel): Some small tweaks for TABLE_TRANSFORMER: - - position_embeddings and query_position_embeddings are added to the forward pass. + - object_queries and query_position_embeddings are added to the forward pass. - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers. Args: @@ -988,7 +988,7 @@ def forward( attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, - position_embeddings=None, + object_queries=None, query_position_embeddings=None, output_attentions=None, output_hidden_states=None, @@ -1016,10 +1016,11 @@ def forward( - 1 for pixels that are real (i.e. **not masked**), - 0 for pixels that are padding (i.e. **masked**). - position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Position embeddings that are added to the queries and keys in each cross-attention layer. + object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Object queries that are added to the queries and keys in each cross-attention layer. query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): - , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer. + , *optional*): Position embeddings that are added to the values and keys in each self-attention layer. + output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -1091,7 +1092,7 @@ def custom_forward(*inputs): layer_outputs = decoder_layer( hidden_states, attention_mask=combined_attention_mask, - position_embeddings=position_embeddings, + object_queries=object_queries, query_position_embeddings=query_position_embeddings, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, @@ -1150,8 +1151,8 @@ def __init__(self, config: TableTransformerConfig): # Create backbone + positional encoding backbone = TableTransformerConvEncoder(config) - position_embeddings = build_position_encoding(config) - self.backbone = TableTransformerConvModel(backbone, position_embeddings) + object_queries = build_position_encoding(config) + self.backbone = TableTransformerConvModel(backbone, object_queries) # Create projection layer self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)