Skip to content

Commit

Permalink
fix overflow when training mDeberta in fp16 (huggingface#24116)
Browse files Browse the repository at this point in the history
* Porting changes from https://github.com/microsoft/DeBERTa/ that hopefully allows for fp16 training of mdeberta

* Updates to deberta modeling from microsoft repo

* Performing some cleanup

* Undoing changes that weren't necessary

* Undoing float calls

* Minimally change the p2c block

* Fix error

* Minimally changing the c2p block

* Switch to torch sqrt

* Remove math

* Adding back the to calls to scale

* Undoing attention_scores change

* Removing commented out code

* Updating modeling_sew_d.py to satisfy utils/check_copies.py

* Missed changed

* Further reduce changes needed to get fp16 working

* Reverting changes to modeling_sew_d.py

* Make same change in TF
  • Loading branch information
sjrl authored and novice03 committed Jun 23, 2023
1 parent 9321c92 commit 24be98a
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/transformers/models/deberta_v2/modeling_deberta_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def forward(
if "p2c" in self.pos_att_type:
scale_factor += 1
scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale.to(dtype=query_layer.dtype)
attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
if self.relative_attention:
rel_embeddings = self.pos_dropout(rel_embeddings)
rel_att = self.disentangled_attention_bias(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ def call(
if "p2c" in self.pos_att_type:
scale_factor += 1
scale = tf.math.sqrt(tf.cast(shape_list(query_layer)[-1] * scale_factor, tf.float32))
attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 2, 1])) / scale
attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 2, 1]) / scale)
if self.relative_attention:
rel_embeddings = self.pos_dropout(rel_embeddings)
rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/sew_d/modeling_sew_d.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ def forward(
if "p2c" in self.pos_att_type:
scale_factor += 1
scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale.to(dtype=query_layer.dtype)
attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
if self.relative_attention:
rel_embeddings = self.pos_dropout(rel_embeddings)
rel_att = self.disentangled_attention_bias(
Expand Down

0 comments on commit 24be98a

Please sign in to comment.