Skip to content

Commit

Permalink
fix mask-tying to sequence length. WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
erip committed Jan 30, 2023
1 parent bc08bbc commit c29e380
Showing 1 changed file with 13 additions and 10 deletions.
23 changes: 13 additions & 10 deletions xformers/components/attention/scaled_dot_product.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,22 @@ def forward(
)

# Handle a possibly deferred causal mask handling
if self.causal and self.mask is None:
self.mask = AttentionMask.make_causal(
seq_len=q.shape[-2],
to_seq_len=q.shape[-2],
device=q.device,
dtype=q.dtype,
)
if self.causal:
if self.mask is None:
mask = AttentionMask.make_causal(
seq_len=q.shape[-2],
to_seq_len=q.shape[-2],
device=q.device,
dtype=q.dtype,
)
else:
mask = self.mask

# Merge the optional causal mask and the user-provided mask
if self.mask is not None:
self.mask = self.mask.to(dtype=q.dtype, device=q.device)
if mask is not None:
mask = mask.to(dtype=q.dtype, device=q.device)

att_mask = att_mask + self.mask if att_mask is not None else self.mask
att_mask = att_mask + mask if att_mask is not None else mask

# Try to handle a case where the sequence is smaller than the mask
if (
Expand Down

0 comments on commit c29e380

Please sign in to comment.