taylanbil
diff --git a/‎fairseq/criterions/wav2vec_criterion.py
+45-20 b/‎fairseq/criterions/wav2vec_criterion.py
+45-20
diff --git a/‎fairseq/data/audio/raw_audio_dataset.py
+121-2 b/‎fairseq/data/audio/raw_audio_dataset.py
+121-2
diff --git a/‎fairseq/data/bucket_pad_length_dataset.py
+24-22 b/‎fairseq/data/bucket_pad_length_dataset.py
+24-22
@@ -31,7 +31,7 @@ class Wav2VecCriterionConfig(FairseqDataclass):
         default_factory=lambda: [],
         metadata={"help": "output keys to log"},
     )
-
+from fairseq.utils import index_put, is_xla_tensor
 
 @register_criterion("wav2vec", dataclass=Wav2VecCriterionConfig)
 class Wav2vecCriterion(FairseqCriterion):
@@ -52,7 +52,9 @@ def forward(self, model, sample, reduce=True):
         net_output = model(**sample["net_input"])
         logits = model.get_logits(net_output).float()
         target = model.get_targets(sample, net_output)
+        self.xla = is_xla_tensor(logits)
 
+        # XXX: handle weights on xla.
         weights = None
         if hasattr(model, "get_target_weights") and not self.infonce:
             weights = model.get_target_weights(target, net_output)
@@ -61,21 +63,31 @@ def forward(self, model, sample, reduce=True):
 
         losses = []
 
+        reduction = "none" if ((not reduce) or self.xla) else "sum"
         if self.infonce:
-            loss = F.cross_entropy(
-                logits,
-                target,
-                reduction="sum" if reduce else "none",
-            )
+            loss = F.cross_entropy(logits, target, reduction=reduction)
         else:
             loss = F.binary_cross_entropy_with_logits(
-                logits,
-                target.float(),
-                weights,
-                reduction="sum" if reduce else "none",
+                logits, target.float(), weights, reduction=reduction
+            )
+
+        if self.xla:
+            # tpu-comment: since dynamic shapes lead to recompilations on xla,
+            # we don't shrink tensors using mask_indices.
+            # Instead, we use mask indices to adjust loss.
+            mi = (
+                sample['net_input']['mask_indices']
+                .transpose(0, 1)  # logits are transposed in `model.get_logits`
+                .reshape(logits.size(0))
             )
+            loss = (loss * mi).sum() if reduce else (loss * mi)
 
-        sample_size = target.numel() if self.infonce else target.long().sum().item()
+        if 'sample_size' in sample and self.infonce:
+            sample_size = sample['sample_size']
+        elif 'mask_indices' in sample['net_input']:
+            sample_size = sample['net_input']['mask_indices'].sum()
+        else:
+            sample_size = target.numel() if self.infonce else target.long().sum().item()
         losses.append(loss.detach().clone())
 
         if self.loss_weights is not None:
@@ -95,7 +107,7 @@ def forward(self, model, sample, reduce=True):
                     losses.append(p)
 
         logging_output = {
-            "loss": loss.item() if reduce else loss,
+            "loss": loss.item() if (reduce and not self.xla) else loss.detach(),
             "ntokens": sample_size,
             "nsentences": sample["id"].numel(),
             "sample_size": sample_size,
@@ -111,11 +123,14 @@ def forward(self, model, sample, reduce=True):
                 if not self.training:
                     logging_output["target"] = target.cpu().numpy()
             elif lk in net_output:
-                logging_output[lk] = float(net_output[lk])
+                value = net_output[lk]
+                if not is_xla_tensor(value):
+                    value = float(value)
+                logging_output[lk] = value
 
         if len(losses) > 1:
             for i, l in enumerate(losses):
-                logging_output[f"loss_{i}"] = l.item()
+                logging_output[f"loss_{i}"] = l.item() if not self.xla else l.detach()
 
         if self.infonce:
             with torch.no_grad():
@@ -126,9 +141,15 @@ def forward(self, model, sample, reduce=True):
                     assert logits.dim() > 1, logits.shape
                     max = logits.argmax(-1) == 0
                     min = logits.argmin(-1) == 0
-                    both = max & min
-                    corr = max.long().sum().item() - both.long().sum().item()
-                    count = max.numel()
+                    if is_xla_tensor(logits):
+                        max, min = max * mi, min * mi
+                        both = max & min
+                        corr = max.long().sum() - both.long().sum()
+                        count = mi.sum()
+                    else:
+                        both = max & min
+                        corr = max.long().sum().item() - both.long().sum().item()
+                        count = float(max.numel())
 
                 logging_output["correct"] = corr
                 logging_output["count"] = count
@@ -188,11 +209,15 @@ def reduce_metrics(logging_outputs) -> None:
                 else:
                     metrics.log_scalar(k, val / len(logging_outputs), round=3)
 
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
+    # FIXME: revert when gather based xla reduction is implemented
+    #@staticmethod
+    #def logging_outputs_can_be_summed() -> bool:
+    def logging_outputs_can_be_summed(self) -> bool:
         """
         Whether the logging outputs returned by `forward` can be summed
         across workers prior to calling `reduce_metrics`. Setting this
         to True will improves distributed training speed.
         """
-        return False
+        # XXX: Gather based reduction not implemented for xla yet.
+        # So we fall to sum based reduction for xla.
+        return self.xla
@@ -12,7 +12,8 @@
 import torch
 import torch.nn.functional as F
 
-from .. import FairseqDataset
+from .. import FairseqDataset, BaseWrapperDataset
+from ..data_utils import compute_mask_indices, get_buckets, get_bucketed_sizes
 
 
 logger = logging.getLogger(__name__)
@@ -28,6 +29,8 @@ def __init__(
         min_length=0,
         pad=False,
         normalize=False,
+        compute_mask_indices=False,
+        **mask_compute_kwargs,
     ):
         super().__init__()
 
@@ -41,6 +44,14 @@ def __init__(
         self.pad = pad
         self.shuffle = shuffle
         self.normalize = normalize
+        self.compute_mask_indices = compute_mask_indices
+        if self.compute_mask_indices:
+            self.mask_compute_kwargs = mask_compute_kwargs
+            self._features_size_map = {}
+            self._C = mask_compute_kwargs['encoder_embed_dim']
+            self._conv_feature_layers = eval(
+                mask_compute_kwargs['conv_feature_layers']
+            )
 
     def __getitem__(self, index):
         raise NotImplementedError()
@@ -72,6 +83,45 @@ def crop_to_max_size(self, wav, target_size):
         end = size - diff + start
         return wav[start:end]
 
+    def _compute_mask_indices(self, dims, padding_mask):
+        B, T, C = dims
+        mask_indices, mask_channel_indices = None, None
+        if self.mask_compute_kwargs['mask_prob'] > 0:
+            mask_indices = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_compute_kwargs['mask_prob'],
+                self.mask_compute_kwargs['mask_length'],
+                self.mask_compute_kwargs['mask_selection'],
+                self.mask_compute_kwargs['mask_other'],
+                min_masks=2,
+                no_overlap=self.mask_compute_kwargs['no_mask_overlap'],
+                min_space=self.mask_compute_kwargs['mask_min_space'],
+            )
+            mask_indices = torch.from_numpy(mask_indices)
+        if self.mask_compute_kwargs['mask_channel_prob'] > 0:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_compute_kwargs['mask_channel_prob'],
+                self.mask_compute_kwargs['mask_channel_length'],
+                self.mask_compute_kwargs['mask_channel_selection'],
+                self.mask_compute_kwargs['mask_channel_other'],
+                no_overlap=self.mask_compute_kwargs['no_mask_channel_overlap'],
+                min_space=self.mask_compute_kwargs['mask_channel_min_space'],
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+
+        return mask_indices, mask_channel_indices
+
+    @staticmethod
+    def _bucket_tensor(tensor, num_pad, value):
+        return F.pad(tensor, (0, num_pad), value=value)
+
     def collater(self, samples):
         samples = [s for s in samples if s["source"] is not None]
         if len(samples) == 0:
@@ -103,9 +153,55 @@ def collater(self, samples):
                 collated_sources[i] = self.crop_to_max_size(source, target_size)
 
         input = {"source": collated_sources}
+        out = {"id": torch.LongTensor([s["id"] for s in samples])}
         if self.pad:
             input["padding_mask"] = padding_mask
-        return {"id": torch.LongTensor([s["id"] for s in samples]), "net_input": input}
+
+        if hasattr(self, 'num_buckets') and self.num_buckets > 0:
+            assert self.pad, "Cannot bucket without padding first."
+            bucket = max(self._bucketed_sizes[s['id']] for s in samples)
+            num_pad = bucket - collated_sources.size(-1)
+            if num_pad:
+                input['source'] = self._bucket_tensor(
+                    collated_sources, num_pad, 0
+                )
+                input['padding_mask'] = self._bucket_tensor(
+                    padding_mask, num_pad, True
+                )
+
+        if self.compute_mask_indices:
+            B = input['source'].size(0)
+            T = self._get_mask_indices_dims(input['source'].size(-1))
+            padding_mask_reshaped = input['padding_mask'].clone()
+            extra = padding_mask_reshaped.size(1) % T
+            if extra > 0:
+                padding_mask_reshaped = padding_mask_reshaped[:, :-extra]
+            padding_mask_reshaped = padding_mask_reshaped.view(
+                padding_mask_reshaped.size(0), T, -1
+            )
+            padding_mask_reshaped = padding_mask_reshaped.all(-1)
+            input['padding_count'] = (
+                padding_mask_reshaped.sum(-1).max().item()
+            )
+            mask_indices, mask_channel_indices = self._compute_mask_indices(
+                (B, T, self._C), padding_mask_reshaped,
+            )
+            input["mask_indices"] = mask_indices
+            input["mask_channel_indices"] = mask_channel_indices
+            out['sample_size'] = mask_indices.sum().item()
+
+        out["net_input"] = input
+        return out
+
+    def _get_mask_indices_dims(self, size, padding=0, dilation=1):
+        if size not in self._features_size_map:
+            L_in = size
+            for (_, kernel_size, stride) in self._conv_feature_layers:
+                L_out = L_in + 2*padding - dilation*(kernel_size-1) - 1
+                L_out = 1 + L_out // stride
+                L_in = L_out
+            self._features_size_map[size] = L_out
+        return self._features_size_map[size]
 
     def num_tokens(self, index):
         return self.size(index)
@@ -141,6 +237,9 @@ def __init__(
         min_length=0,
         pad=False,
         normalize=False,
+        num_buckets=0,
+        compute_mask_indices=False,
+        **mask_compute_kwargs,
     ):
         super().__init__(
             sample_rate=sample_rate,
@@ -150,6 +249,8 @@ def __init__(
             min_length=min_length,
             pad=pad,
             normalize=normalize,
+            compute_mask_indices=compute_mask_indices,
+            **mask_compute_kwargs,
         )
 
         self.fnames = []
@@ -168,8 +269,26 @@ def __init__(
                 self.fnames.append(items[0])
                 self.line_inds.add(i)
                 self.sizes.append(sz)
+        self.set_bucket_info(num_buckets)
         logger.info(f"loaded {len(self.fnames)}, skipped {skipped} samples")
 
+    def set_bucket_info(self, num_buckets):
+        self.num_buckets = num_buckets
+        if self.num_buckets > 0:
+            self._collated_sizes = np.minimum(
+                np.array(self.sizes), self.max_sample_size,
+            )
+            self.buckets = get_buckets(
+                self._collated_sizes, self.num_buckets,
+            )
+            self._bucketed_sizes = get_bucketed_sizes(
+                self._collated_sizes, self.buckets
+            )
+            logger.info(
+                f"{len(self.buckets)} bucket(s) for the audio dataset: "
+                f"{self.buckets}"
+            )
+
     def __getitem__(self, index):
         import soundfile as sf
 
 
@@ -6,6 +6,7 @@
 import numpy as np
 import torch.nn.functional as F
 from fairseq.data import BaseWrapperDataset
+from fairseq.data.data_utils import get_buckets, get_bucketed_sizes
 
 
 class BucketPadLengthDataset(BaseWrapperDataset):
@@ -29,42 +30,43 @@ def __init__(
         num_buckets,
         pad_idx,
         left_pad,
+        tensor_key=None,
     ):
         super().__init__(dataset)
         self.pad_idx = pad_idx
         self.left_pad = left_pad
 
         assert num_buckets > 0
-        self.buckets = np.unique(
-            np.percentile(
-                sizes,
-                np.linspace(0, 100, num_buckets + 1),
-                interpolation="lower",
-            )[1:]
-        )
+        self.buckets = get_buckets(sizes, num_buckets)
+        self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets)
+        self._tensor_key = tensor_key
 
-        def get_bucketed_sizes(orig_sizes, buckets):
-            sizes = np.copy(orig_sizes)
-            assert np.min(sizes) >= 0
-            start_val = -1
-            for end_val in buckets:
-                mask = (sizes > start_val) & (sizes <= end_val)
-                sizes[mask] = end_val
-                start_val = end_val
-            return sizes
+    def _set_tensor(self, item, val):
+        if self._tensor_key is None:
+            return val
+        item[self._tensor_key] = val
+        return item
 
-        self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets)
+    def _get_tensor(self, item):
+        if self._tensor_key is None:
+            return item
+        return item[self._tensor_key]
 
-    def __getitem__(self, index):
-        item = self.dataset[index]
-        bucket_size = self._bucketed_sizes[index]
-        num_pad = bucket_size - item.size(-1)
+    def _pad(self, tensor, bucket_size, dim=-1):
+        num_pad = bucket_size - tensor.size(dim)
         return F.pad(
-            item,
+            tensor,
             (num_pad if self.left_pad else 0, 0 if self.left_pad else num_pad),
             value=self.pad_idx,
         )
 
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        bucket_size = self._bucketed_sizes[index]
+        tensor = self._get_tensor(item)
+        padded = self._pad(tensor, bucket_size)
+        return self._set_tensor(item, padded)
+
     @property
     def sizes(self):
         return self._bucketed_sizes