From b002d0096ea82c77be7027b205d0b29de82edde4 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 19 Jul 2019 06:30:30 -0700
Subject: [PATCH 001/213] v0.7.1 -> v0.7.2 (#891)

Summary:
No major API changes since the last release. Cutting a new release since we'll be merging significant (possibly breaking) changes to logging, data loading and the masked LM implementation soon.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/891

Differential Revision: D16377132

Pulled By: myleott

fbshipit-source-id: f1cb88e671ccd510e53334d0f449fe18585268c7
---
 docs/conf.py        | 4 ++--
 fairseq/__init__.py | 2 +-
 setup.py            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 7239bd3c19..7f108940d0 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -60,9 +60,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.7.1'
+version = '0.7.2'
 # The full version, including alpha/beta/rc tags.
-release = '0.7.1'
+release = '0.7.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/fairseq/__init__.py b/fairseq/__init__.py
index c3f174e67c..90ddc77812 100644
--- a/fairseq/__init__.py
+++ b/fairseq/__init__.py
@@ -6,7 +6,7 @@
 # can be found in the PATENTS file in the same directory.
 
 __all__ = ['pdb']
-__version__ = '0.7.1'
+__version__ = '0.7.2'
 
 import fairseq.criterions
 import fairseq.models
diff --git a/setup.py b/setup.py
index 19cea38980..7c965010a7 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@
 
 setup(
     name='fairseq',
-    version='0.7.1',
+    version='0.7.2',
     description='Facebook AI Research Sequence-to-Sequence Toolkit',
     url='https://github.com/pytorch/fairseq',
     classifiers=[

From be5821b82bcb5f700693d805c31ba8d20c41cd01 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 19 Jul 2019 06:34:20 -0700
Subject: [PATCH 002/213] Switch to torch.nn.functional.gelu when available

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/735

Differential Revision: D16377046

Pulled By: myleott

fbshipit-source-id: 9725d4a3ce6b2fc8cee0b1d1cb8921f9d59c551a
---
 fairseq/modules/gelu.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fairseq/modules/gelu.py b/fairseq/modules/gelu.py
index 0b2fe833a8..998610943b 100644
--- a/fairseq/modules/gelu.py
+++ b/fairseq/modules/gelu.py
@@ -21,4 +21,7 @@ def gelu_accurate(x):
 
 
 def gelu(x: torch.Tensor) -> torch.Tensor:
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+    if hasattr(torch.nn.functional, 'gelu'):
+        return torch.nn.functional.gelu(x.float()).type_as(x)
+    else:
+        return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

From 8af555426980b775b9804ad2172fd34e4e818c9c Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 19 Jul 2019 06:34:46 -0700
Subject: [PATCH 003/213] Improve interactive generation (support --tokenizer
 and --bpe)

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/734

Differential Revision: D16377044

Pulled By: myleott

fbshipit-source-id: 37d5553d76aa7c653113fec089f59710281c31d7
---
 docs/getting_started.rst                   | 18 +++++++-----
 examples/translation/README.md             | 11 ++++----
 examples/translation_moe/README.md         |  9 +++---
 fairseq/data/transforms/__init__.py        |  2 +-
 fairseq/data/transforms/moses_tokenizer.py | 18 ++++++------
 fairseq/data/transforms/space_tokenizer.py |  2 +-
 fairseq/data/transforms/subword_nmt_bpe.py |  2 ++
 fairseq/sequence_generator.py              |  6 ++--
 interactive.py                             | 32 +++++++++++++---------
 9 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 13ef2fa9aa..a5fa17246c 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -19,23 +19,27 @@ flag to :ref:`fairseq-generate`. Prior to BPE, input text needs to be tokenized
 using ``tokenizer.perl`` from
 `mosesdecoder <https://github.com/moses-smt/mosesdecoder>`__.
 
-Let's use :ref:`fairseq-interactive` to generate translations
-interactively. Here, we use a beam size of 5:
+Let's use :ref:`fairseq-interactive` to generate translations interactively.
+Here, we use a beam size of 5 and preprocess the input with the Moses
+tokenizer and the given Byte-Pair Encoding vocabulary. It will automatically
+remove the BPE continuation markers and detokenize the output.
 
 .. code-block:: console
 
     > MODEL_DIR=wmt14.en-fr.fconv-py
     > fairseq-interactive \
         --path $MODEL_DIR/model.pt $MODEL_DIR \
-        --beam 5 --source-lang en --target-lang fr
+        --beam 5 --source-lang en --target-lang fr \
+        --tokenizer moses \
+        --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes
     | loading model(s) from wmt14.en-fr.fconv-py/model.pt
     | [en] dictionary: 44206 types
     | [fr] dictionary: 44463 types
     | Type the input sentence and press return:
-    > Why is it rare to discover new marine mam@@ mal species ?
-    O       Why is it rare to discover new marine mam@@ mal species ?
-    H       -0.1525060087442398     Pourquoi est @-@ il rare de découvrir de nouvelles espèces de mammifères marins ?
-    P       -0.2221 -0.3122 -0.1289 -0.2673 -0.1711 -0.1930 -0.1101 -0.1660 -0.1003 -0.0740 -0.1101 -0.0814 -0.1238 -0.0985 -0.1288
+    Why is it rare to discover new marine mammal species?
+    S-0     Why is it rare to discover new marine mam@@ mal species ?
+    H-0     -0.0643349438905716     Pourquoi est-il rare de découvrir de nouvelles espèces de mammifères marins?
+    P-0     -0.0763 -0.1849 -0.0956 -0.0946 -0.0735 -0.1150 -0.1301 -0.0042 -0.0321 -0.0171 -0.0052 -0.0062 -0.0015
 
 This generation script produces three types of outputs: a line prefixed
 with *O* is a copy of the original source sentence; *H* is the
diff --git a/examples/translation/README.md b/examples/translation/README.md
index 7e46cae5b6..537da259f6 100644
--- a/examples/translation/README.md
+++ b/examples/translation/README.md
@@ -244,11 +244,12 @@ $ SRC=de
 $ sacrebleu --test-set iwslt17 --language-pair ${SRC}-en --echo src \
   | python scripts/spm_encode.py --model examples/translation/iwslt17.de_fr.en.bpe16k/sentencepiece.bpe.model \
   > iwslt17.test.${SRC}-en.${SRC}.bpe
-$ cat iwslt17.test.${SRC}-en.${SRC}.bpe | fairseq-interactive data-bin/iwslt17.de_fr.en.bpe16k/ \
-  --task multilingual_translation --source-lang ${SRC} --target-lang en \
-  --path checkpoints/multilingual_transformer/checkpoint_best.pt \
-  --buffer 2000 --batch-size 128 \
-  --beam 5 --remove-bpe=sentencepiece \
+$ cat iwslt17.test.${SRC}-en.${SRC}.bpe \
+  | fairseq-interactive data-bin/iwslt17.de_fr.en.bpe16k/ \
+      --task multilingual_translation --source-lang ${SRC} --target-lang en \
+      --path checkpoints/multilingual_transformer/checkpoint_best.pt \
+      --buffer 2000 --batch-size 128 \
+      --beam 5 --remove-bpe=sentencepiece \
   > iwslt17.test.${SRC}-en.en.sys
 $ grep ^H iwslt17.test.${SRC}-en.en.sys | cut -f3 \
   | sacrebleu --test-set iwslt17 --language-pair ${SRC}-en
diff --git a/examples/translation_moe/README.md b/examples/translation_moe/README.md
index 378471f2c3..6a4af48b6d 100644
--- a/examples/translation_moe/README.md
+++ b/examples/translation_moe/README.md
@@ -58,11 +58,12 @@ Next apply BPE on the fly and run generation for each expert:
 $ BPEROOT=examples/translation/subword-nmt/
 $ BPE_CODE=examples/translation/wmt17_en_de/code
 $ for EXPERT in $(seq 0 2); do \
-    cat wmt14-en-de.extra_refs.tok | grep ^S | cut -f 2 | \
-      python $BPEROOT/apply_bpe.py -c $BPE_CODE | \
-      fairseq-interactive data-bin/wmt17_en_de \
+    cat wmt14-en-de.extra_refs.tok \
+    | grep ^S | cut -f 2 \
+    | fairseq-interactive data-bin/wmt17_en_de \
         --path checkpoints/checkpoint_best.pt \
-        --beam 1 --remove-bpe \
+        --beam 1 \
+        --bpe subword_nmt --bpe-codes $BPE_CODE \
         --buffer-size 500 --max-tokens 6000 \
         --task translation_moe \
         --method hMoElp --mean-pool-gating-network \
diff --git a/fairseq/data/transforms/__init__.py b/fairseq/data/transforms/__init__.py
index 81822d108b..a64954edd6 100644
--- a/fairseq/data/transforms/__init__.py
+++ b/fairseq/data/transforms/__init__.py
@@ -14,7 +14,7 @@
 
 build_tokenizer, register_tokenizer, TOKENIZER_REGISTRY = registry.setup_registry(
     '--tokenizer',
-    default='space',
+    default=None,
 )
 
 
diff --git a/fairseq/data/transforms/moses_tokenizer.py b/fairseq/data/transforms/moses_tokenizer.py
index 469298478b..dc6016d914 100644
--- a/fairseq/data/transforms/moses_tokenizer.py
+++ b/fairseq/data/transforms/moses_tokenizer.py
@@ -14,13 +14,13 @@ class MosesTokenizer(object):
     @staticmethod
     def add_args(parser):
         # fmt: off
-        parser.add_argument('-s', '--source-lang', default='en', metavar='SRC',
+        parser.add_argument('--moses-source-lang', default='en', metavar='SRC',
                             help='source language')
-        parser.add_argument('-t', '--target-lang', default='en', metavar='TARGET',
+        parser.add_argument('--moses-target-lang', default='en', metavar='TARGET',
                             help='target language')
-        parser.add_argument('--aggressive-dash-splits', action='store_true', default=False,
-                            help='triggers dash split rules')
-        parser.add_argument('--no-escape', action='store_true', default=False,
+        parser.add_argument('--moses-no-dash-splits', action='store_true', default=False,
+                            help='don\'t apply dash split rules')
+        parser.add_argument('--moses-no-escape', action='store_true', default=False,
                             help='don\'t perform HTML escaping on apostrophy, quotes, etc.')
         # fmt: on
 
@@ -28,17 +28,17 @@ def __init__(self, args):
         self.args = args
         try:
             from sacremoses import MosesTokenizer, MosesDetokenizer
-            self.tok = MosesTokenizer(args.source_lang)
-            self.detok = MosesDetokenizer(args.target_lang)
+            self.tok = MosesTokenizer(args.moses_source_lang)
+            self.detok = MosesDetokenizer(args.moses_target_lang)
         except ImportError:
             raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
 
     def encode(self, x: str) -> str:
         return self.tok.tokenize(
             x,
-            aggressive_dash_splits=self.args.aggressive_dash_splits,
+            aggressive_dash_splits=(not self.args.moses_no_dash_splits),
             return_str=True,
-            escape=(not self.args.no_escape),
+            escape=(not self.args.moses_no_escape),
         )
 
     def decode(self, x: str) -> str:
diff --git a/fairseq/data/transforms/space_tokenizer.py b/fairseq/data/transforms/space_tokenizer.py
index 4ac3ed510f..95d68a45d6 100644
--- a/fairseq/data/transforms/space_tokenizer.py
+++ b/fairseq/data/transforms/space_tokenizer.py
@@ -17,7 +17,7 @@ def __init__(self, source_lang=None, target_lang=None):
         self.space_tok = re.compile(r"\s+")
 
     def encode(self, x: str) -> str:
-        return self.space_tok.sub(" ", x).strip().split()
+        return self.space_tok.sub(' ', x)
 
     def decode(self, x: str) -> str:
         return x
diff --git a/fairseq/data/transforms/subword_nmt_bpe.py b/fairseq/data/transforms/subword_nmt_bpe.py
index 1d582ec003..c5f2722340 100644
--- a/fairseq/data/transforms/subword_nmt_bpe.py
+++ b/fairseq/data/transforms/subword_nmt_bpe.py
@@ -22,6 +22,8 @@ def add_args(parser):
         # fmt: on
 
     def __init__(self, args):
+        if args.bpe_codes is None:
+            raise ValueError('--bpe-codes is required for --bpe=subword_nmt')
         codes = file_utils.cached_path(args.bpe_codes)
         try:
             from subword_nmt import apply_bpe
diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py
index 392f062fb1..93e4fe1b53 100644
--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -160,7 +160,7 @@ def generate(
         scores_buf = scores.clone()
         tokens = src_tokens.data.new(bsz * beam_size, max_len + 2).long().fill_(self.pad)
         tokens_buf = tokens.clone()
-        tokens[:, 0] = bos_token or self.eos
+        tokens[:, 0] = self.eos if bos_token is None else bos_token
         attn, attn_buf = None, None
         nonpad_idxs = None
         if prefix_tokens is not None:
@@ -618,10 +618,8 @@ def _decode_one(
             decoder_out[0].div_(temperature)
         attn = decoder_out[1]
         if type(attn) is dict:
-            attn = attn['attn']
+            attn = attn.get('attn', None)
         if attn is not None:
-            if type(attn) is dict:
-                attn = attn['attn']
             attn = attn[:, -1, :]
         probs = model.get_normalized_probs(decoder_out, log_probs=log_probs)
         probs = probs[:, -1, :]
diff --git a/interactive.py b/interactive.py
index d7946aabf3..eea7fcbda4 100644
--- a/interactive.py
+++ b/interactive.py
@@ -15,6 +15,7 @@
 import torch
 
 from fairseq import checkpoint_utils, options, tasks, utils
+from fairseq.data import transforms
 
 
 Batch = namedtuple('Batch', 'ids src_tokens src_lengths')
@@ -101,17 +102,23 @@ def main(args):
     # Initialize generator
     generator = task.build_generator(args)
 
-    # Hack to support GPT-2 BPE
-    if args.remove_bpe == 'gpt2':
-        from fairseq.gpt2_bpe.gpt2_encoding import get_encoder
-        decoder = get_encoder(
-            'fairseq/gpt2_bpe/encoder.json',
-            'fairseq/gpt2_bpe/vocab.bpe',
-        )
-        encode_fn = lambda x: ' '.join(map(str, decoder.encode(x)))
-    else:
-        decoder = None
-        encode_fn = lambda x: x
+    # Handle tokenization and BPE
+    tokenizer = transforms.build_tokenizer(args)
+    bpe = transforms.build_bpe(args)
+
+    def encode_fn(x):
+        if tokenizer is not None:
+            x = tokenizer.encode(x)
+        if bpe is not None:
+            x = bpe.encode(x)
+        return x
+
+    def decode_fn(x):
+        if bpe is not None:
+            x = bpe.decode(x)
+        if tokenizer is not None:
+            x = tokenizer.decode(x)
+        return x
 
     # Load alignment dictionary for unknown word replacement
     # (None if no unknown word replacement, empty if no path to align dictionary)
@@ -162,8 +169,7 @@ def main(args):
                     tgt_dict=tgt_dict,
                     remove_bpe=args.remove_bpe,
                 )
-                if decoder is not None:
-                    hypo_str = decoder.decode(map(int, hypo_str.strip().split()))
+                hypo_str = decode_fn(hypo_str)
                 print('H-{}\t{}\t{}'.format(id, hypo['score'], hypo_str))
                 print('P-{}\t{}'.format(
                     id,

From c811e0e02d06f8d5fd6a0b738546b0e200c706cd Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 19 Jul 2019 08:08:16 -0700
Subject: [PATCH 004/213] Store task in the criterion base class

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/737

Differential Revision: D16377805

Pulled By: myleott

fbshipit-source-id: 1e090a02ff4fbba8695173f57d3cc5b88ae98bbf
---
 fairseq/criterions/fairseq_criterion.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fairseq/criterions/fairseq_criterion.py b/fairseq/criterions/fairseq_criterion.py
index fbe5afc8e8..4c167ac511 100644
--- a/fairseq/criterions/fairseq_criterion.py
+++ b/fairseq/criterions/fairseq_criterion.py
@@ -13,6 +13,7 @@ class FairseqCriterion(_Loss):
     def __init__(self, args, task):
         super().__init__()
         self.args = args
+        self.task = task
         self.padding_idx = task.target_dictionary.pad() if task.target_dictionary is not None else -100
 
     @staticmethod

From ffe53d6fbc9b36668cf3c9bdf1cc786730fdee79 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 19 Jul 2019 08:08:22 -0700
Subject: [PATCH 005/213] Create standalone label_smoothed_nll_loss

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/739

Differential Revision: D16377798

Pulled By: myleott

fbshipit-source-id: 20047c80de2e6f108269ace4ae3eec906a5920dd
---
 .../label_smoothed_cross_entropy.py           | 34 +++++++++++++------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
index c5e974c129..71448bf10b 100644
--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -12,6 +12,26 @@
 from . import FairseqCriterion, register_criterion
 
 
+def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True):
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target)
+    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+    if ignore_index is not None:
+        non_pad_mask = target.ne(ignore_index)
+        nll_loss = nll_loss[non_pad_mask]
+        smooth_loss = smooth_loss[non_pad_mask]
+    else:
+        nll_loss = nll_loss.squeeze(-1)
+        smooth_loss = smooth_loss.squeeze(-1)
+    if reduce:
+        nll_loss = nll_loss.sum()
+        smooth_loss = smooth_loss.sum()
+    eps_i = epsilon / lprobs.size(-1)
+    loss = (1. - epsilon) * nll_loss + eps_i * smooth_loss
+    return loss, nll_loss
+
+
 @register_criterion('label_smoothed_cross_entropy')
 class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
 
@@ -51,17 +71,9 @@ def compute_loss(self, model, net_output, sample, reduce=True):
         lprobs = model.get_normalized_probs(net_output, log_probs=True)
         lprobs = lprobs.view(-1, lprobs.size(-1))
         target = model.get_targets(sample, net_output).view(-1, 1)
-        non_pad_mask = target.ne(self.padding_idx)
-        if reduce:
-            nll_loss = -lprobs.gather(dim=-1, index=target).masked_fill_(1.0-non_pad_mask, 0.0)
-            nll_loss = nll_loss.sum()
-            smooth_loss = -lprobs.sum(dim=-1, keepdim=True).masked_fill_(1.0-non_pad_mask, 0.0)
-            smooth_loss = smooth_loss.sum()
-        else:
-            nll_loss = -lprobs.gather(dim=-1, index=target)[non_pad_mask]
-            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)[non_pad_mask]
-        eps_i = self.eps / lprobs.size(-1)
-        loss = (1. - self.eps) * nll_loss + eps_i * smooth_loss
+        loss, nll_loss = label_smoothed_nll_loss(
+            lprobs, target, self.eps, ignore_index=self.padding_idx, reduce=reduce,
+        )
         return loss, nll_loss
 
     @staticmethod

From 7efde2261f78e9e8d20e637e252bdd9977ec9290 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 19 Jul 2019 08:10:22 -0700
Subject: [PATCH 006/213] Allow not specifying --warmup-init-lr

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/736

Differential Revision: D16378001

Pulled By: myleott

fbshipit-source-id: 2907f63bcbf7068ceaa48b00096040fa2639e569
---
 fairseq/optim/lr_scheduler/inverse_square_root_schedule.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py b/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
index a3a48f04fb..fb33ea0cdb 100644
--- a/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
+++ b/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
@@ -37,7 +37,7 @@ def __init__(self, args, optimizer):
             )
         warmup_end_lr = args.lr[0]
         if args.warmup_init_lr < 0:
-            args.warmup_init_lr = warmup_end_lr
+            args.warmup_init_lr = 0 if args.warmup_updates > 0 else warmup_end_lr
 
         # linearly warmup for the first args.warmup_updates
         self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates

From 69d0f7f826e58a7e07bc431afd74ce2c54c63eb6 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 19 Jul 2019 13:10:23 -0700
Subject: [PATCH 007/213] Rename _load_model_ensemble ->
 load_model_ensemble_and_task

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/738

Differential Revision: D16377803

Pulled By: myleott

fbshipit-source-id: 6beb2f78e7464b70ff65a965d2b747cdca0ca951
---
 fairseq/checkpoint_utils.py     | 4 ++--
 fairseq/models/fairseq_model.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 66e16ea16d..0284fa2aa1 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -153,11 +153,11 @@ def load_model_ensemble(filenames, arg_overrides=None, task=None):
             were used during model training
         task (fairseq.tasks.FairseqTask, optional): task to use for loading
     """
-    ensemble, args, _task = _load_model_ensemble(filenames, arg_overrides, task)
+    ensemble, args, _task = load_model_ensemble_and_task(filenames, arg_overrides, task)
     return ensemble, args
 
 
-def _load_model_ensemble(filenames, arg_overrides=None, task=None):
+def load_model_ensemble_and_task(filenames, arg_overrides=None, task=None):
     from fairseq import tasks
 
     ensemble = []
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index 1d534188f8..8f52adc7de 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -191,7 +191,7 @@ def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_na
             if os.path.exists(path):
                 kwargs[arg] = path
 
-        models, args, task = checkpoint_utils._load_model_ensemble(
+        models, args, task = checkpoint_utils.load_model_ensemble_and_task(
             [os.path.join(model_path, cpt) for cpt in checkpoint_file.split(':')],
             arg_overrides=kwargs,
         )

From f812e5295610d8a6467fe7212d8a47ce16d8d081 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 21 Jul 2019 03:48:34 -0700
Subject: [PATCH 008/213] Rename data.transforms -> data.encoders

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/747

Differential Revision: D16403464

Pulled By: myleott

fbshipit-source-id: ee3b4184f129a02be833c7bdc00685978b4de883
---
 fairseq/data/{transforms => encoders}/__init__.py          | 4 ++--
 fairseq/data/{transforms => encoders}/gpt2_bpe.py          | 2 +-
 fairseq/data/{transforms => encoders}/moses_tokenizer.py   | 2 +-
 fairseq/data/{transforms => encoders}/nltk_tokenizer.py    | 2 +-
 fairseq/data/{transforms => encoders}/sentencepiece_bpe.py | 2 +-
 fairseq/data/{transforms => encoders}/space_tokenizer.py   | 2 +-
 fairseq/data/{transforms => encoders}/subword_nmt_bpe.py   | 2 +-
 fairseq/hub_utils.py                                       | 6 +++---
 interactive.py                                             | 6 +++---
 9 files changed, 14 insertions(+), 14 deletions(-)
 rename fairseq/data/{transforms => encoders}/__init__.py (83%)
 rename fairseq/data/{transforms => encoders}/gpt2_bpe.py (99%)
 rename fairseq/data/{transforms => encoders}/moses_tokenizer.py (96%)
 rename fairseq/data/{transforms => encoders}/nltk_tokenizer.py (93%)
 rename fairseq/data/{transforms => encoders}/sentencepiece_bpe.py (95%)
 rename fairseq/data/{transforms => encoders}/space_tokenizer.py (91%)
 rename fairseq/data/{transforms => encoders}/subword_nmt_bpe.py (97%)

diff --git a/fairseq/data/transforms/__init__.py b/fairseq/data/encoders/__init__.py
similarity index 83%
rename from fairseq/data/transforms/__init__.py
rename to fairseq/data/encoders/__init__.py
index a64954edd6..1e7e69fbea 100644
--- a/fairseq/data/transforms/__init__.py
+++ b/fairseq/data/encoders/__init__.py
@@ -24,8 +24,8 @@
 )
 
 
-# automatically import any Python files in the transforms/ directory
+# automatically import any Python files in the encoders/ directory
 for file in os.listdir(os.path.dirname(__file__)):
     if file.endswith('.py') and not file.startswith('_'):
         module = file[:file.find('.py')]
-        importlib.import_module('fairseq.data.transforms.' + module)
+        importlib.import_module('fairseq.data.encoders.' + module)
diff --git a/fairseq/data/transforms/gpt2_bpe.py b/fairseq/data/encoders/gpt2_bpe.py
similarity index 99%
rename from fairseq/data/transforms/gpt2_bpe.py
rename to fairseq/data/encoders/gpt2_bpe.py
index 38361a49ac..b411a5aa39 100644
--- a/fairseq/data/transforms/gpt2_bpe.py
+++ b/fairseq/data/encoders/gpt2_bpe.py
@@ -6,7 +6,7 @@
 # can be found in the PATENTS file in the same directory.
 
 from fairseq import file_utils
-from fairseq.data.transforms import register_bpe
+from fairseq.data.encoders import register_bpe
 
 
 @register_bpe('gpt2')
diff --git a/fairseq/data/transforms/moses_tokenizer.py b/fairseq/data/encoders/moses_tokenizer.py
similarity index 96%
rename from fairseq/data/transforms/moses_tokenizer.py
rename to fairseq/data/encoders/moses_tokenizer.py
index dc6016d914..4964a822c2 100644
--- a/fairseq/data/transforms/moses_tokenizer.py
+++ b/fairseq/data/encoders/moses_tokenizer.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
-from fairseq.data.transforms import register_tokenizer
+from fairseq.data.encoders import register_tokenizer
 
 
 @register_tokenizer('moses')
diff --git a/fairseq/data/transforms/nltk_tokenizer.py b/fairseq/data/encoders/nltk_tokenizer.py
similarity index 93%
rename from fairseq/data/transforms/nltk_tokenizer.py
rename to fairseq/data/encoders/nltk_tokenizer.py
index 206243c7fb..61325efc42 100644
--- a/fairseq/data/transforms/nltk_tokenizer.py
+++ b/fairseq/data/encoders/nltk_tokenizer.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
-from fairseq.data.transforms import register_tokenizer
+from fairseq.data.encoders import register_tokenizer
 
 
 @register_tokenizer('nltk')
diff --git a/fairseq/data/transforms/sentencepiece_bpe.py b/fairseq/data/encoders/sentencepiece_bpe.py
similarity index 95%
rename from fairseq/data/transforms/sentencepiece_bpe.py
rename to fairseq/data/encoders/sentencepiece_bpe.py
index 11b8dfe20c..9b27460194 100644
--- a/fairseq/data/transforms/sentencepiece_bpe.py
+++ b/fairseq/data/encoders/sentencepiece_bpe.py
@@ -6,7 +6,7 @@
 # can be found in the PATENTS file in the same directory.
 
 from fairseq import file_utils
-from fairseq.data.transforms import register_bpe
+from fairseq.data.encoders import register_bpe
 
 
 @register_bpe('sentencepiece')
diff --git a/fairseq/data/transforms/space_tokenizer.py b/fairseq/data/encoders/space_tokenizer.py
similarity index 91%
rename from fairseq/data/transforms/space_tokenizer.py
rename to fairseq/data/encoders/space_tokenizer.py
index 95d68a45d6..b804b969d8 100644
--- a/fairseq/data/transforms/space_tokenizer.py
+++ b/fairseq/data/encoders/space_tokenizer.py
@@ -7,7 +7,7 @@
 
 import re
 
-from fairseq.data.transforms import register_tokenizer
+from fairseq.data.encoders import register_tokenizer
 
 
 @register_tokenizer('space')
diff --git a/fairseq/data/transforms/subword_nmt_bpe.py b/fairseq/data/encoders/subword_nmt_bpe.py
similarity index 97%
rename from fairseq/data/transforms/subword_nmt_bpe.py
rename to fairseq/data/encoders/subword_nmt_bpe.py
index c5f2722340..b2c1fa33b9 100644
--- a/fairseq/data/transforms/subword_nmt_bpe.py
+++ b/fairseq/data/encoders/subword_nmt_bpe.py
@@ -6,7 +6,7 @@
 # can be found in the PATENTS file in the same directory.
 
 from fairseq import file_utils
-from fairseq.data.transforms import register_bpe
+from fairseq.data.encoders import register_bpe
 
 
 @register_bpe('subword_nmt')
diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index 1218b9da7c..02c3291fda 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -9,7 +9,7 @@
 import torch
 
 from fairseq import utils
-from fairseq.data import transforms
+from fairseq.data import encoders
 
 
 class Generator(object):
@@ -44,8 +44,8 @@ def __init__(self, args, task, models):
         # (None if no unknown word replacement, empty if no path to align dictionary)
         self.align_dict = utils.load_align_dict(getattr(args, 'replace_unk', None))
 
-        self.tokenizer = transforms.build_tokenizer(args)
-        self.bpe = transforms.build_bpe(args)
+        self.tokenizer = encoders.build_tokenizer(args)
+        self.bpe = encoders.build_bpe(args)
 
     def generate(self, src_str, verbose=False):
 
diff --git a/interactive.py b/interactive.py
index eea7fcbda4..632a16f3ed 100644
--- a/interactive.py
+++ b/interactive.py
@@ -15,7 +15,7 @@
 import torch
 
 from fairseq import checkpoint_utils, options, tasks, utils
-from fairseq.data import transforms
+from fairseq.data import encoders
 
 
 Batch = namedtuple('Batch', 'ids src_tokens src_lengths')
@@ -103,8 +103,8 @@ def main(args):
     generator = task.build_generator(args)
 
     # Handle tokenization and BPE
-    tokenizer = transforms.build_tokenizer(args)
-    bpe = transforms.build_bpe(args)
+    tokenizer = encoders.build_tokenizer(args)
+    bpe = encoders.build_bpe(args)
 
     def encode_fn(x):
         if tokenizer is not None:

From 1f96d284bea7dac46f3781e95e32a54117238f3a Mon Sep 17 00:00:00 2001
From: Liang Wang <wangliang01@fenbi.com>
Date: Sun, 21 Jul 2019 07:17:50 -0700
Subject: [PATCH 009/213] Fix topp sampling issues (#882)

Summary:
Two issues here:

1. `last_included` should be the last included index `cumsum_mask[:, :, -1:]` instead of `cumsum_mask[:, :, :1]`  (which is either 0 or 1);

2. If `--no-repeat-ngram-size` is set, the sum of `probs` may less than 1, we need to re-normalize to make it a valid probability distribution

The following code can reproduce this issues:

```
import torch
import numpy as np

def _sample_topp(probs):

    # =====  Code from  fairseq/search.py _sample_topp ======

    # sort the last dimension (vocab dimension) in descending order
    sorted_probs, sorted_indices = probs.sort(descending=True)

    # compute a mask to indicate the words to be included in the top-P set.
    cumsum_probs = sorted_probs.cumsum(dim=2)
    mask = cumsum_probs.lt(sampling_topp)

    # note that mask was computed by 'lt'. One more word needs to be included
    # so that the cumulative probability mass can exceed p.
    cumsum_mask = mask.cumsum(dim=2)
    last_included = cumsum_mask[:, :, :1]
    mask = mask.scatter_(2, last_included, 1)

    # truncate unnecessary dims.
    max_dim = last_included.max()
    truncated_mask = mask[:, :, :max_dim + 1]
    truncated_probs = sorted_probs[:, :, :max_dim + 1]
    truncated_indices = sorted_indices[:, :, :max_dim + 1]

    # trim the words that are not in top-P by setting their probabilities
    # to 0, so that they would not be sampled later.
    trim_mask = 1 - truncated_mask
    trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
    return trimed_probs, truncated_indices

    # ========================================================

if __name__ == '__main__':
    np.random.seed(1234)
    torch.manual_seed(1234)

    sampling_topp = 0.9
    probs = torch.softmax(torch.randn(1, 1, 10), dim=-1)
    # probs = tensor([0.0545, 0.0779, 0.0189, 0.0647, 0.0282, 0.0862, 0.0656, 0.1041, 0.0399, 0.4600])
    print('probs =', probs[0][0])

    trimed_probs, truncated_indices = _sample_topp(probs)

    cum_probs = trimed_probs.cumsum(dim=-1)[0][0]
    # cumsum = tensor([0.4600, 0.5641])
    print('cumsum =', cum_probs)
    # Will throw AssertionError
    assert float(cum_probs[-1]) >= sampling_topp

```
Pull Request resolved: https://github.com/pytorch/fairseq/pull/882

Differential Revision: D16409269

Pulled By: xingz9

fbshipit-source-id: 94b1122eed50c656057b64e22af6f4a6ea7a68af
---
 fairseq/search.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fairseq/search.py b/fairseq/search.py
index 9f47f3328f..742453468a 100644
--- a/fairseq/search.py
+++ b/fairseq/search.py
@@ -202,7 +202,8 @@ def _sample_topp(self, lprobs):
         # note that mask was computed by 'lt'. One more word needs to be included
         # so that the cumulative probability mass can exceed p.
         cumsum_mask = mask.cumsum(dim=2)
-        last_included = cumsum_mask[:, :, :1]
+        last_included = cumsum_mask[:, :, -1:]
+        last_included.clamp_(0, mask.size()[2] - 1)
         mask = mask.scatter_(2, last_included, 1)
 
         # truncate unnecessary dims.

From 5f78106a2d0fd91f6a977d350d999f07b45d5396 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 21 Jul 2019 12:42:50 -0700
Subject: [PATCH 010/213] Default to mmap and infer dataset implementations
 automatically

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/751

Differential Revision: D16410989

Pulled By: myleott

fbshipit-source-id: ddbbee49756f9ff6c4487977a3f5d2259b7abafe
---
 fairseq/data/data_utils.py         | 53 ++++++++++++++++++++++++++++--
 fairseq/data/indexed_dataset.py    | 28 +++++++++++++---
 fairseq/options.py                 | 11 ++++---
 fairseq/tasks/cross_lingual_lm.py  |  6 ++--
 fairseq/tasks/language_modeling.py | 50 +++++++++-------------------
 fairseq/tasks/translation.py       | 10 +++---
 scripts/read_binarized.py          |  2 +-
 7 files changed, 106 insertions(+), 54 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index fe4557a533..f731ee730a 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -5,13 +5,15 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
-import contextlib
-import os
-import numpy as np
 try:
     from collections.abc import Iterable
 except ImportError:
     from collections import Iterable
+import contextlib
+import itertools
+import os
+
+import numpy as np
 
 
 def infer_language_pair(path):
@@ -43,6 +45,51 @@ def copy_tensor(src, dst):
     return res
 
 
+def load_indexed_dataset(path, dictionary, dataset_impl=None, combine=False):
+    """A helper function for loading indexed datasets.
+
+    Args:
+        path (str): path to indexed dataset (e.g., 'data-bin/train')
+        dictionary (~fairseq.data.Dictionary): data dictionary
+        dataset_impl (str, optional): which dataset implementation to use. If
+            not provided, it will be inferred automatically. For legacy indexed
+            data we use the 'cached' implementation by default.
+        combine (bool, optional): automatically load and combine multiple
+            datasets. For example, if *path* is 'data-bin/train', then we will
+            combine 'data-bin/train', 'data-bin/train1', ... and return a
+            single ConcatDataset instance.
+    """
+    from fairseq.data.concat_dataset import ConcatDataset
+    import fairseq.data.indexed_dataset as indexed_dataset
+
+    datasets = []
+    for k in itertools.count():
+        path_k = path + (str(k) if k > 0 else '')
+
+        dataset_impl_k = dataset_impl
+        if dataset_impl_k is None:
+            dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k)
+
+        dataset = indexed_dataset.make_dataset(
+            path_k,
+            impl=dataset_impl_k or 'cached',
+            fix_lua_indexing=True,
+            dictionary=dictionary,
+        )
+        if dataset is None:
+            break
+        print('| loaded {} examples from: {}'.format(len(dataset), path_k))
+        datasets.append(dataset)
+        if not combine:
+            break
+    if len(datasets) == 0:
+        return None
+    elif len(datasets) == 1:
+        return datasets[0]
+    else:
+        return ConcatDataset(datasets)
+
+
 @contextlib.contextmanager
 def numpy_seed(seed, *addl_seeds):
     """Context manager which seeds the NumPy PRNG with the specified seed and
diff --git a/fairseq/data/indexed_dataset.py b/fairseq/data/indexed_dataset.py
index 7fddf285e4..9a801ff3d0 100644
--- a/fairseq/data/indexed_dataset.py
+++ b/fairseq/data/indexed_dataset.py
@@ -22,6 +22,26 @@ def __best_fitting_dtype(vocab_size=None):
         return np.int32
 
 
+def get_available_dataset_impl():
+    return ['raw', 'lazy', 'cached', 'mmap']
+
+
+def infer_dataset_impl(path):
+    if IndexedRawTextDataset.exists(path):
+        return 'raw'
+    elif IndexedDataset.exists(path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return 'cached'
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return 'mmap'
+            else:
+                return None
+    else:
+        return None
+
+
 def make_builder(out_file, impl, vocab_size=None):
     if impl == 'mmap':
         return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
@@ -39,7 +59,6 @@ def make_dataset(path, impl, fix_lua_indexing=False, dictionary=None):
         return IndexedCachedDataset(path, fix_lua_indexing=fix_lua_indexing)
     elif impl == 'mmap' and MMapIndexedDataset.exists(path):
         return MMapIndexedDataset(path)
-
     return None
 
 
@@ -91,6 +110,7 @@ def data_file_path(prefix_path):
 
 class IndexedDataset(FairseqDataset):
     """Loader for TorchNet IndexedDataset"""
+    _HDR_MAGIC = b'TNTIDX\x00\x00'
 
     def __init__(self, path, fix_lua_indexing=False):
         super().__init__()
@@ -102,7 +122,7 @@ def __init__(self, path, fix_lua_indexing=False):
     def read_index(self, path):
         with open(index_file_path(path), 'rb') as f:
             magic = f.read(8)
-            assert magic == b'TNTIDX\x00\x00', (
+            assert magic == self._HDR_MAGIC, (
                 'Index file doesn\'t match expected format. '
                 'Make sure that --dataset-impl is configured properly.'
             )
@@ -151,7 +171,7 @@ def size(self, index):
     @staticmethod
     def exists(path):
         return (
-                os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
         )
 
     @property
@@ -465,7 +485,7 @@ def supports_prefetch(self):
     @staticmethod
     def exists(path):
         return (
-                os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
         )
 
 
diff --git a/fairseq/options.py b/fairseq/options.py
index d3522481a3..41d466d948 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -11,6 +11,7 @@
 import sys
 
 from fairseq import utils
+from fairseq.data.indexed_dataset import get_available_dataset_impl
 
 
 def get_preprocessing_parser(default_task='translation'):
@@ -233,8 +234,9 @@ def add_preprocess_args(parser):
                        help="number of source words to retain")
     group.add_argument("--alignfile", metavar="ALIGN", default=None,
                        help="an alignment file (optional)")
-    parser.add_argument('--dataset-impl', metavar="FORMAT", help='output dataset implementation',
-                        choices=['raw', 'lazy', 'cached', 'mmap'], default='cached')
+    parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap',
+                        choices=get_available_dataset_impl(),
+                        help='output dataset implementation')
     group.add_argument("--joined-dictionary", action="store_true",
                        help="Generate joined dictionary")
     group.add_argument("--only-source", action="store_true",
@@ -260,8 +262,9 @@ def add_dataset_args(parser, train=False, gen=False):
                        help='maximum number of sentences in a batch')
     group.add_argument('--required-batch-size-multiple', default=8, type=int, metavar='N',
                        help='batch size will be a multiplier of this value')
-    parser.add_argument('--dataset-impl', metavar="FORMAT", help='output dataset implementation',
-                        choices=['raw', 'lazy', 'cached', 'mmap'], default='cached')
+    parser.add_argument('--dataset-impl', metavar='FORMAT',
+                        choices=get_available_dataset_impl(),
+                        help='output dataset implementation')
     if train:
         group.add_argument('--train-subset', default='train', metavar='SPLIT',
                            choices=['train', 'valid', 'test'],
diff --git a/fairseq/tasks/cross_lingual_lm.py b/fairseq/tasks/cross_lingual_lm.py
index f0a377f7fd..456f6c2f29 100644
--- a/fairseq/tasks/cross_lingual_lm.py
+++ b/fairseq/tasks/cross_lingual_lm.py
@@ -17,6 +17,7 @@
 
 from fairseq.data import (
     ConcatDataset,
+    data_utils,
     indexed_dataset,
     TokenBlockDataset,
 )
@@ -114,10 +115,7 @@ def _load_single_lang_dataset(self, split, epoch):
             split_k = split + (str(k) if k > 0 else '')
             path = os.path.join(data_path, split_k)
 
-            ds = indexed_dataset.make_dataset(
-                path, impl=self.args.dataset_impl, fix_lua_indexing=True,
-                dictionary=self.dictionary,
-            )
+            ds = data_utils.load_indexed_dataset(path, self.dictionary, self.args.dataset_impl)
             if ds is None:
                 if k > 0:
                     break
diff --git a/fairseq/tasks/language_modeling.py b/fairseq/tasks/language_modeling.py
index b52b7174b4..a5b3470982 100644
--- a/fairseq/tasks/language_modeling.py
+++ b/fairseq/tasks/language_modeling.py
@@ -14,6 +14,7 @@
 from fairseq import utils
 from fairseq.data import (
     ConcatDataset,
+    data_utils,
     Dictionary,
     MonolingualDataset,
     TokenBlockDataset,
@@ -152,49 +153,30 @@ def load_dataset(self, split, epoch=0, combine=False, **kwargs):
         Args:
             split (str): name of the split (e.g., train, valid, test)
         """
-
-        loaded_datasets = []
-
         paths = self.args.data.split(':')
         assert len(paths) > 0
         data_path = paths[epoch % len(paths)]
+        split_path = os.path.join(data_path, split)
 
-        for k in itertools.count():
-            split_k = split + (str(k) if k > 0 else '')
-            path = os.path.join(data_path, split_k)
-            ds = indexed_dataset.make_dataset(path, impl=self.args.dataset_impl,
-                                              fix_lua_indexing=True, dictionary=self.dictionary)
-
-            if ds is None:
-                if k > 0:
-                    break
-                else:
-                    raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path))
-
-            loaded_datasets.append(
-                TokenBlockDataset(
-                    ds, ds.sizes, self.args.tokens_per_sample,
-                    pad=self.dictionary.pad(), eos=self.dictionary.eos(),
-                    break_mode=self.args.sample_break_mode, include_targets=True,
-                )
-            )
-
-            print('| {} {} {} examples'.format(data_path, split_k, len(loaded_datasets[-1])))
-
-            if not combine:
-                break
+        dataset = data_utils.load_indexed_dataset(
+            split_path,
+            self.dictionary,
+            self.args.dataset_impl,
+            combine=combine,
+        )
+        if dataset is None:
+            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))
 
-        if len(loaded_datasets) == 1:
-            dataset = loaded_datasets[0]
-            sizes = dataset.sizes
-        else:
-            dataset = ConcatDataset(loaded_datasets)
-            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
+        dataset = TokenBlockDataset(
+            dataset, dataset.sizes, self.args.tokens_per_sample,
+            pad=self.dictionary.pad(), eos=self.dictionary.eos(),
+            break_mode=self.args.sample_break_mode, include_targets=True,
+        )
 
         add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'
 
         self.datasets[split] = MonolingualDataset(
-            dataset, sizes, self.dictionary, self.output_dictionary,
+            dataset, dataset.sizes, self.dictionary, self.output_dictionary,
             add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True,
             targets=self.targets, add_bos_token=self.args.add_bos_token,
         )
diff --git a/fairseq/tasks/translation.py b/fairseq/tasks/translation.py
index 9e5bf41c7c..80e1c2960a 100644
--- a/fairseq/tasks/translation.py
+++ b/fairseq/tasks/translation.py
@@ -47,10 +47,12 @@ def split_exists(split, src, tgt, lang, data_path):
             else:
                 raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path))
 
-        src_datasets.append(indexed_dataset.make_dataset(prefix + src, impl=dataset_impl,
-                                                         fix_lua_indexing=True, dictionary=src_dict))
-        tgt_datasets.append(indexed_dataset.make_dataset(prefix + tgt, impl=dataset_impl,
-                                                         fix_lua_indexing=True, dictionary=tgt_dict))
+        src_datasets.append(
+            data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl)
+        )
+        tgt_datasets.append(
+            data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl)
+        )
 
         print('| {} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1])))
 
diff --git a/scripts/read_binarized.py b/scripts/read_binarized.py
index 79edd51234..f8242a89c2 100644
--- a/scripts/read_binarized.py
+++ b/scripts/read_binarized.py
@@ -17,7 +17,7 @@ def get_parser():
         description='writes text from binarized file to stdout')
     # fmt: off
     parser.add_argument('--dataset-impl', help='dataset implementation',
-                        choices=['raw', 'lazy', 'cached', 'mmap'], default='lazy')
+                        choices=indexed_dataset.get_available_dataset_impl())
     parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None)
     parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read')
     # fmt: on

From 62b5498bebfc7825b277e488ec6eca2558e92295 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 21 Jul 2019 12:44:30 -0700
Subject: [PATCH 011/213] Update GPT-2 BPE

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/749

Differential Revision: D16410984

Pulled By: myleott

fbshipit-source-id: 7698df46b8a179afccb287990f9705358690454a
---
 fairseq/data/encoders/gpt2_bpe.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/fairseq/data/encoders/gpt2_bpe.py b/fairseq/data/encoders/gpt2_bpe.py
index b411a5aa39..e1d01ee6b4 100644
--- a/fairseq/data/encoders/gpt2_bpe.py
+++ b/fairseq/data/encoders/gpt2_bpe.py
@@ -9,6 +9,10 @@
 from fairseq.data.encoders import register_bpe
 
 
+DEFAULT_ENCODER_JSON = 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+DEFAULT_VOCAB_BPE = 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+
+
 @register_bpe('gpt2')
 class GPT2BPE(object):
 
@@ -16,16 +20,20 @@ class GPT2BPE(object):
     def add_args(parser):
         # fmt: off
         parser.add_argument('--gpt2-encoder-json', type=str,
-                            default='https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json',
+                            default=DEFAULT_ENCODER_JSON,
                             help='path to encoder.json')
         parser.add_argument('--gpt2-vocab-bpe', type=str,
-                            default='https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe',
+                            default=DEFAULT_VOCAB_BPE,
                             help='path to vocab.bpe')
         # fmt: on
 
     def __init__(self, args):
-        encoder_json = file_utils.cached_path(args.gpt2_encoder_json)
-        vocab_bpe = file_utils.cached_path(args.gpt2_vocab_bpe)
+        encoder_json = file_utils.cached_path(
+            getattr(args, 'gpt2_encoder_json', DEFAULT_ENCODER_JSON)
+        )
+        vocab_bpe = file_utils.cached_path(
+            getattr(args, 'gpt2_vocab_bpe', DEFAULT_VOCAB_BPE)
+        )
         self.bpe = get_encoder(encoder_json, vocab_bpe)
 
     def encode(self, x: str) -> str:

From 9c89e8821ee8df810c8b46e45002202a89106612 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 21 Jul 2019 17:10:49 -0700
Subject: [PATCH 012/213] Misc improvements to torch hub interface

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/750

Differential Revision: D16410986

Pulled By: myleott

fbshipit-source-id: 8ee6b4371d6ae5b041b00a54a6039a422345795e
---
 fairseq/hub_utils.py            | 45 +++++++++++++++++++++++++++++++++
 fairseq/models/fairseq_model.py | 44 ++++++++------------------------
 fairseq/registry.py             |  2 +-
 3 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index 02c3291fda..c532075b6b 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -6,12 +6,57 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
+import os
+
 import torch
 
 from fairseq import utils
 from fairseq.data import encoders
 
 
+def from_pretrained(
+    model_name_or_path,
+    checkpoint_file='model.pt',
+    data_name_or_path='.',
+    archive_map=None,
+    **kwargs,
+):
+    from fairseq import checkpoint_utils, file_utils
+
+    if archive_map is not None:
+        if model_name_or_path in archive_map:
+            model_name_or_path = archive_map[model_name_or_path]
+        if data_name_or_path is not None and data_name_or_path in archive_map:
+            data_name_or_path = archive_map[data_name_or_path]
+
+    model_path = file_utils.load_archive_file(model_name_or_path)
+
+    # convenience hack for loading data and BPE codes from model archive
+    if data_name_or_path.startswith('.'):
+        kwargs['data'] = os.path.abspath(os.path.join(model_path, data_name_or_path))
+    else:
+        kwargs['data'] = file_utils.load_archive_file(data_name_or_path)
+    for file, arg in {
+        'code': 'bpe_codes',
+        'bpecodes': 'bpe_codes',
+        'sentencepiece.bpe.model': 'sentencepiece_vocab',
+    }.items():
+        path = os.path.join(model_path, file)
+        if os.path.exists(path):
+            kwargs[arg] = path
+
+    models, args, task = checkpoint_utils.load_model_ensemble_and_task(
+        [os.path.join(model_path, cpt) for cpt in checkpoint_file.split(':')],
+        arg_overrides=kwargs,
+    )
+
+    return {
+        'args': args,
+        'task': task,
+        'models': models,
+    }
+
+
 class Generator(object):
     """PyTorch Hub API for generating sequences from a pre-trained translation
     or language model."""
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index 8f52adc7de..78b1308617 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -144,7 +144,7 @@ def apply_prepare_for_onnx_export_(module):
         self.apply(apply_prepare_for_onnx_export_)
 
     @classmethod
-    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path=None, **kwargs):
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', **kwargs):
         """
         Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model
         file. Downloads and caches the pre-trained model file if needed.
@@ -165,40 +165,16 @@ def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_na
                 at the given path/URL. Can start with '.' or './' to reuse the
                 model archive path.
         """
-        from fairseq import checkpoint_utils, file_utils, hub_utils
-
-        if hasattr(cls, 'hub_models'):
-            archive_map = cls.hub_models()
-            if model_name_or_path in archive_map:
-                model_name_or_path = archive_map[model_name_or_path]
-            if data_name_or_path is not None and data_name_or_path in archive_map:
-                data_name_or_path = archive_map[data_name_or_path]
-
-        model_path = file_utils.load_archive_file(model_name_or_path)
-
-        # convenience hack for loading data and BPE codes from model archive
-        if data_name_or_path is not None:
-            if data_name_or_path.startswith('.'):
-                kwargs['data'] = os.path.abspath(os.path.join(model_path, data_name_or_path))
-            else:
-                kwargs['data'] = file_utils.load_archive_file(data_name_or_path)
-        for file, arg in {
-            'code': 'bpe_codes',
-            'bpecodes': 'bpe_codes',
-            'sentencepiece.bpe.model': 'sentencepiece_vocab',
-        }.items():
-            path = os.path.join(model_path, file)
-            if os.path.exists(path):
-                kwargs[arg] = path
-
-        models, args, task = checkpoint_utils.load_model_ensemble_and_task(
-            [os.path.join(model_path, cpt) for cpt in checkpoint_file.split(':')],
-            arg_overrides=kwargs,
+        from fairseq import hub_utils
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            **kwargs,
         )
-
-        print(args)
-
-        return hub_utils.Generator(args, task, models)
+        print(x['args'])
+        return hub_utils.Generator(x['args'], x['task'], x['models'])
 
     @classmethod
     def hub_models(cls):
diff --git a/fairseq/registry.py b/fairseq/registry.py
index 8b26913873..25168e588d 100644
--- a/fairseq/registry.py
+++ b/fairseq/registry.py
@@ -22,7 +22,7 @@ def setup_registry(
 
     # maintain a registry of all registries
     if registry_name in REGISTRIES:
-        raise ValueError('Canot setup duplicate registry: {}'.format(registry_name))
+        return  # registry already exists
     REGISTRIES[registry_name] = {
         'registry': REGISTRY,
         'default': default,

From 47fd985269e92735826c05d9160d68dc8e8a9807 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 21 Jul 2019 19:30:03 -0700
Subject: [PATCH 013/213] Move Masked LM components to legacy/ -- new ones are
 coming

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/740

Differential Revision: D16377797

Pulled By: myleott

fbshipit-source-id: f7d6c8b00a77e279ea94376b1f0fcd15087eaf5f
---
 examples/cross_lingual_language_model/README.md |  2 +-
 .../{masked_lm_loss.py => legacy_masked_lm.py}  |  4 ++--
 fairseq/data/__init__.py                        |  7 -------
 fairseq/data/legacy/__init__.py                 | 17 +++++++++++++++++
 fairseq/data/{ => legacy}/block_pair_dataset.py |  2 +-
 fairseq/data/{ => legacy}/masked_lm_dataset.py  |  4 ++--
 .../data/{ => legacy}/masked_lm_dictionary.py   |  0
 .../models/transformer_from_pretrained_xlm.py   |  2 +-
 fairseq/tasks/cross_lingual_lm.py               |  4 ++--
 .../tasks/{masked_lm.py => legacy_masked_lm.py} | 10 +++++-----
 .../tasks/translation_from_pretrained_xlm.py    |  2 +-
 tests/test_binaries.py                          | 14 ++++++++------
 12 files changed, 40 insertions(+), 28 deletions(-)
 rename fairseq/criterions/{masked_lm_loss.py => legacy_masked_lm.py} (98%)
 create mode 100644 fairseq/data/legacy/__init__.py
 rename fairseq/data/{ => legacy}/block_pair_dataset.py (99%)
 rename fairseq/data/{ => legacy}/masked_lm_dataset.py (99%)
 rename fairseq/data/{ => legacy}/masked_lm_dictionary.py (100%)
 rename fairseq/tasks/{masked_lm.py => legacy_masked_lm.py} (94%)

diff --git a/examples/cross_lingual_language_model/README.md b/examples/cross_lingual_language_model/README.md
index 74423f11a8..a78f86d8da 100644
--- a/examples/cross_lingual_language_model/README.md
+++ b/examples/cross_lingual_language_model/README.md
@@ -63,7 +63,7 @@ fairseq-train \
 --optimizer adam --lr-scheduler reduce_lr_on_plateau \
 --lr-shrink 0.5 --lr 0.0001 --min-lr 1e-09 \
 --dropout 0.1 \
---criterion masked_lm_loss \
+--criterion legacy_masked_lm_loss \
 --max-tokens 2048 --tokens-per-sample 256 --attention-dropout 0.1 \
 --dataset-impl lazy --seed 0 \
 --masked-lm-only \
diff --git a/fairseq/criterions/masked_lm_loss.py b/fairseq/criterions/legacy_masked_lm.py
similarity index 98%
rename from fairseq/criterions/masked_lm_loss.py
rename to fairseq/criterions/legacy_masked_lm.py
index f1f86ce385..ac7fb9d445 100644
--- a/fairseq/criterions/masked_lm_loss.py
+++ b/fairseq/criterions/legacy_masked_lm.py
@@ -32,8 +32,8 @@ def compute_cross_entropy_loss(logits, targets, ignore_index=-100):
     return loss
 
 
-@register_criterion('masked_lm_loss')
-class MaskedLmLoss(FairseqCriterion):
+@register_criterion('legacy_masked_lm_loss')
+class LegacyMaskedLmLoss(FairseqCriterion):
     """
     Implementation for the loss used in masked language model (MLM) training.
     This optionally also computes the next sentence prediction (NSP) loss and
diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index ffee78429e..dcc0bd5474 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -6,18 +6,15 @@
 # can be found in the PATENTS file in the same directory.
 
 from .dictionary import Dictionary, TruncatedDictionary
-from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary
 
 from .fairseq_dataset import FairseqDataset
 
 from .audio.raw_audio_dataset import RawAudioDataset
 from .backtranslation_dataset import BacktranslationDataset
-from .block_pair_dataset import BlockPairDataset
 from .concat_dataset import ConcatDataset
 from .indexed_dataset import IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, MMapIndexedDataset
 from .language_pair_dataset import LanguagePairDataset
 from .lm_context_window_dataset import LMContextWindowDataset
-from .masked_lm_dataset import MaskedLMDataset
 from .monolingual_dataset import MonolingualDataset
 from .noising import NoisingDataset
 from .round_robin_zip_datasets import RoundRobinZipDatasets
@@ -34,8 +31,6 @@
 
 __all__ = [
     'BacktranslationDataset',
-    'BertDictionary',
-    'BlockPairDataset',
     'ConcatDataset',
     'CountingIterator',
     'Dictionary',
@@ -47,8 +42,6 @@
     'IndexedRawTextDataset',
     'LanguagePairDataset',
     'LMContextWindowDataset',
-    'MaskedLMDataset',
-    'MaskedLMDictionary',
     'MMapIndexedDataset',
     'MonolingualDataset',
     'NoisingDataset',
diff --git a/fairseq/data/legacy/__init__.py b/fairseq/data/legacy/__init__.py
new file mode 100644
index 0000000000..df912ec648
--- /dev/null
+++ b/fairseq/data/legacy/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary
+from .block_pair_dataset import BlockPairDataset
+from .masked_lm_dataset import MaskedLMDataset
+
+__all__ = [
+    'BertDictionary',
+    'BlockPairDataset',
+    'MaskedLMDataset',
+    'MaskedLMDictionary',
+]
diff --git a/fairseq/data/block_pair_dataset.py b/fairseq/data/legacy/block_pair_dataset.py
similarity index 99%
rename from fairseq/data/block_pair_dataset.py
rename to fairseq/data/legacy/block_pair_dataset.py
index 73145e8aee..db13f61c97 100644
--- a/fairseq/data/block_pair_dataset.py
+++ b/fairseq/data/legacy/block_pair_dataset.py
@@ -10,7 +10,7 @@
 import numpy as np
 import torch
 
-from . import FairseqDataset
+from fairseq.data import FairseqDataset
 
 
 class BlockPairDataset(FairseqDataset):
diff --git a/fairseq/data/masked_lm_dataset.py b/fairseq/data/legacy/masked_lm_dataset.py
similarity index 99%
rename from fairseq/data/masked_lm_dataset.py
rename to fairseq/data/legacy/masked_lm_dataset.py
index 258ae82ccf..864c5dcb67 100644
--- a/fairseq/data/masked_lm_dataset.py
+++ b/fairseq/data/legacy/masked_lm_dataset.py
@@ -12,10 +12,10 @@
 
 from typing import Dict, List, Tuple
 
-from . import FairseqDataset, data_utils
+from fairseq.data import FairseqDataset, data_utils
 
 from fairseq.data import Dictionary
-from fairseq.data.block_pair_dataset import BlockPairDataset
+from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
 from fairseq.data.token_block_dataset import TokenBlockDataset
 from fairseq.data.concat_dataset import ConcatDataset
 
diff --git a/fairseq/data/masked_lm_dictionary.py b/fairseq/data/legacy/masked_lm_dictionary.py
similarity index 100%
rename from fairseq/data/masked_lm_dictionary.py
rename to fairseq/data/legacy/masked_lm_dictionary.py
diff --git a/fairseq/models/transformer_from_pretrained_xlm.py b/fairseq/models/transformer_from_pretrained_xlm.py
index 5002caefe7..06c4a2ca92 100644
--- a/fairseq/models/transformer_from_pretrained_xlm.py
+++ b/fairseq/models/transformer_from_pretrained_xlm.py
@@ -9,7 +9,7 @@
 from typing import Any, Dict
 
 from fairseq import checkpoint_utils
-from fairseq.data.masked_lm_dictionary import MaskedLMDictionary
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.transformer import (
     TransformerDecoder,
diff --git a/fairseq/tasks/cross_lingual_lm.py b/fairseq/tasks/cross_lingual_lm.py
index 456f6c2f29..6ad58e79fa 100644
--- a/fairseq/tasks/cross_lingual_lm.py
+++ b/fairseq/tasks/cross_lingual_lm.py
@@ -13,7 +13,7 @@
 import numpy as np
 
 from fairseq import tokenizer
-from fairseq.data.masked_lm_dictionary import MaskedLMDictionary
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
 
 from fairseq.data import (
     ConcatDataset,
@@ -23,7 +23,7 @@
 )
 
 from fairseq.data import Dictionary
-from fairseq.data.masked_lm_dataset import MaskedLMDataset
+from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset
 from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset
 
 from . import FairseqTask, register_task
diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/legacy_masked_lm.py
similarity index 94%
rename from fairseq/tasks/masked_lm.py
rename to fairseq/tasks/legacy_masked_lm.py
index 0b9a6f4f47..b4f2c93ac2 100644
--- a/fairseq/tasks/masked_lm.py
+++ b/fairseq/tasks/legacy_masked_lm.py
@@ -17,15 +17,15 @@
 )
 
 from fairseq.data import Dictionary
-from fairseq.data.block_pair_dataset import BlockPairDataset
-from fairseq.data.masked_lm_dataset import MaskedLMDataset
-from fairseq.data.masked_lm_dictionary import BertDictionary
+from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
+from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset
+from fairseq.data.legacy.masked_lm_dictionary import BertDictionary
 
 from . import FairseqTask, register_task
 
 
-@register_task('masked_lm')
-class MaskedLMTask(FairseqTask):
+@register_task('legacy_masked_lm')
+class LegacyMaskedLMTask(FairseqTask):
     """
     Task for training Masked LM (BERT) model.
     Args:
diff --git a/fairseq/tasks/translation_from_pretrained_xlm.py b/fairseq/tasks/translation_from_pretrained_xlm.py
index 941d2bee97..941634cf86 100644
--- a/fairseq/tasks/translation_from_pretrained_xlm.py
+++ b/fairseq/tasks/translation_from_pretrained_xlm.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
-from fairseq.data.masked_lm_dictionary import MaskedLMDictionary
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
 from fairseq.tasks.translation import TranslationTask
 
 from . import register_task
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
index 30cdcb88be..79650df6ba 100644
--- a/tests/test_binaries.py
+++ b/tests/test_binaries.py
@@ -263,19 +263,20 @@ def test_transformer_lm(self):
 
 
 class TestMaskedLanguageModel(unittest.TestCase):
-    def test_masked_lm(self):
+
+    def test_legacy_masked_lm(self):
         with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_mlm") as data_dir:
+            with tempfile.TemporaryDirectory("test_legacy_mlm") as data_dir:
                 create_dummy_data(data_dir)
                 preprocess_lm_data(data_dir)
-                train_masked_language_model(data_dir, "masked_lm")
+                train_legacy_masked_language_model(data_dir, "masked_lm")
 
     def _test_pretrained_masked_lm_for_translation(self, learned_pos_emb, encoder_only):
         with contextlib.redirect_stdout(StringIO()):
             with tempfile.TemporaryDirectory("test_mlm") as data_dir:
                 create_dummy_data(data_dir)
                 preprocess_lm_data(data_dir)
-                train_masked_language_model(
+                train_legacy_masked_language_model(
                     data_dir,
                     arch="masked_lm",
                     extra_args=('--encoder-learned-pos',) if learned_pos_emb else ()
@@ -332,7 +333,8 @@ def test_pretrained_masked_lm_for_translation_sinusoidal_pos_emb(self):
     def test_pretrained_masked_lm_for_translation_encoder_only(self):
         self._test_pretrained_masked_lm_for_translation(True, True)
 
-def train_masked_language_model(data_dir, arch, extra_args=()):
+
+def train_legacy_masked_language_model(data_dir, arch, extra_args=()):
     train_parser = options.get_training_parser()
     # TODO: langs should be in and out right?
     train_args = options.parse_args_and_arch(
@@ -361,7 +363,7 @@ def train_masked_language_model(data_dir, arch, extra_args=()):
             "0.1",
             # MLM args
             "--criterion",
-            "masked_lm_loss",
+            "legacy_masked_lm_loss",
             "--masked-lm-only",
             "--monolingual-langs",
             "in,out",

From bccfa7d0067c05dbdce6b54f915480c78ddb896d Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 22 Jul 2019 06:10:03 -0700
Subject: [PATCH 014/213] Add fallback for SLURM config

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/752

Differential Revision: D16417582

Pulled By: myleott

fbshipit-source-id: 6b4289febcf9290452bb91f1f2181a02c09c82a7
---
 fairseq/distributed_utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py
index 202d30d75b..b6aa0d0492 100644
--- a/fairseq/distributed_utils.py
+++ b/fairseq/distributed_utils.py
@@ -48,7 +48,14 @@ def infer_init_method(args):
                     port=args.distributed_port,
                 )
                 nnodes = int(os.environ.get('SLURM_NNODES'))
-                ntasks_per_node = int(os.environ.get('SLURM_NTASKS_PER_NODE'))
+                ntasks_per_node = os.environ.get('SLURM_NTASKS_PER_NODE')
+                if ntasks_per_node is not None:
+                    ntasks_per_node = int(ntasks_per_node)
+                else:
+                    ntasks = int(os.environ.get('SLURM_NTASKS'))
+                    nnodes = int(os.environ.get('SLURM_NNODES'))
+                    assert ntasks % nnodes == 0
+                    ntasks_per_node = int(ntasks / nnodes)
                 if ntasks_per_node == 1:
                     assert args.distributed_world_size % nnodes == 0
                     gpus_per_node = args.distributed_world_size // nnodes

From 906411da12dfaf4c2b37d46b48540a89bca7fa31 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 22 Jul 2019 07:29:46 -0700
Subject: [PATCH 015/213] Fix --reset-meters

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/756

Differential Revision: D16418302

Pulled By: myleott

fbshipit-source-id: 62495a0bff41d1741e2b09807a3b43ff2c66c8fb
---
 fairseq/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 250312928a..d2f27e05d1 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -184,7 +184,7 @@ def load_checkpoint(
 
             self.lr_step(epoch)
 
-            if 'train_meters' in extra_state:
+            if 'train_meters' in extra_state and not reset_meters:
                 self.meters.update(extra_state['train_meters'])
                 del extra_state['train_meters']
 

From 51ba35217debfbf34cdfbae14e09cb9df9c3be5b Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 22 Jul 2019 08:11:58 -0700
Subject: [PATCH 016/213] Simplify hubconf

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/758

Differential Revision: D16418932

Pulled By: myleott

fbshipit-source-id: 59f005164b61b9fa712922eeb23525f7eec38f38
---
 hubconf.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index 1eb25f870a..992c259fa3 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -5,6 +5,8 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
+import functools
+
 from fairseq.models import MODEL_REGISTRY
 
 
@@ -18,5 +20,11 @@
 ]
 
 
-for model, cls in MODEL_REGISTRY.items():
-    globals()[model] = cls.from_pretrained
+for model_type, _cls in MODEL_REGISTRY.items():
+    for model_name in _cls.hub_models().keys():
+        globals()[model_name] = functools.partial(
+            _cls.from_pretrained,
+            model_name_or_path=model_name,
+        )
+    # to simplify the interface we only expose named models
+    #globals()[model_type] = _cls.from_pretrained

From 654affc03dd78b356c8dc6941609ca76a342c881 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 22 Jul 2019 08:55:12 -0700
Subject: [PATCH 017/213] Add new Datasets

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/757

Differential Revision: D16418305

Pulled By: myleott

fbshipit-source-id: 25f293a2792509f7a75c688e4bf8cff02e6bba2e
---
 fairseq/data/__init__.py                  |  23 +++
 fairseq/data/base_wrapper_dataset.py      |  53 +++++++
 fairseq/data/id_dataset.py                |  22 +++
 fairseq/data/lru_cache_dataset.py         |  24 +++
 fairseq/data/mask_tokens_dataset.py       | 174 ++++++++++++++++++++++
 fairseq/data/nested_dictionary_dataset.py | 118 +++++++++++++++
 fairseq/data/num_samples_dataset.py       |  20 +++
 fairseq/data/numel_dataset.py             |  34 +++++
 fairseq/data/pad_dataset.py               |  33 ++++
 fairseq/data/prepend_token_dataset.py     |  44 ++++++
 fairseq/data/sort_dataset.py              |  24 +++
 11 files changed, 569 insertions(+)
 create mode 100644 fairseq/data/base_wrapper_dataset.py
 create mode 100644 fairseq/data/id_dataset.py
 create mode 100644 fairseq/data/lru_cache_dataset.py
 create mode 100644 fairseq/data/mask_tokens_dataset.py
 create mode 100644 fairseq/data/nested_dictionary_dataset.py
 create mode 100644 fairseq/data/num_samples_dataset.py
 create mode 100644 fairseq/data/numel_dataset.py
 create mode 100644 fairseq/data/pad_dataset.py
 create mode 100644 fairseq/data/prepend_token_dataset.py
 create mode 100644 fairseq/data/sort_dataset.py

diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index dcc0bd5474..7ff95af472 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -9,15 +9,26 @@
 
 from .fairseq_dataset import FairseqDataset
 
+from .base_wrapper_dataset import BaseWrapperDataset
+
 from .audio.raw_audio_dataset import RawAudioDataset
 from .backtranslation_dataset import BacktranslationDataset
 from .concat_dataset import ConcatDataset
+from .id_dataset import IdDataset
 from .indexed_dataset import IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, MMapIndexedDataset
 from .language_pair_dataset import LanguagePairDataset
 from .lm_context_window_dataset import LMContextWindowDataset
+from .lru_cache_dataset import LRUCacheDataset
+from .mask_tokens_dataset import MaskTokensDataset
 from .monolingual_dataset import MonolingualDataset
+from .nested_dictionary_dataset import NestedDictionaryDataset
 from .noising import NoisingDataset
+from .numel_dataset import NumelDataset
+from .num_samples_dataset import NumSamplesDataset
+from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset
+from .prepend_token_dataset import PrependTokenDataset
 from .round_robin_zip_datasets import RoundRobinZipDatasets
+from .sort_dataset import SortDataset
 from .token_block_dataset import TokenBlockDataset
 from .transform_eos_dataset import TransformEosDataset
 from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
@@ -31,23 +42,35 @@
 
 __all__ = [
     'BacktranslationDataset',
+    'BaseWrapperDataset',
     'ConcatDataset',
     'CountingIterator',
     'Dictionary',
     'EpochBatchIterator',
     'FairseqDataset',
     'GroupedIterator',
+    'IdDataset',
     'IndexedCachedDataset',
     'IndexedDataset',
     'IndexedRawTextDataset',
     'LanguagePairDataset',
+    'LeftPadDataset',
     'LMContextWindowDataset',
+    'LRUCacheDataset',
+    'MaskTokensDataset',
     'MMapIndexedDataset',
     'MonolingualDataset',
+    'NestedDictionaryDataset',
     'NoisingDataset',
+    'NumelDataset',
+    'NumSamplesDataset',
+    'PadDataset',
+    'PrependTokenDataset',
     'RawAudioDataset',
+    'RightPadDataset',
     'RoundRobinZipDatasets',
     'ShardedIterator',
+    'SortDataset',
     'TokenBlockDataset',
     'TransformEosDataset',
     'TransformEosLangPairDataset',
diff --git a/fairseq/data/base_wrapper_dataset.py b/fairseq/data/base_wrapper_dataset.py
new file mode 100644
index 0000000000..2a46a5215c
--- /dev/null
+++ b/fairseq/data/base_wrapper_dataset.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from torch.utils.data._utils.collate import default_collate
+
+from . import FairseqDataset
+
+
+class BaseWrapperDataset(FairseqDataset):
+
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        if hasattr(self.dataset, 'collater'):
+            return self.dataset.collater(samples)
+        else:
+            return default_collate(samples)
+
+    @property
+    def sizes(self):
+        return self.dataset.sizes
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, 'supports_prefetch', False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        self.dataset.set_epoch(epoch)
diff --git a/fairseq/data/id_dataset.py b/fairseq/data/id_dataset.py
new file mode 100644
index 0000000000..a10423e1af
--- /dev/null
+++ b/fairseq/data/id_dataset.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+
+from . import FairseqDataset
+
+
+class IdDataset(FairseqDataset):
+
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return 0
+
+    def collater(self, samples):
+        return torch.tensor(samples)
diff --git a/fairseq/data/lru_cache_dataset.py b/fairseq/data/lru_cache_dataset.py
new file mode 100644
index 0000000000..cea71731cc
--- /dev/null
+++ b/fairseq/data/lru_cache_dataset.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from functools import lru_cache
+
+from . import BaseWrapperDataset
+
+
+class LRUCacheDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    @lru_cache(maxsize=8)
+    def collater(self, samples):
+        return self.dataset.collater(samples)
diff --git a/fairseq/data/mask_tokens_dataset.py b/fairseq/data/mask_tokens_dataset.py
new file mode 100644
index 0000000000..6c64f66d25
--- /dev/null
+++ b/fairseq/data/mask_tokens_dataset.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from functools import lru_cache
+
+import numpy as np
+import torch
+
+from fairseq.data import data_utils, Dictionary
+
+from . import BaseWrapperDataset, LRUCacheDataset
+
+
+class MaskTokensDataset(BaseWrapperDataset):
+    """
+    A wrapper Dataset for masked language modeling.
+
+    Input items are masked according to the specified masking probability.
+
+    Args:
+        dataset: Dataset to wrap.
+        sizes: Sentence lengths
+        vocab: Dictionary with the vocabulary and special tokens.
+        pad_idx: Id of pad token in vocab
+        mask_idx: Id of mask token in vocab
+        return_masked_tokens: controls whether to return the non-masked tokens
+            (the default) or to return a tensor with the original masked token
+            IDs (and *pad_idx* elsewhere). The latter is useful as targets for
+            masked LM training.
+        seed: Seed for random number generator for reproducibility.
+        mask_prob: probability of replacing a token with *mask_idx*.
+        leave_unmasked_prob: probability that a masked token is unmasked.
+        random_token_prob: probability of replacing a masked token with a
+            random token from the vocabulary.
+        freq_weighted_replacement: sample random replacement words based on
+            word frequencies in the vocab.
+        mask_whole_words: only mask whole words. This should be a byte mask
+            over vocab indices, indicating whether it is the beginning of a
+            word. We will extend any mask to encompass the whole word.
+        bpe: BPE to use for whole-word masking.
+    """
+
+    @classmethod
+    def apply_mask(cls, dataset: torch.utils.data.Dataset, *args, **kwargs):
+        """Return the source and target datasets for masked LM training."""
+        dataset = LRUCacheDataset(dataset)
+        return (
+            LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=False)),
+            LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=True)),
+        )
+
+    def __init__(
+        self,
+        dataset: torch.utils.data.Dataset,
+        vocab: Dictionary,
+        pad_idx: int,
+        mask_idx: int,
+        return_masked_tokens: bool = False,
+        seed: int = 1,
+        mask_prob: float = 0.15,
+        leave_unmasked_prob: float = 0.1,
+        random_token_prob: float = 0.1,
+        freq_weighted_replacement: bool = False,
+        mask_whole_words: torch.Tensor = None,
+    ):
+        assert 0.0 < mask_prob < 1.0
+        assert 0.0 <= random_token_prob <= 1.0
+        assert 0.0 <= leave_unmasked_prob <= 1.0
+        assert random_token_prob + leave_unmasked_prob <= 1.0
+
+        self.dataset = dataset
+        self.vocab = vocab
+        self.pad_idx = pad_idx
+        self.mask_idx = mask_idx
+        self.return_masked_tokens = return_masked_tokens
+        self.seed = seed
+        self.mask_prob = mask_prob
+        self.leave_unmasked_prob = leave_unmasked_prob
+        self.random_token_prob = random_token_prob
+        self.mask_whole_words = mask_whole_words
+
+        if random_token_prob > 0.0:
+            if freq_weighted_replacement:
+                weights = np.array(self.vocab.count)
+            else:
+                weights = np.ones(len(self.vocab))
+            weights[:self.vocab.nspecial] = 0
+            self.weights = weights / weights.sum()
+
+        self.epoch = 0
+
+    def set_epoch(self, epoch, **unused):
+        self.epoch = epoch
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, index: int):
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            item = self.dataset[index]
+            sz = len(item)
+
+            assert self.mask_idx not in item, \
+                'Dataset contains mask_idx (={}), this is not expected!'.format(
+                    self.mask_idx,
+                )
+
+            if self.mask_whole_words is not None:
+                word_begins_mask = self.mask_whole_words.gather(0, item)
+                word_begins_idx = word_begins_mask.nonzero().view(-1)
+                sz = len(word_begins_idx)
+                words = np.split(word_begins_mask, word_begins_idx)[1:]
+                assert len(words) == sz
+                word_lens = list(map(len, words))
+
+            # decide elements to mask
+            mask = np.full(sz, False)
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                self.mask_prob * sz + np.random.rand()
+            )
+            mask[np.random.choice(sz, num_mask, replace=False)] = True
+
+            if self.return_masked_tokens:
+                # exit early if we're just returning the masked tokens
+                # (i.e., the targets for masked LM training)
+                if self.mask_whole_words is not None:
+                    mask = np.repeat(mask, word_lens)
+                new_item = np.full(len(mask), self.pad_idx)
+                new_item[mask] = item[torch.from_numpy(mask)]
+                return torch.from_numpy(new_item)
+
+            # decide unmasking and random replacement
+            rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob
+            if rand_or_unmask_prob > 0.0:
+                rand_or_unmask = mask & (np.random.rand(sz) < rand_or_unmask_prob)
+                if self.random_token_prob == 0.0:
+                    unmask = rand_or_unmask
+                    rand_mask = None
+                elif self.leave_unmasked_prob == 0.0:
+                    unmask = None
+                    rand_mask = rand_or_unmask
+                else:
+                    unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob
+                    decision = np.random.rand(sz) < unmask_prob
+                    unmask = rand_or_unmask & decision
+                    rand_mask = rand_or_unmask & (~decision)
+            else:
+                unmask = rand_mask = None
+
+            if unmask is not None:
+                mask = mask ^ unmask
+
+            if self.mask_whole_words is not None:
+                mask = np.repeat(mask, word_lens)
+
+            new_item = np.copy(item)
+            new_item[mask] = self.mask_idx
+            if rand_mask is not None:
+                num_rand = rand_mask.sum()
+                if num_rand > 0:
+                    if self.mask_whole_words is not None:
+                        rand_mask = np.repeat(rand_mask, word_lens)
+                        num_rand = rand_mask.sum()
+
+                    new_item[rand_mask] = np.random.choice(
+                        len(self.vocab),
+                        num_rand,
+                        p=self.weights,
+                    )
+
+            return torch.from_numpy(new_item)
diff --git a/fairseq/data/nested_dictionary_dataset.py b/fairseq/data/nested_dictionary_dataset.py
new file mode 100644
index 0000000000..385bf2324b
--- /dev/null
+++ b/fairseq/data/nested_dictionary_dataset.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from collections import OrderedDict
+
+import torch
+from torch.utils.data.dataloader import default_collate
+
+from . import FairseqDataset
+
+
+def _flatten(dico, prefix=None):
+    """Flatten a nested dictionary."""
+    new_dico = OrderedDict()
+    if isinstance(dico, dict):
+        prefix = prefix + '.' if prefix is not None else ''
+        for k, v in dico.items():
+            if v is None:
+                continue
+            new_dico.update(_flatten(v, prefix + k))
+    elif isinstance(dico, list):
+        for i, v in enumerate(dico):
+            new_dico.update(_flatten(v, prefix + '.[' + str(i) + ']'))
+    else:
+        new_dico = OrderedDict({prefix: dico})
+    return new_dico
+
+
+def _unflatten(dico):
+    """Unflatten a flattened dictionary into a nested dictionary."""
+    new_dico = OrderedDict()
+    for full_k, v in dico.items():
+        full_k = full_k.split('.')
+        node = new_dico
+        for k in full_k[:-1]:
+            if k.startswith('[') and k.endswith(']'):
+                k = int(k[1:-1])
+            if k not in node:
+                node[k] = OrderedDict()
+            node = node[k]
+        node[full_k[-1]] = v
+    return new_dico
+
+
+class NestedDictionaryDataset(FairseqDataset):
+
+    def __init__(self, defn, sizes=None):
+        super().__init__()
+        self.defn = _flatten(defn)
+        self.sizes = [sizes] if not isinstance(sizes, (list, tuple)) else sizes
+
+        first = None
+        for v in self.defn.values():
+            if not isinstance(v, (FairseqDataset, torch.utils.data.Dataset, )):
+                raise ValueError('Expected Dataset but found: {}'.format(v.__class__))
+            first = first or v
+            if len(v) > 0:
+                assert len(v) == len(first), 'dataset lengths must match'
+
+        self._len = len(first)
+
+    def __getitem__(self, index):
+        return OrderedDict((k, ds[index]) for k, ds in self.defn.items())
+
+    def __len__(self):
+        return self._len
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        if len(samples) == 0:
+            return {}
+        sample = OrderedDict()
+        for k, ds in self.defn.items():
+            try:
+                sample[k] = ds.collater([s[k] for s in samples])
+            except NotImplementedError:
+                sample[k] = default_collate([s[k] for s in samples])
+        return _unflatten(sample)
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return max(s[index] for s in self.sizes)
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        if len(self.sizes) == 1:
+            return self.sizes[0][index]
+        else:
+            return (s[index] for s in self.sizes)
+
+    @property
+    def supports_prefetch(self):
+        """Whether this dataset supports prefetching."""
+        return any(ds.supports_prefetch for ds in self.defn.values())
+
+    def prefetch(self, indices):
+        """Prefetch the data required for this epoch."""
+        for ds in self.defn.values():
+            if getattr(ds, 'supports_prefetch', False):
+                ds.prefetch(indices)
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.defn.values():
+            ds.set_epoch(epoch)
diff --git a/fairseq/data/num_samples_dataset.py b/fairseq/data/num_samples_dataset.py
new file mode 100644
index 0000000000..1ad2ce8290
--- /dev/null
+++ b/fairseq/data/num_samples_dataset.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from . import FairseqDataset
+
+
+class NumSamplesDataset(FairseqDataset):
+
+    def __getitem__(self, index):
+        return 1
+
+    def __len__(self):
+        return 0
+
+    def collater(self, samples):
+        return sum(samples)
diff --git a/fairseq/data/numel_dataset.py b/fairseq/data/numel_dataset.py
new file mode 100644
index 0000000000..efcd8f152c
--- /dev/null
+++ b/fairseq/data/numel_dataset.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class NumelDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, reduce=False):
+        super().__init__(dataset)
+        self.reduce = reduce
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        if torch.is_tensor(item):
+            return torch.numel(item)
+        else:
+            return np.size(item)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        if self.reduce:
+            return sum(samples)
+        else:
+            return torch.tensor(samples)
diff --git a/fairseq/data/pad_dataset.py b/fairseq/data/pad_dataset.py
new file mode 100644
index 0000000000..28c372c134
--- /dev/null
+++ b/fairseq/data/pad_dataset.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from fairseq.data import data_utils
+
+from . import BaseWrapperDataset
+
+
+class PadDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, pad_idx, left_pad):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+
+    def collater(self, samples):
+        return data_utils.collate_tokens(samples, self.pad_idx, left_pad=self.left_pad)
+
+
+class LeftPadDataset(PadDataset):
+
+    def __init__(self, dataset, pad_idx):
+        super().__init__(dataset, pad_idx, left_pad=True)
+
+
+class RightPadDataset(PadDataset):
+
+    def __init__(self, dataset, pad_idx):
+        super().__init__(dataset, pad_idx, left_pad=False)
diff --git a/fairseq/data/prepend_token_dataset.py b/fairseq/data/prepend_token_dataset.py
new file mode 100644
index 0000000000..3daf50f389
--- /dev/null
+++ b/fairseq/data/prepend_token_dataset.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class PrependTokenDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+        self.token = token
+        if token is not None:
+            self._sizes = np.array(dataset.sizes) + 1
+        else:
+            self._sizes = dataset.sizes
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        if self.token is not None:
+            item = torch.cat([item.new([self.token]), item])
+        return item
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    def num_tokens(self, index):
+        n = self.dataset.num_tokens(index)
+        if self.token is not None:
+            n += 1
+        return n
+
+    def size(self, index):
+        n = self.dataset.size(index)
+        if self.token is not None:
+            n += 1
+        return n
diff --git a/fairseq/data/sort_dataset.py b/fairseq/data/sort_dataset.py
new file mode 100644
index 0000000000..3755cd326c
--- /dev/null
+++ b/fairseq/data/sort_dataset.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import numpy as np
+
+from . import BaseWrapperDataset
+
+
+class SortDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, sort_order):
+        super().__init__(dataset)
+        if not isinstance(sort_order, (list, tuple)):
+            sort_order = [sort_order]
+        self.sort_order = sort_order
+
+        assert all(len(so) == len(dataset) for so in sort_order)
+
+    def ordered_indices(self):
+        return np.lexsort(self.sort_order)

From e8d609a80ddb6524baf978a15ceb75cdcfe5ac60 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 22 Jul 2019 11:43:48 -0700
Subject: [PATCH 018/213] Add new Masked LM task + criterion

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/761

Differential Revision: D16421335

Pulled By: myleott

fbshipit-source-id: 257d92c2b90361147642e2baa38486b4d18f6297
---
 fairseq/criterions/masked_lm.py |  72 ++++++++++
 fairseq/tasks/masked_lm.py      | 227 ++++++++++++++++++++++++++++++++
 2 files changed, 299 insertions(+)
 create mode 100644 fairseq/criterions/masked_lm.py
 create mode 100644 fairseq/tasks/masked_lm.py

diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
new file mode 100644
index 0000000000..b899b87605
--- /dev/null
+++ b/fairseq/criterions/masked_lm.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+
+from . import FairseqCriterion, register_criterion
+
+
+@register_criterion('masked_lm')
+class MaskedLmLoss(FairseqCriterion):
+    """
+    Implementation for the loss used in masked language model (MLM) training.
+    """
+
+    def __init__(self, args, task):
+        super().__init__(args, task)
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        # compute MLM loss
+        logits = model(**sample['net_input'], last_state_only=True)[0]
+        targets = model.get_targets(sample, [logits])
+        loss = F.nll_loss(
+            F.log_softmax(
+                logits.view(-1, logits.size(-1)),
+                dim=-1,
+                dtype=torch.float32,
+            ),
+            targets.view(-1),
+            reduction='sum',
+            ignore_index=self.padding_idx,
+        )
+
+        sample_size = targets.ne(self.padding_idx).int().sum().item()
+
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['nsentences'],
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        agg_output = {
+            'loss': loss / sample_size / math.log(2),
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
+        return agg_output
diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py
new file mode 100644
index 0000000000..36b13eb435
--- /dev/null
+++ b/fairseq/tasks/masked_lm.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import itertools
+import os
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from fairseq.data import (
+    ConcatDataset,
+    data_utils,
+    Dictionary,
+    encoders,
+    IdDataset,
+    indexed_dataset,
+    MaskTokensDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PadDataset,
+    PrependTokenDataset,
+    SortDataset,
+    TokenBlockDataset,
+)
+from fairseq.tasks import FairseqTask, register_task
+
+
+@register_task('masked_lm')
+class MaskedLMTask(FairseqTask):
+    """Task for training masked language models (e.g., BERT, RoBERTa)."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner')
+        parser.add_argument('--sample-break-mode', default='complete',
+                            choices=['none', 'complete', 'complete_doc', 'eos'],
+                            help='If omitted or "none", fills each sample with tokens-per-sample '
+                                 'tokens. If set to "complete", splits samples only at the end '
+                                 'of sentence, but may include multiple sentences per sample. '
+                                 '"complete_doc" is similar but respects doc boundaries. '
+                                 'If set to "eos", includes only one sentence per sample.')
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments '
+                                 'per sample for BERT dataset')
+        parser.add_argument('--mask-prob', default=0.15, type=float,
+                            help='probability of replacing a token with mask')
+        parser.add_argument('--leave-unmasked-prob', default=0.1, type=float,
+                            help='probability that a masked token is unmasked')
+        parser.add_argument('--random-token-prob', default=0.1, type=float,
+                            help='probability of replacing a token with a random token')
+        parser.add_argument('--freq-weighted-replacement', action='store_true',
+                            help='sample random replacement words based on word frequencies')
+        parser.add_argument('--mask-whole-words', default=False, action='store_true',
+                            help='mask whole words; you may also want to set --bpe')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        # add mask token
+        self.mask_idx = dictionary.add_symbol('<mask>')
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        paths = args.data.split(':')
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))
+        print('| dictionary: {} types'.format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=0, combine=False):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = self.args.data.split(':')
+        assert len(paths) > 0
+        data_path = paths[epoch % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        dataset = data_utils.load_indexed_dataset(
+            split_path,
+            self.source_dictionary,
+            self.args.dataset_impl,
+            combine=combine,
+        )
+        if dataset is None:
+            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))
+
+        # create continuous blocks of tokens
+        dataset = TokenBlockDataset(
+            dataset,
+            dataset.sizes,
+            self.args.tokens_per_sample - 1,  # one less for <s>
+            pad=self.source_dictionary.pad(),
+            eos=self.source_dictionary.eos(),
+            break_mode=self.args.sample_break_mode,
+        )
+
+        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+
+        # create masked input and targets
+        if self.args.mask_whole_words:
+            bpe = encoders.build_bpe(self.args)
+            if bpe is not None:
+
+                def is_beginning_of_word(i):
+                    if i < self.source_dictionary.nspecial:
+                        # special elements are always considered beginnings
+                        return True
+                    tok = self.source_dictionary[i]
+                    if tok.startswith('madeupword'):
+                        return True
+                    try:
+                        return bpe.is_beginning_of_word(tok)
+                    except ValueError:
+                        return True
+
+                mask_whole_words = torch.ByteTensor(list(
+                    map(is_beginning_of_word, range(len(self.source_dictionary)))
+                ))
+        else:
+            mask_whole_words = None
+
+        src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
+            dataset,
+            self.source_dictionary,
+            pad_idx=self.source_dictionary.pad(),
+            mask_idx=self.mask_idx,
+            seed=self.args.seed,
+            mask_prob=self.args.mask_prob,
+            leave_unmasked_prob=self.args.leave_unmasked_prob,
+            random_token_prob=self.args.random_token_prob,
+            freq_weighted_replacement=self.args.freq_weighted_replacement,
+            mask_whole_words=mask_whole_words,
+        )
+
+        with data_utils.numpy_seed(self.args.seed + epoch):
+            shuffle = np.random.permutation(len(src_dataset))
+
+        self.datasets[split] = SortDataset(
+            NestedDictionaryDataset(
+                {
+                    'id': IdDataset(),
+                    'net_input': {
+                        'src_tokens': PadDataset(
+                            src_dataset,
+                            pad_idx=self.source_dictionary.pad(),
+                            left_pad=False,
+                        ),
+                        'src_lengths': NumelDataset(src_dataset, reduce=False),
+                    },
+                    'target': PadDataset(
+                        tgt_dataset,
+                        pad_idx=self.source_dictionary.pad(),
+                        left_pad=False,
+                    ),
+                    'nsentences': NumSamplesDataset(),
+                    'ntokens': NumelDataset(src_dataset, reduce=True),
+                },
+                sizes=[src_dataset.sizes],
+            ),
+            sort_order=[
+                shuffle,
+                src_dataset.sizes,
+            ],
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
+        if self.args.also_lowercase_words:
+            raise NotImplementedError
+        src_dataset = PadDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                self.args.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode='eos',
+            ),
+            pad_idx=self.source_dictionary.pad(),
+            left_pad=False,
+        )
+        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
+        src_dataset = NestedDictionaryDataset(
+            {
+                'id': IdDataset(),
+                'net_input': {
+                    'src_tokens': src_dataset,
+                    'src_lengths': NumelDataset(src_dataset, reduce=False),
+                },
+            },
+            sizes=src_lengths,
+        )
+        if sort:
+            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
+        return src_dataset
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+    def get_average_masked_score(self, model, src_tokens, mask, **net_input):
+        """Mask a set of tokens and return their average score."""
+        masked_tokens = src_tokens.clone()
+        masked_tokens[mask.byte()] = self.mask_idx
+        net_output = model(src_tokens=masked_tokens, **net_input, last_state_only=True)
+        lprobs = F.log_softmax(net_output[0], dim=-1, dtype=torch.float32)
+        lprobs = lprobs.gather(-1, src_tokens.unsqueeze(-1)).squeeze(-1)
+        mask = mask.type_as(lprobs)
+        score = (lprobs * mask).sum(dim=-1) / mask.sum(dim=-1)
+        return score

From a03fe6faf3b0fc9415d14b1cecf5598d4672b85d Mon Sep 17 00:00:00 2001
From: Sara Hanson <sarahanson@fb.com>
Date: Mon, 22 Jul 2019 16:36:08 -0700
Subject: [PATCH 019/213] Implement sparse transformer fixed attention pattern
 (#804)

Summary:
Pull Request resolved: https://github.com/facebookresearch/pytext/pull/804

Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/746

Pull Request resolved: https://github.com/pytorch/fairseq/pull/894

Adding an implementation of the sparse transformer to multi-head attention using the fixed attention pattern specified https://arxiv.org/pdf/1904.10509.pdf. The sparse_mask masks out words using -inf; after softmax, -inf becomes 0. Thus, a mask does not need to be re-calculated and re-applied when multiplying attn_weights and values.

Four inputs are added to the config: sparse, is_bidirectional, stride, expressivity. If we are using the sparse transformer, is_bidirectional, stride, and expressivity must be specified (there are defaults). If is_bidirectional is False, the mask values using the fixed attention pattern described in the paper. If is_bidirectional is True, subset one includes all values in the current stride window and a summary from every stride window--all other values are masked. Stride (L in the paper) controls the window size and expressivity (c in the paper) controls the size of the summary.

Reviewed By: borguz

Differential Revision: D16042988

fbshipit-source-id: c59166dc7cfe89187a256e4076000c2458842fd5
---
 fairseq/modules/multihead_attention.py        |   7 +-
 fairseq/modules/sparse_multihead_attention.py | 106 ++++++++++++++++++
 .../sparse_transformer_sentence_encoder.py    |  85 ++++++++++++++
 ...arse_transformer_sentence_encoder_layer.py |  50 +++++++++
 tests/test_sparse_multihead_attention.py      |  50 +++++++++
 5 files changed, 296 insertions(+), 2 deletions(-)
 create mode 100644 fairseq/modules/sparse_multihead_attention.py
 create mode 100644 fairseq/modules/sparse_transformer_sentence_encoder.py
 create mode 100644 fairseq/modules/sparse_transformer_sentence_encoder_layer.py
 create mode 100644 tests/test_sparse_multihead_attention.py

diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 60852af2b8..490b93f576 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -40,7 +40,6 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
                                                              'value to be of the same size'
 
-
         if self.qkv_same_dim:
             self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
         else:
@@ -102,7 +101,6 @@ def forward(self, query, key, value, key_padding_mask=None, incremental_state=No
         the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
         batch x src_len, where padding elements are indicated by 1s.
         """
-
         tgt_len, bsz, embed_dim = query.size()
         assert embed_dim == self.embed_dim
         assert list(query.size()) == [tgt_len, bsz, embed_dim]
@@ -217,6 +215,8 @@ def forward(self, query, key, value, key_padding_mask=None, incremental_state=No
                     [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
 
         attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+
         assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
 
         if attn_mask is not None:
@@ -327,3 +327,6 @@ def _set_input_buffer(self, incremental_state, buffer):
             'attn_state',
             buffer,
         )
+
+    def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
+        return attn_weights
diff --git a/fairseq/modules/sparse_multihead_attention.py b/fairseq/modules/sparse_multihead_attention.py
new file mode 100644
index 0000000000..a4e8848c43
--- /dev/null
+++ b/fairseq/modules/sparse_multihead_attention.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import math
+import torch
+from .multihead_attention import MultiheadAttention
+
+
+class SparseMultiheadAttention(MultiheadAttention):
+    """ Sparse Multi-Headed Attention.
+
+    "Generating Long Sequences with Sparse Transformers". Implements
+    fixed factorized self attention, where l=stride and c=expressivity.
+    A(1) includes all words in the stride window and A(2) takes a summary of c
+    words from the end of each stride window.
+    If is_bidirectional=False, we do not include any words past the current word,
+    as in the paper.
+    """
+
+    def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
+                 add_bias_kv=False, add_zero_attn=False, self_attention=False,
+                 encoder_decoder_attention=False, stride=32, expressivity=8, is_bidirectional=True):
+
+        super().__init__(
+            embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv,
+            add_zero_attn, self_attention, encoder_decoder_attention
+        )
+
+        self.is_bidirectional = is_bidirectional
+        self.stride = stride
+        self.expressivity = expressivity
+        assert(self.stride > 0 and self.stride >= self.expressivity)
+
+    # Used for Ai(2) calculations - beginning of [l-c, l] range
+    def compute_checkpoint(self, word_index):
+        if word_index % self.stride == 0 and word_index is not 0:
+            checkpoint_index = word_index - self.expressivity
+        else:
+            checkpoint_index = (
+                math.floor(word_index / self.stride) * self.stride
+                + self.stride - self.expressivity
+            )
+        return checkpoint_index
+
+    # Computes Ai(2)
+    def compute_subset_summaries(self, absolute_max):
+        checkpoint_index = self.compute_checkpoint(0)
+        subset_two = set()
+        while checkpoint_index <= absolute_max-1:
+            summary = set(range(checkpoint_index, min(
+                checkpoint_index+self.expressivity+1, absolute_max)
+            ))
+            subset_two = subset_two.union(summary)
+            checkpoint_index = self.compute_checkpoint(checkpoint_index+self.stride)
+        return subset_two
+
+    # Sparse Transformer Fixed Attention Pattern: https://arxiv.org/pdf/1904.10509.pdf
+    def compute_fixed_attention_subset(self, word_index, tgt_len):
+        # +1s account for range function; [min, max) -> [min, max]
+        if not self.is_bidirectional:
+            absolute_max = word_index + 1
+        else:
+            absolute_max = tgt_len
+
+        # Subset 1 - whole window
+        rounded_index = math.floor((word_index + self.stride) / self.stride) * self.stride
+        if word_index % self.stride == 0 and word_index is not 0:
+            subset_one = set(range(word_index-self.stride, min(absolute_max, word_index+1)))
+        else:
+            subset_one = set(range(max(0, rounded_index - self.stride), min(
+                absolute_max, rounded_index+1))
+            )
+
+        # Subset 2 - summary per window
+        # If bidirectional, subset 2 is the same for every index
+        subset_two = set()
+        if not self.is_bidirectional:
+            subset_two = self.compute_subset_summaries(absolute_max)
+
+        return subset_one.union(subset_two)
+
+    # Compute sparse mask - if bidirectional, can pre-compute and store
+    def buffered_sparse_mask(self, tensor, tgt_len, src_len):
+        assert(tgt_len > self.stride)
+        sparse_mask = torch.empty((tgt_len, src_len)).float().fill_(float('-inf'))
+
+        # If bidirectional, subset 2 is the same for every index
+        subset_summaries = set()
+        if self.is_bidirectional:
+            subset_summaries = self.compute_subset_summaries(tgt_len)
+
+        for i in range(tgt_len):
+            fixed_attention_subset = self.compute_fixed_attention_subset(i, tgt_len)
+            fixed_attention_subset = fixed_attention_subset.union(subset_summaries)
+            included_word_indices = torch.LongTensor(list(fixed_attention_subset))
+            sparse_mask[i].index_fill_(0, included_word_indices, 0)
+        return sparse_mask.type_as(tensor)
+
+    def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
+        sparse_mask = self.buffered_sparse_mask(attn_weights, tgt_len, src_len)
+        sparse_mask = sparse_mask.unsqueeze(0).expand(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights += sparse_mask
diff --git a/fairseq/modules/sparse_transformer_sentence_encoder.py b/fairseq/modules/sparse_transformer_sentence_encoder.py
new file mode 100644
index 0000000000..9df5db5484
--- /dev/null
+++ b/fairseq/modules/sparse_transformer_sentence_encoder.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch.nn as nn
+from fairseq.modules import TransformerSentenceEncoder
+from fairseq.modules.sparse_transformer_sentence_encoder_layer import SparseTransformerSentenceEncoderLayer
+
+
+class SparseTransformerSentenceEncoder(TransformerSentenceEncoder):
+    """
+    Sparse implementation of the TransformerSentenceEncoder
+    - see SparseMultiheadAttention
+    """
+
+    def __init__(
+        self,
+        padding_idx: int,
+        vocab_size: int,
+        num_encoder_layers: int = 6,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        max_seq_len: int = 256,
+        num_segments: int = 2,
+        use_position_embeddings: bool = True,
+        offset_positions_by_padding: bool = True,
+        encoder_normalize_before: bool = False,
+        apply_bert_init: bool = False,
+        activation_fn: str = "relu",
+        learned_pos_embedding: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        embed_scale: float = None,
+        freeze_embeddings: bool = False,
+        n_trans_layers_to_freeze: int = 0,
+        export: bool = False,
+        is_bidirectional: bool = True,
+        stride: int = 32,
+        expressivity: int = 8,
+    ) -> None:
+
+        super().__init__(
+            padding_idx, vocab_size, num_encoder_layers, embedding_dim,
+            ffn_embedding_dim, num_attention_heads, dropout, attention_dropout,
+            activation_dropout, max_seq_len, num_segments, use_position_embeddings,
+            offset_positions_by_padding, encoder_normalize_before, apply_bert_init,
+            activation_fn, learned_pos_embedding, add_bias_kv, add_zero_attn,
+            embed_scale, freeze_embeddings, n_trans_layers_to_freeze, export
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                SparseTransformerSentenceEncoderLayer(
+                    embedding_dim=self.embedding_dim,
+                    ffn_embedding_dim=ffn_embedding_dim,
+                    num_attention_heads=num_attention_heads,
+                    dropout=self.dropout,
+                    attention_dropout=attention_dropout,
+                    activation_dropout=activation_dropout,
+                    activation_fn=activation_fn,
+                    add_bias_kv=add_bias_kv,
+                    add_zero_attn=add_zero_attn,
+                    export=export,
+                    is_bidirectional=is_bidirectional,
+                    stride=stride,
+                    expressivity=expressivity,
+                )
+                for _ in range(num_encoder_layers)
+            ]
+        )
+
+        def freeze_module_params(m):
+            if m is not None:
+                for p in m.parameters():
+                    p.requires_grad = False
+
+        for layer in range(n_trans_layers_to_freeze):
+            freeze_module_params(self.layers[layer])
diff --git a/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
new file mode 100644
index 0000000000..9a8f3296c2
--- /dev/null
+++ b/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from fairseq.modules import TransformerSentenceEncoderLayer
+from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
+
+
+class SparseTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
+    """
+    Implements a Sprase Transformer Encoder Layer (see SparseMultiheadAttention)
+    """
+
+    def __init__(
+        self,
+        embedding_dim: float = 768,
+        ffn_embedding_dim: float = 3072,
+        num_attention_heads: float = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = 'relu',
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        export: bool = False,
+        is_bidirectional: bool = True,
+        stride: int = 32,
+        expressivity: int = 8,
+    ) -> None:
+
+        super().__init__(
+            embedding_dim, ffn_embedding_dim, num_attention_heads, dropout,
+            attention_dropout, activation_dropout, activation_fn, add_bias_kv,
+            add_zero_attn, export
+        )
+
+        self.self_attn = SparseMultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=True,
+            is_bidirectional=is_bidirectional,
+            stride=stride,
+            expressivity=expressivity,
+        )
diff --git a/tests/test_sparse_multihead_attention.py b/tests/test_sparse_multihead_attention.py
new file mode 100644
index 0000000000..d6e6ebdb4c
--- /dev/null
+++ b/tests/test_sparse_multihead_attention.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+import unittest
+from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
+
+
+class TestSparseMultiheadAttention(unittest.TestCase):
+    def test_sparse_multihead_attention(self):
+        attn_weights = torch.randn(1, 8, 8)
+        bidirectional_sparse_mask = torch.tensor([
+                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],
+                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],
+                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],
+                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],
+                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],
+                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],
+                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],
+                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0]
+            ])
+
+        bidirectional_attention = SparseMultiheadAttention(16, 1, stride=4, expressivity=1, is_bidirectional=True)
+        bidirectional_attention_sparse_mask = bidirectional_attention.buffered_sparse_mask(attn_weights, 8, 8)
+        torch.all(torch.eq(bidirectional_attention_sparse_mask, bidirectional_sparse_mask))
+
+        sparse_mask = torch.tensor([
+                [0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf'),
+                 float('-inf'), float('-inf')],
+                [0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf')],
+                [0, 0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf')],
+                [0, 0, 0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf')],
+                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), float('-inf')],
+                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, float('-inf'), float('-inf')],
+                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, float('-inf')],
+                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],
+            ])
+
+        attention = SparseMultiheadAttention(16, 1, stride=4, expressivity=1, is_bidirectional=False)
+        attention_sparse_mask = attention.buffered_sparse_mask(attn_weights, 8, 8)
+
+        torch.all(torch.eq(attention_sparse_mask, sparse_mask))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 30123e2caac819f5ccdd04c00f36d6b9303af111 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 22 Jul 2019 19:38:23 -0700
Subject: [PATCH 020/213] Fix read_binarized.py script

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/762

Differential Revision: D16427266

Pulled By: myleott

fbshipit-source-id: 9bd9b8c6b4994ae98a62a37b34d03265bd365453
---
 fairseq/data/data_utils.py |  4 ++--
 scripts/read_binarized.py  | 11 +++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index f731ee730a..71b450aabc 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -45,7 +45,7 @@ def copy_tensor(src, dst):
     return res
 
 
-def load_indexed_dataset(path, dictionary, dataset_impl=None, combine=False):
+def load_indexed_dataset(path, dictionary, dataset_impl=None, combine=False, default='cached'):
     """A helper function for loading indexed datasets.
 
     Args:
@@ -72,7 +72,7 @@ def load_indexed_dataset(path, dictionary, dataset_impl=None, combine=False):
 
         dataset = indexed_dataset.make_dataset(
             path_k,
-            impl=dataset_impl_k or 'cached',
+            impl=dataset_impl_k or default,
             fix_lua_indexing=True,
             dictionary=dictionary,
         )
diff --git a/scripts/read_binarized.py b/scripts/read_binarized.py
index f8242a89c2..4b041cd8e6 100644
--- a/scripts/read_binarized.py
+++ b/scripts/read_binarized.py
@@ -8,8 +8,7 @@
 
 import argparse
 
-from fairseq.data import Dictionary
-from fairseq.data import indexed_dataset
+from fairseq.data import data_utils, Dictionary, indexed_dataset
 
 
 def get_parser():
@@ -30,8 +29,12 @@ def main():
     args = parser.parse_args()
 
     dictionary = Dictionary.load(args.dict) if args.dict is not None else None
-    dataset = indexed_dataset.make_dataset(args.input, impl=args.dataset_impl,
-                                           fix_lua_indexing=True, dictionary=dictionary)
+    dataset = data_utils.load_indexed_dataset(
+        args.input,
+        dictionary,
+        dataset_impl=args.dataset_impl,
+        default='lazy',
+    )
 
     for tensor_line in dataset:
         if dictionary is None:

From af6b361c92178458e1fe938bf8247080f38e44b4 Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@google.com>
Date: Tue, 23 Jul 2019 14:51:20 -0700
Subject: [PATCH 021/213] Initializing mask as a tensor of ints (not long)
 (#875)

Summary:
Since mask really is a tensor of ints, this change should be mathematically
equivalent to the base.

On the other hand, this has performance implications for xla, hence the
pull request.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/875

Differential Revision: D16232877

Pulled By: myleott

fbshipit-source-id: e63175ee0016dcf0dfe10e2fd22570b8bbfbde84
---
 fairseq/utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fairseq/utils.py b/fairseq/utils.py
index 5c25677f4b..c5062274ac 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -172,8 +172,14 @@ def make_positions(tensor, padding_idx, onnx_trace=False):
 
     Position numbers begin at padding_idx+1. Padding symbols are ignored.
     """
-    mask = tensor.ne(padding_idx).long()
-    return torch.cumsum(mask, dim=1) * mask + padding_idx
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA. In particular XLA
+    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+    # how to handle the dtype kwarg in cumsum.
+    mask = tensor.ne(padding_idx).int()
+    return (
+        torch.cumsum(mask, dim=1).type_as(mask) * mask
+    ).long() + padding_idx
 
 
 def strip_pad(tensor, pad):

From 208295dfc76492748500f97a4f9a808d8053a184 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 23 Jul 2019 15:39:44 -0700
Subject: [PATCH 022/213] Update README.md

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/899

Differential Revision: D16448602

Pulled By: myleott

fbshipit-source-id: afd1a1b713274b6328150cd85d7f8a81833597aa
---
 examples/translation_moe/README.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/examples/translation_moe/README.md b/examples/translation_moe/README.md
index 6a4af48b6d..4fc027e9c7 100644
--- a/examples/translation_moe/README.md
+++ b/examples/translation_moe/README.md
@@ -15,23 +15,20 @@ The model is trained with online responsibility assignment and shared parameteri
 
 The following command will train a `hMoElp` model with `3` experts:
 ```
-$ CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/wmt17_en_de \
+$ fairseq-train --ddp-backend='no_c10d' \
+  data-bin/wmt17_en_de \
   --max-update 100000 \
   --task translation_moe \
   --method hMoElp --mean-pool-gating-network \
   --num-experts 3 \
-  --arch transformer_vaswani_wmt_en_de --share-all-embeddings \
+  --arch transformer_wmt_en_de --share-all-embeddings \
   --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
   --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
   --lr 0.0007 --min-lr 1e-09 \
   --dropout 0.1 --weight-decay 0.0 --criterion cross_entropy \
-  --max-tokens 3584 \
-  --update-freq 8
+  --max-tokens 3584
 ```
 
-**Note**: the above command assumes 1 GPU, but accumulates gradients from 8 fwd/bwd passes to simulate training on 8 GPUs.
-You can accelerate training on up to 8 GPUs by adjusting the `CUDA_VISIBLE_DEVICES` and `--update-freq` options accordingly.
-
 ## Translate
 
 Once a model is trained, we can generate translations from different experts using the `--gen-expert` option.

From b49ea81c556fef9375a94e18b8f95fb97459becc Mon Sep 17 00:00:00 2001
From: Spencer Poff <spoff@fb.com>
Date: Wed, 24 Jul 2019 16:54:51 -0700
Subject: [PATCH 023/213] check save_dir before beginning training

Summary: I sadly discovery that my checkpoint directory wasn't globally readable after 8 hours of training. Adding this check at the beginning of train loop to keep that from happening again!

Reviewed By: myleott

Differential Revision: D16455394

fbshipit-source-id: 35959aa058150b2afb63710c468d01ebc8a12b0c
---
 fairseq/checkpoint_utils.py | 12 ++++++++++++
 train.py                    |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 0284fa2aa1..4696875498 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -358,3 +358,15 @@ def load_pretrained_component_from_model(
             component_state_dict[component_subkey] = state["model"][key]
     component.load_state_dict(component_state_dict, strict=True)
     return component
+
+
+def verify_checkpoint_directory(save_dir: str) -> None:
+    temp_file_path = os.path.join(save_dir, 'dummy')
+    try:
+        with open(temp_file_path, 'w'):
+            pass
+    except OSError as e:
+        print('| Unable to access checkpoint save directory: {}'.format(save_dir))
+        raise e
+    else:
+        os.remove(temp_file_path)
diff --git a/train.py b/train.py
index 314ff219ac..9531829625 100644
--- a/train.py
+++ b/train.py
@@ -35,6 +35,9 @@ def main(args, init_distributed=False):
     if init_distributed:
         args.distributed_rank = distributed_utils.distributed_init(args)
 
+    if distributed_utils.is_master(args):
+        checkpoint_utils.verify_checkpoint_directory(args.save_dir)
+
     # Print args
     print(args)
 

From 3d764a3dc6f0d1ae3968870645fe800debb12ad6 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 25 Jul 2019 06:25:45 -0700
Subject: [PATCH 024/213] Update torch.hub usage

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/770

Differential Revision: D16491911

Pulled By: myleott

fbshipit-source-id: 8dd2b76f8fa24183640ae9d1129ea47ded77d43d
---
 examples/backtranslation/README.md | 6 +++---
 examples/language_model/README.md  | 6 +++---
 examples/translation/README.md     | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/backtranslation/README.md b/examples/backtranslation/README.md
index 0d63d8abab..cb010855cb 100644
--- a/examples/backtranslation/README.md
+++ b/examples/backtranslation/README.md
@@ -13,14 +13,14 @@ Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381); WMT'1
 Interactive generation from the full ensemble via PyTorch Hub:
 ```
 >>> import torch
+>>> torch.hub.list('pytorch/fairseq')
+[..., 'transformer.wmt14.en-fr', 'transformer.wmt16.en-de', 'transformer.wmt18.en-de', ... ]
 >>> en2de_ensemble = torch.hub.load(
 ...   'pytorch/fairseq',
-...   'transformer',
-...   model_name_or_path='transformer.wmt18.en-de',
+...   'transformer.wmt18.en-de',
 ...   checkpoint_file='wmt18.model1.pt:wmt18.model2.pt:wmt18.model3.pt:wmt18.model4.pt:wmt18.model5.pt',
 ...   data_name_or_path='.',
 ...   tokenizer='moses',
-...   aggressive_dash_splits=True,
 ...   bpe='subword_nmt',
 ... )
 >>> len(en2de_ensemble.models)
diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index be64138fea..d598cd0e20 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -13,13 +13,13 @@ Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)
 Interactive generation via PyTorch Hub:
 ```
 >>> import torch
+>>> torch.hub.list('pytorch/fairseq')
+[..., 'transformer_lm.gbw.adaptive_huge', 'transformer_lm.wiki103.adaptive', ...]
 >>> lm = torch.hub.load(
 ...   'pytorch/fairseq',
-...   'transformer_lm',
-...   model_name_or_path='transformer_lm.wiki103.adaptive',
+...   'transformer_lm.wiki103.adaptive',
 ...   data_name_or_path='./data-bin',
 ...   tokenizer='moses',
-...   aggressive_dash_splits=True,
 ...   no_escape=True,
 ...   beam=1,
 ...   sampling=True,
diff --git a/examples/translation/README.md b/examples/translation/README.md
index 537da259f6..72f8b16178 100644
--- a/examples/translation/README.md
+++ b/examples/translation/README.md
@@ -16,13 +16,13 @@ Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381); WMT'1
 Interactive generation via PyTorch Hub:
 ```
 >>> import torch
+>>> torch.hub.list('pytorch/fairseq')
+[..., 'transformer.wmt14.en-fr', 'transformer.wmt16.en-de', 'transformer.wmt18.en-de', ... ]
 >>> en2de = torch.hub.load(
 ...   'pytorch/fairseq',
-...   'transformer',
-...   model_name_or_path='transformer.wmt16.en-de',
+...   'transformer.wmt16.en-de',
 ...   data_name_or_path='.',
 ...   tokenizer='moses',
-...   aggressive_dash_splits=True,
 ...   bpe='subword_nmt',
 ... )
 >>> print(en2de.models[0].__class__)

From 8835d93cf08e429b073d7adabd077099a2cb7602 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 25 Jul 2019 07:19:44 -0700
Subject: [PATCH 025/213] =?UTF-8?q?Standardize=20on=20'teacher=20forcing'?=
 =?UTF-8?q?=20rather=20than=20'input=20feeding'=20which=20is=E2=80=A6=20(#?=
 =?UTF-8?q?769)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Input feeding generally refers to a slightly different concept
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/769

Differential Revision: D16491898

Pulled By: myleott

fbshipit-source-id: 68573584e820f11f199db4e7e37e9ee7a69a3287
---
 docs/tutorial_classifying_names.rst           |  2 +-
 docs/tutorial_simple_lstm.rst                 | 12 ++++++------
 fairseq/data/language_pair_dataset.py         | 11 +++++------
 fairseq/models/fairseq_decoder.py             |  2 +-
 fairseq/models/fairseq_incremental_decoder.py |  4 ++--
 fairseq/models/fairseq_model.py               |  6 +++---
 fairseq/models/lightconv.py                   |  2 +-
 fairseq/models/transformer.py                 |  2 +-
 8 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/docs/tutorial_classifying_names.rst b/docs/tutorial_classifying_names.rst
index c20bf487db..b420d850bc 100644
--- a/docs/tutorial_classifying_names.rst
+++ b/docs/tutorial_classifying_names.rst
@@ -285,7 +285,7 @@ following contents::
               max_source_positions=self.args.max_positions,
               max_target_positions=1,
               # Since our target is a single class label, there's no need for
-              # input feeding. If we set this to ``True`` then our Model's
+              # teacher forcing. If we set this to ``True`` then our Model's
               # ``forward()`` method would receive an additional argument called
               # *prev_output_tokens* that would contain a shifted version of the
               # target sequence.
diff --git a/docs/tutorial_simple_lstm.rst b/docs/tutorial_simple_lstm.rst
index 57a254962b..30bdc7213e 100644
--- a/docs/tutorial_simple_lstm.rst
+++ b/docs/tutorial_simple_lstm.rst
@@ -125,9 +125,9 @@ Decoder
 
 Our Decoder will predict the next word, conditioned on the Encoder's final
 hidden state and an embedded representation of the previous target word -- which
-is sometimes called *input feeding* or *teacher forcing*. More specifically,
-we'll use a :class:`torch.nn.LSTM` to produce a sequence of hidden states that
-we'll project to the size of the output vocabulary to predict each target word.
+is sometimes called *teacher forcing*. More specifically, we'll use a
+:class:`torch.nn.LSTM` to produce a sequence of hidden states that we'll project
+to the size of the output vocabulary to predict each target word.
 
 ::
 
@@ -171,7 +171,7 @@ we'll project to the size of the output vocabulary to predict each target word.
           """
           Args:
               prev_output_tokens (LongTensor): previous decoder outputs of shape
-                  `(batch, tgt_len)`, for input feeding/teacher forcing
+                  `(batch, tgt_len)`, for teacher forcing
               encoder_out (Tensor, optional): output from the encoder, used for
                   encoder-side attention
 
@@ -387,8 +387,8 @@ previous hidden states.
 
 In fairseq this is called :ref:`Incremental decoding`. Incremental decoding is a
 special mode at inference time where the Model only receives a single timestep
-of input corresponding to the immediately previous output token (for input
-feeding) and must produce the next output incrementally. Thus the model must
+of input corresponding to the immediately previous output token (for teacher
+forcing) and must produce the next output incrementally. Thus the model must
 cache any long-term state that is needed about the sequence, e.g., hidden
 states, convolutional states, etc.
 
diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py
index 351ba1e8f2..64a5e4c7ee 100644
--- a/fairseq/data/language_pair_dataset.py
+++ b/fairseq/data/language_pair_dataset.py
@@ -88,8 +88,7 @@ class LanguagePairDataset(FairseqDataset):
         shuffle (bool, optional): shuffle dataset elements before batching
             (default: True).
         input_feeding (bool, optional): create a shifted version of the targets
-            to be passed into the model for input feeding/teacher forcing
-            (default: True).
+            to be passed into the model for teacher forcing (default: True).
         remove_eos_from_source (bool, optional): if set, removes eos from end
             of source if it's present (default: False).
         append_eos_to_target (bool, optional): if set, appends eos to end of
@@ -167,10 +166,10 @@ def collater(self, samples):
                   - `src_lengths` (LongTensor): 1D Tensor of the unpadded
                     lengths of each source sentence of shape `(bsz)`
                   - `prev_output_tokens` (LongTensor): a padded 2D Tensor of
-                    tokens in the target sentence, shifted right by one position
-                    for input feeding/teacher forcing, of shape `(bsz,
-                    tgt_len)`. This key will not be present if *input_feeding*
-                    is ``False``. Padding will appear on the left if
+                    tokens in the target sentence, shifted right by one
+                    position for teacher forcing, of shape `(bsz, tgt_len)`.
+                    This key will not be present if *input_feeding* is
+                    ``False``.  Padding will appear on the left if
                     *left_pad_target* is ``True``.
 
                 - `target` (LongTensor): a padded 2D Tensor of tokens in the
diff --git a/fairseq/models/fairseq_decoder.py b/fairseq/models/fairseq_decoder.py
index 732e66a091..2e5398e364 100644
--- a/fairseq/models/fairseq_decoder.py
+++ b/fairseq/models/fairseq_decoder.py
@@ -22,7 +22,7 @@ def forward(self, prev_output_tokens, encoder_out=None, **kwargs):
         """
         Args:
             prev_output_tokens (LongTensor): shifted output tokens of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
             encoder_out (dict, optional): output from the encoder, used for
                 encoder-side attention
 
diff --git a/fairseq/models/fairseq_incremental_decoder.py b/fairseq/models/fairseq_incremental_decoder.py
index ede1b51738..1c41215571 100644
--- a/fairseq/models/fairseq_incremental_decoder.py
+++ b/fairseq/models/fairseq_incremental_decoder.py
@@ -13,7 +13,7 @@ class FairseqIncrementalDecoder(FairseqDecoder):
 
     Incremental decoding is a special mode at inference time where the Model
     only receives a single timestep of input corresponding to the previous
-    output token (for input feeding) and must produce the next output
+    output token (for teacher forcing) and must produce the next output
     *incrementally*. Thus the model must cache any long-term state that is
     needed about the sequence, e.g., hidden states, convolutional states, etc.
 
@@ -37,7 +37,7 @@ def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None,
         """
         Args:
             prev_output_tokens (LongTensor): shifted output tokens of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
             encoder_out (dict, optional): output from the encoder, used for
                 encoder-side attention
             incremental_state (dict, optional): dictionary used for storing
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index 78b1308617..f8bd5ba609 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -202,8 +202,8 @@ def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
         Run the forward pass for an encoder-decoder model.
 
         First feed a batch of source tokens through the encoder. Then, feed the
-        encoder output and previous decoder outputs (i.e., input feeding/teacher
-        forcing) to the decoder to produce the next outputs::
+        encoder output and previous decoder outputs (i.e., teacher forcing) to
+        the decoder to produce the next outputs::
 
             encoder_out = self.encoder(src_tokens, src_lengths)
             return self.decoder(prev_output_tokens, encoder_out)
@@ -213,7 +213,7 @@ def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
                 `(batch, src_len)`
             src_lengths (LongTensor): source sentence lengths of shape `(batch)`
             prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
 
         Returns:
             tuple:
diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py
index 5b38ac5e7d..0dc71a1f70 100644
--- a/fairseq/models/lightconv.py
+++ b/fairseq/models/lightconv.py
@@ -345,7 +345,7 @@ def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
         """
         Args:
             prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
             encoder_out (Tensor, optional): output from the encoder, used for
                 encoder-side attention
             incremental_state (dict): dictionary used for storing state during
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 59e14a4e73..591a486066 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -370,7 +370,7 @@ def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None,
         """
         Args:
             prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
             encoder_out (Tensor, optional): output from the encoder, used for
                 encoder-side attention
             incremental_state (dict): dictionary used for storing state during

From 17fcc72a641e6994bea0b14356a611a0dd6cd1a1 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 26 Jul 2019 17:58:27 -0700
Subject: [PATCH 026/213] Add RoBERTa README

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/778

Differential Revision: D16525447

Pulled By: myleott

fbshipit-source-id: e721e3a10e243a2408a04f89f06b5adbbe2fdff2
---
 examples/roberta/README.md | 141 ++++++++++++++
 fairseq/models/roberta.py  | 368 +++++++++++++++++++++++++++++++++++++
 2 files changed, 509 insertions(+)
 create mode 100644 examples/roberta/README.md
 create mode 100644 fairseq/models/roberta.py

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
new file mode 100644
index 0000000000..1bc55ed7b4
--- /dev/null
+++ b/examples/roberta/README.md
@@ -0,0 +1,141 @@
+# RoBERTa: A Robustly Optimized BERT Pretraining Approach
+
+*Pre-print coming 7/28*
+
+## Introduction
+
+**RoBERTa** iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details.
+
+## Pre-trained models
+
+Model | Description | # params | Download
+---|---|---|---
+`roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz)
+`roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz)
+`roberta.large.mnli` | `roberta.large` finetuned on MNLI | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
+
+## Example usage (torch.hub)
+
+##### Load RoBERTa:
+```
+>>> import torch
+>>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
+```
+
+##### Apply Byte-Pair Encoding (BPE) to input text:
+```
+>>> tokens = roberta.encode('Hello world!')
+>>> tokens
+tensor([    0, 31414,   232,   328,     2])
+```
+
+##### Extract features from RoBERTa:
+```
+>>> features = roberta.extract_features(tokens)
+>>> features.size()
+torch.Size([1, 5, 1024])
+```
+
+##### Use RoBERTa for sentence-pair classification tasks:
+```
+>>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')  # already finetuned
+>>> roberta.eval()  # disable dropout for evaluation
+
+>>> tokens = roberta.encode(
+...   'Roberta is a heavily optimized version of BERT.',
+...   'Roberta is not very optimized.'
+... )
+
+>>> roberta.predict('mnli', tokens).argmax()
+tensor(0)  # contradiction
+
+>>> tokens = roberta.encode(
+...   'Roberta is a heavily optimized version of BERT.',
+...   'Roberta is based on BERT.'
+... )
+
+>>> roberta.predict('mnli', tokens).argmax()
+tensor(2)  # entailment
+```
+
+##### Register a new (randomly initialized) classification head:
+```
+>>> roberta.register_classification_head('new_task', num_classes=3)
+>>> roberta.predict('new_task', tokens)
+tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
+```
+
+##### Using the GPU:
+```
+>>> roberta.cuda()
+>>> roberta.predict('new_task', tokens)
+tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
+```
+
+## Results
+
+##### Results on GLUE tasks (dev set, single model, single-task finetuning)
+
+Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
+---|---|---|---|---|---|---|---|---
+`roberta.base` | 87.6 | 92.8 | 91.9 | 78.7 | 94.8 | 90.2 | 63.6 | 91.2
+`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
+`roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
+
+##### Results on SQuAD (dev set)
+
+Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
+---|---|---
+`roberta.large` | 88.9/94.6 | 86.5/89.4
+
+##### Results on Reading Comprehension (RACE, test set)
+
+Model | Accuracy | Middle | High
+---|---|---|---
+`roberta.large` | 83.2 | 86.5 | 81.3
+
+## Evaluating the `roberta.large.mnli` model
+
+Example python code snippet to evaluate accuracy on the MNLI dev_matched set.
+```
+label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
+ncorrect, nsamples = 0, 0
+roberta.cuda()
+roberta.eval()
+with open('glue_data/MNLI/dev_matched.tsv') as fin:
+    fin.readline()
+    for index, line in enumerate(fin):
+        tokens = line.strip().split('\t')
+        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
+        tokens = roberta.encode(sent1, sent2)
+        prediction = roberta.predict('mnli', tokens).argmax().item()
+        prediction_label = label_map[prediction]
+        ncorrect += int(prediction_label == target)
+        nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+# Expected output: 0.9060
+```
+
+## Finetuning on GLUE tasks
+
+A more detailed tutorial is coming soon.
+
+## Pretraining using your own data
+
+You can use the [`masked_lm` task](/fairseq/tasks/masked_lm.py) to pretrain RoBERTa from scratch, or to continue pretraining RoBERTa starting from one of the released checkpoints.
+
+Data should be preprocessed following the [language modeling example](/examples/language_model).
+
+A more detailed tutorial is coming soon.
+
+## Citation
+
+```bibtex
+@article{liu2019roberta,
+  title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach},
+  author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
+            Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
+            Luke Zettlemoyer and Veselin Stoyanov},
+  year = {2019},
+}
+```
diff --git a/fairseq/models/roberta.py b/fairseq/models/roberta.py
new file mode 100644
index 0000000000..42adfad303
--- /dev/null
+++ b/fairseq/models/roberta.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+"""
+RoBERTa: A Robustly Optimized BERT Pretraining Approach.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.data import encoders
+from fairseq.models import (
+    FairseqDecoder,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    LayerNorm,
+    TransformerSentenceEncoder,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+class RobertaHubInterface(nn.Module):
+    """A simple PyTorch Hub interface to RoBERTa.
+
+    Load RoBERTa::
+
+        >>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
+
+    Apply Byte-Pair Encoding (BPE) to input text::
+
+        >>> tokens = roberta.encode('Hello world!')
+        >>> tokens
+        tensor([    0, 31414,   232,   328,     2])
+
+    Extract features from RoBERTa::
+
+        >>> features = roberta.extract_features(tokens)
+        >>> features.size()
+        torch.Size([1, 5, 1024])
+
+    Use RoBERTa for sentence-pair classification tasks::
+
+        >>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')  # already finetuned
+        >>> roberta.eval()  # disable dropout for evaluation
+
+        >>> tokens = roberta.encode(
+        ...   'Roberta is a heavily optimized version of BERT.',
+        ...   'Roberta is not very optimized.'
+        ... )
+        >>> roberta.predict('mnli', tokens).argmax()
+        tensor(0)  # contradiction
+
+        >>> tokens = roberta.encode(
+        ...   'Roberta is a heavily optimized version of BERT.',
+        ...   'Roberta is based on BERT.'
+        ... )
+        >>> roberta.predict('mnli', tokens).argmax()
+        tensor(2)  # entailment
+
+    Register a new (randomly initialized) classification head::
+
+        >>> roberta.register_classification_head('new_task', num_classes=3)
+        >>> roberta.predict('new_task', tokens)
+        tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
+
+    Using the GPU::
+
+        >>> roberta.cuda()
+        >>> roberta.predict('new_task', tokens)
+        tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
+    """
+
+    def __init__(self, args, task, model):
+        super().__init__()
+        self.args = args
+        self.task = task
+        self.model = model
+
+        self.bpe = encoders.build_bpe(args)
+
+        # this is useful for determining the device
+        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
+
+    @property
+    def device(self):
+        return self._float_tensor.device
+
+    def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
+        bpe_sentence = '<s> ' + self.bpe.encode(sentence) + ' </s>'
+        for s in addl_sentences:
+            bpe_sentence += ' </s> ' + self.bpe.encode(s)
+        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=True)
+        return tokens.long()
+
+    def extract_features(self, tokens: torch.LongTensor) -> torch.Tensor:
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        features, _ = self.model(tokens.to(device=self.device), features_only=True)
+        return features
+
+    def register_classification_head(
+        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
+    ):
+        self.model.register_classification_head(
+            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
+        )
+
+    def predict(self, head: str, tokens: torch.LongTensor):
+        features = self.extract_features(tokens)
+        logits = self.model.classification_heads[head](features)
+        return F.log_softmax(logits, dim=-1)
+
+
+@register_model('roberta')
+class RobertaModel(FairseqLanguageModel):
+
+    @classmethod
+    def hub_models(cls):
+        return {
+            'roberta.base': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz',
+            'roberta.large': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz',
+            'roberta.large.mnli': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz',
+        }
+
+    def __init__(self, args, encoder):
+        super().__init__(encoder)
+        self.args = args
+
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+
+        self.classification_heads = nn.ModuleDict()
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--encoder-layers', type=int, metavar='L',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
+                            help='num encoder attention heads')
+        parser.add_argument('--activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use')
+        parser.add_argument('--pooler-activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use for pooler layer')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--activation-dropout', type=float, metavar='D',
+                            help='dropout probability after activation in FFN')
+        parser.add_argument('--pooler-dropout', type=float, metavar='D',
+                            help='dropout probability in the masked_lm pooler layers')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present
+        base_architecture(args)
+
+        if not hasattr(args, 'max_positions'):
+            args.max_positions = args.tokens_per_sample
+
+        encoder = RobertaEncoder(args, task.source_dictionary)
+        return cls(args, encoder)
+
+    def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
+        """Register a classification head."""
+        self.classification_heads[name] = RobertaClassificationHead(
+            self.args.encoder_embed_dim,
+            inner_dim or self.args.encoder_embed_dim,
+            num_classes,
+            self.args.pooler_activation_fn,
+            self.args.pooler_dropout,
+        )
+
+    @property
+    def supported_targets(self):
+        return {'self'}
+
+    @classmethod
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', **kwargs):
+        from fairseq import hub_utils
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe='gpt2',
+            **kwargs,
+        )
+        return RobertaHubInterface(x['args'], x['task'], x['models'][0])
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + '.' if name != '' else ''
+
+        # recreate any classification heads present in the state dict
+        for k in state_dict.keys():
+            if not k.startswith(prefix + 'classification_heads.'):
+                continue
+            head_name = k[len(prefix + 'classification_heads.'):].split('.')[0]
+            num_classes = state_dict[
+                prefix + 'classification_heads.' + head_name + '.out_proj.weight'
+            ].size(0)
+            inner_dim = state_dict[
+                prefix + 'classification_heads.' + head_name + '.dense.weight'
+            ].size(0)
+            self.register_classification_head(head_name, num_classes, inner_dim)
+
+
+class RobertaLMHead(nn.Module):
+    """Head for masked language modeling."""
+
+    def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
+        super().__init__()
+        self.dense = nn.Linear(embed_dim, embed_dim)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.layer_norm = LayerNorm(embed_dim)
+
+        if weight is None:
+            weight = nn.Linear(embed_dim, output_dim, bias=False).weight
+        self.weight = weight
+        self.bias = nn.Parameter(torch.zeros(output_dim))
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = self.activation_fn(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = F.linear(x, self.weight) + self.bias
+
+        return x
+
+
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = self.activation_fn(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class RobertaEncoder(FairseqDecoder):
+    """RoBERTa encoder.
+
+    Implements the :class:`~fairseq.models.FairseqDecoder` interface required
+    by :class:`~fairseq.models.FairseqLanguageModel`.
+    """
+
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        self.args = args
+        self.sentence_encoder = TransformerSentenceEncoder(
+            padding_idx=dictionary.pad(),
+            vocab_size=len(dictionary),
+            num_encoder_layers=args.encoder_layers,
+            embedding_dim=args.encoder_embed_dim,
+            ffn_embedding_dim=args.encoder_ffn_embed_dim,
+            num_attention_heads=args.encoder_attention_heads,
+            dropout=args.dropout,
+            attention_dropout=args.attention_dropout,
+            activation_dropout=args.activation_dropout,
+            max_seq_len=args.max_positions,
+            num_segments=0,
+            encoder_normalize_before=True,
+            apply_bert_init=True,
+            activation_fn=args.activation_fn,
+        )
+        self.lm_head = RobertaLMHead(
+            embed_dim=args.encoder_embed_dim,
+            output_dim=len(dictionary),
+            activation_fn=args.activation_fn,
+            weight=self.sentence_encoder.embed_tokens.weight,
+        )
+
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, **unused):
+        """
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            features_only (bool, optional): skip LM head and just return
+                features. If True, the output will be of shape
+                `(batch, src_len, embed_dim)`.
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            tuple:
+                - the LM output of shape `(batch, src_len, vocab)`
+                - a dictionary of additional data, where 'inner_states'
+                  is a list of hidden states.
+        """
+        x, extra = self.extract_features(src_tokens, return_all_hiddens)
+        if not features_only:
+            x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(self, src_tokens, return_all_hiddens=False, **unused):
+        inner_states, _ = self.sentence_encoder(
+            src_tokens, last_state_only=not return_all_hiddens,
+        )
+        features = inner_states[-1]
+        return features, {'inner_states': inner_states if return_all_hiddens else None}
+
+    def output_layer(self, features, **unused):
+        return self.lm_head(features)
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.args.max_positions
+
+
+@register_model_architecture('roberta', 'roberta')
+def base_architecture(args):
+    args.encoder_layers = getattr(args, 'encoder_layers', 12)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 3072)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 12)
+
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    args.pooler_activation_fn = getattr(args, 'pooler_activation_fn', 'tanh')
+
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_dropout = getattr(args, 'activation_dropout', 0.0)
+    args.pooler_dropout = getattr(args, 'pooler_dropout', 0.0)
+
+
+@register_model_architecture('roberta', 'roberta_base')
+def roberta_base_architecture(args):
+    base_architecture(args)
+
+
+@register_model_architecture('roberta', 'roberta_large')
+def roberta_large_architecture(args):
+    args.encoder_layers = getattr(args, 'encoder_layers', 24)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+    base_architecture(args)

From 40f16872c749d5b5cbf26218b3bd33c6a6788582 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sat, 27 Jul 2019 07:19:05 -0700
Subject: [PATCH 027/213] Add return_all_hiddens flag to hub interface

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/909

Differential Revision: D16532919

Pulled By: myleott

fbshipit-source-id: 16ce884cf3d84579026e4406a75ba3c01a128dbd
---
 examples/roberta/README.md | 12 ++++++++++--
 fairseq/models/roberta.py  | 26 +++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 1bc55ed7b4..52550157f3 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -20,6 +20,7 @@ Model | Description | # params | Download
 ```
 >>> import torch
 >>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
+>>> roberta.eval()  # disable dropout (or leave in train mode to finetune)
 ```
 
 ##### Apply Byte-Pair Encoding (BPE) to input text:
@@ -31,9 +32,16 @@ tensor([    0, 31414,   232,   328,     2])
 
 ##### Extract features from RoBERTa:
 ```
->>> features = roberta.extract_features(tokens)
->>> features.size()
+>>> last_layer_features = roberta.extract_features(tokens)
+>>> last_layer_features.size()
 torch.Size([1, 5, 1024])
+
+>>> all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
+>>> len(all_layers)
+25
+
+>>> torch.all(all_layers[-1] == last_layer_features)
+tensor(1, dtype=torch.uint8)
 ```
 
 ##### Use RoBERTa for sentence-pair classification tasks:
diff --git a/fairseq/models/roberta.py b/fairseq/models/roberta.py
index 42adfad303..d8001e6d7c 100644
--- a/fairseq/models/roberta.py
+++ b/fairseq/models/roberta.py
@@ -33,6 +33,7 @@ class RobertaHubInterface(nn.Module):
     Load RoBERTa::
 
         >>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
+        >>> roberta.eval()  # disable dropout (or leave in train mode to finetune)
 
     Apply Byte-Pair Encoding (BPE) to input text::
 
@@ -42,10 +43,16 @@ class RobertaHubInterface(nn.Module):
 
     Extract features from RoBERTa::
 
-        >>> features = roberta.extract_features(tokens)
-        >>> features.size()
+        >>> last_layer_features = roberta.extract_features(tokens)
+        >>> last_layer_features.size()
         torch.Size([1, 5, 1024])
 
+        >>> all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
+        >>> len(all_layers)
+        25
+        >>> torch.all(all_layers[-1] == last_layer_features)
+        tensor(1, dtype=torch.uint8)
+
     Use RoBERTa for sentence-pair classification tasks::
 
         >>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')  # already finetuned
@@ -100,11 +107,20 @@ def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
         tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=True)
         return tokens.long()
 
-    def extract_features(self, tokens: torch.LongTensor) -> torch.Tensor:
+    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens=False) -> torch.Tensor:
         if tokens.dim() == 1:
             tokens = tokens.unsqueeze(0)
-        features, _ = self.model(tokens.to(device=self.device), features_only=True)
-        return features
+        features, extra = self.model(
+            tokens.to(device=self.device),
+            features_only=True,
+            return_all_hiddens=return_all_hiddens,
+        )
+        if return_all_hiddens:
+            # convert from T x B x C -> B x T x C
+            inner_states = extra['inner_states']
+            return [inner_state.transpose(0, 1) for inner_state in inner_states]
+        else:
+            return features  # just the last layer's features
 
     def register_classification_head(
         self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs

From 5218a7c970308a7b807548c05d0f20cb1ab37bdd Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 28 Jul 2019 06:46:44 -0700
Subject: [PATCH 028/213] Fix compatibility with PyTorch 1.0.x (Fixes #906)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/910

Differential Revision: D16536532

Pulled By: myleott

fbshipit-source-id: 56bb5570e70b5670ad87c64d9dd20c64c1fa9f5c
---
 fairseq/data/base_wrapper_dataset.py | 2 +-
 fairseq/data/mask_tokens_dataset.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/data/base_wrapper_dataset.py b/fairseq/data/base_wrapper_dataset.py
index 2a46a5215c..88609915c4 100644
--- a/fairseq/data/base_wrapper_dataset.py
+++ b/fairseq/data/base_wrapper_dataset.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
-from torch.utils.data._utils.collate import default_collate
+from torch.utils.data.dataloader import default_collate
 
 from . import FairseqDataset
 
diff --git a/fairseq/data/mask_tokens_dataset.py b/fairseq/data/mask_tokens_dataset.py
index 6c64f66d25..ecbf29d294 100644
--- a/fairseq/data/mask_tokens_dataset.py
+++ b/fairseq/data/mask_tokens_dataset.py
@@ -129,7 +129,7 @@ def __getitem__(self, index: int):
                 if self.mask_whole_words is not None:
                     mask = np.repeat(mask, word_lens)
                 new_item = np.full(len(mask), self.pad_idx)
-                new_item[mask] = item[torch.from_numpy(mask)]
+                new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8))]
                 return torch.from_numpy(new_item)
 
             # decide unmasking and random replacement

From abc13e28711ba228bb0c2065071887417ea254d0 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 28 Jul 2019 06:51:16 -0700
Subject: [PATCH 029/213] Make hub_utils.generator inherit from nn.Module

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/913

Differential Revision: D16536562

Pulled By: myleott

fbshipit-source-id: ce28642da6868ec884e3e416388a652977a062df
---
 fairseq/hub_utils.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index c532075b6b..60f5143878 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -12,6 +12,7 @@
 
 from fairseq import utils
 from fairseq.data import encoders
+from fairseq.models import BaseFairseqModel
 
 
 def from_pretrained(
@@ -57,7 +58,7 @@ def from_pretrained(
     }
 
 
-class Generator(object):
+class Generator(BaseFairseqModel):
     """PyTorch Hub API for generating sequences from a pre-trained translation
     or language model."""
 
@@ -69,6 +70,11 @@ def __init__(self, args, task, models):
         self.tgt_dict = task.target_dictionary
         self.use_cuda = torch.cuda.is_available() and not getattr(args, 'cpu', False)
 
+        if self.use_cuda:
+            if getattr(args, 'fp16', False):
+                self.half()
+            self.cuda()
+
         # optimize model for generation
         for model in self.models:
             model.make_generation_fast_(
@@ -78,10 +84,6 @@ def __init__(self, args, task, models):
                 ),
                 need_attn=getattr(args, 'print_alignment', False),
             )
-            if self.use_cuda:
-                if getattr(args, 'fp16', False):
-                    model.half()
-                model.cuda()
 
         self.generator = self.task.build_generator(args)
 

From 8207f26347f523b8fb655e4f248ade28ed9231db Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 28 Jul 2019 06:53:19 -0700
Subject: [PATCH 030/213] Misc dataset improvements

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/911

Differential Revision: D16536559

Pulled By: myleott

fbshipit-source-id: 7fe495054ce5b7658b1d3a43eca38c5858360236
---
 fairseq/data/concat_dataset.py  | 6 +++++-
 fairseq/data/dictionary.py      | 3 +++
 fairseq/data/fairseq_dataset.py | 3 ++-
 fairseq/data/indexed_dataset.py | 6 ++++++
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/fairseq/data/concat_dataset.py b/fairseq/data/concat_dataset.py
index 220e495e59..1a930b9334 100644
--- a/fairseq/data/concat_dataset.py
+++ b/fairseq/data/concat_dataset.py
@@ -8,6 +8,7 @@
 import bisect
 
 import numpy as np
+from torch.utils.data.dataloader import default_collate
 
 from . import FairseqDataset
 
@@ -50,7 +51,10 @@ def _get_dataset_and_sample_index(self, idx: int):
 
     def collater(self, samples):
         # For now only supports datasets with same underlying collater implementations
-        return self.datasets[0].collater(samples)
+        if hasattr(self.datasets[0], 'collater'):
+            return self.datasets[0].collater(samples)
+        else:
+            return default_collate(samples)
 
     def size(self, idx: int):
         """
diff --git a/fairseq/data/dictionary.py b/fairseq/data/dictionary.py
index 5acb73d097..4e4cbc0346 100644
--- a/fairseq/data/dictionary.py
+++ b/fairseq/data/dictionary.py
@@ -52,6 +52,9 @@ def __len__(self):
         """Returns the number of symbols in the dictionary"""
         return len(self.symbols)
 
+    def __contains__(self, sym):
+        return sym in self.indices
+
     def index(self, sym):
         """Returns the index of the specified symbol"""
         assert isinstance(sym, str)
diff --git a/fairseq/data/fairseq_dataset.py b/fairseq/data/fairseq_dataset.py
index 20bda95be2..55ffec30d0 100644
--- a/fairseq/data/fairseq_dataset.py
+++ b/fairseq/data/fairseq_dataset.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
+import numpy as np
 import torch.utils.data
 
 
@@ -41,7 +42,7 @@ def size(self, index):
     def ordered_indices(self):
         """Return an ordered list of indices. Batches will be constructed based
         on this order."""
-        raise NotImplementedError
+        return np.arange(len(self))
 
     @property
     def supports_prefetch(self):
diff --git a/fairseq/data/indexed_dataset.py b/fairseq/data/indexed_dataset.py
index 9a801ff3d0..7939a5a62d 100644
--- a/fairseq/data/indexed_dataset.py
+++ b/fairseq/data/indexed_dataset.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree. An additional grant of patent rights
 # can be found in the PATENTS file in the same directory.
 
+from functools import lru_cache
 import os
 import shutil
 import struct
@@ -146,6 +147,7 @@ def __del__(self):
         if self.data_file:
             self.data_file.close()
 
+    @lru_cache(maxsize=8)
     def __getitem__(self, i):
         if not self.data_file:
             self.read_data(self.path)
@@ -214,6 +216,7 @@ def prefetch(self, indices):
             self.data_file.close()
             self.data_file = None
 
+    @lru_cache(maxsize=8)
     def __getitem__(self, i):
         self.check_index(i)
         tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
@@ -255,6 +258,7 @@ def check_index(self, i):
         if i < 0 or i >= self.size:
             raise IndexError('index out of range')
 
+    @lru_cache(maxsize=8)
     def __getitem__(self, i):
         self.check_index(i)
         return self.tokens_list[i]
@@ -429,6 +433,7 @@ def dtype(self):
         def sizes(self):
             return self._sizes
 
+        @lru_cache(maxsize=8)
         def __getitem__(self, i):
             return self._pointers[i], self._sizes[i]
 
@@ -466,6 +471,7 @@ def __del__(self):
     def __len__(self):
         return len(self._index)
 
+    @lru_cache(maxsize=8)
     def __getitem__(self, i):
         ptr, size = self._index[i]
         np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)

From 1362b21bc3e1debe442b4162adc14eb30df5fd23 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 28 Jul 2019 06:53:20 -0700
Subject: [PATCH 031/213] Correctly zero padding index in
 TransformerSentenceEncoder

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/912

Differential Revision: D16536561

Pulled By: myleott

fbshipit-source-id: 54c5c20a826a14f4e690770e027bcb282acdf911
---
 fairseq/modules/transformer_sentence_encoder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fairseq/modules/transformer_sentence_encoder.py b/fairseq/modules/transformer_sentence_encoder.py
index 41c3867f34..08c2de91a2 100644
--- a/fairseq/modules/transformer_sentence_encoder.py
+++ b/fairseq/modules/transformer_sentence_encoder.py
@@ -38,6 +38,7 @@ def init_bert_params(module):
             module.bias.data.zero_()
     if isinstance(module, nn.Embedding):
         module.weight.data.normal_(mean=0.0, std=0.02)
+        module.weight.data[module.padding_idx].zero_()
     if isinstance(module, MultiheadAttention):
         module.in_proj_weight.data.normal_(mean=0.0, std=0.02)
 

From c446c44b1f2023808d48609dbeb48c58fdba1cf3 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 28 Jul 2019 08:22:28 -0700
Subject: [PATCH 032/213] Add Adamax optimizer

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/914

Differential Revision: D16536670

Pulled By: myleott

fbshipit-source-id: 8a41c98f0fb87af6c384cdade756e3eae2978a88
---
 fairseq/optim/adamax.py | 153 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 fairseq/optim/adamax.py

diff --git a/fairseq/optim/adamax.py b/fairseq/optim/adamax.py
new file mode 100644
index 0000000000..fd6874f1dd
--- /dev/null
+++ b/fairseq/optim/adamax.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+import torch.optim
+
+from . import FairseqOptimizer, register_optimizer
+
+
+@register_optimizer('adamax')
+class FairseqAdamax(FairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args, params)
+        self._optimizer = Adamax(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--adamax-betas', default='(0.9, 0.999)', metavar='B',
+                            help='betas for Adam optimizer')
+        parser.add_argument('--adamax-eps', type=float, default=1e-8, metavar='D',
+                            help='epsilon for Adam optimizer')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        parser.add_argument('--no-bias-correction', default=False, action='store_true',
+                            help='disable bias correction')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'betas': eval(self.args.adamax_betas),
+            'eps': self.args.adamax_eps,
+            'weight_decay': self.args.weight_decay,
+            'bias_correction': not self.args.no_bias_correction,
+        }
+
+
+class Adamax(torch.optim.Optimizer):
+    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
+
+    Compared to the version in PyTorch, this version implements a fix for weight decay.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        bias_correction (bool, optional): enable bias correction (default: True)
+
+    __ https://arxiv.org/abs/1412.6980
+    """
+
+    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, bias_correction=True):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        bias_correction=bias_correction)
+        super(Adamax, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('Adamax does not support sparse gradients')
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_inf'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_inf'] = state['exp_inf'].type_as(p_data_fp32)
+
+                exp_avg, exp_inf = state['exp_avg'], state['exp_inf']
+                beta1, beta2 = group['betas']
+                eps = group['eps']
+
+                state['step'] += 1
+
+                # Update biased first moment estimate.
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                # Update the exponentially weighted infinity norm.
+                torch.max(
+                    exp_inf.mul_(beta2),
+                    grad.abs_(),
+                    out=exp_inf,
+                )
+
+                step_size = group['lr']
+                if group['bias_correction']:
+                    bias_correction = 1 - beta1 ** state['step']
+                    step_size /= bias_correction
+
+                if group['weight_decay'] != 0:
+                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+
+                p_data_fp32.addcdiv_(-step_size, exp_avg, exp_inf.add(eps))
+
+                p.data.copy_(p_data_fp32)
+
+        return loss

From 76ff39f56655e764488f757fe92e5c564fbc7c91 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 28 Jul 2019 16:01:16 -0700
Subject: [PATCH 033/213] Change default --num-workers to 1

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/779

Differential Revision: D16536673

Pulled By: myleott

fbshipit-source-id: bf56e9a81d3086f3d95a3273391dc5e04ed2dbc4
---
 fairseq/options.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/options.py b/fairseq/options.py
index 41d466d948..a991cbef1f 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -252,7 +252,7 @@ def add_preprocess_args(parser):
 def add_dataset_args(parser, train=False, gen=False):
     group = parser.add_argument_group('Dataset and data loading')
     # fmt: off
-    group.add_argument('--num-workers', default=0, type=int, metavar='N',
+    group.add_argument('--num-workers', default=1, type=int, metavar='N',
                        help='how many subprocesses to use for data loading')
     group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true',
                        help='ignore too long or too short lines in valid and test set')

From a80cade964704a7e100fcd1943219432ce0e6009 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 28 Jul 2019 17:22:44 -0700
Subject: [PATCH 034/213] Update BPE library code

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/780

Differential Revision: D16537567

Pulled By: myleott

fbshipit-source-id: 4e18c529959935e82ea122c3a2ee477308ffcbe3
---
 fairseq/data/encoders/gpt2_bpe.py       | 128 +-----------------------
 fairseq/data/encoders/gpt2_bpe_utils.py | 128 ++++++++++++++++++++++++
 2 files changed, 132 insertions(+), 124 deletions(-)
 create mode 100644 fairseq/data/encoders/gpt2_bpe_utils.py

diff --git a/fairseq/data/encoders/gpt2_bpe.py b/fairseq/data/encoders/gpt2_bpe.py
index e1d01ee6b4..283e6c4501 100644
--- a/fairseq/data/encoders/gpt2_bpe.py
+++ b/fairseq/data/encoders/gpt2_bpe.py
@@ -8,6 +8,8 @@
 from fairseq import file_utils
 from fairseq.data.encoders import register_bpe
 
+from .gpt2_bpe_utils import get_encoder
+
 
 DEFAULT_ENCODER_JSON = 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
 DEFAULT_VOCAB_BPE = 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
@@ -42,127 +44,5 @@ def encode(self, x: str) -> str:
     def decode(self, x: str) -> str:
         return self.bpe.decode(map(int, x.split()))
 
-
-"""Byte pair encoding utilities from GPT-2"""
-
-from functools import lru_cache
-import json
-import os
-
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-class Encoder:
-
-    def __init__(self, encoder, bpe_merges, errors='replace'):
-        self.encoder = encoder
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        self.errors = errors # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        try:
-            import regex as re
-            self.re = re
-        except ImportError:
-            raise ImportError('Please install regex with: pip install regex')
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = self.re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-
-    def encode(self, text):
-        bpe_tokens = []
-        for token in self.re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        return text
-
-def get_encoder(encoder_json_path, vocab_bpe_path):
-    with open(encoder_json_path, 'r') as f:
-        encoder = json.load(f)
-    with open(vocab_bpe_path, 'r', encoding="utf-8") as f:
-        bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )
+    def is_beginning_of_word(self, x: str) -> bool:
+        return self.decode(x).startswith(' ')
diff --git a/fairseq/data/encoders/gpt2_bpe_utils.py b/fairseq/data/encoders/gpt2_bpe_utils.py
new file mode 100644
index 0000000000..ae98dbc708
--- /dev/null
+++ b/fairseq/data/encoders/gpt2_bpe_utils.py
@@ -0,0 +1,128 @@
+"""
+Byte pair encoding utilities from GPT-2.
+
+Original source: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+Original license: MIT
+"""
+
+from functools import lru_cache
+import json
+import os
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+class Encoder:
+
+    def __init__(self, encoder, bpe_merges, errors='replace'):
+        self.encoder = encoder
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        try:
+            import regex as re
+            self.re = re
+        except ImportError:
+            raise ImportError('Please install regex with: pip install regex')
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = self.re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        for token in self.re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+def get_encoder(encoder_json_path, vocab_bpe_path):
+    with open(encoder_json_path, 'r') as f:
+        encoder = json.load(f)
+    with open(vocab_bpe_path, 'r', encoding="utf-8") as f:
+        bpe_data = f.read()
+    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
+    return Encoder(
+        encoder=encoder,
+        bpe_merges=bpe_merges,
+    )

From 8d036c2fe01be5158c3ae5265d32c619131d8783 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 28 Jul 2019 18:40:05 -0700
Subject: [PATCH 035/213] Add RoBERTa

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/916

Differential Revision: D16537774

Pulled By: myleott

fbshipit-source-id: 86bb7b1913a428ee4a21674cc3fc7b39264067ec
---
 README.md                                     |  30 +++--
 examples/roberta/README.md                    |   3 +-
 fairseq/models/__init__.py                    |   4 +-
 .../models/{roberta.py => roberta/model.py}   | 109 +-----------------
 4 files changed, 25 insertions(+), 121 deletions(-)
 rename fairseq/models/{roberta.py => roberta/model.py} (73%)

diff --git a/README.md b/README.md
index 42253940d3..755e5c8fd0 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,18 @@
-# Introduction <img src="fairseq_logo.png" width="50"> 
+# <img src="fairseq_logo.png" width="30"> Introduction
 
 Fairseq(-py) is a sequence modeling toolkit that allows researchers and
 developers to train custom models for translation, summarization, language
-modeling and other text generation tasks. It provides reference implementations
-of various sequence-to-sequence models, including:
+modeling and other text generation tasks.
+
+### What's New:
+
+- July 2019: [RoBERTa models and code release](examples/roberta/README.md)
+- June 2019: [wav2vec models and code release](examples/wav2vec/README.md)
+- April 2019: [fairseq demo paper @ NAACL 2019](https://arxiv.org/abs/1904.01038)
+
+### Features:
+
+Fairseq provides reference implementations of various sequence-to-sequence models, including:
 - **Convolutional Neural Networks (CNN)**
   - [Dauphin et al. (2017): Language Modeling with Gated Convolutional Networks](examples/language_model/conv_lm/README.md)
   - [Gehring et al. (2017): Convolutional Sequence to Sequence Learning](examples/conv_seq2seq/README.md)
@@ -11,18 +20,18 @@ of various sequence-to-sequence models, including:
   - [Fan et al. (2018): Hierarchical Neural Story Generation](examples/stories/README.md)
   - **_New_** [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md)
 - **LightConv and DynamicConv models**
-  - **_New_** [Wu et al. (2019): Pay Less Attention with Lightweight and Dynamic Convolutions](examples/pay_less_attention_paper/README.md)
+  - [Wu et al. (2019): Pay Less Attention with Lightweight and Dynamic Convolutions](examples/pay_less_attention_paper/README.md)
 - **Long Short-Term Memory (LSTM) networks**
-  - [Luong et al. (2015): Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/abs/1508.04025)
-  - [Wiseman and Rush (2016): Sequence-to-Sequence Learning as Beam-Search Optimization](https://arxiv.org/abs/1606.02960)
+  - Luong et al. (2015): Effective Approaches to Attention-based Neural Machine Translation
 - **Transformer (self-attention) networks**
-  - [Vaswani et al. (2017): Attention Is All You Need](https://arxiv.org/abs/1706.03762)
+  - Vaswani et al. (2017): Attention Is All You Need
   - [Ott et al. (2018): Scaling Neural Machine Translation](examples/scaling_nmt/README.md)
   - [Edunov et al. (2018): Understanding Back-Translation at Scale](examples/backtranslation/README.md)
-  - **_New_** [Baevski and Auli (2018): Adaptive Input Representations for Neural Language Modeling](examples/language_model/transformer_lm/README.md)
-  - **_New_** [Shen et al. (2019): Mixture Models for Diverse Machine Translation: Tricks of the Trade](examples/translation_moe/README.md)
+  - [Baevski and Auli (2018): Adaptive Input Representations for Neural Language Modeling](examples/language_model/transformer_lm/README.md)
+  - [Shen et al. (2019): Mixture Models for Diverse Machine Translation: Tricks of the Trade](examples/translation_moe/README.md)
+  - **_New_** [Liu et al. (2019): RoBERTa: A Robustly Optimized BERT Pretraining Approach](examples/roberta/README.md)
 
-Fairseq features:
+**Additionally:**
 - multi-GPU (distributed) training on one machine or across multiple machines
 - fast generation on both CPU and GPU with multiple search algorithms implemented:
   - beam search
@@ -83,6 +92,7 @@ as well as example training and evaluation commands.
 - [Language Modeling](examples/language_model/README.md): convolutional models are available
 
 We also have more detailed READMEs to reproduce results from specific papers:
+- [Liu et al. (2019): RoBERTa: A Robustly Optimized BERT Pretraining Approach](examples/roberta/README.md)
 - [Schneider et al. (2019): wav2vec: Unsupervised Pre-training for Speech Recognition](examples/wav2vec/README.md)
 - [Shen et al. (2019) Mixture Models for Diverse Machine Translation: Tricks of the Trade](examples/translation_moe/README.md)
 - [Wu et al. (2019): Pay Less Attention with Lightweight and Dynamic Convolutions](examples/pay_less_attention_paper/README.md)
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 52550157f3..3e757e9289 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -1,6 +1,6 @@
 # RoBERTa: A Robustly Optimized BERT Pretraining Approach
 
-*Pre-print coming 7/28*
+https://arxiv.org/abs/1907.11692
 
 ## Introduction
 
@@ -144,6 +144,7 @@ A more detailed tutorial is coming soon.
   author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
             Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
             Luke Zettlemoyer and Veselin Stoyanov},
+  journal={arXiv preprint arXiv:1907.11692},
   year = {2019},
 }
 ```
diff --git a/fairseq/models/__init__.py b/fairseq/models/__init__.py
index 21bc9a450e..ae1d5fffa1 100644
--- a/fairseq/models/__init__.py
+++ b/fairseq/models/__init__.py
@@ -123,8 +123,8 @@ def register_model_arch_fn(fn):
 
 # automatically import any Python files in the models/ directory
 for file in os.listdir(os.path.dirname(__file__)):
-    if file.endswith('.py') and not file.startswith('_'):
-        model_name = file[:file.find('.py')]
+    if not file.startswith('_'):
+        model_name = file[:file.find('.py')] if file.endswith('.py') else file
         module = importlib.import_module('fairseq.models.' + model_name)
 
         # extra `model_parser` for sphinx
diff --git a/fairseq/models/roberta.py b/fairseq/models/roberta/model.py
similarity index 73%
rename from fairseq/models/roberta.py
rename to fairseq/models/roberta/model.py
index d8001e6d7c..946c19c899 100644
--- a/fairseq/models/roberta.py
+++ b/fairseq/models/roberta/model.py
@@ -13,7 +13,6 @@
 import torch.nn.functional as F
 
 from fairseq import utils
-from fairseq.data import encoders
 from fairseq.models import (
     FairseqDecoder,
     FairseqLanguageModel,
@@ -26,113 +25,7 @@
 )
 from fairseq.modules.transformer_sentence_encoder import init_bert_params
 
-
-class RobertaHubInterface(nn.Module):
-    """A simple PyTorch Hub interface to RoBERTa.
-
-    Load RoBERTa::
-
-        >>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
-        >>> roberta.eval()  # disable dropout (or leave in train mode to finetune)
-
-    Apply Byte-Pair Encoding (BPE) to input text::
-
-        >>> tokens = roberta.encode('Hello world!')
-        >>> tokens
-        tensor([    0, 31414,   232,   328,     2])
-
-    Extract features from RoBERTa::
-
-        >>> last_layer_features = roberta.extract_features(tokens)
-        >>> last_layer_features.size()
-        torch.Size([1, 5, 1024])
-
-        >>> all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
-        >>> len(all_layers)
-        25
-        >>> torch.all(all_layers[-1] == last_layer_features)
-        tensor(1, dtype=torch.uint8)
-
-    Use RoBERTa for sentence-pair classification tasks::
-
-        >>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')  # already finetuned
-        >>> roberta.eval()  # disable dropout for evaluation
-
-        >>> tokens = roberta.encode(
-        ...   'Roberta is a heavily optimized version of BERT.',
-        ...   'Roberta is not very optimized.'
-        ... )
-        >>> roberta.predict('mnli', tokens).argmax()
-        tensor(0)  # contradiction
-
-        >>> tokens = roberta.encode(
-        ...   'Roberta is a heavily optimized version of BERT.',
-        ...   'Roberta is based on BERT.'
-        ... )
-        >>> roberta.predict('mnli', tokens).argmax()
-        tensor(2)  # entailment
-
-    Register a new (randomly initialized) classification head::
-
-        >>> roberta.register_classification_head('new_task', num_classes=3)
-        >>> roberta.predict('new_task', tokens)
-        tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
-
-    Using the GPU::
-
-        >>> roberta.cuda()
-        >>> roberta.predict('new_task', tokens)
-        tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
-    """
-
-    def __init__(self, args, task, model):
-        super().__init__()
-        self.args = args
-        self.task = task
-        self.model = model
-
-        self.bpe = encoders.build_bpe(args)
-
-        # this is useful for determining the device
-        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
-
-    @property
-    def device(self):
-        return self._float_tensor.device
-
-    def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
-        bpe_sentence = '<s> ' + self.bpe.encode(sentence) + ' </s>'
-        for s in addl_sentences:
-            bpe_sentence += ' </s> ' + self.bpe.encode(s)
-        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=True)
-        return tokens.long()
-
-    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens=False) -> torch.Tensor:
-        if tokens.dim() == 1:
-            tokens = tokens.unsqueeze(0)
-        features, extra = self.model(
-            tokens.to(device=self.device),
-            features_only=True,
-            return_all_hiddens=return_all_hiddens,
-        )
-        if return_all_hiddens:
-            # convert from T x B x C -> B x T x C
-            inner_states = extra['inner_states']
-            return [inner_state.transpose(0, 1) for inner_state in inner_states]
-        else:
-            return features  # just the last layer's features
-
-    def register_classification_head(
-        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
-    ):
-        self.model.register_classification_head(
-            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
-        )
-
-    def predict(self, head: str, tokens: torch.LongTensor):
-        features = self.extract_features(tokens)
-        logits = self.model.classification_heads[head](features)
-        return F.log_softmax(logits, dim=-1)
+from .hub_interface import RobertaHubInterface
 
 
 @register_model('roberta')

From ce7f044bb100aeec6b3c524a654ce8c177403c0b Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 29 Jul 2019 06:02:19 -0700
Subject: [PATCH 036/213] Add instructions to load RoBERTa models on PyTorch
 1.0

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/921

Differential Revision: D16541025

Pulled By: myleott

fbshipit-source-id: bb78d30fe285da2adfc7c4e5897ee01fa413b2e4
---
 examples/roberta/README.md | 60 ++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 3e757e9289..b7661d3784 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -14,15 +14,47 @@ Model | Description | # params | Download
 `roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz)
 `roberta.large.mnli` | `roberta.large` finetuned on MNLI | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
 
-## Example usage (torch.hub)
+## Results
+
+##### Results on GLUE tasks (dev set, single model, single-task finetuning)
 
-##### Load RoBERTa:
+Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
+---|---|---|---|---|---|---|---|---
+`roberta.base` | 87.6 | 92.8 | 91.9 | 78.7 | 94.8 | 90.2 | 63.6 | 91.2
+`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
+`roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
+
+##### Results on SQuAD (dev set)
+
+Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
+---|---|---
+`roberta.large` | 88.9/94.6 | 86.5/89.4
+
+##### Results on Reading Comprehension (RACE, test set)
+
+Model | Accuracy | Middle | High
+---|---|---|---
+`roberta.large` | 83.2 | 86.5 | 81.3
+
+## Example usage
+
+##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
 ```
 >>> import torch
 >>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
 >>> roberta.eval()  # disable dropout (or leave in train mode to finetune)
 ```
 
+##### Load RoBERTa (for PyTorch 1.0):
+```
+$ wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
+$ tar -xzvf roberta.large.tar.gz
+
+>>> from fairseq.models.roberta import RobertaModel
+>>> roberta = RobertaModel.from_pretrained('/path/to/roberta.large')
+>>> roberta.eval()  # disable dropout (or leave in train mode to finetune)
+```
+
 ##### Apply Byte-Pair Encoding (BPE) to input text:
 ```
 >>> tokens = roberta.encode('Hello world!')
@@ -80,29 +112,7 @@ tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
 tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
 ```
 
-## Results
-
-##### Results on GLUE tasks (dev set, single model, single-task finetuning)
-
-Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
----|---|---|---|---|---|---|---|---
-`roberta.base` | 87.6 | 92.8 | 91.9 | 78.7 | 94.8 | 90.2 | 63.6 | 91.2
-`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
-`roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
-
-##### Results on SQuAD (dev set)
-
-Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
----|---|---
-`roberta.large` | 88.9/94.6 | 86.5/89.4
-
-##### Results on Reading Comprehension (RACE, test set)
-
-Model | Accuracy | Middle | High
----|---|---|---
-`roberta.large` | 83.2 | 86.5 | 81.3
-
-## Evaluating the `roberta.large.mnli` model
+##### Evaluating the `roberta.large.mnli` model
 
 Example python code snippet to evaluate accuracy on the MNLI dev_matched set.
 ```

From 36df0dadb9bc22b1f3432a7586e1be11dfd0270e Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 29 Jul 2019 06:20:49 -0700
Subject: [PATCH 037/213] Fix RoBERTa model import (fixes #918)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/920

Differential Revision: D16540932

Pulled By: myleott

fbshipit-source-id: b64438ad8651ecc8fe8904c5f69fa6111b4bed64
---
 fairseq/models/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fairseq/models/__init__.py b/fairseq/models/__init__.py
index ae1d5fffa1..0e3d146a6e 100644
--- a/fairseq/models/__init__.py
+++ b/fairseq/models/__init__.py
@@ -122,8 +122,10 @@ def register_model_arch_fn(fn):
 
 
 # automatically import any Python files in the models/ directory
-for file in os.listdir(os.path.dirname(__file__)):
-    if not file.startswith('_'):
+models_dir = os.path.dirname(__file__)
+for file in os.listdir(models_dir):
+    path = os.path.join(models_dir, file)
+    if not file.startswith('_') and (file.endswith('.py') or os.path.isdir(path)):
         model_name = file[:file.find('.py')] if file.endswith('.py') else file
         module = importlib.import_module('fairseq.models.' + model_name)
 

From 2f6d8b352a142424bef1858ef62e24180e8fbd0b Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 29 Jul 2019 07:46:42 -0700
Subject: [PATCH 038/213] Add missing files for RoBERTa hub interface

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/923

Differential Revision: D16541289

Pulled By: myleott

fbshipit-source-id: b3563a9d61507d4864ac6ecf0648672eaa40b5f3
---
 fairseq/models/roberta/__init__.py      |  9 ++++
 fairseq/models/roberta/hub_interface.py | 68 +++++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 fairseq/models/roberta/__init__.py
 create mode 100644 fairseq/models/roberta/hub_interface.py

diff --git a/fairseq/models/roberta/__init__.py b/fairseq/models/roberta/__init__.py
new file mode 100644
index 0000000000..bf4bf8fad9
--- /dev/null
+++ b/fairseq/models/roberta/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from .hub_interface import *  # noqa
+from .model import *  # noqa
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
new file mode 100644
index 0000000000..15c86a3f73
--- /dev/null
+++ b/fairseq/models/roberta/hub_interface.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq.data import encoders
+
+
+class RobertaHubInterface(nn.Module):
+    """A simple PyTorch Hub interface to RoBERTa.
+
+    Usage: https://github.com/pytorch/fairseq/tree/master/examples/roberta
+    """
+
+    def __init__(self, args, task, model):
+        super().__init__()
+        self.args = args
+        self.task = task
+        self.model = model
+
+        self.bpe = encoders.build_bpe(args)
+
+        # this is useful for determining the device
+        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
+
+    @property
+    def device(self):
+        return self._float_tensor.device
+
+    def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
+        bpe_sentence = '<s> ' + self.bpe.encode(sentence) + ' </s>'
+        for s in addl_sentences:
+            bpe_sentence += ' </s> ' + self.bpe.encode(s)
+        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=True)
+        return tokens.long()
+
+    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens=False) -> torch.Tensor:
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        features, extra = self.model(
+            tokens.to(device=self.device),
+            features_only=True,
+            return_all_hiddens=return_all_hiddens,
+        )
+        if return_all_hiddens:
+            # convert from T x B x C -> B x T x C
+            inner_states = extra['inner_states']
+            return [inner_state.transpose(0, 1) for inner_state in inner_states]
+        else:
+            return features  # just the last layer's features
+
+    def register_classification_head(
+        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
+    ):
+        self.model.register_classification_head(
+            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
+        )
+
+    def predict(self, head: str, tokens: torch.LongTensor):
+        features = self.extract_features(tokens)
+        logits = self.model.classification_heads[head](features)
+        return F.log_softmax(logits, dim=-1)

From 2fe45f09a1e775cfc1093103054aa3a554eb53e0 Mon Sep 17 00:00:00 2001
From: Xing Zhou <51722896+xingz9@users.noreply.github.com>
Date: Mon, 29 Jul 2019 10:34:35 -0700
Subject: [PATCH 039/213] Update README.md to add top-p sampling (#783)

Summary:
Update README.md to include the recently implemented top-p/nucleus sampling.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/783

Differential Revision: D16543974

Pulled By: myleott

fbshipit-source-id: 27c502af10ee390d29607038118a99ff0067aec4
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 755e5c8fd0..0b5f9ea947 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Fairseq provides reference implementations of various sequence-to-sequence model
 - fast generation on both CPU and GPU with multiple search algorithms implemented:
   - beam search
   - Diverse Beam Search ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424))
-  - sampling (unconstrained and top-k)
+  - sampling (unconstrained, top-k and top-p/nucleus)
 - large mini-batch training even on a single GPU via delayed updates
 - mixed precision training (trains faster with less GPU memory on [NVIDIA tensor cores](https://developer.nvidia.com/tensor-cores))
 - extensible: easily register new models, criterions, tasks, optimizers and learning rate schedulers

From 33597e5a65db7ebb3c3d9caf3851a4dc8e239bc5 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 29 Jul 2019 14:47:43 -0700
Subject: [PATCH 040/213] Support different --max-positions and
 --tokens-per-sample

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/924

Differential Revision: D16548165

Pulled By: myleott

fbshipit-source-id: 49569ece3e54fad7b4f0dfb201ac99123bfdd4f2
---
 fairseq/models/roberta/hub_interface.py | 4 ++++
 fairseq/models/roberta/model.py         | 2 ++
 fairseq/tasks/masked_lm.py              | 2 --
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index 15c86a3f73..f7eb6277b1 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -43,6 +43,10 @@ def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
     def extract_features(self, tokens: torch.LongTensor, return_all_hiddens=False) -> torch.Tensor:
         if tokens.dim() == 1:
             tokens = tokens.unsqueeze(0)
+        if tokens.size(-1) > self.model.max_positions():
+            raise ValueError('tokens exceeds maximum length: {} > {}'.format(
+                tokens.size(-1), self.model.max_positions()
+            ))
         features, extra = self.model(
             tokens.to(device=self.device),
             features_only=True,
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index 946c19c899..e7c6d5b7fc 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -75,6 +75,8 @@ def add_args(parser):
                             help='dropout probability after activation in FFN')
         parser.add_argument('--pooler-dropout', type=float, metavar='D',
                             help='dropout probability in the masked_lm pooler layers')
+        parser.add_argument('--max-positions', type=int,
+                            help='number of positional embeddings to learn')
 
     @classmethod
     def build_model(cls, args, task):
diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py
index 36b13eb435..2b89b1b8e0 100644
--- a/fairseq/tasks/masked_lm.py
+++ b/fairseq/tasks/masked_lm.py
@@ -178,8 +178,6 @@ def is_beginning_of_word(i):
         )
 
     def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
-        if self.args.also_lowercase_words:
-            raise NotImplementedError
         src_dataset = PadDataset(
             TokenBlockDataset(
                 src_tokens,

From 138dc8e4fd02f81074842731d1bdd9401aa59489 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@devfair0110.h2.fair>
Date: Mon, 29 Jul 2019 16:03:11 -0700
Subject: [PATCH 041/213] adding glue data preprocessing scripts (#771)

Summary:
1) Added glue data pre-processing script.
2) updated README with usage.

TODO:
1) releasing fairseq dictionary and remove hardcoded path.
2) remove hard-coded path for bpe-encoding,

myleott what do you recommend for above TODOs?
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/771

Reviewed By: myleott

Differential Revision: D16547679

Pulled By: myleott

fbshipit-source-id: 6a6562d9b6215523d048fdf3daee63ffac21e231
---
 examples/roberta/README.md                    |  70 +++++-
 .../roberta/multiprocessing_bpe_encoder.py    | 126 +++++++++++
 examples/roberta/preprocess_GLUE_tasks.sh     | 187 +++++++++++++++
 fairseq/checkpoint_utils.py                   |  10 +-
 fairseq/criterions/sentence_prediction.py     | 101 +++++++++
 fairseq/data/__init__.py                      |  10 +
 fairseq/data/concat_sentences_dataset.py      |  52 +++++
 fairseq/data/offset_tokens_dataset.py         |  18 ++
 fairseq/data/raw_label_dataset.py             |  26 +++
 fairseq/data/strip_token_dataset.py           |  19 ++
 fairseq/data/truncate_dataset.py              |  32 +++
 fairseq/models/roberta/model.py               |   9 +
 fairseq/tasks/sentence_prediction.py          | 212 ++++++++++++++++++
 train.py                                      |  31 ++-
 14 files changed, 892 insertions(+), 11 deletions(-)
 create mode 100644 examples/roberta/multiprocessing_bpe_encoder.py
 create mode 100755 examples/roberta/preprocess_GLUE_tasks.sh
 create mode 100644 fairseq/criterions/sentence_prediction.py
 create mode 100644 fairseq/data/concat_sentences_dataset.py
 create mode 100644 fairseq/data/offset_tokens_dataset.py
 create mode 100644 fairseq/data/raw_label_dataset.py
 create mode 100644 fairseq/data/strip_token_dataset.py
 create mode 100644 fairseq/data/truncate_dataset.py
 create mode 100644 fairseq/tasks/sentence_prediction.py

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index b7661d3784..c01595bfb8 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -134,9 +134,77 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 # Expected output: 0.9060
 ```
 
+
 ## Finetuning on GLUE tasks
 
-A more detailed tutorial is coming soon.
+##### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
+```
+$ wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
+$ python download_glue_data.py --data_dir glue_data --tasks all
+```
+
+##### 2) Preprocess GLUE task data:
+```
+$ ./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
+```
+`glue_task_name` is one of the following:
+`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
+Use `ALL` for preprocessing all the glue tasks.
+
+##### 3) Fine-tuning on GLUE task :
+Example fine-tuning cmd for `RTE` task
+```
+TOTAL_NUM_UPDATES=2036  # 10 epochs through RTE for bsz 16
+WARMUP_UPDATES=122      # 6 percent of the number of updates
+LR=2e-05                # Peak LR for polynomial LR scheduler.
+NUM_CLASSES=2
+MAX_SENTENCES=16        # Batch size.
+
+CUDA_VISIBLE_DEVICES=0 python train.py RTE-bin/ \
+--restore-file <roberta_large_absolute_path> \
+--max-positions 512 \
+--max-sentences $MAX_SENTENCES \
+--max-tokens 4400 \
+--task sentence_prediction \
+--reset-optimizer --reset-dataloader --reset-meters \
+--required-batch-size-multiple 1 \
+--init-token 0 --separator-token 2 \
+--arch roberta_large \
+--criterion sentence_prediction \
+--num-classes $NUM_CLASSES \
+--dropout 0.1 --attention-dropout 0.1 \
+--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+--clip-norm 0.0 \
+--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+--max-epoch 10 \
+--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
+```
+
+For each of the GLUE task, you will need to use following cmd-line arguments:
+
+Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
+---|---|---|---|---|---|---|---|---
+`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
+`--lr` | 1e-5 | 1e-5 | 1e-5 | 2e-5 | 1e-5 | 1e-5 | 1e-5 | 2e-5
+`--max-sentences` | 32 | 32 | 32 | 16 | 32 | 16 | 16 | 16
+`--total-num-update` | 123873 | 33112 | 113272 | 2036 | 20935 | 2296 | 5336 | 3598
+`--warmup-updates` | 7432 | 1986 | 28318 | 122 | 1256 | 137 | 320 | 214
+
+For `STS-B` additionally use following cmd-line argument:
+```
+--regression-target
+--best-checkpoint-metric loss
+```
+and remove `--maximize-best-checkpoint-metric`.
+
+**Note:**
+
+a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--max-sentences=16/32` depending on the task.
+
+b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--max-sentences`.
+
+c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
 
 ## Pretraining using your own data
 
diff --git a/examples/roberta/multiprocessing_bpe_encoder.py b/examples/roberta/multiprocessing_bpe_encoder.py
new file mode 100644
index 0000000000..48d9cb367e
--- /dev/null
+++ b/examples/roberta/multiprocessing_bpe_encoder.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import contextlib
+import sys
+
+from collections import Counter
+from multiprocessing import Pool
+
+from fairseq.data.encoders.gpt2_bpe import get_encoder
+
+
+def main():
+    """
+    Helper script to encode raw text
+    with the GPT-2 BPE using multiple processes.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--encoder-json",
+        help='path to encoder.json',
+    )
+    parser.add_argument(
+        "--vocab-bpe",
+        type=str,
+        help='path to vocab.bpe',
+    )
+    parser.add_argument(
+        "--inputs",
+        nargs="+",
+        default=['-'],
+        help="input files to filter/encode",
+    )
+    parser.add_argument(
+        "--outputs",
+        nargs="+",
+        default=['-'],
+        help="path to save encoded outputs",
+    )
+    parser.add_argument(
+        "--keep-empty",
+        action="store_true",
+        help="keep empty lines",
+    )
+    parser.add_argument("--workers", type=int, default=20)
+    args = parser.parse_args()
+
+    assert len(args.inputs) == len(args.outputs), \
+        "number of input and output paths should match"
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-" else sys.stdin
+            for input in args.inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-" else sys.stdout
+            for output in args.outputs
+        ]
+
+        encoder = MultiprocessingEncoder(args)
+        pool = Pool(args.workers, initializer=encoder.initializer)
+        encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100)
+
+        stats = Counter()
+        for i, (filt, enc_lines) in enumerate(encoded_lines, start=1):
+            if filt == "PASS":
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    print(enc_line, file=output_h)
+            else:
+                stats["num_filtered_" + filt] += 1
+            if i % 10000 == 0:
+                print("processed {} lines".format(i), file=sys.stderr)
+
+        for k, v in stats.most_common():
+            print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
+
+
+class MultiprocessingEncoder(object):
+
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        global bpe
+        bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe)
+
+    def encode(self, line):
+        global bpe
+        ids = bpe.encode(line)
+        return list(map(str, ids))
+
+    def decode(self, tokens):
+        global bpe
+        return bpe.decode(tokens)
+
+    def encode_lines(self, lines):
+        """
+        Encode a set of lines. All lines will be encoded together.
+        """
+        enc_lines = []
+        for line in lines:
+            line = line.strip()
+            if len(line) == 0 and not self.args.keep_empty:
+                return ["EMPTY", None]
+            tokens = self.encode(line)
+            enc_lines.append(" ".join(tokens))
+        return ["PASS", enc_lines]
+
+    def decode_lines(self, lines):
+        dec_lines = []
+        for line in lines:
+            tokens = map(int, line.strip().split())
+            dec_lines.append(self.decode(tokens))
+        return ["PASS", dec_lines]
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/roberta/preprocess_GLUE_tasks.sh b/examples/roberta/preprocess_GLUE_tasks.sh
new file mode 100755
index 0000000000..33fcd8f4f5
--- /dev/null
+++ b/examples/roberta/preprocess_GLUE_tasks.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+
+# raw glue data as downloaded by glue download script (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+if [[ $# -ne 2 ]]; then
+  echo "Run as following:"
+  echo "./examples/roberta/preprocess_GLUE_tasks.sh <glud_data_folder> <task_name>"
+  exit 1
+fi
+
+GLUE_DATA_FOLDER=$1
+
+# download bpe encoder.json, vocabulary and fairseq dictionary
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
+
+TASKS=$2 # QQP
+
+if [ "$TASKS" = "ALL" ]
+then
+  TASKS="QQP MNLI QNLI MRPC RTE STS-B SST-2 CoLA"
+fi
+
+for TASK in $TASKS
+do
+  echo "Preprocessing $TASK"
+
+  TASK_DATA_FOLDER="$GLUE_DATA_FOLDER/$TASK"
+  echo "Raw data as downloaded from glue website: $TASK_DATA_FOLDER"
+
+  SPLITS="train dev test"
+  INPUT_COUNT=2
+  if [ "$TASK" = "QQP" ]
+  then
+    INPUT_COLUMNS=( 4 5 )
+    TEST_INPUT_COLUMNS=( 2 3 )
+    LABEL_COLUMN=6
+  elif [ "$TASK" = "MNLI" ]
+  then
+    SPLITS="train dev_matched dev_mismatched test_matched test_mismatched"
+    INPUT_COLUMNS=( 9 10 )
+    TEST_INPUT_COLUMNS=( 9 10 )
+    DEV_LABEL_COLUMN=16
+    LABEL_COLUMN=12
+  elif [ "$TASK" = "QNLI" ]
+  then
+    INPUT_COLUMNS=( 2 3 )
+    TEST_INPUT_COLUMNS=( 2 3 )
+    LABEL_COLUMN=4
+  elif [ "$TASK" = "MRPC" ]
+  then
+    INPUT_COLUMNS=( 4 5 )
+    TEST_INPUT_COLUMNS=( 4 5 )
+    LABEL_COLUMN=1
+  elif [ "$TASK" = "RTE" ]
+  then
+    INPUT_COLUMNS=( 2 3 )
+    TEST_INPUT_COLUMNS=( 2 3 )
+    LABEL_COLUMN=4
+  elif [ "$TASK" = "STS-B" ]
+  then
+    INPUT_COLUMNS=( 8 9 )
+    TEST_INPUT_COLUMNS=( 8 9 )
+    LABEL_COLUMN=10
+  # Following are single sentence tasks.
+  elif [ "$TASK" = "SST-2" ]
+  then
+    INPUT_COLUMNS=( 1 )
+    TEST_INPUT_COLUMNS=( 2 )
+    LABEL_COLUMN=2
+    INPUT_COUNT=1
+  elif [ "$TASK" = "CoLA" ]
+  then
+    INPUT_COLUMNS=( 4 )
+    TEST_INPUT_COLUMNS=( 2 )
+    LABEL_COLUMN=2
+    INPUT_COUNT=1
+  fi
+
+  # Strip out header and filter lines that don't have expected number of fields.
+  rm -rf "$TASK_DATA_FOLDER/processed"
+  mkdir "$TASK_DATA_FOLDER/processed"
+  for SPLIT in $SPLITS
+  do
+    # CoLA train and dev doesn't have header.
+    if [[ ( "$TASK" = "CoLA") && ( "$SPLIT" != "test" ) ]]
+    then
+      cp "$TASK_DATA_FOLDER/$SPLIT.tsv" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
+    else
+      tail -n +2 "$TASK_DATA_FOLDER/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
+    fi
+
+    # Remove unformatted lines from train and dev files for QQP dataset.
+    if [[ ( "$TASK" = "QQP") && ( "$SPLIT" != "test" ) ]]
+    then
+      awk -F '\t' -v NUM_FIELDS=6 'NF==NUM_FIELDS{print}{}' "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv";
+    else
+      cp "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv";
+    fi
+    rm "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
+  done
+
+  # Split into input0, input1 and label
+  for SPLIT in $SPLITS
+  do
+    for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
+    do
+      if [[ "$SPLIT" != test* ]]
+      then
+        COLUMN_NUMBER=${INPUT_COLUMNS[$INPUT_TYPE]}
+      else
+        COLUMN_NUMBER=${TEST_INPUT_COLUMNS[$INPUT_TYPE]}
+      fi
+      cut -f"$COLUMN_NUMBER" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.raw.input$INPUT_TYPE";
+    done
+
+    if [[ "$SPLIT" != test* ]]
+    then
+      if [ "$TASK" = "MNLI" ] && [ "$SPLIT" != "train" ]
+      then
+        cut -f"$DEV_LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv"  > "$TASK_DATA_FOLDER/processed/$SPLIT.label";
+      else
+        cut -f"$LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label";
+      fi
+    fi
+
+    # BPE encode.
+    for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
+    do
+      LANG="input$INPUT_TYPE"
+      echo "BPE encoding $SPLIT/$LANG"
+      python -m examples.roberta.multiprocessing_bpe_encoder \
+      --encoder-json encoder.json \
+      --vocab-bpe vocab.bpe \
+      --inputs "$TASK_DATA_FOLDER/processed/$SPLIT.raw.$LANG" \
+      --outputs "$TASK_DATA_FOLDER/processed/$SPLIT.$LANG" \
+      --workers 60 \
+      --keep-empty;
+    done
+  done
+
+  # Remove output directory.
+  rm -rf "$TASK-bin"
+
+  DEVPREF="$TASK_DATA_FOLDER/processed/dev.LANG"
+  TESTPREF="$TASK_DATA_FOLDER/processed/test.LANG"
+  if [ "$TASK" = "MNLI" ]
+  then
+    DEVPREF="$TASK_DATA_FOLDER/processed/dev_matched.LANG,$TASK_DATA_FOLDER/processed/dev_mismatched.LANG"
+    TESTPREF="$TASK_DATA_FOLDER/processed/test_matched.LANG,$TASK_DATA_FOLDER/processed/test_mismatched.LANG"
+  fi
+
+  # Run fairseq preprocessing:
+  for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
+  do
+    LANG="input$INPUT_TYPE"
+    python preprocess.py \
+      --only-source \
+      --trainpref "$TASK_DATA_FOLDER/processed/train.$LANG" \
+      --validpref "${DEVPREF//LANG/$LANG}" \
+      --testpref "${TESTPREF//LANG/$LANG}" \
+      --destdir "$TASK-bin/$LANG" \
+      --workers 60 \
+      --srcdict dict.txt;
+  done
+  if [[ "$TASK" !=  "STS-B" ]]
+  then
+    python preprocess.py \
+      --only-source \
+      --trainpref "$TASK_DATA_FOLDER/processed/train.label" \
+      --validpref "${DEVPREF//LANG/'label'}" \
+      --destdir "$TASK-bin/label" \
+      --workers 60;
+  else
+    # For STS-B output range is converted to be between: [0.0, 1.0]
+    mkdir "$TASK-bin/label"
+    awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/train.label" > "$TASK-bin/label/train.label"
+    awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/dev.label" > "$TASK-bin/label/valid.label"
+  fi
+done
diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 4696875498..3e2fcbda7c 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -24,11 +24,16 @@
 def save_checkpoint(args, trainer, epoch_itr, val_loss):
     from fairseq import distributed_utils, meters
 
+    prev_best = getattr(save_checkpoint, 'best', val_loss)
+    if val_loss is not None:
+        best_function = max if args.maximize_best_checkpoint_metric else min
+        save_checkpoint.best = best_function(val_loss, prev_best)
+
     if args.no_save or not distributed_utils.is_master(args):
         return
 
     def is_better(a, b):
-        return a > b if args.maximize_best_checkpoint_metric else a < b
+        return a >= b if args.maximize_best_checkpoint_metric else a <= b
 
     write_timer = meters.StopwatchMeter()
     write_timer.start()
@@ -52,9 +57,6 @@ def is_better(a, b):
     )
     checkpoint_conds['checkpoint_last.pt'] = not args.no_last_checkpoints
 
-    prev_best = getattr(save_checkpoint, 'best', val_loss)
-    if val_loss is not None:
-        save_checkpoint.best = val_loss if is_better(val_loss, prev_best) else prev_best
     extra_state = {
         'train_iterator': epoch_itr.state_dict(),
         'val_loss': val_loss,
diff --git a/fairseq/criterions/sentence_prediction.py b/fairseq/criterions/sentence_prediction.py
new file mode 100644
index 0000000000..9b4a2d1815
--- /dev/null
+++ b/fairseq/criterions/sentence_prediction.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+
+from . import FairseqCriterion, register_criterion
+
+
+@register_criterion('sentence_prediction')
+class SentencePredictionCriterion(FairseqCriterion):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--save-predictions', metavar='FILE',
+                            help='file to save predictions to')
+        # fmt: on
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        features, extra = model(**sample['net_input'], features_only=True)
+        padding_mask = sample['net_input']['src_tokens'].eq(self.padding_idx)
+
+        assert hasattr(model, 'classification_heads') and \
+            'sentence_classification_head' in model.classification_heads, \
+            "model must provide sentence classification head for --criterion=sentence_prediction"
+
+        logits = model.classification_heads['sentence_classification_head'](
+            features,
+            padding_mask=padding_mask,
+        )
+
+        targets = model.get_targets(sample, [logits]).view(-1)
+        sample_size = targets.numel()
+
+        if not self.args.regression_target:
+            loss = F.nll_loss(
+                F.log_softmax(logits, dim=-1, dtype=torch.float32),
+                targets,
+                reduction='sum',
+            )
+        else:
+            logits = logits.squeeze().float()
+            targets = targets.float()
+            loss = F.mse_loss(
+                logits,
+                targets,
+                reduction='sum',
+            )
+
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample_size,
+            'sample_size': sample_size,
+        }
+
+        if not self.args.regression_target:
+            preds = logits.max(dim=1)[1]
+            logging_output.update(
+                ncorrect=(preds == targets).sum().item()
+            )
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        agg_output = {
+            'loss': loss_sum / sample_size / math.log(2),
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
+
+        if len(logging_outputs) > 0 and 'ncorrect' in logging_outputs[0]:
+            ncorrect = sum(log.get('ncorrect', 0) for log in logging_outputs)
+            agg_output.update(accuracy=ncorrect/nsentences)
+
+        if sample_size != ntokens:
+            agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
+        return agg_output
diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index 7ff95af472..14e9770ae9 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -14,6 +14,7 @@
 from .audio.raw_audio_dataset import RawAudioDataset
 from .backtranslation_dataset import BacktranslationDataset
 from .concat_dataset import ConcatDataset
+from .concat_sentences_dataset import ConcatSentencesDataset
 from .id_dataset import IdDataset
 from .indexed_dataset import IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, MMapIndexedDataset
 from .language_pair_dataset import LanguagePairDataset
@@ -25,13 +26,17 @@
 from .noising import NoisingDataset
 from .numel_dataset import NumelDataset
 from .num_samples_dataset import NumSamplesDataset
+from .offset_tokens_dataset import OffsetTokensDataset
 from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset
 from .prepend_token_dataset import PrependTokenDataset
+from .raw_label_dataset import RawLabelDataset
 from .round_robin_zip_datasets import RoundRobinZipDatasets
 from .sort_dataset import SortDataset
+from .strip_token_dataset import StripTokenDataset
 from .token_block_dataset import TokenBlockDataset
 from .transform_eos_dataset import TransformEosDataset
 from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
+from .truncate_dataset import TruncateDataset
 
 from .iterators import (
     CountingIterator,
@@ -44,6 +49,7 @@
     'BacktranslationDataset',
     'BaseWrapperDataset',
     'ConcatDataset',
+    'ConcatSentencesDataset',
     'CountingIterator',
     'Dictionary',
     'EpochBatchIterator',
@@ -64,15 +70,19 @@
     'NoisingDataset',
     'NumelDataset',
     'NumSamplesDataset',
+    "OffsetTokensDataset",
     'PadDataset',
     'PrependTokenDataset',
     'RawAudioDataset',
+    "RawLabelDataset",
     'RightPadDataset',
     'RoundRobinZipDatasets',
     'ShardedIterator',
     'SortDataset',
+    "StripTokenDataset",
     'TokenBlockDataset',
     'TransformEosDataset',
     'TransformEosLangPairDataset',
+    "TruncateDataset",
     'TruncatedDictionary',
 ]
diff --git a/fairseq/data/concat_sentences_dataset.py b/fairseq/data/concat_sentences_dataset.py
new file mode 100644
index 0000000000..342018f096
--- /dev/null
+++ b/fairseq/data/concat_sentences_dataset.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+
+from . import FairseqDataset
+
+
+class ConcatSentencesDataset(FairseqDataset):
+
+    def __init__(self, *datasets):
+        super().__init__()
+        self.datasets = datasets
+        assert all(len(ds) == len(datasets[0]) for ds in datasets), \
+            'datasets must have the same length'
+
+    def __getitem__(self, index):
+        return torch.cat([ds[index] for ds in self.datasets])
+
+    def __len__(self):
+        return len(self.datasets[0])
+
+    def collater(self, samples):
+        return self.datasets[0].collater(samples)
+
+    @property
+    def sizes(self):
+        return sum(ds.sizes for ds in self.datasets)
+
+    def num_tokens(self, index):
+        return sum(ds.num_tokens(index) for ds in self.datasets)
+
+    def size(self, index):
+        return sum(ds.size(index) for ds in self.datasets)
+
+    def ordered_indices(self):
+        return self.datasets[0].ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return any(
+            getattr(ds, 'supports_prefetch', False) for ds in self.datasets
+        )
+
+    def prefetch(self, indices):
+        for ds in self.datasets:
+            if getattr(ds, 'supports_prefetch', False):
+                ds.prefetch(indices)
diff --git a/fairseq/data/offset_tokens_dataset.py b/fairseq/data/offset_tokens_dataset.py
new file mode 100644
index 0000000000..7a947f66ed
--- /dev/null
+++ b/fairseq/data/offset_tokens_dataset.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from . import BaseWrapperDataset
+
+
+class OffsetTokensDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, offset):
+        super().__init__(dataset)
+        self.offset = offset
+
+    def __getitem__(self, idx):
+        return self.dataset[idx] + self.offset
diff --git a/fairseq/data/raw_label_dataset.py b/fairseq/data/raw_label_dataset.py
new file mode 100644
index 0000000000..5f7cc0e43c
--- /dev/null
+++ b/fairseq/data/raw_label_dataset.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+
+from . import FairseqDataset
+
+
+class RawLabelDataset(FairseqDataset):
+
+    def __init__(self, labels):
+        super().__init__()
+        self.labels = labels
+
+    def __getitem__(self, index):
+        return self.labels[index]
+
+    def __len__(self):
+        return len(self.labels)
+
+    def collater(self, samples):
+        return torch.tensor(samples)
diff --git a/fairseq/data/strip_token_dataset.py b/fairseq/data/strip_token_dataset.py
new file mode 100644
index 0000000000..eeb48ae600
--- /dev/null
+++ b/fairseq/data/strip_token_dataset.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from . import BaseWrapperDataset
+
+
+class StripTokenDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, id_to_strip):
+        super().__init__(dataset)
+        self.id_to_strip = id_to_strip
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        return item[item.ne(self.id_to_strip)]
diff --git a/fairseq/data/truncate_dataset.py b/fairseq/data/truncate_dataset.py
new file mode 100644
index 0000000000..0e350e407f
--- /dev/null
+++ b/fairseq/data/truncate_dataset.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import numpy as np
+
+from . import BaseWrapperDataset
+
+
+class TruncateDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, truncation_length):
+        super().__init__(dataset)
+        self.truncation_length = truncation_length
+        self.dataset = dataset
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item_len = item.size(0)
+        if item_len > self.truncation_length:
+            item = item[:self.truncation_length]
+        return item
+
+    @property
+    def sizes(self):
+        return np.minimum(self.dataset.sizes, self.truncation_length)
+
+    def __len__(self):
+        return len(self.dataset)
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index e7c6d5b7fc..c8794b2607 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -134,6 +134,15 @@ def upgrade_state_dict_named(self, state_dict, name):
             ].size(0)
             self.register_classification_head(head_name, num_classes, inner_dim)
 
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if hasattr(self, 'classification_heads'):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + 'classification_heads.' + k not in state_dict:
+                    print('Overwriting', prefix + 'classification_heads.' + k)
+                    state_dict[prefix + 'classification_heads.' + k] = v
+
 
 class RobertaLMHead(nn.Module):
     """Head for masked language modeling."""
diff --git a/fairseq/tasks/sentence_prediction.py b/fairseq/tasks/sentence_prediction.py
new file mode 100644
index 0000000000..0f54ef81f1
--- /dev/null
+++ b/fairseq/tasks/sentence_prediction.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import os
+
+import numpy as np
+
+from fairseq.data import (
+    ConcatSentencesDataset,
+    data_utils,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumSamplesDataset,
+    NumelDataset,
+    OffsetTokensDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    SortDataset,
+    StripTokenDataset,
+    TruncateDataset,
+)
+
+from . import FairseqTask, register_task
+
+
+@register_task('sentence_prediction')
+class SentencePredictionTask(FairseqTask):
+    """
+    Sentence (or sentence pair) prediction (classification or regression) task.
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', metavar='FILE',
+                            help='file prefix for data')
+        parser.add_argument('--max-positions', type=int, default=512,
+                            help='max input length')
+        parser.add_argument('--num-classes', type=int, default=-1,
+                            help='number of classes')
+        parser.add_argument('--init-token', type=int, default=None,
+                            help='add token at the beginning of each batch item')
+        parser.add_argument('--separator-token', type=int, default=None,
+                            help='add separator token between inputs')
+        parser.add_argument('--regression-target', action='store_true', default=False)
+        parser.add_argument('--no-shuffle', action='store_true', default=False)
+        parser.add_argument('--truncate-sequence', action='store_true', default=False,
+                            help='Truncate sequence to max_sequence_length')
+
+    def __init__(self, args, data_dictionary, label_dictionary):
+        super().__init__(args)
+        self.dictionary = data_dictionary
+        self.label_dictionary = label_dictionary
+
+    @classmethod
+    def load_dictionary(cls, args, filename, source=True):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol('<mask>')
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.num_classes > 0, 'Must set --num-classes'
+
+        args.tokens_per_sample = args.max_positions
+
+        # load data dictionary
+        data_dict = cls.load_dictionary(
+            args,
+            os.path.join(args.data, 'input0', 'dict.txt'),
+            source=True,
+        )
+        print('| [input] dictionary: {} types'.format(len(data_dict)))
+
+        label_dict = None
+        if not args.regression_target:
+            # load label dictionary
+            label_dict = cls.load_dictionary(
+                args,
+                os.path.join(args.data, 'label', 'dict.txt'),
+                source=False,
+            )
+            print('| [label] dictionary: {} types'.format(len(label_dict)))
+        else:
+            label_dict = data_dict
+        return SentencePredictionTask(args, data_dict, label_dict)
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)."""
+        def get_path(type, split):
+            return os.path.join(self.args.data, type, split)
+
+        def make_dataset(type, dictionary):
+            split_path = get_path(type, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            return dataset
+
+        input0 = make_dataset('input0', self.source_dictionary)
+        assert input0 is not None, 'could not find dataset: {}'.format(get_path(type, split))
+        input1 = make_dataset('input1', self.source_dictionary)
+
+        if self.args.init_token is not None:
+            input0 = PrependTokenDataset(input0, self.args.init_token)
+
+        if input1 is None:
+            src_tokens = input0
+        else:
+            if self.args.separator_token is not None:
+                input1 = PrependTokenDataset(input1, self.args.separator_token)
+
+            src_tokens = ConcatSentencesDataset(input0, input1)
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(src_tokens))
+
+        if self.args.truncate_sequence:
+            src_tokens = TruncateDataset(src_tokens, self.args.max_positions)
+
+        dataset = {
+            'id': IdDataset(),
+            'net_input': {
+                'src_tokens': RightPadDataset(
+                    src_tokens,
+                    pad_idx=self.source_dictionary.pad(),
+                ),
+                'src_lengths': NumelDataset(src_tokens, reduce=False),
+            },
+            'nsentences': NumSamplesDataset(),
+            'ntokens': NumelDataset(src_tokens, reduce=True),
+        }
+
+        if not self.args.regression_target:
+            label_dataset = make_dataset('label', self.target_dictionary)
+            if label_dataset is not None:
+                dataset.update(
+                    target=OffsetTokensDataset(
+                        StripTokenDataset(
+                            label_dataset,
+                            id_to_strip=self.target_dictionary.eos(),
+                        ),
+                        offset=-self.target_dictionary.nspecial,
+                    )
+                )
+        else:
+            label_path = f"{get_path('label', split)}.label"
+            if os.path.exists(label_path):
+                dataset.update(
+                    target=RawLabelDataset([
+                        float(x.strip()) for x in open(label_path).readlines()
+                    ])
+                )
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[src_tokens.sizes],
+        )
+
+        if self.args.no_shuffle:
+            dataset = nested_dataset
+        else:
+            dataset = SortDataset(
+                nested_dataset,
+                # shuffle
+                sort_order=[shuffle],
+            )
+
+        print(f"| Loaded {split} with #samples: {len(dataset)}")
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, args):
+        from fairseq import models
+        model = models.build_model(args, self)
+
+        model.register_classification_head(
+            'sentence_classification_head',
+            num_classes=self.args.num_classes,
+        )
+
+        return model
+
+    def max_positions(self):
+        return self.args.max_positions
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.label_dictionary
diff --git a/train.py b/train.py
index 9531829625..c4a95023c3 100644
--- a/train.py
+++ b/train.py
@@ -130,7 +130,7 @@ def train(args, trainer, task, epoch_itr):
         for k, v in log_output.items():
             if k in ['loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size']:
                 continue  # these are already logged above
-            if 'loss' in k:
+            if 'loss' in k or k == 'accuracy':
                 extra_meters[k].update(v, log_output['sample_size'])
             else:
                 extra_meters[k].update(v)
@@ -236,16 +236,20 @@ def validate(args, trainer, task, epoch_itr, subsets):
                 extra_meters[k].update(v)
 
         # log validation stats
-        stats = get_valid_stats(trainer)
+        stats = get_valid_stats(trainer, args, extra_meters)
         for k, meter in extra_meters.items():
             stats[k] = meter.avg
         progress.print(stats, tag=subset, step=trainer.get_num_updates())
 
-        valid_losses.append(stats[args.best_checkpoint_metric].avg)
+        valid_losses.append(
+            stats[args.best_checkpoint_metric].avg
+            if args.best_checkpoint_metric == 'loss'
+            else stats[args.best_checkpoint_metric]
+        )
     return valid_losses
 
 
-def get_valid_stats(trainer):
+def get_valid_stats(trainer, args, extra_meters=None):
     stats = collections.OrderedDict()
     stats['loss'] = trainer.get_meter('valid_loss')
     if trainer.get_meter('valid_nll_loss').count > 0:
@@ -256,8 +260,23 @@ def get_valid_stats(trainer):
     stats['ppl'] = utils.get_perplexity(nll_loss.avg)
     stats['num_updates'] = trainer.get_num_updates()
     if hasattr(checkpoint_utils.save_checkpoint, 'best'):
-        stats['best_loss'] = min(
-            checkpoint_utils.save_checkpoint.best, stats['loss'].avg)
+        key = f'best_{args.best_checkpoint_metric}'
+        best_function = max if args.maximize_best_checkpoint_metric else min
+
+        current_metric = None
+        if args.best_checkpoint_metric == 'loss':
+            current_metric = stats['loss'].avg
+        elif args.best_checkpoint_metric in extra_meters:
+            current_metric = extra_meters[args.best_checkpoint_metric].avg
+        elif args.best_checkpoint_metric in stats:
+            current_metric = stats[args.best_checkpoint_metric]
+        else:
+            raise ValueError("best_checkpoint_metric not found in logs")
+
+        stats[key] = best_function(
+            checkpoint_utils.save_checkpoint.best,
+            current_metric,
+        )
     return stats
 
 

From c132b9b931893257ecb0ac6351969ab4142d6782 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 30 Jul 2019 05:30:22 -0700
Subject: [PATCH 042/213] Fix tokenization (fixes #926) (#929)

Summary:
Fixes https://github.com/pytorch/fairseq/issues/926
Pull Request resolved: https://github.com/pytorch/fairseq/pull/929

Differential Revision: D16560281

Pulled By: myleott

fbshipit-source-id: 751051bcdbf25207315bb05f5bee0235d21be627
---
 fairseq/models/roberta/hub_interface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index f7eb6277b1..a2654febb7 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -36,8 +36,8 @@ def device(self):
     def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
         bpe_sentence = '<s> ' + self.bpe.encode(sentence) + ' </s>'
         for s in addl_sentences:
-            bpe_sentence += ' </s> ' + self.bpe.encode(s)
-        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=True)
+            bpe_sentence += ' </s> ' + self.bpe.encode(s) + ' </s>'
+        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False)
         return tokens.long()
 
     def extract_features(self, tokens: torch.LongTensor, return_all_hiddens=False) -> torch.Tensor:

From e75cff5f2c1d62f12dc911e0bf420025eb1a4e33 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 30 Jul 2019 07:45:13 -0700
Subject: [PATCH 043/213] Relicense fairseq under MIT license (#786)

Summary:
The previous BSD+PATENTS license was controversial. We have been
approved to relicense fairseq under the MIT license.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/786

Differential Revision: D16560654

Pulled By: myleott

fbshipit-source-id: f78b1beb4f2895dd7b9bfc79f5f952a2bfb94034
---
 CONTRIBUTING.md                               | 10 ++---
 LICENSE                                       | 43 ++++++++-----------
 PATENTS                                       | 33 --------------
 README.md                                     |  4 +-
 eval_lm.py                                    |  8 ++--
 examples/roberta/preprocess_GLUE_tasks.sh     |  8 ++--
 examples/translation_moe/score.py             |  8 ++--
 fairseq/__init__.py                           |  8 ++--
 fairseq/binarizer.py                          |  8 ++--
 fairseq/bleu.py                               |  8 ++--
 fairseq/checkpoint_utils.py                   |  8 ++--
 fairseq/criterions/__init__.py                |  8 ++--
 fairseq/criterions/adaptive_loss.py           |  8 ++--
 fairseq/criterions/binary_cross_entropy.py    |  8 ++--
 fairseq/criterions/composite_loss.py          |  8 ++--
 fairseq/criterions/cross_entropy.py           |  8 ++--
 fairseq/criterions/fairseq_criterion.py       |  8 ++--
 .../label_smoothed_cross_entropy.py           |  8 ++--
 fairseq/criterions/legacy_masked_lm.py        |  8 ++--
 fairseq/criterions/masked_lm.py               |  8 ++--
 fairseq/criterions/sentence_prediction.py     |  8 ++--
 fairseq/data/__init__.py                      |  8 ++--
 fairseq/data/audio/raw_audio_dataset.py       |  8 ++--
 fairseq/data/backtranslation_dataset.py       |  8 ++--
 fairseq/data/base_wrapper_dataset.py          |  8 ++--
 fairseq/data/concat_dataset.py                |  8 ++--
 fairseq/data/concat_sentences_dataset.py      |  8 ++--
 fairseq/data/data_utils.py                    |  8 ++--
 fairseq/data/dictionary.py                    |  8 ++--
 fairseq/data/encoders/__init__.py             |  8 ++--
 fairseq/data/encoders/gpt2_bpe.py             |  8 ++--
 fairseq/data/encoders/moses_tokenizer.py      |  8 ++--
 fairseq/data/encoders/nltk_tokenizer.py       |  8 ++--
 fairseq/data/encoders/sentencepiece_bpe.py    |  8 ++--
 fairseq/data/encoders/space_tokenizer.py      |  8 ++--
 fairseq/data/encoders/subword_nmt_bpe.py      |  8 ++--
 fairseq/data/fairseq_dataset.py               |  8 ++--
 fairseq/data/id_dataset.py                    |  8 ++--
 fairseq/data/indexed_dataset.py               |  8 ++--
 fairseq/data/iterators.py                     |  8 ++--
 fairseq/data/language_pair_dataset.py         |  8 ++--
 fairseq/data/legacy/__init__.py               |  8 ++--
 fairseq/data/legacy/block_pair_dataset.py     |  8 ++--
 fairseq/data/legacy/masked_lm_dataset.py      |  8 ++--
 fairseq/data/legacy/masked_lm_dictionary.py   |  8 ++--
 fairseq/data/lm_context_window_dataset.py     |  8 ++--
 fairseq/data/lru_cache_dataset.py             |  8 ++--
 fairseq/data/mask_tokens_dataset.py           |  8 ++--
 fairseq/data/monolingual_dataset.py           |  8 ++--
 fairseq/data/multi_corpus_sampled_dataset.py  |  8 ++--
 fairseq/data/nested_dictionary_dataset.py     |  8 ++--
 fairseq/data/noising.py                       |  8 ++--
 fairseq/data/num_samples_dataset.py           |  8 ++--
 fairseq/data/numel_dataset.py                 |  8 ++--
 fairseq/data/offset_tokens_dataset.py         |  8 ++--
 fairseq/data/pad_dataset.py                   |  8 ++--
 fairseq/data/prepend_token_dataset.py         |  8 ++--
 fairseq/data/raw_label_dataset.py             |  8 ++--
 fairseq/data/round_robin_zip_datasets.py      |  8 ++--
 fairseq/data/sort_dataset.py                  |  8 ++--
 fairseq/data/strip_token_dataset.py           |  8 ++--
 fairseq/data/token_block_dataset.py           |  8 ++--
 fairseq/data/transform_eos_dataset.py         |  8 ++--
 .../data/transform_eos_lang_pair_dataset.py   |  8 ++--
 fairseq/data/truncate_dataset.py              |  8 ++--
 fairseq/distributed_utils.py                  |  8 ++--
 fairseq/file_utils.py                         |  8 ++--
 fairseq/hub_utils.py                          |  8 ++--
 fairseq/legacy_distributed_data_parallel.py   |  8 ++--
 fairseq/meters.py                             |  8 ++--
 fairseq/models/__init__.py                    |  8 ++--
 fairseq/models/composite_encoder.py           |  8 ++--
 fairseq/models/distributed_fairseq_model.py   |  8 ++--
 fairseq/models/fairseq_decoder.py             |  8 ++--
 fairseq/models/fairseq_encoder.py             |  8 ++--
 fairseq/models/fairseq_incremental_decoder.py |  8 ++--
 fairseq/models/fairseq_model.py               |  8 ++--
 fairseq/models/fconv.py                       |  8 ++--
 fairseq/models/fconv_lm.py                    |  8 ++--
 fairseq/models/fconv_self_att.py              |  8 ++--
 fairseq/models/lightconv.py                   |  8 ++--
 fairseq/models/lightconv_lm.py                |  8 ++--
 fairseq/models/lstm.py                        |  8 ++--
 fairseq/models/masked_lm.py                   |  8 ++--
 fairseq/models/multilingual_transformer.py    |  8 ++--
 fairseq/models/roberta/__init__.py            |  8 ++--
 fairseq/models/roberta/hub_interface.py       |  8 ++--
 fairseq/models/roberta/model.py               |  8 ++--
 fairseq/models/transformer.py                 |  8 ++--
 .../models/transformer_from_pretrained_xlm.py |  8 ++--
 fairseq/models/transformer_lm.py              |  8 ++--
 fairseq/models/wav2vec.py                     |  8 ++--
 fairseq/modules/__init__.py                   |  8 ++--
 fairseq/modules/adaptive_input.py             |  8 ++--
 fairseq/modules/adaptive_softmax.py           |  8 ++--
 fairseq/modules/beamable_mm.py                |  8 ++--
 fairseq/modules/character_token_embedder.py   |  8 ++--
 fairseq/modules/conv_tbc.py                   |  8 ++--
 .../downsampled_multihead_attention.py        |  8 ++--
 fairseq/modules/dynamic_convolution.py        |  8 ++--
 fairseq/modules/gelu.py                       |  8 ++--
 fairseq/modules/grad_multiply.py              |  8 ++--
 fairseq/modules/highway.py                    |  8 ++--
 fairseq/modules/layer_norm.py                 |  8 ++--
 .../modules/learned_positional_embedding.py   |  8 ++--
 fairseq/modules/lightweight_convolution.py    |  8 ++--
 fairseq/modules/linearized_convolution.py     |  8 ++--
 fairseq/modules/logsumexp_moe.py              |  8 ++--
 fairseq/modules/mean_pool_gating_network.py   |  8 ++--
 fairseq/modules/multihead_attention.py        |  8 ++--
 fairseq/modules/positional_embedding.py       |  8 ++--
 fairseq/modules/scalar_bias.py                |  8 ++--
 .../sinusoidal_positional_embedding.py        |  8 ++--
 fairseq/modules/sparse_multihead_attention.py |  8 ++--
 .../sparse_transformer_sentence_encoder.py    |  8 ++--
 ...arse_transformer_sentence_encoder_layer.py |  8 ++--
 .../modules/transformer_sentence_encoder.py   |  8 ++--
 .../transformer_sentence_encoder_layer.py     |  8 ++--
 fairseq/modules/unfold.py                     |  8 ++--
 fairseq/optim/__init__.py                     |  8 ++--
 fairseq/optim/adadelta.py                     |  8 ++--
 fairseq/optim/adafactor.py                    |  8 ++--
 fairseq/optim/adagrad.py                      |  8 ++--
 fairseq/optim/adam.py                         |  8 ++--
 fairseq/optim/adamax.py                       |  8 ++--
 fairseq/optim/bmuf.py                         |  8 ++--
 fairseq/optim/fairseq_optimizer.py            |  8 ++--
 fairseq/optim/fp16_optimizer.py               |  8 ++--
 fairseq/optim/lamb.py                         |  8 ++--
 fairseq/optim/lr_scheduler/__init__.py        |  8 ++--
 .../optim/lr_scheduler/cosine_lr_scheduler.py |  8 ++--
 .../lr_scheduler/fairseq_lr_scheduler.py      |  8 ++--
 fairseq/optim/lr_scheduler/fixed_schedule.py  |  8 ++--
 .../inverse_square_root_schedule.py           |  8 ++--
 .../lr_scheduler/polynomial_decay_schedule.py |  8 ++--
 .../lr_scheduler/reduce_lr_on_plateau.py      |  8 ++--
 .../lr_scheduler/triangular_lr_scheduler.py   |  8 ++--
 fairseq/optim/nag.py                          |  8 ++--
 fairseq/optim/sgd.py                          |  8 ++--
 fairseq/options.py                            |  8 ++--
 fairseq/pdb.py                                |  8 ++--
 fairseq/progress_bar.py                       |  8 ++--
 fairseq/registry.py                           |  8 ++--
 fairseq/search.py                             |  8 ++--
 fairseq/sequence_generator.py                 |  8 ++--
 fairseq/sequence_scorer.py                    |  8 ++--
 fairseq/tasks/__init__.py                     |  8 ++--
 fairseq/tasks/audio_pretraining.py            |  8 ++--
 fairseq/tasks/cross_lingual_lm.py             |  8 ++--
 fairseq/tasks/fairseq_task.py                 |  8 ++--
 fairseq/tasks/language_modeling.py            |  8 ++--
 fairseq/tasks/legacy_masked_lm.py             |  8 ++--
 fairseq/tasks/masked_lm.py                    |  8 ++--
 fairseq/tasks/multilingual_translation.py     |  8 ++--
 fairseq/tasks/semisupervised_translation.py   |  8 ++--
 fairseq/tasks/sentence_prediction.py          |  8 ++--
 fairseq/tasks/translation.py                  |  8 ++--
 .../tasks/translation_from_pretrained_xlm.py  |  8 ++--
 fairseq/tasks/translation_moe.py              |  7 ++-
 fairseq/tokenizer.py                          |  8 ++--
 fairseq/trainer.py                            |  8 ++--
 fairseq/utils.py                              |  8 ++--
 generate.py                                   |  8 ++--
 hubconf.py                                    |  8 ++--
 interactive.py                                |  8 ++--
 preprocess.py                                 |  8 ++--
 score.py                                      |  8 ++--
 scripts/average_checkpoints.py                |  8 ++--
 scripts/build_sym_alignment.py                |  8 ++--
 scripts/convert_dictionary.lua                |  8 ++--
 scripts/convert_model.lua                     |  8 ++--
 scripts/count_docs.py                         |  8 ++--
 scripts/read_binarized.py                     |  8 ++--
 scripts/rm_pt.py                              |  8 ++--
 scripts/shard_docs.py                         |  8 ++--
 scripts/split_train_valid_docs.py             |  8 ++--
 scripts/wav2vec_manifest.py                   |  8 ++--
 setup.py                                      | 10 ++---
 tests/test_average_checkpoints.py             |  8 ++--
 tests/test_backtranslation_dataset.py         |  8 ++--
 tests/test_binaries.py                        |  8 ++--
 tests/test_character_token_embedder.py        |  8 ++--
 tests/test_concat_dataset.py                  |  8 ++--
 tests/test_convtbc.py                         |  8 ++--
 tests/test_dictionary.py                      |  8 ++--
 tests/test_iterators.py                       |  8 ++--
 tests/test_label_smoothing.py                 |  8 ++--
 tests/test_memory_efficient_fp16.py           |  8 ++--
 tests/test_multi_corpus_sampled_dataset.py    |  8 ++--
 tests/test_noising.py                         |  8 ++--
 tests/test_reproducibility.py                 |  8 ++--
 tests/test_sequence_generator.py              |  8 ++--
 tests/test_sequence_scorer.py                 |  8 ++--
 tests/test_sparse_multihead_attention.py      |  8 ++--
 tests/test_token_block_dataset.py             |  8 ++--
 tests/test_train.py                           |  8 ++--
 tests/test_utils.py                           |  8 ++--
 tests/utils.py                                |  8 ++--
 train.py                                      |  8 ++--
 199 files changed, 609 insertions(+), 1042 deletions(-)
 delete mode 100644 PATENTS

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5592b2bdef..4d7ca6a98e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to FAIR Sequence-to-Sequence Toolkit (PyTorch)
+# Contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq)
 We want to make contributing to this project as easy and transparent as
 possible.
 
@@ -22,9 +22,7 @@ Complete your CLA here: <https://code.facebook.com/cla>
 We use GitHub issues to track public bugs. Please ensure your description is
 clear and has sufficient instructions to be able to reproduce the issue.
 
-## Coding Style
-We try to follow the PEP style guidelines and encourage you to as well.
-
 ## License
-By contributing to FAIR Sequence-to-Sequence Toolkit, you agree that your contributions will be licensed
-under the LICENSE file in the root directory of this source tree.
\ No newline at end of file
+By contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq),
+you agree that your contributions will be licensed under the LICENSE file in
+the root directory of this source tree.
diff --git a/LICENSE b/LICENSE
index 8144372d12..b96dcb0480 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,30 +1,21 @@
-BSD License
+MIT License
 
-For fairseq software
+Copyright (c) Facebook, Inc. and its affiliates.
 
-Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
- * Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-       and/or other materials provided with the distribution.
-
- * Neither the name Facebook nor the names of its contributors may be used to
-    endorse or promote products derived from this software without specific
-       prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/PATENTS b/PATENTS
deleted file mode 100644
index 18b09892ca..0000000000
--- a/PATENTS
+++ /dev/null
@@ -1,33 +0,0 @@
-Additional Grant of Patent Rights Version 2
-
-"Software" means the fairseq software distributed by Facebook, Inc.
-
-Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
-("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
-(subject to the termination provision below) license under any Necessary
-Claims, to make, have made, use, sell, offer to sell, import, and otherwise
-transfer the Software. For avoidance of doubt, no license is granted under
-Facebook’s rights in any patent claims that are infringed by (i) modifications
-to the Software made by you or any third party or (ii) the Software in
-combination with any software or other technology.
-
-The license granted hereunder will terminate, automatically and without notice,
-if you (or any of your subsidiaries, corporate affiliates or agents) initiate
-directly or indirectly, or take a direct financial interest in, any Patent
-Assertion: (i) against Facebook or any of its subsidiaries or corporate
-affiliates, (ii) against any party if such Patent Assertion arises in whole or
-in part from any software, technology, product or service of Facebook or any of
-its subsidiaries or corporate affiliates, or (iii) against any party relating
-to the Software. Notwithstanding the foregoing, if Facebook or any of its
-subsidiaries or corporate affiliates files a lawsuit alleging patent
-infringement against you in the first instance, and you respond by filing a
-patent infringement counterclaim in that lawsuit against that party that is
-unrelated to the Software, the license granted hereunder will not terminate
-under section (i) of this paragraph due to such counterclaim.
-
-A "Necessary Claim" is a claim of a patent owned by Facebook that is
-necessarily infringed by the Software standing alone.
-
-A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
-or contributory infringement or inducement to infringe any patent, including a
-cross-claim or counterclaim.
diff --git a/README.md b/README.md
index 0b5f9ea947..a748599895 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ modeling and other text generation tasks.
 
 ### What's New:
 
+- July 2019: fairseq relicensed under MIT license
 - July 2019: [RoBERTa models and code release](examples/roberta/README.md)
 - June 2019: [wav2vec models and code release](examples/wav2vec/README.md)
 - April 2019: [fairseq demo paper @ NAACL 2019](https://arxiv.org/abs/1904.01038)
@@ -109,9 +110,8 @@ We also have more detailed READMEs to reproduce results from specific papers:
 * Google group: https://groups.google.com/forum/#!forum/fairseq-users
 
 # License
-fairseq(-py) is BSD-licensed.
+fairseq(-py) is MIT-licensed.
 The license applies to the pre-trained models as well.
-We also provide an additional patent grant.
 
 # Citation
 
diff --git a/eval_lm.py b/eval_lm.py
index 34c9a5485d..e2da64fc1d 100644
--- a/eval_lm.py
+++ b/eval_lm.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 """
 Evaluate the perplexity of a trained language model.
diff --git a/examples/roberta/preprocess_GLUE_tasks.sh b/examples/roberta/preprocess_GLUE_tasks.sh
index 33fcd8f4f5..56addbc292 100755
--- a/examples/roberta/preprocess_GLUE_tasks.sh
+++ b/examples/roberta/preprocess_GLUE_tasks.sh
@@ -1,10 +1,8 @@
 #!/bin/bash
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 # raw glue data as downloaded by glue download script (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
diff --git a/examples/translation_moe/score.py b/examples/translation_moe/score.py
index f59fac0846..8e207093db 100644
--- a/examples/translation_moe/score.py
+++ b/examples/translation_moe/score.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Scoring script for computing pairwise BLEU and multi-ref BLEU over a set of
 candidate hypotheses.
diff --git a/fairseq/__init__.py b/fairseq/__init__.py
index 90ddc77812..1699f2bfca 100644
--- a/fairseq/__init__.py
+++ b/fairseq/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 __all__ = ['pdb']
 __version__ = '0.7.2'
diff --git a/fairseq/binarizer.py b/fairseq/binarizer.py
index 5130c4e12a..44dcb256c4 100644
--- a/fairseq/binarizer.py
+++ b/fairseq/binarizer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import Counter
 import os
diff --git a/fairseq/bleu.py b/fairseq/bleu.py
index d46abd340f..36b15286fc 100644
--- a/fairseq/bleu.py
+++ b/fairseq/bleu.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import ctypes
 import math
diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 3e2fcbda7c..859f65351a 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 from collections import OrderedDict
diff --git a/fairseq/criterions/__init__.py b/fairseq/criterions/__init__.py
index 343ba0dc2d..618723aeff 100644
--- a/fairseq/criterions/__init__.py
+++ b/fairseq/criterions/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import importlib
 import os
diff --git a/fairseq/criterions/adaptive_loss.py b/fairseq/criterions/adaptive_loss.py
index 8982bd2b7d..6f282001c0 100644
--- a/fairseq/criterions/adaptive_loss.py
+++ b/fairseq/criterions/adaptive_loss.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 import math
diff --git a/fairseq/criterions/binary_cross_entropy.py b/fairseq/criterions/binary_cross_entropy.py
index 06f269692c..d1f758f511 100644
--- a/fairseq/criterions/binary_cross_entropy.py
+++ b/fairseq/criterions/binary_cross_entropy.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 import torch
diff --git a/fairseq/criterions/composite_loss.py b/fairseq/criterions/composite_loss.py
index 8b36434684..a638d268c7 100644
--- a/fairseq/criterions/composite_loss.py
+++ b/fairseq/criterions/composite_loss.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from torch import nn
 
diff --git a/fairseq/criterions/cross_entropy.py b/fairseq/criterions/cross_entropy.py
index fda539df6b..d6e8ff545f 100644
--- a/fairseq/criterions/cross_entropy.py
+++ b/fairseq/criterions/cross_entropy.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 import torch.nn.functional as F
diff --git a/fairseq/criterions/fairseq_criterion.py b/fairseq/criterions/fairseq_criterion.py
index 4c167ac511..2df0819183 100644
--- a/fairseq/criterions/fairseq_criterion.py
+++ b/fairseq/criterions/fairseq_criterion.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from torch.nn.modules.loss import _Loss
 
diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
index 71448bf10b..6687718725 100644
--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/criterions/legacy_masked_lm.py b/fairseq/criterions/legacy_masked_lm.py
index ac7fb9d445..fe3c7bf2a6 100644
--- a/fairseq/criterions/legacy_masked_lm.py
+++ b/fairseq/criterions/legacy_masked_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 import math
diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
index b899b87605..842fad0fa5 100644
--- a/fairseq/criterions/masked_lm.py
+++ b/fairseq/criterions/masked_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/criterions/sentence_prediction.py b/fairseq/criterions/sentence_prediction.py
index 9b4a2d1815..f116288b12 100644
--- a/fairseq/criterions/sentence_prediction.py
+++ b/fairseq/criterions/sentence_prediction.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index 14e9770ae9..d400a9b034 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from .dictionary import Dictionary, TruncatedDictionary
 
diff --git a/fairseq/data/audio/raw_audio_dataset.py b/fairseq/data/audio/raw_audio_dataset.py
index 7fb25cc6c3..59bee89066 100644
--- a/fairseq/data/audio/raw_audio_dataset.py
+++ b/fairseq/data/audio/raw_audio_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 import os
diff --git a/fairseq/data/backtranslation_dataset.py b/fairseq/data/backtranslation_dataset.py
index 9e0689e5e9..0007a01506 100644
--- a/fairseq/data/backtranslation_dataset.py
+++ b/fairseq/data/backtranslation_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/data/base_wrapper_dataset.py b/fairseq/data/base_wrapper_dataset.py
index 88609915c4..17b39133dc 100644
--- a/fairseq/data/base_wrapper_dataset.py
+++ b/fairseq/data/base_wrapper_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from torch.utils.data.dataloader import default_collate
 
diff --git a/fairseq/data/concat_dataset.py b/fairseq/data/concat_dataset.py
index 1a930b9334..659af9ae75 100644
--- a/fairseq/data/concat_dataset.py
+++ b/fairseq/data/concat_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import bisect
 
diff --git a/fairseq/data/concat_sentences_dataset.py b/fairseq/data/concat_sentences_dataset.py
index 342018f096..8a9336d360 100644
--- a/fairseq/data/concat_sentences_dataset.py
+++ b/fairseq/data/concat_sentences_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 71b450aabc..bd2c5d35c9 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 try:
     from collections.abc import Iterable
diff --git a/fairseq/data/dictionary.py b/fairseq/data/dictionary.py
index 4e4cbc0346..417105e50b 100644
--- a/fairseq/data/dictionary.py
+++ b/fairseq/data/dictionary.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import Counter
 from multiprocessing import Pool
diff --git a/fairseq/data/encoders/__init__.py b/fairseq/data/encoders/__init__.py
index 1e7e69fbea..c0909b6697 100644
--- a/fairseq/data/encoders/__init__.py
+++ b/fairseq/data/encoders/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 import importlib
diff --git a/fairseq/data/encoders/gpt2_bpe.py b/fairseq/data/encoders/gpt2_bpe.py
index 283e6c4501..04a485add2 100644
--- a/fairseq/data/encoders/gpt2_bpe.py
+++ b/fairseq/data/encoders/gpt2_bpe.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq import file_utils
 from fairseq.data.encoders import register_bpe
diff --git a/fairseq/data/encoders/moses_tokenizer.py b/fairseq/data/encoders/moses_tokenizer.py
index 4964a822c2..deed30d880 100644
--- a/fairseq/data/encoders/moses_tokenizer.py
+++ b/fairseq/data/encoders/moses_tokenizer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq.data.encoders import register_tokenizer
 
diff --git a/fairseq/data/encoders/nltk_tokenizer.py b/fairseq/data/encoders/nltk_tokenizer.py
index 61325efc42..3db8ee5652 100644
--- a/fairseq/data/encoders/nltk_tokenizer.py
+++ b/fairseq/data/encoders/nltk_tokenizer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq.data.encoders import register_tokenizer
 
diff --git a/fairseq/data/encoders/sentencepiece_bpe.py b/fairseq/data/encoders/sentencepiece_bpe.py
index 9b27460194..ecfe865c56 100644
--- a/fairseq/data/encoders/sentencepiece_bpe.py
+++ b/fairseq/data/encoders/sentencepiece_bpe.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq import file_utils
 from fairseq.data.encoders import register_bpe
diff --git a/fairseq/data/encoders/space_tokenizer.py b/fairseq/data/encoders/space_tokenizer.py
index b804b969d8..670001a8e8 100644
--- a/fairseq/data/encoders/space_tokenizer.py
+++ b/fairseq/data/encoders/space_tokenizer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import re
 
diff --git a/fairseq/data/encoders/subword_nmt_bpe.py b/fairseq/data/encoders/subword_nmt_bpe.py
index b2c1fa33b9..78f19b43ea 100644
--- a/fairseq/data/encoders/subword_nmt_bpe.py
+++ b/fairseq/data/encoders/subword_nmt_bpe.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq import file_utils
 from fairseq.data.encoders import register_bpe
diff --git a/fairseq/data/fairseq_dataset.py b/fairseq/data/fairseq_dataset.py
index 55ffec30d0..f710b3d9f7 100644
--- a/fairseq/data/fairseq_dataset.py
+++ b/fairseq/data/fairseq_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch.utils.data
diff --git a/fairseq/data/id_dataset.py b/fairseq/data/id_dataset.py
index a10423e1af..6a73ba1ff7 100644
--- a/fairseq/data/id_dataset.py
+++ b/fairseq/data/id_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/data/indexed_dataset.py b/fairseq/data/indexed_dataset.py
index 7939a5a62d..12497989bb 100644
--- a/fairseq/data/indexed_dataset.py
+++ b/fairseq/data/indexed_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from functools import lru_cache
 import os
diff --git a/fairseq/data/iterators.py b/fairseq/data/iterators.py
index 313d546b83..451a8c5e1d 100644
--- a/fairseq/data/iterators.py
+++ b/fairseq/data/iterators.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import itertools
 import math
diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py
index 64a5e4c7ee..5fc1371aae 100644
--- a/fairseq/data/language_pair_dataset.py
+++ b/fairseq/data/language_pair_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/fairseq/data/legacy/__init__.py b/fairseq/data/legacy/__init__.py
index df912ec648..1acaafeb09 100644
--- a/fairseq/data/legacy/__init__.py
+++ b/fairseq/data/legacy/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary
 from .block_pair_dataset import BlockPairDataset
diff --git a/fairseq/data/legacy/block_pair_dataset.py b/fairseq/data/legacy/block_pair_dataset.py
index db13f61c97..b9fc814147 100644
--- a/fairseq/data/legacy/block_pair_dataset.py
+++ b/fairseq/data/legacy/block_pair_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/data/legacy/masked_lm_dataset.py b/fairseq/data/legacy/masked_lm_dataset.py
index 864c5dcb67..953aa85dd4 100644
--- a/fairseq/data/legacy/masked_lm_dataset.py
+++ b/fairseq/data/legacy/masked_lm_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/data/legacy/masked_lm_dictionary.py b/fairseq/data/legacy/masked_lm_dictionary.py
index ef31373be3..254b0eb740 100644
--- a/fairseq/data/legacy/masked_lm_dictionary.py
+++ b/fairseq/data/legacy/masked_lm_dictionary.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq.data import Dictionary
 
diff --git a/fairseq/data/lm_context_window_dataset.py b/fairseq/data/lm_context_window_dataset.py
index c3ff5a0deb..17ba08bc7f 100644
--- a/fairseq/data/lm_context_window_dataset.py
+++ b/fairseq/data/lm_context_window_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/fairseq/data/lru_cache_dataset.py b/fairseq/data/lru_cache_dataset.py
index cea71731cc..833a2c75cb 100644
--- a/fairseq/data/lru_cache_dataset.py
+++ b/fairseq/data/lru_cache_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from functools import lru_cache
 
diff --git a/fairseq/data/mask_tokens_dataset.py b/fairseq/data/mask_tokens_dataset.py
index ecbf29d294..b73da6b0ca 100644
--- a/fairseq/data/mask_tokens_dataset.py
+++ b/fairseq/data/mask_tokens_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from functools import lru_cache
 
diff --git a/fairseq/data/monolingual_dataset.py b/fairseq/data/monolingual_dataset.py
index 4d41171aed..76c3772374 100644
--- a/fairseq/data/monolingual_dataset.py
+++ b/fairseq/data/monolingual_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/fairseq/data/multi_corpus_sampled_dataset.py b/fairseq/data/multi_corpus_sampled_dataset.py
index ef7e1b794c..4d32d00fda 100644
--- a/fairseq/data/multi_corpus_sampled_dataset.py
+++ b/fairseq/data/multi_corpus_sampled_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 from typing import Callable, Dict, List
diff --git a/fairseq/data/nested_dictionary_dataset.py b/fairseq/data/nested_dictionary_dataset.py
index 385bf2324b..2795f895dd 100644
--- a/fairseq/data/nested_dictionary_dataset.py
+++ b/fairseq/data/nested_dictionary_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 
diff --git a/fairseq/data/noising.py b/fairseq/data/noising.py
index 0fe6597a82..bd67e7336c 100644
--- a/fairseq/data/noising.py
+++ b/fairseq/data/noising.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import numpy as np
diff --git a/fairseq/data/num_samples_dataset.py b/fairseq/data/num_samples_dataset.py
index 1ad2ce8290..9d7ea44019 100644
--- a/fairseq/data/num_samples_dataset.py
+++ b/fairseq/data/num_samples_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from . import FairseqDataset
 
diff --git a/fairseq/data/numel_dataset.py b/fairseq/data/numel_dataset.py
index efcd8f152c..50087e5857 100644
--- a/fairseq/data/numel_dataset.py
+++ b/fairseq/data/numel_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/fairseq/data/offset_tokens_dataset.py b/fairseq/data/offset_tokens_dataset.py
index 7a947f66ed..a6fd559a30 100644
--- a/fairseq/data/offset_tokens_dataset.py
+++ b/fairseq/data/offset_tokens_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from . import BaseWrapperDataset
 
diff --git a/fairseq/data/pad_dataset.py b/fairseq/data/pad_dataset.py
index 28c372c134..4c13b549aa 100644
--- a/fairseq/data/pad_dataset.py
+++ b/fairseq/data/pad_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq.data import data_utils
 
diff --git a/fairseq/data/prepend_token_dataset.py b/fairseq/data/prepend_token_dataset.py
index 3daf50f389..9dac71badf 100644
--- a/fairseq/data/prepend_token_dataset.py
+++ b/fairseq/data/prepend_token_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/fairseq/data/raw_label_dataset.py b/fairseq/data/raw_label_dataset.py
index 5f7cc0e43c..e67170f1a5 100644
--- a/fairseq/data/raw_label_dataset.py
+++ b/fairseq/data/raw_label_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/data/round_robin_zip_datasets.py b/fairseq/data/round_robin_zip_datasets.py
index d8ac04651b..5bfc966ce8 100644
--- a/fairseq/data/round_robin_zip_datasets.py
+++ b/fairseq/data/round_robin_zip_datasets.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 
diff --git a/fairseq/data/sort_dataset.py b/fairseq/data/sort_dataset.py
index 3755cd326c..9b510b93a0 100644
--- a/fairseq/data/sort_dataset.py
+++ b/fairseq/data/sort_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 
diff --git a/fairseq/data/strip_token_dataset.py b/fairseq/data/strip_token_dataset.py
index eeb48ae600..69e7ecf9cb 100644
--- a/fairseq/data/strip_token_dataset.py
+++ b/fairseq/data/strip_token_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from . import BaseWrapperDataset
 
diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
index 0e054c5e0d..4633167318 100644
--- a/fairseq/data/token_block_dataset.py
+++ b/fairseq/data/token_block_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/data/transform_eos_dataset.py b/fairseq/data/transform_eos_dataset.py
index 503e4516e2..84350c21cd 100644
--- a/fairseq/data/transform_eos_dataset.py
+++ b/fairseq/data/transform_eos_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/data/transform_eos_lang_pair_dataset.py b/fairseq/data/transform_eos_lang_pair_dataset.py
index 8da2597999..9e19da328b 100644
--- a/fairseq/data/transform_eos_lang_pair_dataset.py
+++ b/fairseq/data/transform_eos_lang_pair_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 from . import FairseqDataset
diff --git a/fairseq/data/truncate_dataset.py b/fairseq/data/truncate_dataset.py
index 0e350e407f..36d3745658 100644
--- a/fairseq/data/truncate_dataset.py
+++ b/fairseq/data/truncate_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 
diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py
index b6aa0d0492..e854b85195 100644
--- a/fairseq/distributed_utils.py
+++ b/fairseq/distributed_utils.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import namedtuple
 import os
diff --git a/fairseq/file_utils.py b/fairseq/file_utils.py
index cc9e7b02c1..d3f14dde54 100644
--- a/fairseq/file_utils.py
+++ b/fairseq/file_utils.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 """
 Utilities for working with the local dataset cache.
diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index 60f5143878..bdd025bfca 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import os
 
diff --git a/fairseq/legacy_distributed_data_parallel.py b/fairseq/legacy_distributed_data_parallel.py
index 76824739f8..2e9c82539b 100644
--- a/fairseq/legacy_distributed_data_parallel.py
+++ b/fairseq/legacy_distributed_data_parallel.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 """
 A modified version of the legacy DistributedDataParallel module that uses c10d
diff --git a/fairseq/meters.py b/fairseq/meters.py
index 30973b3f3d..bfa9a24fb4 100644
--- a/fairseq/meters.py
+++ b/fairseq/meters.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import time
 
diff --git a/fairseq/models/__init__.py b/fairseq/models/__init__.py
index 0e3d146a6e..9cc884cc3a 100644
--- a/fairseq/models/__init__.py
+++ b/fairseq/models/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 MODEL_REGISTRY = {}
 ARCH_MODEL_REGISTRY = {}
diff --git a/fairseq/models/composite_encoder.py b/fairseq/models/composite_encoder.py
index d6859c7cb0..bae48e344c 100644
--- a/fairseq/models/composite_encoder.py
+++ b/fairseq/models/composite_encoder.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq.models import FairseqEncoder
 
diff --git a/fairseq/models/distributed_fairseq_model.py b/fairseq/models/distributed_fairseq_model.py
index 5f5a474807..e858717d97 100644
--- a/fairseq/models/distributed_fairseq_model.py
+++ b/fairseq/models/distributed_fairseq_model.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import inspect
 
diff --git a/fairseq/models/fairseq_decoder.py b/fairseq/models/fairseq_decoder.py
index 2e5398e364..f301b96bbd 100644
--- a/fairseq/models/fairseq_decoder.py
+++ b/fairseq/models/fairseq_decoder.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 
diff --git a/fairseq/models/fairseq_encoder.py b/fairseq/models/fairseq_encoder.py
index 52fd4ba4ca..934d6dc018 100644
--- a/fairseq/models/fairseq_encoder.py
+++ b/fairseq/models/fairseq_encoder.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 
diff --git a/fairseq/models/fairseq_incremental_decoder.py b/fairseq/models/fairseq_incremental_decoder.py
index 1c41215571..9eccaf3d50 100644
--- a/fairseq/models/fairseq_incremental_decoder.py
+++ b/fairseq/models/fairseq_incremental_decoder.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq.models import FairseqDecoder
 
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index f8bd5ba609..0cf7cf2684 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Base classes for various fairseq models.
 """
diff --git a/fairseq/models/fconv.py b/fairseq/models/fconv.py
index 8ca216644a..c0295a9172 100644
--- a/fairseq/models/fconv.py
+++ b/fairseq/models/fconv.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 import torch
diff --git a/fairseq/models/fconv_lm.py b/fairseq/models/fconv_lm.py
index ef53bf8bc6..f2320b1700 100644
--- a/fairseq/models/fconv_lm.py
+++ b/fairseq/models/fconv_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq import options
 from fairseq.models import (
diff --git a/fairseq/models/fconv_self_att.py b/fairseq/models/fconv_self_att.py
index 09b083303c..1740082572 100644
--- a/fairseq/models/fconv_self_att.py
+++ b/fairseq/models/fconv_self_att.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py
index 0dc71a1f70..20ff4f0e6a 100644
--- a/fairseq/models/lightconv.py
+++ b/fairseq/models/lightconv.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/models/lightconv_lm.py b/fairseq/models/lightconv_lm.py
index 8017304983..a0aa13fbd1 100644
--- a/fairseq/models/lightconv_lm.py
+++ b/fairseq/models/lightconv_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq import options
 from fairseq.models import (
diff --git a/fairseq/models/lstm.py b/fairseq/models/lstm.py
index 40d9ddf1ed..6b51350f96 100644
--- a/fairseq/models/lstm.py
+++ b/fairseq/models/lstm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/fairseq/models/masked_lm.py b/fairseq/models/masked_lm.py
index 155173fcda..1ff9f90a3d 100644
--- a/fairseq/models/masked_lm.py
+++ b/fairseq/models/masked_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/fairseq/models/multilingual_transformer.py b/fairseq/models/multilingual_transformer.py
index 4db2c59eb3..9d17cd6470 100644
--- a/fairseq/models/multilingual_transformer.py
+++ b/fairseq/models/multilingual_transformer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 
diff --git a/fairseq/models/roberta/__init__.py b/fairseq/models/roberta/__init__.py
index bf4bf8fad9..a701923f7e 100644
--- a/fairseq/models/roberta/__init__.py
+++ b/fairseq/models/roberta/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from .hub_interface import *  # noqa
 from .model import *  # noqa
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index a2654febb7..4fe2627bec 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index c8794b2607..93555c7d66 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 RoBERTa: A Robustly Optimized BERT Pretraining Approach.
 """
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 591a486066..53e8f26555 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/models/transformer_from_pretrained_xlm.py b/fairseq/models/transformer_from_pretrained_xlm.py
index 06c4a2ca92..bd03c8450f 100644
--- a/fairseq/models/transformer_from_pretrained_xlm.py
+++ b/fairseq/models/transformer_from_pretrained_xlm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import os
 from typing import Any, Dict
diff --git a/fairseq/models/transformer_lm.py b/fairseq/models/transformer_lm.py
index 4a26a75e95..4d5e68e947 100644
--- a/fairseq/models/transformer_lm.py
+++ b/fairseq/models/transformer_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq import options, utils
 from fairseq.models import (
diff --git a/fairseq/models/wav2vec.py b/fairseq/models/wav2vec.py
index e89ede158b..050d4216ae 100644
--- a/fairseq/models/wav2vec.py
+++ b/fairseq/models/wav2vec.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import sys
 
diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
index 28abe64dca..8cffb0d792 100644
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from .adaptive_input import AdaptiveInput
 from .adaptive_softmax import AdaptiveSoftmax
diff --git a/fairseq/modules/adaptive_input.py b/fairseq/modules/adaptive_input.py
index 3ad8603d21..1234b77237 100644
--- a/fairseq/modules/adaptive_input.py
+++ b/fairseq/modules/adaptive_input.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 import torch
diff --git a/fairseq/modules/adaptive_softmax.py b/fairseq/modules/adaptive_softmax.py
index 90987950a0..2ab7282813 100644
--- a/fairseq/modules/adaptive_softmax.py
+++ b/fairseq/modules/adaptive_softmax.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import operator
 import functools
diff --git a/fairseq/modules/beamable_mm.py b/fairseq/modules/beamable_mm.py
index b0ece04c3e..df77105a94 100644
--- a/fairseq/modules/beamable_mm.py
+++ b/fairseq/modules/beamable_mm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/fairseq/modules/character_token_embedder.py b/fairseq/modules/character_token_embedder.py
index 8a9e022812..47aeed1f15 100644
--- a/fairseq/modules/character_token_embedder.py
+++ b/fairseq/modules/character_token_embedder.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn.functional as F
diff --git a/fairseq/modules/conv_tbc.py b/fairseq/modules/conv_tbc.py
index 1a033f294f..1aa3eff9dc 100644
--- a/fairseq/modules/conv_tbc.py
+++ b/fairseq/modules/conv_tbc.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 from torch.nn.modules.utils import _single
diff --git a/fairseq/modules/downsampled_multihead_attention.py b/fairseq/modules/downsampled_multihead_attention.py
index d9de0730f3..5c401e4f8e 100644
--- a/fairseq/modules/downsampled_multihead_attention.py
+++ b/fairseq/modules/downsampled_multihead_attention.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 #
 
 import math
diff --git a/fairseq/modules/dynamic_convolution.py b/fairseq/modules/dynamic_convolution.py
index 990ff80cf2..a8fa47225d 100644
--- a/fairseq/modules/dynamic_convolution.py
+++ b/fairseq/modules/dynamic_convolution.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/fairseq/modules/gelu.py b/fairseq/modules/gelu.py
index 998610943b..09fefd850b 100644
--- a/fairseq/modules/gelu.py
+++ b/fairseq/modules/gelu.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with
 the corresponding GitHub repo: https://github.com/hendrycks/GELUs
diff --git a/fairseq/modules/grad_multiply.py b/fairseq/modules/grad_multiply.py
index dc52498132..08d15f55df 100644
--- a/fairseq/modules/grad_multiply.py
+++ b/fairseq/modules/grad_multiply.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/modules/highway.py b/fairseq/modules/highway.py
index 2fd3f6ace9..85212ff44f 100644
--- a/fairseq/modules/highway.py
+++ b/fairseq/modules/highway.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/modules/layer_norm.py b/fairseq/modules/layer_norm.py
index 9a9a4fd3c7..c4872da92f 100644
--- a/fairseq/modules/layer_norm.py
+++ b/fairseq/modules/layer_norm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/modules/learned_positional_embedding.py b/fairseq/modules/learned_positional_embedding.py
index 5636278075..e52b8d4715 100644
--- a/fairseq/modules/learned_positional_embedding.py
+++ b/fairseq/modules/learned_positional_embedding.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 
diff --git a/fairseq/modules/lightweight_convolution.py b/fairseq/modules/lightweight_convolution.py
index 810788f206..6191d49501 100644
--- a/fairseq/modules/lightweight_convolution.py
+++ b/fairseq/modules/lightweight_convolution.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/fairseq/modules/linearized_convolution.py b/fairseq/modules/linearized_convolution.py
index 762332296c..83cff25879 100644
--- a/fairseq/modules/linearized_convolution.py
+++ b/fairseq/modules/linearized_convolution.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn.functional as F
diff --git a/fairseq/modules/logsumexp_moe.py b/fairseq/modules/logsumexp_moe.py
index e1533fe45f..0379f226b0 100644
--- a/fairseq/modules/logsumexp_moe.py
+++ b/fairseq/modules/logsumexp_moe.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/modules/mean_pool_gating_network.py b/fairseq/modules/mean_pool_gating_network.py
index 4a2eb75bcd..7acc664488 100644
--- a/fairseq/modules/mean_pool_gating_network.py
+++ b/fairseq/modules/mean_pool_gating_network.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn.functional as F
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 490b93f576..4da628655e 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 from torch import nn
diff --git a/fairseq/modules/positional_embedding.py b/fairseq/modules/positional_embedding.py
index 1e8b7373f0..2ba50113d7 100644
--- a/fairseq/modules/positional_embedding.py
+++ b/fairseq/modules/positional_embedding.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 
diff --git a/fairseq/modules/scalar_bias.py b/fairseq/modules/scalar_bias.py
index 969f3ac327..c96247c759 100644
--- a/fairseq/modules/scalar_bias.py
+++ b/fairseq/modules/scalar_bias.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 #
 
 import torch
diff --git a/fairseq/modules/sinusoidal_positional_embedding.py b/fairseq/modules/sinusoidal_positional_embedding.py
index d43a0c9c9c..93429e1c0b 100644
--- a/fairseq/modules/sinusoidal_positional_embedding.py
+++ b/fairseq/modules/sinusoidal_positional_embedding.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/modules/sparse_multihead_attention.py b/fairseq/modules/sparse_multihead_attention.py
index a4e8848c43..7e83cc9529 100644
--- a/fairseq/modules/sparse_multihead_attention.py
+++ b/fairseq/modules/sparse_multihead_attention.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 import torch
diff --git a/fairseq/modules/sparse_transformer_sentence_encoder.py b/fairseq/modules/sparse_transformer_sentence_encoder.py
index 9df5db5484..771a41e687 100644
--- a/fairseq/modules/sparse_transformer_sentence_encoder.py
+++ b/fairseq/modules/sparse_transformer_sentence_encoder.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 from fairseq.modules import TransformerSentenceEncoder
diff --git a/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
index 9a8f3296c2..f14540074a 100644
--- a/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
+++ b/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq.modules import TransformerSentenceEncoderLayer
 from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
diff --git a/fairseq/modules/transformer_sentence_encoder.py b/fairseq/modules/transformer_sentence_encoder.py
index 08c2de91a2..1699291253 100644
--- a/fairseq/modules/transformer_sentence_encoder.py
+++ b/fairseq/modules/transformer_sentence_encoder.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from typing import Optional, Tuple
 
diff --git a/fairseq/modules/transformer_sentence_encoder_layer.py b/fairseq/modules/transformer_sentence_encoder_layer.py
index 4be97d973a..8982ff0ea9 100644
--- a/fairseq/modules/transformer_sentence_encoder_layer.py
+++ b/fairseq/modules/transformer_sentence_encoder_layer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/fairseq/modules/unfold.py b/fairseq/modules/unfold.py
index fde3b4f032..3a142db698 100644
--- a/fairseq/modules/unfold.py
+++ b/fairseq/modules/unfold.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.nn.functional as F
 
diff --git a/fairseq/optim/__init__.py b/fairseq/optim/__init__.py
index 6d5a791232..d8306c4ef6 100644
--- a/fairseq/optim/__init__.py
+++ b/fairseq/optim/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import importlib
 import os
diff --git a/fairseq/optim/adadelta.py b/fairseq/optim/adadelta.py
index c175d51d30..27079a402b 100644
--- a/fairseq/optim/adadelta.py
+++ b/fairseq/optim/adadelta.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.optim
 
diff --git a/fairseq/optim/adafactor.py b/fairseq/optim/adafactor.py
index 55d61d4923..1c026244b5 100644
--- a/fairseq/optim/adafactor.py
+++ b/fairseq/optim/adafactor.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 import torch
diff --git a/fairseq/optim/adagrad.py b/fairseq/optim/adagrad.py
index deafd3df92..15b3a1c25a 100644
--- a/fairseq/optim/adagrad.py
+++ b/fairseq/optim/adagrad.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.optim
 
diff --git a/fairseq/optim/adam.py b/fairseq/optim/adam.py
index 5f97173033..e60f2fdf6a 100644
--- a/fairseq/optim/adam.py
+++ b/fairseq/optim/adam.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 import types
diff --git a/fairseq/optim/adamax.py b/fairseq/optim/adamax.py
index fd6874f1dd..2a2e7698ad 100644
--- a/fairseq/optim/adamax.py
+++ b/fairseq/optim/adamax.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.optim
diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py
index c1ceccf827..deec08ea74 100644
--- a/fairseq/optim/bmuf.py
+++ b/fairseq/optim/bmuf.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 import sys
diff --git a/fairseq/optim/fairseq_optimizer.py b/fairseq/optim/fairseq_optimizer.py
index 6a85c33004..58bc7fc2d7 100644
--- a/fairseq/optim/fairseq_optimizer.py
+++ b/fairseq/optim/fairseq_optimizer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/optim/fp16_optimizer.py b/fairseq/optim/fp16_optimizer.py
index 7e4f00434b..b3ae1ef49c 100644
--- a/fairseq/optim/fp16_optimizer.py
+++ b/fairseq/optim/fp16_optimizer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from itertools import chain
 
diff --git a/fairseq/optim/lamb.py b/fairseq/optim/lamb.py
index b7c7f78b82..e49a96f101 100644
--- a/fairseq/optim/lamb.py
+++ b/fairseq/optim/lamb.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 LAMB optimizer from github.com/cybertronai/pytorch-lamb.
 """
diff --git a/fairseq/optim/lr_scheduler/__init__.py b/fairseq/optim/lr_scheduler/__init__.py
index d02feb5534..edd0a6a13e 100644
--- a/fairseq/optim/lr_scheduler/__init__.py
+++ b/fairseq/optim/lr_scheduler/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import importlib
 import os
diff --git a/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py b/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
index cc34a3f719..206b79a009 100644
--- a/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
+++ b/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py b/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
index b7db369cc5..8b7884829a 100644
--- a/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
+++ b/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from .. import FairseqOptimizer
 
diff --git a/fairseq/optim/lr_scheduler/fixed_schedule.py b/fairseq/optim/lr_scheduler/fixed_schedule.py
index dc65d942ce..1b9ca3a639 100644
--- a/fairseq/optim/lr_scheduler/fixed_schedule.py
+++ b/fairseq/optim/lr_scheduler/fixed_schedule.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from . import FairseqLRScheduler, register_lr_scheduler
 
diff --git a/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py b/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
index fb33ea0cdb..f98a7c3b99 100644
--- a/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
+++ b/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from . import FairseqLRScheduler, register_lr_scheduler
 
diff --git a/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py b/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
index 1cb223b771..aff57f9b93 100644
--- a/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
+++ b/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from . import FairseqLRScheduler, register_lr_scheduler
 
diff --git a/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py b/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
index 365a5e8971..715c714b65 100644
--- a/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
+++ b/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.optim.lr_scheduler
 
diff --git a/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py b/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
index a9ef35a24a..fed0cf7ef1 100644
--- a/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
+++ b/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/optim/nag.py b/fairseq/optim/nag.py
index 8bdc1de606..c916b6fadb 100644
--- a/fairseq/optim/nag.py
+++ b/fairseq/optim/nag.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 from torch.optim.optimizer import Optimizer, required
diff --git a/fairseq/optim/sgd.py b/fairseq/optim/sgd.py
index ac54352075..c34b9590dd 100644
--- a/fairseq/optim/sgd.py
+++ b/fairseq/optim/sgd.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch.optim
 
diff --git a/fairseq/options.py b/fairseq/options.py
index a991cbef1f..e29187ee5d 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 
diff --git a/fairseq/pdb.py b/fairseq/pdb.py
index 2ba9982a80..f1ce3c46bc 100644
--- a/fairseq/pdb.py
+++ b/fairseq/pdb.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import multiprocessing
 import os
diff --git a/fairseq/progress_bar.py b/fairseq/progress_bar.py
index 870ac1dafe..a9bb5f97b4 100644
--- a/fairseq/progress_bar.py
+++ b/fairseq/progress_bar.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 """
 Wrapper around various loggers and progress bars (e.g., tqdm).
diff --git a/fairseq/registry.py b/fairseq/registry.py
index 25168e588d..cb0c984ade 100644
--- a/fairseq/registry.py
+++ b/fairseq/registry.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 REGISTRIES = {}
diff --git a/fairseq/search.py b/fairseq/search.py
index 742453468a..02dcf628ca 100644
--- a/fairseq/search.py
+++ b/fairseq/search.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py
index 93e4fe1b53..87016e9281 100644
--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/fairseq/sequence_scorer.py b/fairseq/sequence_scorer.py
index 0ee9582a7f..d125422340 100644
--- a/fairseq/sequence_scorer.py
+++ b/fairseq/sequence_scorer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import sys
diff --git a/fairseq/tasks/__init__.py b/fairseq/tasks/__init__.py
index 5690fbee95..0c147568e6 100644
--- a/fairseq/tasks/__init__.py
+++ b/fairseq/tasks/__init__.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 import importlib
diff --git a/fairseq/tasks/audio_pretraining.py b/fairseq/tasks/audio_pretraining.py
index ac97c38ada..e4bf0d79f6 100644
--- a/fairseq/tasks/audio_pretraining.py
+++ b/fairseq/tasks/audio_pretraining.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import os
 
diff --git a/fairseq/tasks/cross_lingual_lm.py b/fairseq/tasks/cross_lingual_lm.py
index 6ad58e79fa..c173f0ad16 100644
--- a/fairseq/tasks/cross_lingual_lm.py
+++ b/fairseq/tasks/cross_lingual_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import itertools
 import os
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index 2993865dab..89a1c866b8 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/fairseq/tasks/language_modeling.py b/fairseq/tasks/language_modeling.py
index a5b3470982..066dd89544 100644
--- a/fairseq/tasks/language_modeling.py
+++ b/fairseq/tasks/language_modeling.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import itertools
 import os
diff --git a/fairseq/tasks/legacy_masked_lm.py b/fairseq/tasks/legacy_masked_lm.py
index b4f2c93ac2..12453d3f32 100644
--- a/fairseq/tasks/legacy_masked_lm.py
+++ b/fairseq/tasks/legacy_masked_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import itertools
 import numpy as np
diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py
index 2b89b1b8e0..f1686258fb 100644
--- a/fairseq/tasks/masked_lm.py
+++ b/fairseq/tasks/masked_lm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import itertools
 import os
diff --git a/fairseq/tasks/multilingual_translation.py b/fairseq/tasks/multilingual_translation.py
index e3225eb5e2..87deadf4f0 100644
--- a/fairseq/tasks/multilingual_translation.py
+++ b/fairseq/tasks/multilingual_translation.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 import copy
diff --git a/fairseq/tasks/semisupervised_translation.py b/fairseq/tasks/semisupervised_translation.py
index 92053fa9aa..612ea48c46 100644
--- a/fairseq/tasks/semisupervised_translation.py
+++ b/fairseq/tasks/semisupervised_translation.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 import os
diff --git a/fairseq/tasks/sentence_prediction.py b/fairseq/tasks/sentence_prediction.py
index 0f54ef81f1..483e2469d4 100644
--- a/fairseq/tasks/sentence_prediction.py
+++ b/fairseq/tasks/sentence_prediction.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import os
 
diff --git a/fairseq/tasks/translation.py b/fairseq/tasks/translation.py
index 80e1c2960a..d3f51cb35c 100644
--- a/fairseq/tasks/translation.py
+++ b/fairseq/tasks/translation.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import itertools
 import os
diff --git a/fairseq/tasks/translation_from_pretrained_xlm.py b/fairseq/tasks/translation_from_pretrained_xlm.py
index 941634cf86..347a6eccb7 100644
--- a/fairseq/tasks/translation_from_pretrained_xlm.py
+++ b/fairseq/tasks/translation_from_pretrained_xlm.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
 from fairseq.tasks.translation import TranslationTask
diff --git a/fairseq/tasks/translation_moe.py b/fairseq/tasks/translation_moe.py
index 932b017836..35d44e47cb 100644
--- a/fairseq/tasks/translation_moe.py
+++ b/fairseq/tasks/translation_moe.py
@@ -1,8 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.  # All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import contextlib
 
diff --git a/fairseq/tokenizer.py b/fairseq/tokenizer.py
index ca368db891..8c4d694aa0 100644
--- a/fairseq/tokenizer.py
+++ b/fairseq/tokenizer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import re
 
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index d2f27e05d1..507687e266 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 """
 Train a network across multiple GPUs.
diff --git a/fairseq/utils.py b/fairseq/utils.py
index c5062274ac..76473837aa 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from collections import defaultdict
 import copy
diff --git a/generate.py b/generate.py
index 2498c39e83..c23cc79868 100644
--- a/generate.py
+++ b/generate.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Translate pre-processed data with a trained model.
 """
diff --git a/hubconf.py b/hubconf.py
index 992c259fa3..90acec6775 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import functools
 
diff --git a/interactive.py b/interactive.py
index 632a16f3ed..d9d547a974 100644
--- a/interactive.py
+++ b/interactive.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Translate raw text with a trained model. Batches data on-the-fly.
 """
diff --git a/preprocess.py b/preprocess.py
index c4ac37cf43..a157feeb68 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Data pre-processing: build vocabularies and binarize training data.
 """
diff --git a/score.py b/score.py
index 4aaa0f6bcc..6d9ac89a12 100644
--- a/score.py
+++ b/score.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 BLEU scoring of generated translations against reference translations.
 """
diff --git a/scripts/average_checkpoints.py b/scripts/average_checkpoints.py
index 24c7d9890f..e5e9bce156 100644
--- a/scripts/average_checkpoints.py
+++ b/scripts/average_checkpoints.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 import collections
diff --git a/scripts/build_sym_alignment.py b/scripts/build_sym_alignment.py
index bb4809ea58..bb0cac09dd 100644
--- a/scripts/build_sym_alignment.py
+++ b/scripts/build_sym_alignment.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Use this script in order to build symmetric alignments for your translation
 dataset.
diff --git a/scripts/convert_dictionary.lua b/scripts/convert_dictionary.lua
index f0bdf45369..14ee8c997f 100644
--- a/scripts/convert_dictionary.lua
+++ b/scripts/convert_dictionary.lua
@@ -1,9 +1,7 @@
--- Copyright (c) 2017-present, Facebook, Inc.
--- All rights reserved.
+-- Copyright (c) Facebook, Inc. and its affiliates.
 --
--- This source code is licensed under the license found in the LICENSE file in
--- the root directory of this source tree. An additional grant of patent rights
--- can be found in the PATENTS file in the same directory.
+-- This source code is licensed under the MIT license found in the
+-- LICENSE file in the root directory of this source tree.
 --
 -- Usage: convert_dictionary.lua <dict.th7>
 require 'fairseq'
diff --git a/scripts/convert_model.lua b/scripts/convert_model.lua
index 33a0efb32d..61b9213929 100644
--- a/scripts/convert_model.lua
+++ b/scripts/convert_model.lua
@@ -1,9 +1,7 @@
--- Copyright (c) 2017-present, Facebook, Inc.
--- All rights reserved.
+-- Copyright (c) Facebook, Inc. and its affiliates.
 --
--- This source code is licensed under the license found in the LICENSE file in
--- the root directory of this source tree. An additional grant of patent rights
--- can be found in the PATENTS file in the same directory.
+-- This source code is licensed under the MIT license found in the
+-- LICENSE file in the root directory of this source tree.
 --
 -- Usage: convert_model.lua <model_epoch1.th7>
 require 'torch'
diff --git a/scripts/count_docs.py b/scripts/count_docs.py
index eb0de66977..13640f4b6f 100644
--- a/scripts/count_docs.py
+++ b/scripts/count_docs.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Count the number of documents and average number of lines and tokens per
 document in a large file. Documents should be separated by a single empty line.
diff --git a/scripts/read_binarized.py b/scripts/read_binarized.py
index 4b041cd8e6..f48409beb4 100644
--- a/scripts/read_binarized.py
+++ b/scripts/read_binarized.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 
diff --git a/scripts/rm_pt.py b/scripts/rm_pt.py
index de29edf67b..21976cee4f 100644
--- a/scripts/rm_pt.py
+++ b/scripts/rm_pt.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 import os
diff --git a/scripts/shard_docs.py b/scripts/shard_docs.py
index 3a906cfd45..f1adac72aa 100644
--- a/scripts/shard_docs.py
+++ b/scripts/shard_docs.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Split a large file into shards while respecting document boundaries. Documents
 should be separated by a single empty line.
diff --git a/scripts/split_train_valid_docs.py b/scripts/split_train_valid_docs.py
index 01624da54c..41fb979ad1 100644
--- a/scripts/split_train_valid_docs.py
+++ b/scripts/split_train_valid_docs.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Split a large file into a train and valid set while respecting document
 boundaries. Documents should be separated by a single empty line.
diff --git a/scripts/wav2vec_manifest.py b/scripts/wav2vec_manifest.py
index 949edd58dc..c80f9883df 100644
--- a/scripts/wav2vec_manifest.py
+++ b/scripts/wav2vec_manifest.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Data pre-processing: build vocabularies and binarize training data.
 """
diff --git a/setup.py b/setup.py
index 7c965010a7..1fd3f6dd34 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from setuptools import setup, find_packages, Extension
 import sys
@@ -37,7 +35,7 @@
     url='https://github.com/pytorch/fairseq',
     classifiers=[
         'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: BSD License',
+        'License :: OSI Approved :: MIT License',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
diff --git a/tests/test_average_checkpoints.py b/tests/test_average_checkpoints.py
index e3e3c3c5fa..21f12cb421 100644
--- a/tests/test_average_checkpoints.py
+++ b/tests/test_average_checkpoints.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import collections
 import os
diff --git a/tests/test_backtranslation_dataset.py b/tests/test_backtranslation_dataset.py
index 4d25839f90..b0db3e78c2 100644
--- a/tests/test_backtranslation_dataset.py
+++ b/tests/test_backtranslation_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import unittest
 
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
index 79650df6ba..d7f80f86ba 100644
--- a/tests/test_binaries.py
+++ b/tests/test_binaries.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import contextlib
 from io import StringIO
diff --git a/tests/test_character_token_embedder.py b/tests/test_character_token_embedder.py
index c4bf6f6778..81042c2a3f 100644
--- a/tests/test_character_token_embedder.py
+++ b/tests/test_character_token_embedder.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import unittest
diff --git a/tests/test_concat_dataset.py b/tests/test_concat_dataset.py
index 5ced554845..dbdb2ac518 100644
--- a/tests/test_concat_dataset.py
+++ b/tests/test_concat_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import unittest
 
diff --git a/tests/test_convtbc.py b/tests/test_convtbc.py
index d2720dd0ac..fc2ac0b5dc 100644
--- a/tests/test_convtbc.py
+++ b/tests/test_convtbc.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import unittest
diff --git a/tests/test_dictionary.py b/tests/test_dictionary.py
index 4df94b0ce9..b41838b54f 100644
--- a/tests/test_dictionary.py
+++ b/tests/test_dictionary.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import tempfile
 import unittest
diff --git a/tests/test_iterators.py b/tests/test_iterators.py
index 65c5aa70f8..e2751d2b17 100644
--- a/tests/test_iterators.py
+++ b/tests/test_iterators.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import unittest
 
diff --git a/tests/test_label_smoothing.py b/tests/test_label_smoothing.py
index 5408e5c889..38a627c76c 100644
--- a/tests/test_label_smoothing.py
+++ b/tests/test_label_smoothing.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 import copy
diff --git a/tests/test_memory_efficient_fp16.py b/tests/test_memory_efficient_fp16.py
index 296b162212..78e0ac02e8 100644
--- a/tests/test_memory_efficient_fp16.py
+++ b/tests/test_memory_efficient_fp16.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 import unittest
diff --git a/tests/test_multi_corpus_sampled_dataset.py b/tests/test_multi_corpus_sampled_dataset.py
index 5d4f3d1964..05b20328c5 100644
--- a/tests/test_multi_corpus_sampled_dataset.py
+++ b/tests/test_multi_corpus_sampled_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import unittest
 from collections import OrderedDict
diff --git a/tests/test_noising.py b/tests/test_noising.py
index 060b476703..da792a1826 100644
--- a/tests/test_noising.py
+++ b/tests/test_noising.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import unittest
 from typing import Dict, List
diff --git a/tests/test_reproducibility.py b/tests/test_reproducibility.py
index d1891db3da..dc67379c22 100644
--- a/tests/test_reproducibility.py
+++ b/tests/test_reproducibility.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import contextlib
 from io import StringIO
diff --git a/tests/test_sequence_generator.py b/tests/test_sequence_generator.py
index b65b5e8c79..ce02400de4 100644
--- a/tests/test_sequence_generator.py
+++ b/tests/test_sequence_generator.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 import unittest
diff --git a/tests/test_sequence_scorer.py b/tests/test_sequence_scorer.py
index d33ecfcb5e..a7c2a53a90 100644
--- a/tests/test_sequence_scorer.py
+++ b/tests/test_sequence_scorer.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 import unittest
diff --git a/tests/test_sparse_multihead_attention.py b/tests/test_sparse_multihead_attention.py
index d6e6ebdb4c..eaf9742cdf 100644
--- a/tests/test_sparse_multihead_attention.py
+++ b/tests/test_sparse_multihead_attention.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 import unittest
diff --git a/tests/test_token_block_dataset.py b/tests/test_token_block_dataset.py
index 89a19f5dce..41abb194da 100644
--- a/tests/test_token_block_dataset.py
+++ b/tests/test_token_block_dataset.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import unittest
 
diff --git a/tests/test_train.py b/tests/test_train.py
index 20f6942bbf..7f2eb40204 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import contextlib
 from io import StringIO
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0f8dae4090..906289f547 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import unittest
 
diff --git a/tests/utils.py b/tests/utils.py
index 7eb0d297b7..9c0cea15a6 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import argparse
 import torch
diff --git a/train.py b/train.py
index c4a95023c3..96855970fc 100644
--- a/train.py
+++ b/train.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 """
 Train a new model on one or across multiple GPUs.
 """

From 3b2cecda72bbdd9a59f85e44c92becaeb123715f Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@devfair0110.h2.fair>
Date: Tue, 30 Jul 2019 09:20:15 -0700
Subject: [PATCH 044/213] 1) replaced fstring 2) fixed error from max-positions
 arg

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/787

Differential Revision: D16562052

fbshipit-source-id: 640e30b2378ec917d60092558d3088a77f9741cb
---
 fairseq/tasks/sentence_prediction.py | 6 ++----
 train.py                             | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/fairseq/tasks/sentence_prediction.py b/fairseq/tasks/sentence_prediction.py
index 483e2469d4..1454978eb7 100644
--- a/fairseq/tasks/sentence_prediction.py
+++ b/fairseq/tasks/sentence_prediction.py
@@ -41,8 +41,6 @@ def add_args(parser):
         """Add task-specific arguments to the parser."""
         parser.add_argument('data', metavar='FILE',
                             help='file prefix for data')
-        parser.add_argument('--max-positions', type=int, default=512,
-                            help='max input length')
         parser.add_argument('--num-classes', type=int, default=-1,
                             help='number of classes')
         parser.add_argument('--init-token', type=int, default=None,
@@ -160,7 +158,7 @@ def make_dataset(type, dictionary):
                     )
                 )
         else:
-            label_path = f"{get_path('label', split)}.label"
+            label_path = "{0}.label".format(get_path('label', split))
             if os.path.exists(label_path):
                 dataset.update(
                     target=RawLabelDataset([
@@ -182,7 +180,7 @@ def make_dataset(type, dictionary):
                 sort_order=[shuffle],
             )
 
-        print(f"| Loaded {split} with #samples: {len(dataset)}")
+        print("| Loaded {0} with #samples: {1}".format(split, len(dataset)))
 
         self.datasets[split] = dataset
         return self.datasets[split]
diff --git a/train.py b/train.py
index 96855970fc..b73e362d5d 100644
--- a/train.py
+++ b/train.py
@@ -9,7 +9,6 @@
 
 import collections
 import math
-import os
 import random
 
 import torch
@@ -258,7 +257,7 @@ def get_valid_stats(trainer, args, extra_meters=None):
     stats['ppl'] = utils.get_perplexity(nll_loss.avg)
     stats['num_updates'] = trainer.get_num_updates()
     if hasattr(checkpoint_utils.save_checkpoint, 'best'):
-        key = f'best_{args.best_checkpoint_metric}'
+        key = 'best_{0}'.format(args.best_checkpoint_metric)
         best_function = max if args.maximize_best_checkpoint_metric else min
 
         current_metric = None

From d82517e9553ab690b8a264f7ca5be46a41eee700 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 30 Jul 2019 11:32:50 -0700
Subject: [PATCH 045/213] Add roberta.decode to hub interface to decode BPE
 (#931)

Summary:
Fixes https://github.com/pytorch/fairseq/issues/930.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/931

Differential Revision: D16562511

Pulled By: myleott

fbshipit-source-id: c4c07e2f067326b79daa547dcb3db84aeddbd555
---
 examples/roberta/README.md              |  2 ++
 fairseq/models/roberta/hub_interface.py | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index c01595bfb8..f8c3974bd5 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -60,6 +60,8 @@ $ tar -xzvf roberta.large.tar.gz
 >>> tokens = roberta.encode('Hello world!')
 >>> tokens
 tensor([    0, 31414,   232,   328,     2])
+>>> roberta.decode(tokens)
+'Hello world!'
 ```
 
 ##### Extract features from RoBERTa:
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index 4fe2627bec..bb08d48f6f 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -38,6 +39,19 @@ def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
         tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False)
         return tokens.long()
 
+    def decode(self, tokens: torch.LongTensor):
+        assert tokens.dim() == 1
+        tokens = tokens.numpy()
+        if tokens[0] == self.task.source_dictionary.bos():
+            tokens = tokens[1:]  # remove <s>
+        eos_mask = (tokens == self.task.source_dictionary.eos())
+        doc_mask = eos_mask[1:] & eos_mask[:-1]
+        sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
+        sentences = [self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences]
+        if len(sentences) == 1:
+            return sentences[0]
+        return sentences
+
     def extract_features(self, tokens: torch.LongTensor, return_all_hiddens=False) -> torch.Tensor:
         if tokens.dim() == 1:
             tokens = tokens.unsqueeze(0)

From b651b000033fd8ff51d1c3bea76f4fd1897bdf9c Mon Sep 17 00:00:00 2001
From: Nathan Ng <n.ng555@gmail.com>
Date: Wed, 31 Jul 2019 03:41:41 -0700
Subject: [PATCH 046/213] Wmt19 models (#767)

Summary:
Release of the WMT 19 pretrained models
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/767

Reviewed By: edunov

Differential Revision: D16472717

Pulled By: nng555

fbshipit-source-id: acf0fa3548c33f2bf2b5f71e551c782ad8c31a42
---
 examples/wmt19/README.md         | 97 ++++++++++++++++++++++++++++++++
 fairseq/data/encoders/fastbpe.py | 34 +++++++++++
 fairseq/models/transformer.py    |  4 ++
 fairseq/models/transformer_lm.py |  3 +
 4 files changed, 138 insertions(+)
 create mode 100644 examples/wmt19/README.md
 create mode 100644 fairseq/data/encoders/fastbpe.py

diff --git a/examples/wmt19/README.md b/examples/wmt19/README.md
new file mode 100644
index 0000000000..3f378aa13a
--- /dev/null
+++ b/examples/wmt19/README.md
@@ -0,0 +1,97 @@
+# WMT 19
+
+This page provides pointers to the models of Facebook-FAIR's WMT'19 news translation task submission [(Ng et al., 2019)](https://arxiv.org/abs/1907.06616).
+
+## Pre-trained models
+
+Description | Model
+---|---
+En->De Ensemble | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.bz2)
+De->En Ensemble | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.bz2)
+En->Ru Ensemble | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.bz2)
+Ru->En Ensemble | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.bz2)
+En LM | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.bz2)
+De LM | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.bz2)
+Ru LM | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2)
+
+## Example usage (torch.hub)
+
+```
+>>> import torch
+>>> en2de = torch.hub.load(
+...   'pytorch/fairseq',
+...   'transformer.wmt19.en-de',
+...   checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt'
+...   tokenizer='moses',
+...   bpe='fastbpe',
+... )
+>>> en2de.generate("Machine learning is great!")
+'Maschinelles Lernen ist großartig!'
+
+>>> de2en = torch.hub.load(
+...   'pytorch/fairseq',
+...   'transformer.wmt19.de-en',
+...   checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt'
+...   tokenizer='moses',
+...   bpe='fastbpe',
+... )
+>>> de2en.generate("Maschinelles Lernen ist großartig!")
+'Machine learning is great!'
+
+>>> en2ru = torch.hub.load(
+...   'pytorch/fairseq',
+...   'transformer.wmt19.en-ru',
+...   checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt'
+...   tokenizer='moses',
+...   bpe='fastbpe',
+... )
+>>> en2ru.generate("Machine learning is great!")
+'Машинное обучение - это здорово!'
+
+>>> ru2en = torch.hub.load(
+...   'pytorch/fairseq',
+...   'transformer.wmt19.ru-en',
+...   checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt'
+...   tokenizer='moses',
+...   bpe='fastbpe',
+... )
+>>> ru2en.generate("Машинное обучение - это здорово!")
+'Machine learning is great!'
+
+>>> en_lm = torch.hub.load(
+...   'pytorch.fairseq',
+...   'transformer_lm.wmt19.en'
+...   tokenizer='moses',
+...   bpe='fastbpe',
+... )
+>>> en_lm.generate("Machine learning is")
+'Machine learning is the future of computing, says Microsoft boss Satya Nadella ...'
+
+>>> de_lm = torch.hub.load(
+...   'pytorch.fairseq',
+...   'transformer_lm.wmt19.de'
+...   tokenizer='moses',
+...   bpe='fastbpe',
+... )
+>>> de_lm.generate("Maschinelles lernen ist")
+''Maschinelles lernen ist das A und O (neues-deutschland.de) Die Arbeitsbedingungen für Lehrerinnen und Lehrer sind seit Jahren verbesserungswürdig ...'
+
+>>> ru_lm = torch.hub.load(
+...   'pytorch.fairseq',
+...   'transformer_lm.wmt19.ru'
+...   tokenizer='moses',
+...   bpe='fastbpe',
+... )
+>>> ru_lm.generate("машинное обучение это")
+'машинное обучение это то, что мы называем "искусственным интеллектом".'
+```
+
+## Citation
+```bibtex
+@inproceedings{ng2019facebook},
+  title = {Facebook FAIR's WMT19 News Translation Task Submission},
+  author = {Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey},,
+  booktitle = {Conference of the Association for Computational Linguistics (ACL)},
+  year = 2019,
+}
+```
diff --git a/fairseq/data/encoders/fastbpe.py b/fairseq/data/encoders/fastbpe.py
new file mode 100644
index 0000000000..61a8f726ec
--- /dev/null
+++ b/fairseq/data/encoders/fastbpe.py
@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+
+@register_bpe('fastbpe')
+class fastBPE(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-codes', type=str,
+                            help='path to fastBPE BPE')
+        # fmt: on
+
+    def __init__(self, args):
+        if args.bpe_codes is None:
+            raise ValueError('--bpe-codes is required for --bpe=subword_nmt')
+        codes = file_utils.cached_path(args.bpe_codes)
+        try:
+            import fastBPE
+            self.bpe = fastBPE.fastBPE(codes)
+            self.bpe_symbol = "@@ "
+        except ImportError:
+            raise ImportError('Please install fastbpe at https://github.com/glample/fastBPE')
+
+    def encode(self, x: str) -> str:
+        return self.bpe.apply([x])[0]
+
+    def decode(self, x: str) -> str:
+        return (x + ' ').replace(self.bpe_symbol, '').rstrip()
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 53e8f26555..4afbe93a63 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -53,6 +53,10 @@ def hub_models(cls):
             'transformer.wmt14.en-fr': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2',
             'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2',
             'transformer.wmt18.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz',
+            'transformer.wmt19.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.bz2',
+            'transformer.wmt19.en-ru': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.bz2',
+            'transformer.wmt19.de-en': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.bz2',
+            'transformer.wmt19.ru-en': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.bz2',
         }
 
     def __init__(self, encoder, decoder):
diff --git a/fairseq/models/transformer_lm.py b/fairseq/models/transformer_lm.py
index 4d5e68e947..febdc9adc9 100644
--- a/fairseq/models/transformer_lm.py
+++ b/fairseq/models/transformer_lm.py
@@ -29,6 +29,9 @@ def hub_models(cls):
         return {
             'transformer_lm.gbw.adaptive_huge': 'https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2',
             'transformer_lm.wiki103.adaptive': 'https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.tar.bz2',
+            'transformer_lm.wmt19.en': 'https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.bz2',
+            'transformer_lm.wmt19.de': 'https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.bz2',
+            'transformer_lm.wmt19.ru': 'https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2',
         }
 
     def __init__(self, decoder):

From 37eb9f2b3c36d29ac557524fb4b9dbb644ef4f8b Mon Sep 17 00:00:00 2001
From: Johannes Villmow <johannes.villmow@hs-rm.de>
Date: Wed, 31 Jul 2019 05:54:52 -0700
Subject: [PATCH 047/213] Use commandline interface in preprocess_GLUE_tasks.sh
 (#937)

Summary:
Just a small fix for issue https://github.com/pytorch/fairseq/issues/936 .
Pull Request resolved: https://github.com/pytorch/fairseq/pull/937

Differential Revision: D16580263

Pulled By: myleott

fbshipit-source-id: 1777e782491c63697726e95bd555892da3fed4ec
---
 examples/roberta/preprocess_GLUE_tasks.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/roberta/preprocess_GLUE_tasks.sh b/examples/roberta/preprocess_GLUE_tasks.sh
index 56addbc292..52a5ffa1e9 100755
--- a/examples/roberta/preprocess_GLUE_tasks.sh
+++ b/examples/roberta/preprocess_GLUE_tasks.sh
@@ -159,7 +159,7 @@ do
   for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
   do
     LANG="input$INPUT_TYPE"
-    python preprocess.py \
+    fairseq-preprocess \
       --only-source \
       --trainpref "$TASK_DATA_FOLDER/processed/train.$LANG" \
       --validpref "${DEVPREF//LANG/$LANG}" \
@@ -170,7 +170,7 @@ do
   done
   if [[ "$TASK" !=  "STS-B" ]]
   then
-    python preprocess.py \
+    fairseq-preprocess \
       --only-source \
       --trainpref "$TASK_DATA_FOLDER/processed/train.label" \
       --validpref "${DEVPREF//LANG/'label'}" \

From c5650bfc0a41ebcbb8940dc0430bb6c095ca09c7 Mon Sep 17 00:00:00 2001
From: Dongjin Na <dongjinna@gmail.com>
Date: Wed, 31 Jul 2019 08:30:37 -0700
Subject: [PATCH 048/213] Update language_model README.md (#941)

Summary:
Adding a backslash in the convolutional language model training usage.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/941

Differential Revision: D16581388

Pulled By: myleott

fbshipit-source-id: 7e2e05ecf13e86cb844dc5200d49f560c63b12ff
---
 examples/language_model/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index d598cd0e20..4b041146e3 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -82,7 +82,7 @@ $ fairseq-train --task language_modeling data-bin/wikitext-103 \
   --max-epoch 35 --arch fconv_lm_dauphin_wikitext103 --optimizer nag \
   --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
   --clip-norm 0.1 --dropout 0.2 --weight-decay 5e-06 --criterion adaptive_loss \
-  --adaptive-softmax-cutoff 10000,20000,200000 --max-tokens 1024 --tokens-per-sample 1024
+  --adaptive-softmax-cutoff 10000,20000,200000 --max-tokens 1024 --tokens-per-sample 1024 \
   --ddp-backend=no_c10d
 
 # Evaluate:

From fe8a163986672bbbec1a922231be229cc79dafe6 Mon Sep 17 00:00:00 2001
From: ngoyal2707 <ngoyal2707@users.noreply.github.com>
Date: Wed, 31 Jul 2019 14:57:55 -0700
Subject: [PATCH 049/213] Roberta add classification finetuning example readme
 (#790)

Summary:
Added readme for IMDB classification as tutorial for custm finetuning of roberta
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/790

Reviewed By: myleott

Differential Revision: D16587877

Pulled By: myleott

fbshipit-source-id: ed265b7254e6fa2fc8a899ba04c0d2bb45a7f5c4
---
 .../README.finetune_custom_classification.md  | 120 ++++++++++++++++++
 examples/roberta/README.md                    |   3 +
 2 files changed, 123 insertions(+)
 create mode 100644 examples/roberta/README.finetune_custom_classification.md

diff --git a/examples/roberta/README.finetune_custom_classification.md b/examples/roberta/README.finetune_custom_classification.md
new file mode 100644
index 0000000000..de3a4cc37a
--- /dev/null
+++ b/examples/roberta/README.finetune_custom_classification.md
@@ -0,0 +1,120 @@
+# RoBERTa fine-tuning on custom classification task (example IMDB)
+
+## 1) Get the data
+```
+wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+tar zxvf aclImdb_v1.tar.gz
+```
+
+## 2) Format data
+`IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing.  
+```
+import argparse
+import os
+import random
+from glob import glob
+
+random.seed(0)
+
+def main(args):
+    for split in ['train', 'test']:
+        samples = []
+        for class_label in ['pos', 'neg']:
+            fnames = glob(os.path.join(args.datadir, split, class_label) + '/*.txt')
+            for fname in fnames:
+                with open(fname) as fin:
+                    line = fin.readline()
+                    samples.append((line, 1 if class_label == 'pos' else 0))
+        random.shuffle(samples)
+        out_fname = 'train' if split == 'train' else 'dev'
+        f1 = open(os.path.join(args.datadir, out_fname + '.input0'), 'w')
+        f2 = open(os.path.join(args.datadir, out_fname + '.label'), 'w')
+        for sample in samples:
+            f1.write(sample[0] + '\n')
+            f2.write(str(sample[1]) + '\n')
+        f1.close()
+        f2.close()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--datadir', default='aclImdb')
+    args = parser.parse_args()
+    main(args)
+```
+
+## 3) BPE Encode
+Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower.
+```
+# Download encoder.json and vocab.bpe
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+
+for SPLIT in train dev;
+do
+  python -m examples.roberta.multiprocessing_bpe_encoder \
+  --encoder-json encoder.json \
+  --vocab-bpe vocab.bpe \
+  --inputs "aclImdb/$SPLIT.input0" \
+  --outputs "aclImdb/$SPLIT.input0.bpe" \
+  --workers 60 \
+  --keep-empty;
+done
+```
+
+
+## 4) Preprocess data
+
+```
+# Download fairseq dictionary.
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'  
+
+fairseq-preprocess \
+  --only-source \
+  --trainpref "aclImdb/train.input0.bpe" \
+  --validpref "aclImdb/dev.input0.bpe" \
+  --destdir "IMDB-bin/input0" \
+  --workers 60 \
+  --srcdict dict.txt;
+
+fairseq-preprocess \
+  --only-source \
+  --trainpref "aclImdb/train.label" \
+  --validpref "aclImdb/dev.label" \
+  --destdir "IMDB-bin/label" \
+  --workers 60;
+
+```
+
+## 5) Run Training
+
+```
+TOTAL_NUM_UPDATES=7812  # 10 epochs through IMDB for bsz 32
+WARMUP_UPDATES=469      # 6 percent of the number of updates
+LR=1e-05                # Peak LR for polynomial LR scheduler.
+NUM_CLASSES=2
+MAX_SENTENCES=8        # Batch size.
+
+CUDA_VISIBLE_DEVICES=0 python train.py IMDB-bin/ \
+--restore-file <roberta_large_absolute_path> \
+--max-positions 512 \
+--max-sentences $MAX_SENTENCES \
+--max-tokens 4400 \
+--task sentence_prediction \
+--reset-optimizer --reset-dataloader --reset-meters \
+--required-batch-size-multiple 1 \
+--init-token 0 --separator-token 2 \
+--arch roberta_large \
+--criterion sentence_prediction \
+--num-classes $NUM_CLASSES \
+--dropout 0.1 --attention-dropout 0.1 \
+--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+--clip-norm 0.0 \
+--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+--max-epoch 10 \
+--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+--truncate-sequence \
+--update-freq 4;
+```
+Above will train with effective batch-size of `32`, tested on one `Nvidia V100 32gb`.
+Expected `best-validation-accuracy` after `10` epochs is `~96.5%`.
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index f8c3974bd5..989c9d750e 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -208,6 +208,9 @@ b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb
 
 c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
 
+## Fine-tuning on custom classification tasks
+[Example of fine-tuning Roberta on simple custom classification task](README.finetune_custom_classification.md)
+
 ## Pretraining using your own data
 
 You can use the [`masked_lm` task](/fairseq/tasks/masked_lm.py) to pretrain RoBERTa from scratch, or to continue pretraining RoBERTa starting from one of the released checkpoints.

From 94722a9fb87f4a36d56d9e1888fb54ea010c7a91 Mon Sep 17 00:00:00 2001
From: Nathan Ng <n.ng555@gmail.com>
Date: Wed, 31 Jul 2019 15:29:51 -0700
Subject: [PATCH 050/213] Fix citation errors (#791)

Summary:
Fixing booktitle in wmt19 citation
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/791

Reviewed By: myleott

Differential Revision: D16589372

Pulled By: nng555

fbshipit-source-id: 28402784bb6ef0615e46b8d8383bfa52d79e46de
---
 examples/wmt19/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/wmt19/README.md b/examples/wmt19/README.md
index 3f378aa13a..a7c83172e4 100644
--- a/examples/wmt19/README.md
+++ b/examples/wmt19/README.md
@@ -90,8 +90,8 @@ Ru LM | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/w
 ```bibtex
 @inproceedings{ng2019facebook},
   title = {Facebook FAIR's WMT19 News Translation Task Submission},
-  author = {Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey},,
-  booktitle = {Conference of the Association for Computational Linguistics (ACL)},
+  author = {Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey},
+  booktitle = {Proc. of WMT},
   year = 2019,
 }
 ```

From 3e0e5becff16580c4b6758ceb0a2ce0e837da422 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 31 Jul 2019 18:17:10 -0700
Subject: [PATCH 051/213] Fix small syntax error in hub_utils.py (fixes #942)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/944

Differential Revision: D16593568

Pulled By: myleott

fbshipit-source-id: 611bccae2ad0b8dc704c47a8a3343161010c2356
---
 fairseq/hub_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index bdd025bfca..0eaafc0097 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -18,7 +18,7 @@ def from_pretrained(
     checkpoint_file='model.pt',
     data_name_or_path='.',
     archive_map=None,
-    **kwargs,
+    **kwargs
 ):
     from fairseq import checkpoint_utils, file_utils
 

From 5b2be870f4008f54ccd145e10d4de24d2db9c1de Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 1 Aug 2019 05:51:19 -0700
Subject: [PATCH 052/213] Update PyTorch Hub interface

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/782

Differential Revision: D16542256

Pulled By: myleott

fbshipit-source-id: ea3279e7a1ce4687a5914f32b76787c419be1ffa
---
 fairseq/checkpoint_utils.py     |  22 +----
 fairseq/hub_utils.py            | 161 ++++++++++++++++++++++----------
 fairseq/models/fairseq_model.py |  13 +--
 fairseq/registry.py             |  21 +++++
 hubconf.py                      |   6 +-
 5 files changed, 149 insertions(+), 74 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 859f65351a..70cb948270 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -301,30 +301,14 @@ def _upgrade_state_dict(state):
     if not hasattr(state['args'], 'task'):
         state['args'].task = 'translation'
 
-    def set_defaults(cls):
-        if not hasattr(cls, 'add_args'):
-            return
-        parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, allow_abbrev=False)
-        cls.add_args(parser)
-        # copied from argparse.py:
-        defaults = argparse.Namespace()
-        for action in parser._actions:
-            if action.dest is not argparse.SUPPRESS:
-                if not hasattr(defaults, action.dest):
-                    if action.default is not argparse.SUPPRESS:
-                        setattr(defaults, action.dest, action.default)
-        for key, default_value in vars(defaults).items():
-            if not hasattr(state['args'], key):
-                setattr(state['args'], key, default_value)
-
     # set any missing default values in the task, model or other registries
-    set_defaults(tasks.TASK_REGISTRY[state['args'].task])
-    set_defaults(models.ARCH_MODEL_REGISTRY[state['args'].arch])
+    registry.set_defaults(state['args'], tasks.TASK_REGISTRY[state['args'].task])
+    registry.set_defaults(state['args'], models.ARCH_MODEL_REGISTRY[state['args'].arch])
     for registry_name, REGISTRY in registry.REGISTRIES.items():
         choice = getattr(state['args'], registry_name, None)
         if choice is not None:
             cls = REGISTRY['registry'][choice]
-            set_defaults(cls)
+            registry.set_defaults(state['args'], cls)
 
     return state
 
diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index 0eaafc0097..06a2c55723 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -4,13 +4,16 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
+import copy
 import os
+from typing import List
 
 import torch
+from torch import nn
 
 from fairseq import utils
 from fairseq.data import encoders
-from fairseq.models import BaseFairseqModel
 
 
 def from_pretrained(
@@ -56,22 +59,19 @@ def from_pretrained(
     }
 
 
-class Generator(BaseFairseqModel):
-    """PyTorch Hub API for generating sequences from a pre-trained translation
-    or language model."""
+class GeneratorHubInterface(nn.Module):
+    """
+    PyTorch Hub interface for generating sequences from a pre-trained
+    translation or language model.
+    """
 
     def __init__(self, args, task, models):
+        super().__init__()
         self.args = args
         self.task = task
-        self.models = models
+        self.models = nn.ModuleList(models)
         self.src_dict = task.source_dictionary
         self.tgt_dict = task.target_dictionary
-        self.use_cuda = torch.cuda.is_available() and not getattr(args, 'cpu', False)
-
-        if self.use_cuda:
-            if getattr(args, 'fp16', False):
-                self.half()
-            self.cuda()
 
         # optimize model for generation
         for model in self.models:
@@ -83,8 +83,6 @@ def __init__(self, args, task, models):
                 need_attn=getattr(args, 'print_alignment', False),
             )
 
-        self.generator = self.task.build_generator(args)
-
         # Load alignment dictionary for unknown word replacement
         # (None if no unknown word replacement, empty if no path to align dictionary)
         self.align_dict = utils.load_align_dict(getattr(args, 'replace_unk', None))
@@ -92,53 +90,122 @@ def __init__(self, args, task, models):
         self.tokenizer = encoders.build_tokenizer(args)
         self.bpe = encoders.build_bpe(args)
 
-    def generate(self, src_str, verbose=False):
+        # this is useful for determining the device
+        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
 
-        def preprocess(s):
-            if self.tokenizer is not None:
-                s = self.tokenizer.encode(s)
-            if self.bpe is not None:
-                s = self.bpe.encode(s)
-            return s
+    @property
+    def device(self):
+        return self._float_tensor.device
 
-        def postprocess(s):
-            if self.bpe is not None:
-                s = self.bpe.decode(s)
-            if self.tokenizer is not None:
-                s = self.tokenizer.decode(s)
-            return s
+    def translate(self, sentence: str, verbose: bool = False, **kwargs) -> str:
+        input = self.encode(sentence)
+        hypo = self.generate(input, verbose, **kwargs)
+        return self.decode(hypo)
 
-        src_str = preprocess(src_str)
-        tokens = self.src_dict.encode_line(src_str, add_if_not_exist=False).long()
-        if verbose:
-            src_str_with_unk = self.src_dict.string(tokens)
-            print('S\t{}'.format(src_str_with_unk))
+    def generate(self, tokens: torch.LongTensor, verbose: bool = False, **kwargs) -> torch.LongTensor:
+        sample = self._build_sample(tokens)
 
-        dataset = self.task.build_dataset_for_inference([tokens], [tokens.numel()])
-        sample = dataset.collater([dataset[0]])
-        if self.use_cuda:
-            sample = utils.move_to_cuda(sample)
+        # build generator using current args as well as any kwargs
+        gen_args = copy.copy(self.args)
+        for k, v in kwargs.items():
+            setattr(gen_args, k, v)
+        generator = self.task.build_generator(gen_args)
+
+        translations = self.task.inference_step(generator, self.models, sample)
 
-        translations = self.task.inference_step(self.generator, self.models, sample)
+        if verbose:
+            src_str_with_unk = self.string(tokens)
+            print('S\t{}'.format(src_str_with_unk))
 
         # Process top predictions
         for hypo in translations[0][:min(len(translations), getattr(self.args, 'nbest', 1))]:
-            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
-                hypo_tokens=hypo['tokens'].int().cpu(),
-                src_str=src_str,
-                alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None,
-                align_dict=self.align_dict,
-                tgt_dict=self.tgt_dict,
-            )
-            hypo_str = postprocess(hypo_str)
+            hypo_str = self.decode(hypo['tokens'])
             if verbose:
                 print('H\t{}\t{}'.format(hypo['score'], hypo_str))
                 print('P\t{}'.format(
                     ' '.join(map(lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist()))
                 ))
-                if getattr(self.args, 'print_alignment', False):
+                if hypo['alignment'] is not None and getattr(self.args, 'print_alignment', False):
                     print('A\t{}'.format(
-                        ' '.join(map(lambda x: str(utils.item(x)), alignment))
+                        ' '.join(map(lambda x: str(utils.item(x)), hypo['alignment'].int().cpu()))
                     ))
 
-        return hypo_str
+        return hypo['tokens']
+
+    def encode(self, sentence: str) -> torch.LongTensor:
+        sentence = self.tokenize(sentence)
+        sentence = self.apply_bpe(sentence)
+        return self.binarize(sentence)
+
+    def decode(self, tokens: torch.LongTensor) -> str:
+        sentence = self.string(tokens)
+        sentence = self.remove_bpe(sentence)
+        return self.detokenize(sentence)
+
+    def tokenize(self, sentence: str) -> str:
+        if self.tokenizer is not None:
+            sentence = self.tokenizer.encode(sentence)
+        return sentence
+
+    def detokenize(self, sentence: str) -> str:
+        if self.tokenizer is not None:
+            sentence = self.tokenizer.decode(sentence)
+        return sentence
+
+    def apply_bpe(self, sentence: str) -> str:
+        if self.bpe is not None:
+            sentence = self.bpe.encode(sentence)
+        return sentence
+
+    def remove_bpe(self, sentence: str) -> str:
+        if self.bpe is not None:
+            sentence = self.bpe.decode(sentence)
+        return sentence
+
+    def binarize(self, sentence: str) -> torch.LongTensor:
+        return self.src_dict.encode_line(sentence, add_if_not_exist=False).long()
+
+    def string(self, tokens: torch.LongTensor) -> str:
+        return self.tgt_dict.string(tokens)
+
+    def _build_sample(self, src_tokens: torch.LongTensor):
+        assert torch.is_tensor(src_tokens)
+        dataset = self.task.build_dataset_for_inference([src_tokens], [src_tokens.numel()])
+        sample = dataset.collater([dataset[0]])
+        sample = utils.apply_to_sample(
+            lambda tensor: tensor.to(self.device),
+            sample
+        )
+        return sample
+
+
+class BPEHubInterface(object):
+    """PyTorch Hub interface for Byte-Pair Encoding (BPE)."""
+
+    def __init__(self, bpe, **kwargs):
+        super().__init__()
+        args = argparse.Namespace(bpe=bpe, **kwargs)
+        self.bpe = encoders.build_bpe(args)
+        assert self.bpe is not None
+
+    def encode(self, sentence: str) -> str:
+        return self.bpe.encode(sentence)
+
+    def decode(self, sentence: str) -> str:
+        return self.bpe.decode(sentence)
+
+
+class TokenizerHubInterface(object):
+    """PyTorch Hub interface for tokenization."""
+
+    def __init__(self, tokenizer, **kwargs):
+        super().__init__()
+        args = argparse.Namespace(tokenizer=tokenizer, **kwargs)
+        self.tokenizer = encoders.build_tokenizer(args)
+        assert self.tokenizer is not None
+
+    def encode(self, sentence: str) -> str:
+        return self.tokenizer.encode(sentence)
+
+    def decode(self, sentence: str) -> str:
+        return self.tokenizer.decode(sentence)
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index 0cf7cf2684..37f1aa456f 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -147,12 +147,13 @@ def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_na
         Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model
         file. Downloads and caches the pre-trained model file if needed.
 
-        The base implementation returns a :class:`fairseq.hub_utils.Generator`,
-        which can be used to generate translations or sample from language
-        models. The underlying :class:`~fairseq.models.FairseqModel` can be
-        accessed via the *generator.models* attribute.
+        The base implementation returns a
+        :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to
+        generate translations or sample from language models. The underlying
+        :class:`~fairseq.models.FairseqModel` can be accessed via the
+        *generator.models* attribute.
 
-        Other models may override this to implement custom PyTorch Hub APIs.
+        Other models may override this to implement custom hub interfaces.
 
         Args:
             model_name_or_path (str): either the name of a pre-trained model to
@@ -172,7 +173,7 @@ def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_na
             **kwargs,
         )
         print(x['args'])
-        return hub_utils.Generator(x['args'], x['task'], x['models'])
+        return hub_utils.GeneratorHubInterface(x['args'], x['task'], x['models'])
 
     @classmethod
     def hub_models(cls):
diff --git a/fairseq/registry.py b/fairseq/registry.py
index cb0c984ade..ed24258c57 100644
--- a/fairseq/registry.py
+++ b/fairseq/registry.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
+
 
 REGISTRIES = {}
 
@@ -35,6 +37,7 @@ def build_x(args, *extra_args, **extra_kwargs):
             builder = getattr(cls, 'build_' + registry_name)
         else:
             builder = cls
+        set_defaults(args, cls)
         return builder(args, *extra_args, **extra_kwargs)
 
     def register_x(name):
@@ -57,3 +60,21 @@ def register_x_cls(cls):
         return register_x_cls
 
     return build_x, register_x, REGISTRY
+
+
+def set_defaults(args, cls):
+    """Helper to set default arguments based on *add_args*."""
+    if not hasattr(cls, 'add_args'):
+        return
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, allow_abbrev=False)
+    cls.add_args(parser)
+    # copied from argparse.py:
+    defaults = argparse.Namespace()
+    for action in parser._actions:
+        if action.dest is not argparse.SUPPRESS:
+            if not hasattr(defaults, action.dest):
+                if action.default is not argparse.SUPPRESS:
+                    setattr(defaults, action.dest, action.default)
+    for key, default_value in vars(defaults).items():
+        if not hasattr(args, key):
+            setattr(args, key, default_value)
diff --git a/hubconf.py b/hubconf.py
index 90acec6775..7e1574a684 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -5,6 +5,8 @@
 
 import functools
 
+from fairseq.hub_utils import BPEHubInterface as bpe  # noqa
+from fairseq.hub_utils import TokenizerHubInterface as tokenizer  # noqa
 from fairseq.models import MODEL_REGISTRY
 
 
@@ -18,11 +20,11 @@
 ]
 
 
-for model_type, _cls in MODEL_REGISTRY.items():
+for _model_type, _cls in MODEL_REGISTRY.items():
     for model_name in _cls.hub_models().keys():
         globals()[model_name] = functools.partial(
             _cls.from_pretrained,
             model_name_or_path=model_name,
         )
     # to simplify the interface we only expose named models
-    #globals()[model_type] = _cls.from_pretrained
+    # globals()[_model_type] = _cls.from_pretrained

From 4abadbdf775e8e1a0088da68677842a2330d36d9 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 1 Aug 2019 07:28:55 -0700
Subject: [PATCH 053/213] Fix sampling with beam>1

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/792

Differential Revision: D16591987

Pulled By: myleott

fbshipit-source-id: d27c490ae75f80ded19226b8384f4776485dd694
---
 fairseq/options.py               |   4 +-
 fairseq/search.py                |   2 +-
 fairseq/sequence_generator.py    | 214 ++++++++++++++-----------------
 fairseq/tasks/fairseq_task.py    |   1 -
 tests/test_sequence_generator.py |  17 ---
 5 files changed, 99 insertions(+), 139 deletions(-)

diff --git a/fairseq/options.py b/fairseq/options.py
index e29187ee5d..006f9b6c05 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -448,9 +448,7 @@ def add_generation_args(parser):
     group.add_argument('--match-source-len', default=False, action='store_true',
                        help=('generations should match the source length'))
     group.add_argument('--no-early-stop', action='store_true',
-                       help=('continue searching even after finalizing k=beam '
-                             'hypotheses; this is more correct, but increases '
-                             'generation time by 50%%'))
+                       help='deprecated')
     group.add_argument('--unnormalized', action='store_true',
                        help='compare unnormalized hypothesis scores')
     group.add_argument('--no-beamable-mm', action='store_true',
diff --git a/fairseq/search.py b/fairseq/search.py
index 02dcf628ca..61d0df8d48 100644
--- a/fairseq/search.py
+++ b/fairseq/search.py
@@ -25,7 +25,7 @@ def _init_buffers(self, t):
             self.indices_buf = torch.LongTensor().to(device=t.device)
             self.beams_buf = torch.LongTensor().to(device=t.device)
 
-    def step(self, step, lprobs, scores, beam_size):
+    def step(self, step, lprobs, scores):
         """Take a single search step.
 
         Args:
diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py
index 87016e9281..a6504dab6f 100644
--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -19,7 +19,6 @@ def __init__(
         max_len_a=0,
         max_len_b=200,
         min_len=1,
-        stop_early=True,
         normalize_scores=True,
         len_penalty=1.,
         unk_penalty=0.,
@@ -42,9 +41,6 @@ def __init__(
                 ax + b, where x is the source length
             min_len (int, optional): the minimum length of the generated output
                 (not including end-of-sentence)
-            stop_early (bool, optional): stop generation immediately after we
-                finalize beam_size hypotheses, even though longer hypotheses
-                might have better normalized scores (default: True)
             normalize_scores (bool, optional): normalize scores by the length
                 of the output (default: True)
             len_penalty (float, optional): length penalty, where <1.0 favors
@@ -78,7 +74,6 @@ def __init__(
         self.max_len_a = max_len_a
         self.max_len_b = max_len_b
         self.min_len = min_len
-        self.stop_early = stop_early
         self.normalize_scores = normalize_scores
         self.len_penalty = len_penalty
         self.unk_penalty = unk_penalty
@@ -156,7 +151,7 @@ def generate(
         # initialize buffers
         scores = src_tokens.new(bsz * beam_size, max_len + 1).float().fill_(0)
         scores_buf = scores.clone()
-        tokens = src_tokens.data.new(bsz * beam_size, max_len + 2).long().fill_(self.pad)
+        tokens = src_tokens.new(bsz * beam_size, max_len + 2).long().fill_(self.pad)
         tokens_buf = tokens.clone()
         tokens[:, 0] = self.eos if bos_token is None else bos_token
         attn, attn_buf = None, None
@@ -164,10 +159,15 @@ def generate(
         if prefix_tokens is not None:
             partial_prefix_mask_buf = torch.zeros_like(src_lengths).byte()
 
+        # The blacklist indicates candidates that should be ignored.
+        # For example, suppose we're sampling and have already finalized 2/5
+        # samples. Then the blacklist would mark 2 positions as being ignored,
+        # so that we only finalize the remaining 3 samples.
+        blacklist = src_tokens.new(bsz, beam_size).byte().fill_(0)
+
         # list of completed sentences
         finalized = [[] for i in range(bsz)]
         finished = [False for i in range(bsz)]
-        worst_finalized = [{'idx': None, 'score': -math.inf} for i in range(bsz)]
         num_remaining_sent = bsz
 
         # number of candidate hypos per step
@@ -185,7 +185,7 @@ def buffer(name, type_of=tokens):  # noqa
                 buffers[name] = type_of.new()
             return buffers[name]
 
-        def is_finished(sent, step, unfin_idx, unfinalized_scores=None):
+        def is_finished(sent, step, unfin_idx):
             """
             Check whether we've finished generation for a given sentence, by
             comparing the worst score among finalized hypotheses to the best
@@ -193,18 +193,10 @@ def is_finished(sent, step, unfin_idx, unfinalized_scores=None):
             """
             assert len(finalized[sent]) <= beam_size
             if len(finalized[sent]) == beam_size:
-                if self.stop_early or step == max_len or unfinalized_scores is None:
-                    return True
-                # stop if the best unfinalized score is worse than the worst
-                # finalized one
-                best_unfinalized_score = unfinalized_scores[unfin_idx].max()
-                if self.normalize_scores:
-                    best_unfinalized_score /= max_len ** self.len_penalty
-                if worst_finalized[sent]['score'] >= best_unfinalized_score:
-                    return True
+                return True
             return False
 
-        def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None):
+        def finalize_hypos(step, bbsz_idx, eos_scores):
             """
             Finalize the given hypotheses at this step, while keeping the total
             number of finalized hypotheses per sentence <= beam_size.
@@ -219,14 +211,13 @@ def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None):
                     indicating which hypotheses to finalize
                 eos_scores: A vector of the same size as bbsz_idx containing
                     scores for each hypothesis
-                unfinalized_scores: A vector containing scores for all
-                    unfinalized hypotheses
             """
             assert bbsz_idx.numel() == eos_scores.numel()
 
             # clone relevant token and attention tensors
             tokens_clone = tokens.index_select(0, bbsz_idx)
             tokens_clone = tokens_clone[:, 1:step + 2]  # skip the first index, which is EOS
+            assert not tokens_clone.eq(self.eos).any()
             tokens_clone[:, step] = self.eos
             attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2] if attn is not None else None
 
@@ -278,23 +269,11 @@ def get_hypo():
 
                 if len(finalized[sent]) < beam_size:
                     finalized[sent].append(get_hypo())
-                elif not self.stop_early and score > worst_finalized[sent]['score']:
-                    # replace worst hypo for this sentence with new/better one
-                    worst_idx = worst_finalized[sent]['idx']
-                    if worst_idx is not None:
-                        finalized[sent][worst_idx] = get_hypo()
-
-                    # find new worst finalized hypo for this sentence
-                    idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score'])
-                    worst_finalized[sent] = {
-                        'score': s['score'],
-                        'idx': idx,
-                    }
 
             newly_finished = []
             for sent, unfin_idx in sents_seen:
                 # check termination conditions for this sentence
-                if not finished[sent] and is_finished(sent, step, unfin_idx, unfinalized_scores):
+                if not finished[sent] and is_finished(sent, step, unfin_idx):
                     finished[sent] = True
                     newly_finished.append(unfin_idx)
             return newly_finished
@@ -318,6 +297,13 @@ def get_hypo():
             lprobs[:, self.pad] = -math.inf  # never select pad
             lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty
 
+            # handle min and max length constraints
+            if step >= max_len:
+                lprobs[:, :self.eos] = -math.inf
+                lprobs[:, self.eos + 1:] = -math.inf
+            elif step < self.min_len:
+                lprobs[:, self.eos] = -math.inf
+
             if self.no_repeat_ngram_size > 0:
                 # for each beam and batch sentence, generate a list of previous ngrams
                 gen_ngrams = [{} for bbsz_idx in range(bsz * beam_size)]
@@ -339,105 +325,92 @@ def get_hypo():
             scores_buf = scores_buf.type_as(lprobs)
             eos_bbsz_idx = buffer('eos_bbsz_idx')
             eos_scores = buffer('eos_scores', type_of=scores)
-            if step < max_len:
-                self.search.set_src_lengths(src_lengths)
-
-                if self.no_repeat_ngram_size > 0:
-                    def calculate_banned_tokens(bbsz_idx):
-                        # before decoding the next token, prevent decoding of ngrams that have already appeared
-                        ngram_index = tuple(tokens[bbsz_idx, step + 2 - self.no_repeat_ngram_size:step + 1].tolist())
-                        return gen_ngrams[bbsz_idx].get(ngram_index, [])
-
-                    if step + 2 - self.no_repeat_ngram_size >= 0:
-                        # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
-                        banned_tokens = [calculate_banned_tokens(bbsz_idx) for bbsz_idx in range(bsz * beam_size)]
-                    else:
-                        banned_tokens = [[] for bbsz_idx in range(bsz * beam_size)]
-
-                    for bbsz_idx in range(bsz * beam_size):
-                        lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf
-
-                if prefix_tokens is not None and step < prefix_tokens.size(1):
-                    assert isinstance(self.search, search.BeamSearch) or bsz == 1, \
-                            "currently only BeamSearch supports decoding with prefix_tokens"
-                    probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :]
-                    cand_scores = torch.gather(
-                        probs_slice, dim=1,
-                        index=prefix_tokens[:, step].view(-1, 1)
-                    ).view(-1, 1).repeat(1, cand_size)
-                    if step > 0:
-                        # save cumulative scores for each hypothesis
-                        cand_scores.add_(scores[:, step - 1].view(bsz, beam_size).repeat(1, 2))
-                    cand_indices = prefix_tokens[:, step].view(-1, 1).repeat(1, cand_size)
-                    cand_beams = torch.zeros_like(cand_indices)
-
-                # handle prefixes of different lengths
-                # when step == prefix_tokens.size(1), we'll have new free-decoding batches
-                if prefix_tokens is not None and step <= prefix_tokens.size(1):
-                    if step < prefix_tokens.size(1):
-                        partial_prefix_mask = prefix_tokens[:, step].eq(self.pad)
-                    else:   #  all prefixes finished force-decoding
-                        partial_prefix_mask = torch.ones(bsz).to(prefix_tokens).byte()
-                    if partial_prefix_mask.any():
-                        # track new free-decoding batches, at whose very first step
-                        # only use the first beam to eliminate repeats
-                        prefix_step0_mask = partial_prefix_mask ^ partial_prefix_mask_buf
-                        lprobs.view(bsz, beam_size, -1)[prefix_step0_mask, 1:] = -math.inf
-                        partial_scores, partial_indices, partial_beams = self.search.step(
-                            step,
-                            lprobs.view(bsz, -1, self.vocab_size),
-                            scores.view(bsz, beam_size, -1)[:, :, :step],
-                        )
-                        cand_scores[partial_prefix_mask] = partial_scores[partial_prefix_mask]
-                        cand_indices[partial_prefix_mask] = partial_indices[partial_prefix_mask]
-                        cand_beams[partial_prefix_mask] = partial_beams[partial_prefix_mask]
-                        partial_prefix_mask_buf = partial_prefix_mask
 
+            self.search.set_src_lengths(src_lengths)
+
+            if self.no_repeat_ngram_size > 0:
+                def calculate_banned_tokens(bbsz_idx):
+                    # before decoding the next token, prevent decoding of ngrams that have already appeared
+                    ngram_index = tuple(tokens[bbsz_idx, step + 2 - self.no_repeat_ngram_size:step + 1].tolist())
+                    return gen_ngrams[bbsz_idx].get(ngram_index, [])
+
+                if step + 2 - self.no_repeat_ngram_size >= 0:
+                    # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+                    banned_tokens = [calculate_banned_tokens(bbsz_idx) for bbsz_idx in range(bsz * beam_size)]
                 else:
-                    cand_scores, cand_indices, cand_beams = self.search.step(
+                    banned_tokens = [[] for bbsz_idx in range(bsz * beam_size)]
+
+                for bbsz_idx in range(bsz * beam_size):
+                    lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf
+
+            if prefix_tokens is not None and step < prefix_tokens.size(1):
+                assert isinstance(self.search, search.BeamSearch) or bsz == 1, \
+                        "currently only BeamSearch supports decoding with prefix_tokens"
+                probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :]
+                cand_scores = torch.gather(
+                    probs_slice, dim=1,
+                    index=prefix_tokens[:, step].view(-1, 1)
+                ).view(-1, 1).repeat(1, cand_size)
+                if step > 0:
+                    # save cumulative scores for each hypothesis
+                    cand_scores.add_(scores[:, step - 1].view(bsz, beam_size).repeat(1, 2))
+                cand_indices = prefix_tokens[:, step].view(-1, 1).repeat(1, cand_size)
+                cand_beams = torch.zeros_like(cand_indices)
+
+            # handle prefixes of different lengths
+            # when step == prefix_tokens.size(1), we'll have new free-decoding batches
+            if prefix_tokens is not None and step <= prefix_tokens.size(1):
+                if step < prefix_tokens.size(1):
+                    partial_prefix_mask = prefix_tokens[:, step].eq(self.pad)
+                else:   #  all prefixes finished force-decoding
+                    partial_prefix_mask = torch.ones(bsz).to(prefix_tokens).byte()
+                if partial_prefix_mask.any():
+                    # track new free-decoding batches, at whose very first step
+                    # only use the first beam to eliminate repeats
+                    prefix_step0_mask = partial_prefix_mask ^ partial_prefix_mask_buf
+                    lprobs.view(bsz, beam_size, -1)[prefix_step0_mask, 1:] = -math.inf
+                    partial_scores, partial_indices, partial_beams = self.search.step(
                         step,
                         lprobs.view(bsz, -1, self.vocab_size),
                         scores.view(bsz, beam_size, -1)[:, :, :step],
                     )
+                    cand_scores[partial_prefix_mask] = partial_scores[partial_prefix_mask]
+                    cand_indices[partial_prefix_mask] = partial_indices[partial_prefix_mask]
+                    cand_beams[partial_prefix_mask] = partial_beams[partial_prefix_mask]
+                    partial_prefix_mask_buf = partial_prefix_mask
+
             else:
-                # make probs contain cumulative scores for each hypothesis
-                lprobs.add_(scores[:, step - 1].unsqueeze(-1))
-
-                # finalize all active hypotheses once we hit max_len
-                # pick the hypothesis with the highest prob of EOS right now
-                torch.sort(
-                    lprobs[:, self.eos],
-                    descending=True,
-                    out=(eos_scores, eos_bbsz_idx),
+                cand_scores, cand_indices, cand_beams = self.search.step(
+                    step,
+                    lprobs.view(bsz, -1, self.vocab_size),
+                    scores.view(bsz, beam_size, -1)[:, :, :step],
                 )
-                num_remaining_sent -= len(finalize_hypos(step, eos_bbsz_idx, eos_scores))
-                assert num_remaining_sent == 0
-                break
 
             # cand_bbsz_idx contains beam indices for the top candidate
             # hypotheses, with a range of values: [0, bsz*beam_size),
             # and dimensions: [bsz, cand_size]
             cand_bbsz_idx = cand_beams.add(bbsz_offsets)
 
-            # finalize hypotheses that end in eos
+            # finalize hypotheses that end in eos (except for blacklisted ones)
             eos_mask = cand_indices.eq(self.eos)
+            eos_mask[:, :beam_size][blacklist] = 0
+
+            # only consider eos when it's among the top beam_size indices
+            torch.masked_select(
+                cand_bbsz_idx[:, :beam_size],
+                mask=eos_mask[:, :beam_size],
+                out=eos_bbsz_idx,
+            )
 
             finalized_sents = set()
-            if step >= self.min_len:
-                # only consider eos when it's among the top beam_size indices
+            if eos_bbsz_idx.numel() > 0:
                 torch.masked_select(
-                    cand_bbsz_idx[:, :beam_size],
+                    cand_scores[:, :beam_size],
                     mask=eos_mask[:, :beam_size],
-                    out=eos_bbsz_idx,
+                    out=eos_scores,
                 )
-                if eos_bbsz_idx.numel() > 0:
-                    torch.masked_select(
-                        cand_scores[:, :beam_size],
-                        mask=eos_mask[:, :beam_size],
-                        out=eos_scores,
-                    )
-                    finalized_sents = finalize_hypos(step, eos_bbsz_idx, eos_scores, cand_scores)
-                    num_remaining_sent -= len(finalized_sents)
+                finalized_sents = finalize_hypos(step, eos_bbsz_idx, eos_scores)
+                num_remaining_sent -= len(finalized_sents)
 
             assert num_remaining_sent >= 0
             if num_remaining_sent == 0:
@@ -462,6 +435,7 @@ def calculate_banned_tokens(bbsz_idx):
                     prefix_tokens = prefix_tokens[batch_idxs]
                     partial_prefix_mask_buf = partial_prefix_mask_buf[batch_idxs]
                 src_lengths = src_lengths[batch_idxs]
+                blacklist = blacklist[batch_idxs]
 
                 scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
                 scores_buf.resize_as_(scores)
@@ -474,10 +448,12 @@ def calculate_banned_tokens(bbsz_idx):
             else:
                 batch_idxs = None
 
-            # set active_mask so that values > cand_size indicate eos hypos
-            # and values < cand_size indicate candidate active hypos.
-            # After, the min values per row are the top candidate active hypos
+            # Set active_mask so that values > cand_size indicate eos or
+            # blacklisted hypos and values < cand_size indicate candidate
+            # active hypos. After this, the min values per row are the top
+            # candidate active hypos.
             active_mask = buffer('active_mask')
+            eos_mask[:, :beam_size] |= blacklist
             torch.add(
                 eos_mask.type_as(cand_offsets) * cand_size,
                 cand_offsets[:eos_mask.size(1)],
@@ -486,12 +462,16 @@ def calculate_banned_tokens(bbsz_idx):
 
             # get the top beam_size active hypotheses, which are just the hypos
             # with the smallest values in active_mask
-            active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore')
+            active_hypos, new_blacklist = buffer('active_hypos'), buffer('new_blacklist')
             torch.topk(
                 active_mask, k=beam_size, dim=1, largest=False,
-                out=(_ignore, active_hypos)
+                out=(new_blacklist, active_hypos)
             )
 
+            # update blacklist to ignore any finalized hypos
+            blacklist = new_blacklist.ge(cand_size)[:, :beam_size]
+            assert (~blacklist).any(dim=1).all()
+
             active_bbsz_idx = buffer('active_bbsz_idx')
             torch.gather(
                 cand_bbsz_idx, dim=1, index=active_hypos,
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index 89a1c866b8..3dea071629 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -193,7 +193,6 @@ def build_generator(self, args):
                 max_len_a=getattr(args, 'max_len_a', 0),
                 max_len_b=getattr(args, 'max_len_b', 200),
                 min_len=getattr(args, 'min_len', 1),
-                stop_early=(not getattr(args, 'no_early_stop', False)),
                 normalize_scores=(not getattr(args, 'unnormalized', False)),
                 len_penalty=getattr(args, 'lenpen', 1),
                 unk_penalty=getattr(args, 'unkpen', 0),
diff --git a/tests/test_sequence_generator.py b/tests/test_sequence_generator.py
index ce02400de4..1dace6e9c3 100644
--- a/tests/test_sequence_generator.py
+++ b/tests/test_sequence_generator.py
@@ -137,23 +137,6 @@ def test_maxlen(self):
         self.assertHypoTokens(hypos[1][1], [w2, w2, eos])
         self.assertHypoScore(hypos[1][1], [0.3, 0.9, 0.01])
 
-    def test_no_stop_early(self):
-        generator = SequenceGenerator(self.tgt_dict, stop_early=False, beam_size=2)
-        hypos = generator.generate([self.model], self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, eos])
-        self.assertHypoScore(hypos[0][0], [0.9, 1.0])
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
-        self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0])
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w2, w2, w2, w2, eos])
-        self.assertHypoScore(hypos[1][0], [0.3, 0.9, 0.99, 0.4, 1.0])
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
-        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0])
-
 
 class TestDiverseBeamSearch(TestSequenceGeneratorBase):
 

From 430905d7fae0bdfcc9d969934df35a354e71c4ec Mon Sep 17 00:00:00 2001
From: Iurii Zdebskyi <iuriiz@fb.com>
Date: Thu, 1 Aug 2019 07:47:44 -0700
Subject: [PATCH 054/213] Changed tensor comparison return type from uint8 to
 bool (#21113)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/21113
ghimport-source-id: 9c4ba63457a72bfc41894387e0b01be3fd9a9baf

Test Plan: Imported from OSS

Differential Revision: D15552204

Pulled By: izdeby

fbshipit-source-id: a608213668649d058e22b510d7755cb99e7d0037
---
 fairseq/modules/mean_pool_gating_network.py | 2 +-
 fairseq/search.py                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/modules/mean_pool_gating_network.py b/fairseq/modules/mean_pool_gating_network.py
index 7acc664488..a22d9bd6e4 100644
--- a/fairseq/modules/mean_pool_gating_network.py
+++ b/fairseq/modules/mean_pool_gating_network.py
@@ -39,7 +39,7 @@ def forward(self, encoder_out):
         if encoder_padding_mask is not None:
             encoder_out = encoder_out.clone()  # required because of transpose above
             encoder_out[encoder_padding_mask] = 0
-            ntokens = torch.sum(1 - encoder_padding_mask, dim=1, keepdim=True)
+            ntokens = torch.sum(~encoder_padding_mask, dim=1, keepdim=True)
             x = torch.sum(encoder_out, dim=1) / ntokens.type_as(encoder_out)
         else:
             x = torch.mean(encoder_out, dim=1)
diff --git a/fairseq/search.py b/fairseq/search.py
index 61d0df8d48..d20fbd3c08 100644
--- a/fairseq/search.py
+++ b/fairseq/search.py
@@ -212,7 +212,7 @@ def _sample_topp(self, lprobs):
 
         # trim the words that are not in top-P by setting their probabilities
         # to 0, so that they would not be sampled later.
-        trim_mask = 1 - truncated_mask
+        trim_mask = truncated_mask.bitwise_not()
         trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
         return trimed_probs, truncated_indices
 

From 45f23f66bec3063cb20ffae8c6a9084e45669aba Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 1 Aug 2019 11:56:12 -0700
Subject: [PATCH 055/213] Add more details for bulk BPE encoding

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/793

Differential Revision: D16603930

Pulled By: myleott

fbshipit-source-id: b302db3743db4f36c14fb0dc7f3456fe8a0079dd
---
 examples/roberta/multiprocessing_bpe_encoder.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/roberta/multiprocessing_bpe_encoder.py b/examples/roberta/multiprocessing_bpe_encoder.py
index 48d9cb367e..f0240c210f 100644
--- a/examples/roberta/multiprocessing_bpe_encoder.py
+++ b/examples/roberta/multiprocessing_bpe_encoder.py
@@ -17,8 +17,11 @@
 
 def main():
     """
-    Helper script to encode raw text
-    with the GPT-2 BPE using multiple processes.
+    Helper script to encode raw text with the GPT-2 BPE using multiple processes.
+
+    The encoder.json and vocab.bpe files can be obtained here:
+    - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
+    - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
     """
     parser = argparse.ArgumentParser()
     parser.add_argument(

From ea6cc1da11db821e24ca461b570ae01784e19358 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Thu, 1 Aug 2019 15:20:53 -0700
Subject: [PATCH 056/213] Use ==/!= to compare str, bytes, and int literals
 (#948)

Summary:
Identity is not the same thing as equality in Python.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/948

Differential Revision: D16608269

Pulled By: myleott

fbshipit-source-id: be203d62e7824c96c59400d1b342196adb89a839
---
 fairseq/modules/sparse_multihead_attention.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/modules/sparse_multihead_attention.py b/fairseq/modules/sparse_multihead_attention.py
index 7e83cc9529..61430195c2 100644
--- a/fairseq/modules/sparse_multihead_attention.py
+++ b/fairseq/modules/sparse_multihead_attention.py
@@ -35,7 +35,7 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
 
     # Used for Ai(2) calculations - beginning of [l-c, l] range
     def compute_checkpoint(self, word_index):
-        if word_index % self.stride == 0 and word_index is not 0:
+        if word_index % self.stride == 0 and word_index != 0:
             checkpoint_index = word_index - self.expressivity
         else:
             checkpoint_index = (
@@ -66,7 +66,7 @@ def compute_fixed_attention_subset(self, word_index, tgt_len):
 
         # Subset 1 - whole window
         rounded_index = math.floor((word_index + self.stride) / self.stride) * self.stride
-        if word_index % self.stride == 0 and word_index is not 0:
+        if word_index % self.stride == 0 and word_index != 0:
             subset_one = set(range(word_index-self.stride, min(absolute_max, word_index+1)))
         else:
             subset_one = set(range(max(0, rounded_index - self.stride), min(

From ccb5dea58dc556549df00453a144d382ffb727b6 Mon Sep 17 00:00:00 2001
From: Nathan Ng <n.ng555@gmail.com>
Date: Thu, 1 Aug 2019 16:43:09 -0700
Subject: [PATCH 057/213] Fix wmt19 links (#796)

Summary:
fix links to .tar.gz vs .tar.bz2
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/796

Reviewed By: myleott

Differential Revision: D16611740

Pulled By: nng555

fbshipit-source-id: 76210484225ed917ff14ef626845680d918948f5
---
 examples/wmt19/README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/wmt19/README.md b/examples/wmt19/README.md
index a7c83172e4..fff13fa6ac 100644
--- a/examples/wmt19/README.md
+++ b/examples/wmt19/README.md
@@ -6,13 +6,13 @@ This page provides pointers to the models of Facebook-FAIR's WMT'19 news transla
 
 Description | Model
 ---|---
-En->De Ensemble | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.bz2)
-De->En Ensemble | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.bz2)
-En->Ru Ensemble | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.bz2)
-Ru->En Ensemble | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.bz2)
-En LM | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.bz2)
-De LM | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.bz2)
-Ru LM | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2)
+En->De Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz)
+De->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz)
+En->Ru Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz)
+Ru->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz)
+En LM | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
+De LM | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
+Ru LM | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
 
 ## Example usage (torch.hub)
 

From 5f34252767261dca6056dc27249e241f06cc9700 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 2 Aug 2019 05:19:26 -0700
Subject: [PATCH 058/213] Update beam search code to support torch.bool change

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/797

Differential Revision: D16617067

Pulled By: myleott

fbshipit-source-id: 52e3aeb98d6e3b55ff9154b784028bf13eabfe38
---
 fairseq/search.py             | 2 +-
 fairseq/sequence_generator.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/search.py b/fairseq/search.py
index d20fbd3c08..afcc388d5d 100644
--- a/fairseq/search.py
+++ b/fairseq/search.py
@@ -212,7 +212,7 @@ def _sample_topp(self, lprobs):
 
         # trim the words that are not in top-P by setting their probabilities
         # to 0, so that they would not be sampled later.
-        trim_mask = truncated_mask.bitwise_not()
+        trim_mask = (~truncated_mask)
         trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
         return trimed_probs, truncated_indices
 
diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py
index a6504dab6f..6a122271bd 100644
--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -163,7 +163,7 @@ def generate(
         # For example, suppose we're sampling and have already finalized 2/5
         # samples. Then the blacklist would mark 2 positions as being ignored,
         # so that we only finalize the remaining 3 samples.
-        blacklist = src_tokens.new(bsz, beam_size).byte().fill_(0)
+        blacklist = src_tokens.new_zeros(bsz, beam_size).eq(-1)  # forward and backward-compatible False mask
 
         # list of completed sentences
         finalized = [[] for i in range(bsz)]

From abb7ed4c91b55b1b714021d8163f0a8c73f82f46 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 2 Aug 2019 06:20:46 -0700
Subject: [PATCH 059/213] Update READMEs for torch.hub

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/795

Differential Revision: D16620488

Pulled By: myleott

fbshipit-source-id: 1998a9ccd8816fc7f590861fb4898f910a36bc1e
---
 examples/backtranslation/README.md            |  39 +--
 examples/language_model/README.md             |  98 +++---
 .../README.finetune_custom_classification.md  | 105 +++----
 examples/roberta/README.finetune_glue.md      |  66 ++++
 examples/roberta/README.md                    | 188 ++++-------
 examples/scaling_nmt/README.md                |  48 +--
 examples/stories/README.md                    |  38 +--
 examples/translation/README.md                | 294 +++++++++---------
 examples/translation_moe/README.md            |  66 ++--
 examples/wmt19/README.md                      | 108 +++----
 fairseq/data/encoders/moses_tokenizer.py      |  10 +-
 fairseq/hub_utils.py                          |  23 +-
 hubconf.py                                    |   1 +
 setup.py                                      |   2 +
 14 files changed, 530 insertions(+), 556 deletions(-)
 create mode 100644 examples/roberta/README.finetune_glue.md

diff --git a/examples/backtranslation/README.md b/examples/backtranslation/README.md
index cb010855cb..a834214adf 100644
--- a/examples/backtranslation/README.md
+++ b/examples/backtranslation/README.md
@@ -4,29 +4,32 @@ This page includes pre-trained models from the paper [Understanding Back-Transla
 
 ## Pre-trained models
 
-Description | Dataset | Model | Test set(s)
+Model | Description | Dataset | Download
 ---|---|---|---
-Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381); WMT'18 winner) | [WMT'18 English-German](http://www.statmt.org/wmt18/translation-task.html) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz) | See NOTE in the archive
+`transformer.wmt18.en-de` | Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381)) <br> WMT'18 winner | [WMT'18 English-German](http://www.statmt.org/wmt18/translation-task.html) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz) <br> See NOTE in the archive
 
 ## Example usage
 
 Interactive generation from the full ensemble via PyTorch Hub:
-```
->>> import torch
->>> torch.hub.list('pytorch/fairseq')
-[..., 'transformer.wmt14.en-fr', 'transformer.wmt16.en-de', 'transformer.wmt18.en-de', ... ]
->>> en2de_ensemble = torch.hub.load(
-...   'pytorch/fairseq',
-...   'transformer.wmt18.en-de',
-...   checkpoint_file='wmt18.model1.pt:wmt18.model2.pt:wmt18.model3.pt:wmt18.model4.pt:wmt18.model5.pt',
-...   data_name_or_path='.',
-...   tokenizer='moses',
-...   bpe='subword_nmt',
-... )
->>> len(en2de_ensemble.models)
-5
->>> print(en2de_ensemble.generate('Hello world!'))
-Hallo Welt!
+```python
+import torch
+
+# List available models
+torch.hub.list('pytorch/fairseq')  # [..., 'transformer.wmt18.en-de', ... ]
+
+# Load the WMT'18 En-De ensemble
+en2de_ensemble = torch.hub.load(
+    'pytorch/fairseq', 'transformer.wmt18.en-de',
+    checkpoint_file='wmt18.model1.pt:wmt18.model2.pt:wmt18.model3.pt:wmt18.model4.pt:wmt18.model5.pt',
+    tokenizer='moses', bpe='subword_nmt')
+
+# The ensemble contains 5 models
+len(en2de_ensemble.models)
+# 5
+
+# Translate
+en2de_ensemble.translate('Hello world!')
+# 'Hallo Welt!'
 ```
 
 ## Citation
diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index 4b041146e3..180714de49 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -2,36 +2,30 @@
 
 ## Pre-trained models
 
-Description | Parameters | Dataset | Model and Test set(s)
----|---:|---|---
-Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 1026M | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
-Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 247M | [WikiText-103](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.tar.bz2)
-
+Model | Description | Dataset | Download
+---|---|---|---
+`transformer_lm.gbw.adaptive_huge` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
+`transformer_lm.wiki103.adaptive` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 247M params | [WikiText-103](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.tar.bz2)
+`transformer_lm.wmt19.en` | English LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
+`transformer_lm.wmt19.de` | German LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
+`transformer_lm.wmt19.ru` | Russian LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
 
 ## Example usage
 
-Interactive generation via PyTorch Hub:
-```
->>> import torch
->>> torch.hub.list('pytorch/fairseq')
-[..., 'transformer_lm.gbw.adaptive_huge', 'transformer_lm.wiki103.adaptive', ...]
->>> lm = torch.hub.load(
-...   'pytorch/fairseq',
-...   'transformer_lm.wiki103.adaptive',
-...   data_name_or_path='./data-bin',
-...   tokenizer='moses',
-...   no_escape=True,
-...   beam=1,
-...   sampling=True,
-...   sampling_topk=10,
-...   temperature=0.8,
-... )
->>> lm.generate('Barack Obama', verbose=True)
-```
+Sampling from a language model using PyTorch Hub:
+```python
+import torch
 
-Available models are listed in the ``hub_models()`` method in each model file, for example:
-[transformer_lm.py](https://github.com/pytorch/fairseq/blob/master/fairseq/models/transformer_lm.py).
+# List available models
+torch.hub.list('pytorch/fairseq')  # [..., 'transformer_lm.wmt19.en', ...]
 
+# Load an English LM trained on WMT'19 News Crawl data
+en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
+
+# Sample from the language model
+en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
+# "Barack Obama is coming to Sydney and New Zealand (...)"
+```
 
 ## Training a new model with the CLI tools
 
@@ -44,47 +38,47 @@ Provides an example of pre-processing for [WikiText-103 language modeling task](
 Example usage:
 
 Prepare data:
-```
-$ cd examples/language_model/
-$ bash prepare-wikitext-103.sh
-$ cd ../..
+```bash
+cd examples/language_model/
+bash prepare-wikitext-103.sh
+cd ../..
 
 # Binarize the dataset:
-$ TEXT=examples/language_model/wikitext-103
+TEXT=examples/language_model/wikitext-103
 
-$ fairseq-preprocess --only-source \
-  --trainpref $TEXT/wiki.train.tokens --validpref $TEXT/wiki.valid.tokens --testpref $TEXT/wiki.test.tokens \ 
-  --destdir data-bin/wikitext-103
+fairseq-preprocess --only-source \
+    --trainpref $TEXT/wiki.train.tokens --validpref $TEXT/wiki.valid.tokens --testpref $TEXT/wiki.test.tokens \ 
+    --destdir data-bin/wikitext-103
 ```
 
 Train a transformer language model with adaptive inputs ([Baevski and Auli (2018): Adaptive Input Representations for Neural Language Modeling](transformer_lm/README.md)):
-```
+```bash
 # If it runs out of memory, try to reduce max-tokens and tokens-per-sample
-$ mkdir -p checkpoints/transformer_wikitext-103
-$ fairseq-train --task language_modeling data-bin/wikitext-103 \
-  --save-dir checkpoints/transformer_wikitext-103 --arch transformer_lm_wiki103 \
-  --max-update 286000 --max-lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
-  --warmup-updates 16000 --warmup-init-lr 1e-07 --min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \
-  --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
-  --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d
+mkdir -p checkpoints/transformer_wikitext-103
+fairseq-train --task language_modeling data-bin/wikitext-103 \
+    --save-dir checkpoints/transformer_wikitext-103 --arch transformer_lm_wiki103 \
+    --max-update 286000 --max-lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
+    --warmup-updates 16000 --warmup-init-lr 1e-07 --min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \
+    --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
+    --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d
 
 # Evaluate:
-$ fairseq-eval-lm data-bin/wikitext-103 --path 'checkpoints/transformer_wiki103/checkpoint_best.pt' \
-  --sample-break-mode complete --max-tokens 3072 --context-window 2560 --softmax-batch 1024
+fairseq-eval-lm data-bin/wikitext-103 --path 'checkpoints/transformer_wiki103/checkpoint_best.pt' \
+    --sample-break-mode complete --max-tokens 3072 --context-window 2560 --softmax-batch 1024
 ```
 
 Train a convolutional language model ([Dauphin et al. (2017): Language Modeling with Gated Convolutional Networks](conv_lm/README.md)):
 ```
 # If it runs out of memory, try to reduce max-tokens and tokens-per-sample
-$ mkdir -p checkpoints/fconv_wikitext-103
-$ fairseq-train --task language_modeling data-bin/wikitext-103 \
-  --save-dir checkpoints/fconv_wikitext-103 \
-  --max-epoch 35 --arch fconv_lm_dauphin_wikitext103 --optimizer nag \
-  --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
-  --clip-norm 0.1 --dropout 0.2 --weight-decay 5e-06 --criterion adaptive_loss \
-  --adaptive-softmax-cutoff 10000,20000,200000 --max-tokens 1024 --tokens-per-sample 1024 \
-  --ddp-backend=no_c10d
+mkdir -p checkpoints/fconv_wikitext-103
+fairseq-train --task language_modeling data-bin/wikitext-103 \
+    --save-dir checkpoints/fconv_wikitext-103 \
+    --max-epoch 35 --arch fconv_lm_dauphin_wikitext103 --optimizer nag \
+    --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
+    --clip-norm 0.1 --dropout 0.2 --weight-decay 5e-06 --criterion adaptive_loss \
+    --adaptive-softmax-cutoff 10000,20000,200000 --max-tokens 1024 --tokens-per-sample 1024 \
+    --ddp-backend=no_c10d
 
 # Evaluate:
-$ fairseq-eval-lm data-bin/wikitext-103 --path 'checkpoints/fconv_wiki103/checkpoint_best.pt'
+fairseq-eval-lm data-bin/wikitext-103 --path 'checkpoints/fconv_wiki103/checkpoint_best.pt'
 ```
diff --git a/examples/roberta/README.finetune_custom_classification.md b/examples/roberta/README.finetune_custom_classification.md
index de3a4cc37a..cd49348f56 100644
--- a/examples/roberta/README.finetune_custom_classification.md
+++ b/examples/roberta/README.finetune_custom_classification.md
@@ -1,14 +1,16 @@
-# RoBERTa fine-tuning on custom classification task (example IMDB)
+# Finetuning RoBERTa on a custom classification task
 
-## 1) Get the data
-```
+This example shows how to finetune RoBERTa on the IMDB dataset, but should illustrate the process for most classification tasks.
+
+### 1) Get the data
+```bash
 wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
 tar zxvf aclImdb_v1.tar.gz
 ```
 
-## 2) Format data
+### 2) Format data
 `IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing.  
-```
+```python
 import argparse
 import os
 import random
@@ -42,79 +44,78 @@ if __name__ == '__main__':
     main(args)
 ```
 
-## 3) BPE Encode
+### 3) BPE Encode
 Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower.
-```
+```bash
 # Download encoder.json and vocab.bpe
 wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
 wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
 
-for SPLIT in train dev;
-do
-  python -m examples.roberta.multiprocessing_bpe_encoder \
-  --encoder-json encoder.json \
-  --vocab-bpe vocab.bpe \
-  --inputs "aclImdb/$SPLIT.input0" \
-  --outputs "aclImdb/$SPLIT.input0.bpe" \
-  --workers 60 \
-  --keep-empty;
+for SPLIT in train dev; do
+    python -m examples.roberta.multiprocessing_bpe_encoder \
+        --encoder-json encoder.json \
+        --vocab-bpe vocab.bpe \
+        --inputs "aclImdb/$SPLIT.input0" \
+        --outputs "aclImdb/$SPLIT.input0.bpe" \
+        --workers 60 \
+        --keep-empty
 done
 ```
 
+### 4) Preprocess data
 
-## 4) Preprocess data
-
-```
+```bash
 # Download fairseq dictionary.
 wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'  
 
 fairseq-preprocess \
-  --only-source \
-  --trainpref "aclImdb/train.input0.bpe" \
-  --validpref "aclImdb/dev.input0.bpe" \
-  --destdir "IMDB-bin/input0" \
-  --workers 60 \
-  --srcdict dict.txt;
+    --only-source \
+    --trainpref "aclImdb/train.input0.bpe" \
+    --validpref "aclImdb/dev.input0.bpe" \
+    --destdir "IMDB-bin/input0" \
+    --workers 60 \
+    --srcdict dict.txt
 
 fairseq-preprocess \
-  --only-source \
-  --trainpref "aclImdb/train.label" \
-  --validpref "aclImdb/dev.label" \
-  --destdir "IMDB-bin/label" \
-  --workers 60;
+    --only-source \
+    --trainpref "aclImdb/train.label" \
+    --validpref "aclImdb/dev.label" \
+    --destdir "IMDB-bin/label" \
+    --workers 60
 
 ```
 
-## 5) Run Training
+### 5) Run Training
 
-```
+```bash
 TOTAL_NUM_UPDATES=7812  # 10 epochs through IMDB for bsz 32
 WARMUP_UPDATES=469      # 6 percent of the number of updates
 LR=1e-05                # Peak LR for polynomial LR scheduler.
 NUM_CLASSES=2
 MAX_SENTENCES=8        # Batch size.
+ROBERTA_PATH=/path/to/roberta/model.pt
 
 CUDA_VISIBLE_DEVICES=0 python train.py IMDB-bin/ \
---restore-file <roberta_large_absolute_path> \
---max-positions 512 \
---max-sentences $MAX_SENTENCES \
---max-tokens 4400 \
---task sentence_prediction \
---reset-optimizer --reset-dataloader --reset-meters \
---required-batch-size-multiple 1 \
---init-token 0 --separator-token 2 \
---arch roberta_large \
---criterion sentence_prediction \
---num-classes $NUM_CLASSES \
---dropout 0.1 --attention-dropout 0.1 \
---weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
---clip-norm 0.0 \
---lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
---fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
---max-epoch 10 \
---best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
---truncate-sequence \
---update-freq 4;
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --max-sentences $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --truncate-sequence \
+    --update-freq 4
 ```
 Above will train with effective batch-size of `32`, tested on one `Nvidia V100 32gb`.
 Expected `best-validation-accuracy` after `10` epochs is `~96.5%`.
diff --git a/examples/roberta/README.finetune_glue.md b/examples/roberta/README.finetune_glue.md
new file mode 100644
index 0000000000..c905cab7c0
--- /dev/null
+++ b/examples/roberta/README.finetune_glue.md
@@ -0,0 +1,66 @@
+# Finetuning RoBERTa on GLUE tasks
+
+### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
+```bash
+wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
+python download_glue_data.py --data_dir glue_data --tasks all
+```
+
+### 2) Preprocess GLUE task data:
+```bash
+./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
+```
+`glue_task_name` is one of the following:
+`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
+Use `ALL` for preprocessing all the glue tasks.
+
+### 3) Fine-tuning on GLUE task:
+Example fine-tuning cmd for `RTE` task
+```bash
+TOTAL_NUM_UPDATES=2036  # 10 epochs through RTE for bsz 16
+WARMUP_UPDATES=122      # 6 percent of the number of updates
+LR=2e-05                # Peak LR for polynomial LR scheduler.
+NUM_CLASSES=2
+MAX_SENTENCES=16        # Batch size.
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+CUDA_VISIBLE_DEVICES=0 python train.py RTE-bin/ \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --max-sentences $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
+```
+
+For each of the GLUE task, you will need to use following cmd-line arguments:
+
+Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
+---|---|---|---|---|---|---|---|---
+`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
+`--lr` | 1e-5 | 1e-5 | 1e-5 | 2e-5 | 1e-5 | 1e-5 | 1e-5 | 2e-5
+`--max-sentences` | 32 | 32 | 32 | 16 | 32 | 16 | 16 | 16
+`--total-num-update` | 123873 | 33112 | 113272 | 2036 | 20935 | 2296 | 5336 | 3598
+`--warmup-updates` | 7432 | 1986 | 28318 | 122 | 1256 | 137 | 320 | 214
+
+For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`.
+
+**Note:**
+
+a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--max-sentences=16/32` depending on the task.
+
+b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--max-sentences`.
+
+c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 989c9d750e..e975789f01 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -39,85 +39,83 @@ Model | Accuracy | Middle | High
 ## Example usage
 
 ##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
-```
->>> import torch
->>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
->>> roberta.eval()  # disable dropout (or leave in train mode to finetune)
+```python
+import torch
+roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
+roberta.eval()  # disable dropout (or leave in train mode to finetune)
 ```
 
 ##### Load RoBERTa (for PyTorch 1.0):
-```
-$ wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
-$ tar -xzvf roberta.large.tar.gz
+```python
+# Download roberta.large model
+wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
+tar -xzvf roberta.large.tar.gz
 
->>> from fairseq.models.roberta import RobertaModel
->>> roberta = RobertaModel.from_pretrained('/path/to/roberta.large')
->>> roberta.eval()  # disable dropout (or leave in train mode to finetune)
+# Load the model in fairseq
+from fairseq.models.roberta import RobertaModel
+roberta = RobertaModel.from_pretrained('/path/to/roberta.large')
+roberta.eval()  # disable dropout (or leave in train mode to finetune)
 ```
 
 ##### Apply Byte-Pair Encoding (BPE) to input text:
-```
->>> tokens = roberta.encode('Hello world!')
->>> tokens
-tensor([    0, 31414,   232,   328,     2])
->>> roberta.decode(tokens)
-'Hello world!'
+```python
+tokens = roberta.encode('Hello world!')
+assert tokens.tolist() == [0, 31414, 232, 328, 2]
+roberta.decode(tokens)  # 'Hello world!'
 ```
 
 ##### Extract features from RoBERTa:
-```
->>> last_layer_features = roberta.extract_features(tokens)
->>> last_layer_features.size()
-torch.Size([1, 5, 1024])
+```python
+# Extract the last layer's features
+last_layer_features = roberta.extract_features(tokens)
+assert last_layer_features.size() == torch.Size([1, 5, 1024])
 
->>> all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
->>> len(all_layers)
-25
-
->>> torch.all(all_layers[-1] == last_layer_features)
-tensor(1, dtype=torch.uint8)
+# Extract all layer's features (layer 0 is the embedding layer)
+all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
+assert len(all_layers) == 25
+assert torch.all(all_layers[-1] == last_layer_features)
 ```
 
 ##### Use RoBERTa for sentence-pair classification tasks:
-```
->>> roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')  # already finetuned
->>> roberta.eval()  # disable dropout for evaluation
-
->>> tokens = roberta.encode(
-...   'Roberta is a heavily optimized version of BERT.',
-...   'Roberta is not very optimized.'
-... )
-
->>> roberta.predict('mnli', tokens).argmax()
-tensor(0)  # contradiction
+```python
+# Download RoBERTa already finetuned for MNLI
+roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
+roberta.eval()  # disable dropout for evaluation
 
->>> tokens = roberta.encode(
-...   'Roberta is a heavily optimized version of BERT.',
-...   'Roberta is based on BERT.'
-... )
+# Encode a pair of sentences and make a prediction
+tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.')
+roberta.predict('mnli', tokens).argmax()  # 0: contradiction
 
->>> roberta.predict('mnli', tokens).argmax()
-tensor(2)  # entailment
+# Encode another pair of sentences
+tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.')
+roberta.predict('mnli', tokens).argmax()  # 2: entailment
 ```
 
 ##### Register a new (randomly initialized) classification head:
+```python
+roberta.register_classification_head('new_task', num_classes=3)
+logprobs = roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
 ```
->>> roberta.register_classification_head('new_task', num_classes=3)
->>> roberta.predict('new_task', tokens)
-tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
+
+##### Batched prediction:
+```python
+from fairseq.data.data_utils import collate_tokens
+sentences = ['Hello world.', 'Another unrelated sentence.']
+batch = collate_tokens([roberta.encode(sent) for sent in sentences], pad_idx=1)
+logprobs = roberta.predict('new_task', batch)
+assert logprobs.size() == torch.Size([2, 3])
 ```
 
 ##### Using the GPU:
-```
->>> roberta.cuda()
->>> roberta.predict('new_task', tokens)
-tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
+```python
+roberta.cuda()
+roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
 ```
 
 ##### Evaluating the `roberta.large.mnli` model
 
 Example python code snippet to evaluate accuracy on the MNLI dev_matched set.
-```
+```python
 label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
 ncorrect, nsamples = 0, 0
 roberta.cuda()
@@ -137,79 +135,11 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 ```
 
 
-## Finetuning on GLUE tasks
-
-##### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
-```
-$ wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
-$ python download_glue_data.py --data_dir glue_data --tasks all
-```
-
-##### 2) Preprocess GLUE task data:
-```
-$ ./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
-```
-`glue_task_name` is one of the following:
-`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
-Use `ALL` for preprocessing all the glue tasks.
-
-##### 3) Fine-tuning on GLUE task :
-Example fine-tuning cmd for `RTE` task
-```
-TOTAL_NUM_UPDATES=2036  # 10 epochs through RTE for bsz 16
-WARMUP_UPDATES=122      # 6 percent of the number of updates
-LR=2e-05                # Peak LR for polynomial LR scheduler.
-NUM_CLASSES=2
-MAX_SENTENCES=16        # Batch size.
-
-CUDA_VISIBLE_DEVICES=0 python train.py RTE-bin/ \
---restore-file <roberta_large_absolute_path> \
---max-positions 512 \
---max-sentences $MAX_SENTENCES \
---max-tokens 4400 \
---task sentence_prediction \
---reset-optimizer --reset-dataloader --reset-meters \
---required-batch-size-multiple 1 \
---init-token 0 --separator-token 2 \
---arch roberta_large \
---criterion sentence_prediction \
---num-classes $NUM_CLASSES \
---dropout 0.1 --attention-dropout 0.1 \
---weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
---clip-norm 0.0 \
---lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
---fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
---max-epoch 10 \
---best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
-```
-
-For each of the GLUE task, you will need to use following cmd-line arguments:
-
-Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
----|---|---|---|---|---|---|---|---
-`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
-`--lr` | 1e-5 | 1e-5 | 1e-5 | 2e-5 | 1e-5 | 1e-5 | 1e-5 | 2e-5
-`--max-sentences` | 32 | 32 | 32 | 16 | 32 | 16 | 16 | 16
-`--total-num-update` | 123873 | 33112 | 113272 | 2036 | 20935 | 2296 | 5336 | 3598
-`--warmup-updates` | 7432 | 1986 | 28318 | 122 | 1256 | 137 | 320 | 214
-
-For `STS-B` additionally use following cmd-line argument:
-```
---regression-target
---best-checkpoint-metric loss
-```
-and remove `--maximize-best-checkpoint-metric`.
-
-**Note:**
-
-a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--max-sentences=16/32` depending on the task.
-
-b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--max-sentences`.
-
-c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
+## Finetuning
 
-## Fine-tuning on custom classification tasks
-[Example of fine-tuning Roberta on simple custom classification task](README.finetune_custom_classification.md)
+- [Finetuning on GLUE](README.finetune_glue.md)
+- [Finetuning on custom classification tasks (e.g., IMDB)](README.finetune_custom_classification.md)
+- Finetuning on SQuAD: coming soon
 
 ## Pretraining using your own data
 
@@ -223,11 +153,11 @@ A more detailed tutorial is coming soon.
 
 ```bibtex
 @article{liu2019roberta,
-  title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach},
-  author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
-            Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
-            Luke Zettlemoyer and Veselin Stoyanov},
-  journal={arXiv preprint arXiv:1907.11692},
-  year = {2019},
+    title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach},
+    author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
+              Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
+              Luke Zettlemoyer and Veselin Stoyanov},
+    journal={arXiv preprint arXiv:1907.11692},
+    year = {2019},
 }
 ```
diff --git a/examples/scaling_nmt/README.md b/examples/scaling_nmt/README.md
index d31aa3ae9e..d814436a46 100644
--- a/examples/scaling_nmt/README.md
+++ b/examples/scaling_nmt/README.md
@@ -4,10 +4,10 @@ This page includes instructions for reproducing results from the paper [Scaling
 
 ## Pre-trained models
 
-Description | Dataset | Model | Test set(s)
+Model | Description | Dataset | Download
 ---|---|---|---
-Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2) | newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
-Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2) | newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`transformer.wmt14.en-fr` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
+`transformer.wmt16.en-de` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
 
 ## Training a new model on WMT'16 En-De
 
@@ -15,33 +15,33 @@ Please first download the [preprocessed WMT'16 En-De data provided by Google](ht
 Then:
 
 1. Extract the WMT'16 En-De data:
-```
-$ TEXT=wmt16_en_de_bpe32k
-$ mkdir $TEXT
-$ tar -xzvf wmt16_en_de.tar.gz -C $TEXT
+```bash
+TEXT=wmt16_en_de_bpe32k
+mkdir $TEXT
+tar -xzvf wmt16_en_de.tar.gz -C $TEXT
 ```
 
 2. Preprocess the dataset with a joined dictionary:
-```
-$ fairseq-preprocess --source-lang en --target-lang de \
-  --trainpref $TEXT/train.tok.clean.bpe.32000 \
-  --validpref $TEXT/newstest2013.tok.bpe.32000 \
-  --testpref $TEXT/newstest2014.tok.bpe.32000 \
-  --destdir data-bin/wmt16_en_de_bpe32k \
-  --nwordssrc 32768 --nwordstgt 32768 \
-  --joined-dictionary
+```bash
+fairseq-preprocess --source-lang en --target-lang de \
+    --trainpref $TEXT/train.tok.clean.bpe.32000 \
+    --validpref $TEXT/newstest2013.tok.bpe.32000 \
+    --testpref $TEXT/newstest2014.tok.bpe.32000 \
+    --destdir data-bin/wmt16_en_de_bpe32k \
+    --nwordssrc 32768 --nwordstgt 32768 \
+    --joined-dictionary
 ```
 
 3. Train a model:
-```
-$ fairseq-train data-bin/wmt16_en_de_bpe32k \
-  --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
-  --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
-  --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
-  --lr 0.0005 --min-lr 1e-09 \
-  --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
-  --max-tokens 3584 \
-  --fp16
+```bash
+fairseq-train data-bin/wmt16_en_de_bpe32k \
+    --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+    --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
+    --lr 0.0005 --min-lr 1e-09 \
+    --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --max-tokens 3584 \
+    --fp16
 ```
 
 Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU.
diff --git a/examples/stories/README.md b/examples/stories/README.md
index 29653054f8..625439e81a 100644
--- a/examples/stories/README.md
+++ b/examples/stories/README.md
@@ -14,7 +14,7 @@ We provide sample stories generated by the [convolutional seq2seq model](https:/
 
 The dataset can be downloaded like this:
 
-```
+```bash
 cd examples/stories
 curl https://dl.fbaipublicfiles.com/fairseq/data/writingPrompts.tar.gz | tar xvzf -
 ```
@@ -23,28 +23,28 @@ and contains a train, test, and valid split. The dataset is described here: http
 
 ## Example usage
 
+First we will preprocess the dataset. Note that the dataset release is the full data, but the paper models the first 1000 words of each story. Here is example code that trims the dataset to the first 1000 words of each story:
+```python
+data = ["train", "test", "valid"]
+for name in data:
+    with open(name + ".wp_target") as f:
+        stories = f.readlines()
+    stories = [" ".join(i.split()[0:1000]) for i in stories]
+    with open(name + ".wp_target", "w") as o:
+        for line in stories:
+            o.write(line.strip() + "\n")
 ```
-# Preprocess the dataset:
-# Note that the dataset release is the full data, but the paper models the first 1000 words of each story
-# Here is some example code that can trim the dataset to the first 1000 words of each story
-$ python
-$ data = ["train", "test", "valid"]
-$ for name in data:
-$   with open(name + ".wp_target") as f:
-$     stories = f.readlines()
-$   stories = [" ".join(i.split()[0:1000]) for i in stories]
-$   with open(name + ".wp_target", "w") as o:
-$     for line in stories:
-$       o.write(line.strip() + "\n")
 
+Once we've trimmed the data we can binarize it and train our model:
+```bash
 # Binarize the dataset:
-$ export TEXT=examples/stories/writingPrompts
-$ fairseq-preprocess --source-lang wp_source --target-lang wp_target \
-  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
-  --destdir data-bin/writingPrompts --padding-factor 1 --thresholdtgt 10 --thresholdsrc 10
+export TEXT=examples/stories/writingPrompts
+fairseq-preprocess --source-lang wp_source --target-lang wp_target \
+    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+    --destdir data-bin/writingPrompts --padding-factor 1 --thresholdtgt 10 --thresholdsrc 10
 
 # Train the model:
-$ fairseq-train data-bin/writingPrompts -a fconv_self_att_wp --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau --decoder-attention True --encoder-attention False --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 --source-lang wp_source --target-lang wp_target --gated-attention True --self-attention True --project-input True --pretrained False
+fairseq-train data-bin/writingPrompts -a fconv_self_att_wp --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau --decoder-attention True --encoder-attention False --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 --source-lang wp_source --target-lang wp_target --gated-attention True --self-attention True --project-input True --pretrained False
 
 # Train a fusion model:
 # add the arguments: --pretrained True --pretrained-checkpoint path/to/checkpoint
@@ -52,7 +52,7 @@ $ fairseq-train data-bin/writingPrompts -a fconv_self_att_wp --lr 0.25 --clip-no
 # Generate:
 # Note: to load the pretrained model at generation time, you need to pass in a model-override argument to communicate to the fusion model at generation time where you have placed the pretrained checkpoint. By default, it will load the exact path of the fusion model's pretrained model from training time. You should use model-override if you have moved the pretrained model (or are using our provided models). If you are generating from a non-fusion model, the model-override argument is not necessary.
 
-$ fairseq-generate data-bin/writingPrompts --path /path/to/trained/model/checkpoint_best.pt --batch-size 32 --beam 1 --sampling --sampling-topk 10 --sampling-temperature 0.8 --nbest 1 --model-overrides "{'pretrained_checkpoint':'/path/to/pretrained/model/checkpoint'}"
+fairseq-generate data-bin/writingPrompts --path /path/to/trained/model/checkpoint_best.pt --batch-size 32 --beam 1 --sampling --sampling-topk 10 --sampling-temperature 0.8 --nbest 1 --model-overrides "{'pretrained_checkpoint':'/path/to/pretrained/model/checkpoint'}"
 ```
 
 ## Citation
diff --git a/examples/translation/README.md b/examples/translation/README.md
index 72f8b16178..a43f0af1ad 100644
--- a/examples/translation/README.md
+++ b/examples/translation/README.md
@@ -2,57 +2,58 @@
 
 ## Pre-trained models
 
-Description | Dataset | Model | Test set(s)
+Model | Description | Dataset | Download
 ---|---|---|---
-Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2) | newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.newstest2014.tar.bz2) <br> newstest2012/2013: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.ntst1213.tar.bz2)
-Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT14 English-German](http://statmt.org/wmt14/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-de.fconv-py.tar.bz2) | newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-de.newstest2014.tar.bz2)
-Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT17 English-German](http://statmt.org/wmt17/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt17.v2.en-de.fconv-py.tar.bz2) | newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.v2.en-de.newstest2014.tar.bz2)
-Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2) | newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
-Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2) | newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
-Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381); WMT'18 winner) | [WMT'18 English-German](http://www.statmt.org/wmt18/translation-task.html) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz) | See NOTE in the archive
+`conv.wmt14.en-fr` | Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.newstest2014.tar.bz2) <br> newstest2012/2013: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.ntst1213.tar.bz2)
+`conv.wmt14.en-de` | Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT14 English-German](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-de.fconv-py.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-de.newstest2014.tar.bz2)
+`conv.wmt17.en-de` | Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT17 English-German](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt17.v2.en-de.fconv-py.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.v2.en-de.newstest2014.tar.bz2)
+`transformer.wmt14.en-fr` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
+`transformer.wmt16.en-de` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`transformer.wmt18.en-de` | Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381)) <br> WMT'18 winner | [WMT'18 English-German](http://www.statmt.org/wmt18/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz) <br> See NOTE in the archive
+`transformer.wmt19.en-de` | Transformer <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) <br> WMT'19 winner | [WMT'19 English-German](http://www.statmt.org/wmt19/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz)
+`transformer.wmt19.de-en` | Transformer <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) <br> WMT'19 winner | [WMT'19 German-English](http://www.statmt.org/wmt19/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz)
+`transformer.wmt19.en-ru` | Transformer <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) <br> WMT'19 winner | [WMT'19 English-Russian](http://www.statmt.org/wmt19/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz)
+`transformer.wmt19.ru-en` | Transformer <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) <br> WMT'19 winner | [WMT'19 Russian-English](http://www.statmt.org/wmt19/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz)
 
 ## Example usage (torch.hub)
 
-Interactive generation via PyTorch Hub:
-```
->>> import torch
->>> torch.hub.list('pytorch/fairseq')
-[..., 'transformer.wmt14.en-fr', 'transformer.wmt16.en-de', 'transformer.wmt18.en-de', ... ]
->>> en2de = torch.hub.load(
-...   'pytorch/fairseq',
-...   'transformer.wmt16.en-de',
-...   data_name_or_path='.',
-...   tokenizer='moses',
-...   bpe='subword_nmt',
-... )
->>> print(en2de.models[0].__class__)
-<class 'fairseq.models.transformer.TransformerModel'>
->>> print(en2de.generate('Hello world!'))
-Hallo Welt!
-```
+Interactive translation via PyTorch Hub:
+```python
+import torch
+
+# List available models
+torch.hub.list('pytorch/fairseq')  # [..., 'transformer.wmt16.en-de', ... ]
+
+# Load a transformer trained on WMT'16 En-De
+en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt16.en-de', tokenizer='moses', bpe='subword_nmt')
 
-Available models are listed in the ``hub_models()`` method in each model file, for example:
-[transformer.py](https://github.com/pytorch/fairseq/blob/master/fairseq/models/transformer.py).
+# The underlying model is available under the *models* attribute
+assert isinstance(en2de.models[0], fairseq.models.transformer.TransformerModel)
+
+# Translate a sentence
+en2de.translate('Hello world!')
+# 'Hallo Welt!'
+```
 
 ## Example usage (CLI tools)
 
 Generation with the binarized test sets can be run in batch mode as follows, e.g. for WMT 2014 English-French on a GTX-1080ti:
-```
-$ mkdir -p data-bin
-$ curl https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2 | tar xvjf - -C data-bin
-$ curl https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.newstest2014.tar.bz2 | tar xvjf - -C data-bin
-$ fairseq-generate data-bin/wmt14.en-fr.newstest2014  \
-  --path data-bin/wmt14.en-fr.fconv-py/model.pt \
-  --beam 5 --batch-size 128 --remove-bpe | tee /tmp/gen.out
-...
-| Translated 3003 sentences (96311 tokens) in 166.0s (580.04 tokens/s)
-| Generate test with beam=5: BLEU4 = 40.83, 67.5/46.9/34.4/25.5 (BP=1.000, ratio=1.006, syslen=83262, reflen=82787)
+```bash
+mkdir -p data-bin
+curl https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2 | tar xvjf - -C data-bin
+curl https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.newstest2014.tar.bz2 | tar xvjf - -C data-bin
+fairseq-generate data-bin/wmt14.en-fr.newstest2014  \
+    --path data-bin/wmt14.en-fr.fconv-py/model.pt \
+    --beam 5 --batch-size 128 --remove-bpe | tee /tmp/gen.out
+# ...
+# | Translated 3003 sentences (96311 tokens) in 166.0s (580.04 tokens/s)
+# | Generate test with beam=5: BLEU4 = 40.83, 67.5/46.9/34.4/25.5 (BP=1.000, ratio=1.006, syslen=83262, reflen=82787)
 
 # Compute BLEU score
-$ grep ^H /tmp/gen.out | cut -f3- > /tmp/gen.out.sys
-$ grep ^T /tmp/gen.out | cut -f2- > /tmp/gen.out.ref
-$ fairseq-score --sys /tmp/gen.out.sys --ref /tmp/gen.out.ref
-BLEU4 = 40.83, 67.5/46.9/34.4/25.5 (BP=1.000, ratio=1.006, syslen=83262, reflen=82787)
+grep ^H /tmp/gen.out | cut -f3- > /tmp/gen.out.sys
+grep ^T /tmp/gen.out | cut -f2- > /tmp/gen.out.ref
+fairseq-score --sys /tmp/gen.out.sys --ref /tmp/gen.out.ref
+# BLEU4 = 40.83, 67.5/46.9/34.4/25.5 (BP=1.000, ratio=1.006, syslen=83262, reflen=82787)
 ```
 
 ## Preprocessing
@@ -64,55 +65,54 @@ These scripts provide an example of pre-processing data for the NMT task.
 Provides an example of pre-processing for IWSLT'14 German to English translation task: ["Report on the 11th IWSLT evaluation campaign" by Cettolo et al.](http://workshop2014.iwslt.org/downloads/proceeding.pdf)
 
 Example usage:
-```
-$ cd examples/translation/
-$ bash prepare-iwslt14.sh
-$ cd ../..
+```bash
+cd examples/translation/
+bash prepare-iwslt14.sh
+cd ../..
 
 # Binarize the dataset:
-$ TEXT=examples/translation/iwslt14.tokenized.de-en
-$ fairseq-preprocess --source-lang de --target-lang en \
-  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
-  --destdir data-bin/iwslt14.tokenized.de-en
+TEXT=examples/translation/iwslt14.tokenized.de-en
+fairseq-preprocess --source-lang de --target-lang en \
+    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+    --destdir data-bin/iwslt14.tokenized.de-en
 
 # Train the model (better for a single GPU setup):
-$ mkdir -p checkpoints/fconv
-$ CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
-  --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
-  --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
-  --lr-scheduler fixed --force-anneal 200 \
-  --arch fconv_iwslt_de_en --save-dir checkpoints/fconv
+mkdir -p checkpoints/fconv
+CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
+    --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --lr-scheduler fixed --force-anneal 200 \
+    --arch fconv_iwslt_de_en --save-dir checkpoints/fconv
 
 # Generate:
-$ fairseq-generate data-bin/iwslt14.tokenized.de-en \
-  --path checkpoints/fconv/checkpoint_best.pt \
-  --batch-size 128 --beam 5 --remove-bpe
+fairseq-generate data-bin/iwslt14.tokenized.de-en \
+    --path checkpoints/fconv/checkpoint_best.pt \
+    --batch-size 128 --beam 5 --remove-bpe
 
 ```
 
 To train transformer model on IWSLT'14 German to English:
-```
+```bash
 # Preparation steps are the same as for fconv model.
 
 # Train the model (better for a single GPU setup):
-$ mkdir -p checkpoints/transformer
-$ CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
-  -a transformer_iwslt_de_en --optimizer adam --lr 0.0005 -s de -t en \
-  --label-smoothing 0.1 --dropout 0.3 --max-tokens 4000 \
-  --min-lr '1e-09' --lr-scheduler inverse_sqrt --weight-decay 0.0001 \
-  --criterion label_smoothed_cross_entropy --max-update 50000 \
-  --warmup-updates 4000 --warmup-init-lr '1e-07' \
-  --adam-betas '(0.9, 0.98)' --save-dir checkpoints/transformer
+mkdir -p checkpoints/transformer
+CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
+    -a transformer_iwslt_de_en --optimizer adam --lr 0.0005 -s de -t en \
+    --label-smoothing 0.1 --dropout 0.3 --max-tokens 4000 \
+    --min-lr '1e-09' --lr-scheduler inverse_sqrt --weight-decay 0.0001 \
+    --criterion label_smoothed_cross_entropy --max-update 50000 \
+    --warmup-updates 4000 --warmup-init-lr '1e-07' \
+    --adam-betas '(0.9, 0.98)' --save-dir checkpoints/transformer
 
 # Average 10 latest checkpoints:
-$ python scripts/average_checkpoints.py --inputs checkpoints/transformer \
-   --num-epoch-checkpoints 10 --output checkpoints/transformer/model.pt
+python scripts/average_checkpoints.py --inputs checkpoints/transformer \
+    --num-epoch-checkpoints 10 --output checkpoints/transformer/model.pt
 
 # Generate:
-$ fairseq-generate data-bin/iwslt14.tokenized.de-en \
-  --path checkpoints/transformer/model.pt \
-  --batch-size 128 --beam 5 --remove-bpe
-
+fairseq-generate data-bin/iwslt14.tokenized.de-en \
+    --path checkpoints/transformer/model.pt \
+    --batch-size 128 --beam 5 --remove-bpe
 ```
 
 ### prepare-wmt14en2de.sh
@@ -122,36 +122,35 @@ By default it will produce a dataset that was modeled after ["Attention Is All Y
 
 To use only data available in WMT'14 or to replicate results obtained in the original ["Convolutional Sequence to Sequence Learning" (Gehring et al., 2017)](https://arxiv.org/abs/1705.03122) paper, please use the `--icml17` option.
 
-```
-$ bash prepare-wmt14en2de.sh --icml17
+```bash
+bash prepare-wmt14en2de.sh --icml17
 ```
 
 Example usage:
 
-```
-$ cd examples/translation/
-$ bash prepare-wmt14en2de.sh
-$ cd ../..
+```bash
+cd examples/translation/
+bash prepare-wmt14en2de.sh
+cd ../..
 
 # Binarize the dataset:
-$ TEXT=examples/translation/wmt17_en_de
-$ fairseq-preprocess --source-lang en --target-lang de \
-  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
-  --destdir data-bin/wmt17_en_de --thresholdtgt 0 --thresholdsrc 0
+TEXT=examples/translation/wmt17_en_de
+fairseq-preprocess --source-lang en --target-lang de \
+    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+    --destdir data-bin/wmt17_en_de --thresholdtgt 0 --thresholdsrc 0
 
 # Train the model:
 # If it runs out of memory, try to set --max-tokens 1500 instead
-$ mkdir -p checkpoints/fconv_wmt_en_de
-$ fairseq-train data-bin/wmt17_en_de \
-  --lr 0.5 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
-  --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
-  --lr-scheduler fixed --force-anneal 50 \
-  --arch fconv_wmt_en_de --save-dir checkpoints/fconv_wmt_en_de
+mkdir -p checkpoints/fconv_wmt_en_de
+fairseq-train data-bin/wmt17_en_de \
+    --lr 0.5 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --lr-scheduler fixed --force-anneal 50 \
+    --arch fconv_wmt_en_de --save-dir checkpoints/fconv_wmt_en_de
 
 # Generate:
-$ fairseq-generate data-bin/wmt17_en_de \
-  --path checkpoints/fconv_wmt_en_de/checkpoint_best.pt --beam 5 --remove-bpe
-
+fairseq-generate data-bin/wmt17_en_de \
+    --path checkpoints/fconv_wmt_en_de/checkpoint_best.pt --beam 5 --remove-bpe
 ```
 
 ### prepare-wmt14en2fr.sh
@@ -160,30 +159,29 @@ Provides an example of pre-processing for the WMT'14 English to French translati
 
 Example usage:
 
-```
-$ cd examples/translation/
-$ bash prepare-wmt14en2fr.sh
-$ cd ../..
+```bash
+cd examples/translation/
+bash prepare-wmt14en2fr.sh
+cd ../..
 
 # Binarize the dataset:
-$ TEXT=examples/translation/wmt14_en_fr
-$ fairseq-preprocess --source-lang en --target-lang fr \
-  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
-  --destdir data-bin/wmt14_en_fr --thresholdtgt 0 --thresholdsrc 0
+TEXT=examples/translation/wmt14_en_fr
+fairseq-preprocess --source-lang en --target-lang fr \
+    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+    --destdir data-bin/wmt14_en_fr --thresholdtgt 0 --thresholdsrc 0
 
 # Train the model:
 # If it runs out of memory, try to set --max-tokens 1000 instead
-$ mkdir -p checkpoints/fconv_wmt_en_fr
-$ fairseq-train data-bin/wmt14_en_fr \
-  --lr 0.5 --clip-norm 0.1 --dropout 0.1 --max-tokens 3000 \
-  --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
-  --lr-scheduler fixed --force-anneal 50 \
-  --arch fconv_wmt_en_fr --save-dir checkpoints/fconv_wmt_en_fr
+mkdir -p checkpoints/fconv_wmt_en_fr
+fairseq-train data-bin/wmt14_en_fr \
+    --lr 0.5 --clip-norm 0.1 --dropout 0.1 --max-tokens 3000 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --lr-scheduler fixed --force-anneal 50 \
+    --arch fconv_wmt_en_fr --save-dir checkpoints/fconv_wmt_en_fr
 
 # Generate:
-$ fairseq-generate data-bin/fconv_wmt_en_fr \
-  --path checkpoints/fconv_wmt_en_fr/checkpoint_best.pt --beam 5 --remove-bpe
-
+fairseq-generate data-bin/fconv_wmt_en_fr \
+    --path checkpoints/fconv_wmt_en_fr/checkpoint_best.pt --beam 5 --remove-bpe
 ```
 
 ## Multilingual Translation
@@ -195,64 +193,64 @@ Note that we use slightly different preprocessing here than for the IWSLT'14
 En-De data above. In particular we learn a joint BPE code for all three
 languages and use interactive.py and sacrebleu for scoring the test set.
 
-```
+```bash
 # First install sacrebleu and sentencepiece
-$ pip install sacrebleu sentencepiece
+pip install sacrebleu sentencepiece
 
 # Then download and preprocess the data
-$ cd examples/translation/
-$ bash prepare-iwslt17-multilingual.sh
-$ cd ../..
+cd examples/translation/
+bash prepare-iwslt17-multilingual.sh
+cd ../..
 
 # Binarize the de-en dataset
-$ TEXT=examples/translation/iwslt17.de_fr.en.bpe16k
-$ fairseq-preprocess --source-lang de --target-lang en \
-  --trainpref $TEXT/train.bpe.de-en --validpref $TEXT/valid.bpe.de-en \
-  --joined-dictionary \
-  --destdir data-bin/iwslt17.de_fr.en.bpe16k \
-  --workers 10
+TEXT=examples/translation/iwslt17.de_fr.en.bpe16k
+fairseq-preprocess --source-lang de --target-lang en \
+    --trainpref $TEXT/train.bpe.de-en --validpref $TEXT/valid.bpe.de-en \
+    --joined-dictionary \
+    --destdir data-bin/iwslt17.de_fr.en.bpe16k \
+    --workers 10
 
 # Binarize the fr-en dataset
 # NOTE: it's important to reuse the en dictionary from the previous step
-$ fairseq-preprocess --source-lang fr --target-lang en \
-  --trainpref $TEXT/train.bpe.fr-en --validpref $TEXT/valid.bpe.fr-en \
-  --joined-dictionary --tgtdict data-bin/iwslt17.de_fr.en.bpe16k/dict.en.txt \
-  --destdir data-bin/iwslt17.de_fr.en.bpe16k \
-  --workers 10
+fairseq-preprocess --source-lang fr --target-lang en \
+    --trainpref $TEXT/train.bpe.fr-en --validpref $TEXT/valid.bpe.fr-en \
+    --joined-dictionary --tgtdict data-bin/iwslt17.de_fr.en.bpe16k/dict.en.txt \
+    --destdir data-bin/iwslt17.de_fr.en.bpe16k \
+    --workers 10
 
 # Train a multilingual transformer model
 # NOTE: the command below assumes 1 GPU, but accumulates gradients from
 #       8 fwd/bwd passes to simulate training on 8 GPUs
-$ mkdir -p checkpoints/multilingual_transformer
-$ CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \
-  --max-epoch 50 \
-  --ddp-backend=no_c10d \
-  --task multilingual_translation --lang-pairs de-en,fr-en \
-  --arch multilingual_transformer_iwslt_de_en \
-  --share-decoders --share-decoder-input-output-embed \
-  --optimizer adam --adam-betas '(0.9, 0.98)' \
-  --lr 0.0005 --lr-scheduler inverse_sqrt --min-lr '1e-09' \
-  --warmup-updates 4000 --warmup-init-lr '1e-07' \
-  --label-smoothing 0.1 --criterion label_smoothed_cross_entropy \
-  --dropout 0.3 --weight-decay 0.0001 \
-  --save-dir checkpoints/multilingual_transformer \
-  --max-tokens 4000 \
-  --update-freq 8
+mkdir -p checkpoints/multilingual_transformer
+CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \
+    --max-epoch 50 \
+    --ddp-backend=no_c10d \
+    --task multilingual_translation --lang-pairs de-en,fr-en \
+    --arch multilingual_transformer_iwslt_de_en \
+    --share-decoders --share-decoder-input-output-embed \
+    --optimizer adam --adam-betas '(0.9, 0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt --min-lr '1e-09' \
+    --warmup-updates 4000 --warmup-init-lr '1e-07' \
+    --label-smoothing 0.1 --criterion label_smoothed_cross_entropy \
+    --dropout 0.3 --weight-decay 0.0001 \
+    --save-dir checkpoints/multilingual_transformer \
+    --max-tokens 4000 \
+    --update-freq 8
 
 # Generate and score the test set with sacrebleu
-$ SRC=de
-$ sacrebleu --test-set iwslt17 --language-pair ${SRC}-en --echo src \
-  | python scripts/spm_encode.py --model examples/translation/iwslt17.de_fr.en.bpe16k/sentencepiece.bpe.model \
-  > iwslt17.test.${SRC}-en.${SRC}.bpe
-$ cat iwslt17.test.${SRC}-en.${SRC}.bpe \
-  | fairseq-interactive data-bin/iwslt17.de_fr.en.bpe16k/ \
+SRC=de
+sacrebleu --test-set iwslt17 --language-pair ${SRC}-en --echo src \
+    | python scripts/spm_encode.py --model examples/translation/iwslt17.de_fr.en.bpe16k/sentencepiece.bpe.model \
+    > iwslt17.test.${SRC}-en.${SRC}.bpe
+cat iwslt17.test.${SRC}-en.${SRC}.bpe \
+    | fairseq-interactive data-bin/iwslt17.de_fr.en.bpe16k/ \
       --task multilingual_translation --source-lang ${SRC} --target-lang en \
       --path checkpoints/multilingual_transformer/checkpoint_best.pt \
       --buffer 2000 --batch-size 128 \
       --beam 5 --remove-bpe=sentencepiece \
-  > iwslt17.test.${SRC}-en.en.sys
-$ grep ^H iwslt17.test.${SRC}-en.en.sys | cut -f3 \
-  | sacrebleu --test-set iwslt17 --language-pair ${SRC}-en
+    > iwslt17.test.${SRC}-en.en.sys
+grep ^H iwslt17.test.${SRC}-en.en.sys | cut -f3 \
+    | sacrebleu --test-set iwslt17 --language-pair ${SRC}-en
 ```
 
 ### Argument format during inference
diff --git a/examples/translation_moe/README.md b/examples/translation_moe/README.md
index 4fc027e9c7..842be56bea 100644
--- a/examples/translation_moe/README.md
+++ b/examples/translation_moe/README.md
@@ -14,47 +14,47 @@ Use the `--method` flag to choose the MoE variant; we support hard mixtures with
 The model is trained with online responsibility assignment and shared parameterization.
 
 The following command will train a `hMoElp` model with `3` experts:
-```
-$ fairseq-train --ddp-backend='no_c10d' \
-  data-bin/wmt17_en_de \
-  --max-update 100000 \
-  --task translation_moe \
-  --method hMoElp --mean-pool-gating-network \
-  --num-experts 3 \
-  --arch transformer_wmt_en_de --share-all-embeddings \
-  --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
-  --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
-  --lr 0.0007 --min-lr 1e-09 \
-  --dropout 0.1 --weight-decay 0.0 --criterion cross_entropy \
-  --max-tokens 3584
+```bash
+fairseq-train --ddp-backend='no_c10d' \
+    data-bin/wmt17_en_de \
+    --max-update 100000 \
+    --task translation_moe \
+    --method hMoElp --mean-pool-gating-network \
+    --num-experts 3 \
+    --arch transformer_wmt_en_de --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+    --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
+    --lr 0.0007 --min-lr 1e-09 \
+    --dropout 0.1 --weight-decay 0.0 --criterion cross_entropy \
+    --max-tokens 3584
 ```
 
 ## Translate
 
 Once a model is trained, we can generate translations from different experts using the `--gen-expert` option.
 For example, to generate from expert 0:
-```
-$ fairseq-generate data-bin/wmt17_en_de \
-  --path checkpoints/checkpoint_best.pt \
-  --beam 1 --remove-bpe \
-  --task translation_moe \
-  --method hMoElp --mean-pool-gating-network \
-  --num-experts 3 \
-  --gen-expert 0
+```bash
+fairseq-generate data-bin/wmt17_en_de \
+    --path checkpoints/checkpoint_best.pt \
+    --beam 1 --remove-bpe \
+    --task translation_moe \
+    --method hMoElp --mean-pool-gating-network \
+    --num-experts 3 \
+    --gen-expert 0
 ```
 
 ## Evaluate
 
 First download a tokenized version of the WMT'14 En-De test set with multiple references:
-```
-$ wget dl.fbaipublicfiles.com/fairseq/data/wmt14-en-de.extra_refs.tok
+```bash
+wget dl.fbaipublicfiles.com/fairseq/data/wmt14-en-de.extra_refs.tok
 ```
 
 Next apply BPE on the fly and run generation for each expert:
-```
-$ BPEROOT=examples/translation/subword-nmt/
-$ BPE_CODE=examples/translation/wmt17_en_de/code
-$ for EXPERT in $(seq 0 2); do \
+```bash
+BPEROOT=examples/translation/subword-nmt/
+BPE_CODE=examples/translation/wmt17_en_de/code
+for EXPERT in $(seq 0 2); do \
     cat wmt14-en-de.extra_refs.tok \
     | grep ^S | cut -f 2 \
     | fairseq-interactive data-bin/wmt17_en_de \
@@ -66,15 +66,15 @@ $ for EXPERT in $(seq 0 2); do \
         --method hMoElp --mean-pool-gating-network \
         --num-experts 3 \
         --gen-expert $EXPERT ; \
-  done > wmt14-en-de.extra_refs.tok.gen.3experts
+done > wmt14-en-de.extra_refs.tok.gen.3experts
 ```
 
 Finally use `score_moe.py` to compute pairwise BLUE and average oracle BLEU:
-```
-$ python examples/translation_moe/score.py --sys wmt14-en-de.extra_refs.tok.gen.3experts --ref wmt14-en-de.extra_refs.tok
-pairwise BLEU: 48.26
-#refs covered: 2.11
-multi-reference BLEU (leave-one-out): 59.46
+```bash
+python examples/translation_moe/score.py --sys wmt14-en-de.extra_refs.tok.gen.3experts --ref wmt14-en-de.extra_refs.tok
+# pairwise BLEU: 48.26
+# #refs covered: 2.11
+# multi-reference BLEU (leave-one-out): 59.46
 ```
 This matches row 3 from Table 7 in the paper.
 
diff --git a/examples/wmt19/README.md b/examples/wmt19/README.md
index fff13fa6ac..6eb7818925 100644
--- a/examples/wmt19/README.md
+++ b/examples/wmt19/README.md
@@ -4,86 +4,52 @@ This page provides pointers to the models of Facebook-FAIR's WMT'19 news transla
 
 ## Pre-trained models
 
-Description | Model
----|---
-En->De Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz)
-De->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz)
-En->Ru Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz)
-Ru->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz)
-En LM | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
-De LM | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
-Ru LM | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
+Model | Description | Download
+---|---|---
+`transformer.wmt19.en-de` | En->De Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz)
+`transformer.wmt19.de-en` | De->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz)
+`transformer.wmt19.en-ru` | En->Ru Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz)
+`transformer.wmt19.ru-en` | Ru->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz)
+`transformer_lm.wmt19.en` | En Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
+`transformer_lm.wmt19.de` | De Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
+`transformer_lm.wmt19.ru` | Ru Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
 
 ## Example usage (torch.hub)
 
-```
->>> import torch
->>> en2de = torch.hub.load(
-...   'pytorch/fairseq',
-...   'transformer.wmt19.en-de',
-...   checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt'
-...   tokenizer='moses',
-...   bpe='fastbpe',
-... )
->>> en2de.generate("Machine learning is great!")
-'Maschinelles Lernen ist großartig!'
+```python
+import torch
+
+# English to German translation
+en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
+                       tokenizer='moses', bpe='fastbpe')
+en2de.translate("Machine learning is great!")  # 'Maschinelles Lernen ist großartig!'
 
->>> de2en = torch.hub.load(
-...   'pytorch/fairseq',
-...   'transformer.wmt19.de-en',
-...   checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt'
-...   tokenizer='moses',
-...   bpe='fastbpe',
-... )
->>> de2en.generate("Maschinelles Lernen ist großartig!")
-'Machine learning is great!'
+# German to English translation
+de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
+                       tokenizer='moses', bpe='fastbpe')
+de2en.translate("Maschinelles Lernen ist großartig!")  # 'Machine learning is great!'
 
->>> en2ru = torch.hub.load(
-...   'pytorch/fairseq',
-...   'transformer.wmt19.en-ru',
-...   checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt'
-...   tokenizer='moses',
-...   bpe='fastbpe',
-... )
->>> en2ru.generate("Machine learning is great!")
-'Машинное обучение - это здорово!'
+# English to Russian translation
+en2ru = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-ru', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
+                       tokenizer='moses', bpe='fastbpe')
+en2ru.translate("Machine learning is great!")  # 'Машинное обучение - это здорово!'
 
->>> ru2en = torch.hub.load(
-...   'pytorch/fairseq',
-...   'transformer.wmt19.ru-en',
-...   checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt'
-...   tokenizer='moses',
-...   bpe='fastbpe',
-... )
->>> ru2en.generate("Машинное обучение - это здорово!")
-'Machine learning is great!'
+# Russian to English translation
+ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
+                       tokenizer='moses', bpe='fastbpe')
+ru2en.translate("Машинное обучение - это здорово!")  # 'Machine learning is great!'
 
->>> en_lm = torch.hub.load(
-...   'pytorch.fairseq',
-...   'transformer_lm.wmt19.en'
-...   tokenizer='moses',
-...   bpe='fastbpe',
-... )
->>> en_lm.generate("Machine learning is")
-'Machine learning is the future of computing, says Microsoft boss Satya Nadella ...'
+# Sample from the English LM
+en_lm = torch.hub.load('pytorch.fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
+en_lm.sample("Machine learning is")  # 'Machine learning is the future of computing, says Microsoft boss Satya Nadella ...'
 
->>> de_lm = torch.hub.load(
-...   'pytorch.fairseq',
-...   'transformer_lm.wmt19.de'
-...   tokenizer='moses',
-...   bpe='fastbpe',
-... )
->>> de_lm.generate("Maschinelles lernen ist")
-''Maschinelles lernen ist das A und O (neues-deutschland.de) Die Arbeitsbedingungen für Lehrerinnen und Lehrer sind seit Jahren verbesserungswürdig ...'
+# Sample from the German LM
+de_lm = torch.hub.load('pytorch.fairseq', 'transformer_lm.wmt19.de', tokenizer='moses', bpe='fastbpe')
+de_lm.sample("Maschinelles lernen ist")  # 'Maschinelles lernen ist das A und O (neues-deutschland.de) Die Arbeitsbedingungen für Lehrerinnen und Lehrer sind seit Jahren verbesserungswürdig ...'
 
->>> ru_lm = torch.hub.load(
-...   'pytorch.fairseq',
-...   'transformer_lm.wmt19.ru'
-...   tokenizer='moses',
-...   bpe='fastbpe',
-... )
->>> ru_lm.generate("машинное обучение это")
-'машинное обучение это то, что мы называем "искусственным интеллектом".'
+# Sample from the Russian LM
+ru_lm = torch.hub.load('pytorch.fairseq', 'transformer_lm.wmt19.ru', tokenizer='moses', bpe='fastbpe')
+ru_lm.sample("машинное обучение это")  # 'машинное обучение это то, что мы называем "искусственным интеллектом".'
 ```
 
 ## Citation
diff --git a/fairseq/data/encoders/moses_tokenizer.py b/fairseq/data/encoders/moses_tokenizer.py
index deed30d880..b1e7478b9d 100644
--- a/fairseq/data/encoders/moses_tokenizer.py
+++ b/fairseq/data/encoders/moses_tokenizer.py
@@ -12,9 +12,9 @@ class MosesTokenizer(object):
     @staticmethod
     def add_args(parser):
         # fmt: off
-        parser.add_argument('--moses-source-lang', default='en', metavar='SRC',
+        parser.add_argument('--moses-source-lang', metavar='SRC',
                             help='source language')
-        parser.add_argument('--moses-target-lang', default='en', metavar='TARGET',
+        parser.add_argument('--moses-target-lang', metavar='TARGET',
                             help='target language')
         parser.add_argument('--moses-no-dash-splits', action='store_true', default=False,
                             help='don\'t apply dash split rules')
@@ -24,6 +24,12 @@ def add_args(parser):
 
     def __init__(self, args):
         self.args = args
+
+        if getattr(args, 'moses_source_lang', None) is None:
+            args.moses_source_lang = getattr(args, 'source_lang', 'en')
+        if getattr(args, 'moses_target_lang', None) is None:
+            args.moses_target_lang = getattr(args, 'target_lang', 'en')
+
         try:
             from sacremoses import MosesTokenizer, MosesDetokenizer
             self.tok = MosesTokenizer(args.moses_source_lang)
diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index 06a2c55723..73fdd94dc9 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -97,12 +97,15 @@ def __init__(self, args, task, models):
     def device(self):
         return self._float_tensor.device
 
-    def translate(self, sentence: str, verbose: bool = False, **kwargs) -> str:
+    def translate(self, sentence: str, beam: int = 5, verbose: bool = False, **kwargs) -> str:
+        return self.sample(sentence, beam, verbose, **kwargs)
+
+    def sample(self, sentence: str, beam: int = 1, verbose: bool = False, **kwargs) -> str:
         input = self.encode(sentence)
-        hypo = self.generate(input, verbose, **kwargs)
+        hypo = self.generate(input, beam, verbose, **kwargs)[0]['tokens']
         return self.decode(hypo)
 
-    def generate(self, tokens: torch.LongTensor, verbose: bool = False, **kwargs) -> torch.LongTensor:
+    def generate(self, tokens: torch.LongTensor, beam: int = 5, verbose: bool = False, **kwargs) -> torch.LongTensor:
         sample = self._build_sample(tokens)
 
         # build generator using current args as well as any kwargs
@@ -117,20 +120,24 @@ def generate(self, tokens: torch.LongTensor, verbose: bool = False, **kwargs) ->
             src_str_with_unk = self.string(tokens)
             print('S\t{}'.format(src_str_with_unk))
 
+        def getarg(name, default):
+            return getattr(gen_args, name, getattr(self.args, name, default))
+
         # Process top predictions
-        for hypo in translations[0][:min(len(translations), getattr(self.args, 'nbest', 1))]:
-            hypo_str = self.decode(hypo['tokens'])
-            if verbose:
+        hypos = translations[0]
+        if verbose:
+            for hypo in hypos:
+                hypo_str = self.decode(hypo['tokens'])
                 print('H\t{}\t{}'.format(hypo['score'], hypo_str))
                 print('P\t{}'.format(
                     ' '.join(map(lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist()))
                 ))
-                if hypo['alignment'] is not None and getattr(self.args, 'print_alignment', False):
+                if hypo['alignment'] is not None and getarg('print_alignment', False):
                     print('A\t{}'.format(
                         ' '.join(map(lambda x: str(utils.item(x)), hypo['alignment'].int().cpu()))
                     ))
 
-        return hypo['tokens']
+        return hypos
 
     def encode(self, sentence: str) -> torch.LongTensor:
         sentence = self.tokenize(sentence)
diff --git a/hubconf.py b/hubconf.py
index 7e1574a684..ec27226da4 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -11,6 +11,7 @@
 
 
 dependencies = [
+    'fastBPE',
     'regex',
     'requests',
     'sacremoses',
diff --git a/setup.py b/setup.py
index 1fd3f6dd34..83b3a7ee54 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,9 @@
     long_description_content_type='text/markdown',
     install_requires=[
         'cffi',
+        'fastBPE',
         'numpy',
+        'regex',
         'sacrebleu',
         'torch',
         'tqdm',

From f02f70cce2dc1126c4250170eabfd3a95d8bb378 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 2 Aug 2019 08:40:59 -0700
Subject: [PATCH 060/213] Add single-models for WMT'19 for hub tutorial

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/800

Differential Revision: D16621509

Pulled By: myleott

fbshipit-source-id: d3e8e97d30bcafbc35c3f67cd8bbc657b6fa5fe7
---
 fairseq/models/transformer.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 4afbe93a63..2e23dcf784 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -53,10 +53,14 @@ def hub_models(cls):
             'transformer.wmt14.en-fr': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2',
             'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2',
             'transformer.wmt18.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz',
-            'transformer.wmt19.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.bz2',
-            'transformer.wmt19.en-ru': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.bz2',
-            'transformer.wmt19.de-en': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.bz2',
-            'transformer.wmt19.ru-en': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.bz2',
+            'transformer.wmt19.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz',
+            'transformer.wmt19.en-ru': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz',
+            'transformer.wmt19.de-en': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz',
+            'transformer.wmt19.ru-en': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz',
+            'transformer.wmt19.en-de.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz',
+            'transformer.wmt19.en-ru.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz',
+            'transformer.wmt19.de-en.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz',
+            'transformer.wmt19.ru-en.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz',
         }
 
     def __init__(self, encoder, decoder):

From 3903f46904f8b2ca0fa229cf6131675858997a30 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 2 Aug 2019 10:54:16 -0700
Subject: [PATCH 061/213] Fewer torch.hub requirements (#959)

Summary:
We will raise exceptions if these are needed and aren't available. Only keep minimum set of reqs
Pull Request resolved: https://github.com/pytorch/fairseq/pull/959

Differential Revision: D16623304

Pulled By: myleott

fbshipit-source-id: 8e65253742e393b527e8396a9433e64ebec9bb55
---
 hubconf.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index ec27226da4..d8f252ad7b 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -11,12 +11,8 @@
 
 
 dependencies = [
-    'fastBPE',
     'regex',
     'requests',
-    'sacremoses',
-    'sentencepiece',
-    'subword_nmt',
     'torch',
 ]
 

From 9012e87d36a5f88cc3894508688d4328bbfd3abe Mon Sep 17 00:00:00 2001
From: Ning Dong <dnn@fb.com>
Date: Fri, 2 Aug 2019 13:17:26 -0700
Subject: [PATCH 062/213] Avoid cast in PositionalEmbeddings to fix BLEU drop
 in pytorch native export

Summary:
Tracing mode doesn't generalize correctly in positional embedding calculation, which caused -5 BLEU at transformer export when using pytorch native.

Details: The original issue was that in ensemble_export, _to_tensor(x) in scripting mode turns integer x into 1-d tensor torch.tensor([x]), not 0-d tensor (scalar x) which is expected in the embedding. So the return value in embedding forward() is actually of wrong shape. When self.weights is of size [x,y], the return value should be (bsz, y, 1) but it was (bsz, 1, y), which caused problem in downstream computation. Tracing only becomes an issue when I used pos = timestep.view(-1)[0] to fix the shape. Then casting the scalar to primary int, to be used as index is not generalizable by tracing mode. Thus I need to convert everything to tensor and replace the advanced indexing with index_select operator.

In summary, less understood features in both scripting&tracing sides caused the bleu drop. :)

Reviewed By: myleott

Differential Revision: D16623025

fbshipit-source-id: 0c7a2c3eafbd774760a5c880c6034009ee084abb
---
 fairseq/modules/sinusoidal_positional_embedding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/modules/sinusoidal_positional_embedding.py b/fairseq/modules/sinusoidal_positional_embedding.py
index 93429e1c0b..cf18cf704c 100644
--- a/fairseq/modules/sinusoidal_positional_embedding.py
+++ b/fairseq/modules/sinusoidal_positional_embedding.py
@@ -67,9 +67,9 @@ def forward(self, input, incremental_state=None, timestep=None, **kwargs):
 
         if incremental_state is not None:
             # positions is the same for every token when decoding a single step
-            pos = (timestep.int() + 1).long() if timestep is not None else seq_len
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
             if self.onnx_trace:
-                return self.weights[self.padding_idx + pos, :].unsqueeze(1).repeat(bsz, 1, 1)
+                return self.weights.index_select(index=self.padding_idx + pos, dim=0).unsqueeze(1).repeat(bsz, 1, 1)
             return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
 
         positions = utils.make_positions(input, self.padding_idx, onnx_trace=self.onnx_trace)

From 12258e5798a7b89d46443c1c80dc6f281637807e Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sat, 3 Aug 2019 04:39:04 -0700
Subject: [PATCH 063/213] Fix generating with a fixed prefix

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/801

Differential Revision: D16628318

Pulled By: myleott

fbshipit-source-id: 50e93bb9108afd2ba90f1edd4f34306a7c9964a4
---
 fairseq/hub_utils.py          |  1 +
 fairseq/sequence_generator.py | 79 +++++++++++++++--------------------
 2 files changed, 35 insertions(+), 45 deletions(-)

diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index 73fdd94dc9..297338d02e 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -110,6 +110,7 @@ def generate(self, tokens: torch.LongTensor, beam: int = 5, verbose: bool = Fals
 
         # build generator using current args as well as any kwargs
         gen_args = copy.copy(self.args)
+        gen_args.beam = beam
         for k, v in kwargs.items():
             setattr(gen_args, k, v)
         generator = self.task.build_generator(gen_args)
diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py
index 6a122271bd..0a8ffa1843 100644
--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -156,8 +156,6 @@ def generate(
         tokens[:, 0] = self.eos if bos_token is None else bos_token
         attn, attn_buf = None, None
         nonpad_idxs = None
-        if prefix_tokens is not None:
-            partial_prefix_mask_buf = torch.zeros_like(src_lengths).byte()
 
         # The blacklist indicates candidates that should be ignored.
         # For example, suppose we're sampling and have already finalized 2/5
@@ -304,6 +302,35 @@ def get_hypo():
             elif step < self.min_len:
                 lprobs[:, self.eos] = -math.inf
 
+            # handle prefix tokens (possibly with different lengths)
+            if prefix_tokens is not None and step < prefix_tokens.size(1):
+                prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
+                prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1))
+                prefix_mask = prefix_toks.ne(self.pad)
+                lprobs[prefix_mask] = -math.inf
+                lprobs[prefix_mask] = lprobs[prefix_mask].scatter_(
+                    -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs
+                )
+                # if prefix includes eos, then we should make sure tokens and
+                # scores are the same across all beams
+                eos_mask = prefix_toks.eq(self.eos)
+                if eos_mask.any():
+                    # validate that the first beam matches the prefix
+                    first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[:, 0, 1:step + 1]
+                    eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
+                    target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
+                    assert (first_beam == target_prefix).all()
+
+                    def replicate_first_beam(tensor, mask):
+                        tensor = tensor.view(-1, beam_size, tensor.size(-1))
+                        tensor[mask] = tensor[mask][:, :1, :]
+                        return tensor.view(-1, tensor.size(-1))
+
+                    # copy tokens, scores and lprobs from the first beam to all beams
+                    tokens = replicate_first_beam(tokens, eos_mask_batch_dim)
+                    scores = replicate_first_beam(scores, eos_mask_batch_dim)
+                    lprobs = replicate_first_beam(lprobs, eos_mask_batch_dim)
+
             if self.no_repeat_ngram_size > 0:
                 # for each beam and batch sentence, generate a list of previous ngrams
                 gen_ngrams = [{} for bbsz_idx in range(bsz * beam_size)]
@@ -343,48 +370,11 @@ def calculate_banned_tokens(bbsz_idx):
                 for bbsz_idx in range(bsz * beam_size):
                     lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf
 
-            if prefix_tokens is not None and step < prefix_tokens.size(1):
-                assert isinstance(self.search, search.BeamSearch) or bsz == 1, \
-                        "currently only BeamSearch supports decoding with prefix_tokens"
-                probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :]
-                cand_scores = torch.gather(
-                    probs_slice, dim=1,
-                    index=prefix_tokens[:, step].view(-1, 1)
-                ).view(-1, 1).repeat(1, cand_size)
-                if step > 0:
-                    # save cumulative scores for each hypothesis
-                    cand_scores.add_(scores[:, step - 1].view(bsz, beam_size).repeat(1, 2))
-                cand_indices = prefix_tokens[:, step].view(-1, 1).repeat(1, cand_size)
-                cand_beams = torch.zeros_like(cand_indices)
-
-            # handle prefixes of different lengths
-            # when step == prefix_tokens.size(1), we'll have new free-decoding batches
-            if prefix_tokens is not None and step <= prefix_tokens.size(1):
-                if step < prefix_tokens.size(1):
-                    partial_prefix_mask = prefix_tokens[:, step].eq(self.pad)
-                else:   #  all prefixes finished force-decoding
-                    partial_prefix_mask = torch.ones(bsz).to(prefix_tokens).byte()
-                if partial_prefix_mask.any():
-                    # track new free-decoding batches, at whose very first step
-                    # only use the first beam to eliminate repeats
-                    prefix_step0_mask = partial_prefix_mask ^ partial_prefix_mask_buf
-                    lprobs.view(bsz, beam_size, -1)[prefix_step0_mask, 1:] = -math.inf
-                    partial_scores, partial_indices, partial_beams = self.search.step(
-                        step,
-                        lprobs.view(bsz, -1, self.vocab_size),
-                        scores.view(bsz, beam_size, -1)[:, :, :step],
-                    )
-                    cand_scores[partial_prefix_mask] = partial_scores[partial_prefix_mask]
-                    cand_indices[partial_prefix_mask] = partial_indices[partial_prefix_mask]
-                    cand_beams[partial_prefix_mask] = partial_beams[partial_prefix_mask]
-                    partial_prefix_mask_buf = partial_prefix_mask
-
-            else:
-                cand_scores, cand_indices, cand_beams = self.search.step(
-                    step,
-                    lprobs.view(bsz, -1, self.vocab_size),
-                    scores.view(bsz, beam_size, -1)[:, :, :step],
-                )
+            cand_scores, cand_indices, cand_beams = self.search.step(
+                step,
+                lprobs.view(bsz, -1, self.vocab_size),
+                scores.view(bsz, beam_size, -1)[:, :, :step],
+            )
 
             # cand_bbsz_idx contains beam indices for the top candidate
             # hypotheses, with a range of values: [0, bsz*beam_size),
@@ -433,7 +423,6 @@ def calculate_banned_tokens(bbsz_idx):
                 cand_indices = cand_indices[batch_idxs]
                 if prefix_tokens is not None:
                     prefix_tokens = prefix_tokens[batch_idxs]
-                    partial_prefix_mask_buf = partial_prefix_mask_buf[batch_idxs]
                 src_lengths = src_lengths[batch_idxs]
                 blacklist = blacklist[batch_idxs]
 

From c728b864247ff85e968d3138a4412385722c4b7b Mon Sep 17 00:00:00 2001
From: alexeib <alexei.b@gmail.com>
Date: Sat, 3 Aug 2019 08:29:09 -0700
Subject: [PATCH 064/213] remove default params from args so architecture works
 properly

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/798

Reviewed By: myleott

Differential Revision: D16619502

Pulled By: alexeib

fbshipit-source-id: af20c90c4522458850d8f42cab001259ef4293cc
---
 fairseq/models/transformer_lm.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fairseq/models/transformer_lm.py b/fairseq/models/transformer_lm.py
index febdc9adc9..54d5f3f4c3 100644
--- a/fairseq/models/transformer_lm.py
+++ b/fairseq/models/transformer_lm.py
@@ -44,9 +44,9 @@ def add_args(parser):
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
-        parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
+        parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
@@ -62,9 +62,9 @@ def add_args(parser):
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
-        parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
+        parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
-        parser.add_argument('--no-decoder-final-norm', default=False, action='store_true',
+        parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
@@ -73,11 +73,11 @@ def add_args(parser):
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')
-        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+        parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
-        parser.add_argument('--character-embeddings', default=False, action='store_true',
+        parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',

From 1684e166e3da03f5b600dbb7855cb98ddfcd0805 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sun, 4 Aug 2019 06:22:44 -0700
Subject: [PATCH 065/213] Add doc string for Roberta.encode function

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/969

Differential Revision: D16642388

Pulled By: myleott

fbshipit-source-id: c5b1655dbddb697822feefa433f33f6bb08253ab
---
 fairseq/models/roberta/hub_interface.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index bb08d48f6f..2cc519746c 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -33,6 +33,26 @@ def device(self):
         return self._float_tensor.device
 
     def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
+        """
+        BPE-encode a sentence (or multiple sentences).
+
+        Every sequence begins with a beginning-of-sentence (`<s>`) symbol.
+        Every sentence ends with an end-of-sentence (`</s>`) and we use an
+        extra end-of-sentence (`</s>`) as a separator.
+
+        Example (single sentence): `<s> a b c </s>`
+        Example (sentence pair): `<s> d e f </s> </s> 1 2 3 </s>`
+
+        The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
+        requires leading spaces. For example::
+
+            >>> roberta.encode('Hello world').tolist()
+            [0, 31414, 232, 2]
+            >>> roberta.encode(' world').tolist()
+            [0, 232, 2]
+            >>> roberta.encode('world').tolist()
+            [0, 8331, 2]
+        """
         bpe_sentence = '<s> ' + self.bpe.encode(sentence) + ' </s>'
         for s in addl_sentences:
             bpe_sentence += ' </s> ' + self.bpe.encode(s) + ' </s>'

From 5d543f9b19e76772386903d4eeebdceaeb3d1b69 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair1287.h2.fair>
Date: Mon, 5 Aug 2019 12:39:42 -0700
Subject: [PATCH 066/213] fixed roberta finetuning with
 --find-unused-parameters on multiGPU

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/806

Differential Revision: D16649933

fbshipit-source-id: 6eeda6e2caf8019228e3efc0c27ddfcc3c4d8674
---
 .../roberta/README.finetune_custom_classification.md  |  1 +
 examples/roberta/README.finetune_glue.md              |  1 +
 fairseq/criterions/masked_lm.py                       |  2 +-
 fairseq/criterions/sentence_prediction.py             | 11 ++++-------
 fairseq/models/roberta/model.py                       | 10 ++++++++++
 5 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/examples/roberta/README.finetune_custom_classification.md b/examples/roberta/README.finetune_custom_classification.md
index cd49348f56..eaac58b3f6 100644
--- a/examples/roberta/README.finetune_custom_classification.md
+++ b/examples/roberta/README.finetune_custom_classification.md
@@ -115,6 +115,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py IMDB-bin/ \
     --max-epoch 10 \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
     --truncate-sequence \
+    --find-unused-parameters \
     --update-freq 4
 ```
 Above will train with effective batch-size of `32`, tested on one `Nvidia V100 32gb`.
diff --git a/examples/roberta/README.finetune_glue.md b/examples/roberta/README.finetune_glue.md
index c905cab7c0..d44a5aee53 100644
--- a/examples/roberta/README.finetune_glue.md
+++ b/examples/roberta/README.finetune_glue.md
@@ -42,6 +42,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py RTE-bin/ \
     --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
     --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
     --max-epoch 10 \
+    --find-unused-parameters \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
 ```
 
diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
index 842fad0fa5..a133b5fa41 100644
--- a/fairseq/criterions/masked_lm.py
+++ b/fairseq/criterions/masked_lm.py
@@ -30,7 +30,7 @@ def forward(self, model, sample, reduce=True):
         3) logging outputs to display while training
         """
         # compute MLM loss
-        logits = model(**sample['net_input'], last_state_only=True)[0]
+        logits = model(**sample['net_input'], return_all_hiddens=False)[0]
         targets = model.get_targets(sample, [logits])
         loss = F.nll_loss(
             F.log_softmax(
diff --git a/fairseq/criterions/sentence_prediction.py b/fairseq/criterions/sentence_prediction.py
index f116288b12..6fb03baf41 100644
--- a/fairseq/criterions/sentence_prediction.py
+++ b/fairseq/criterions/sentence_prediction.py
@@ -31,18 +31,15 @@ def forward(self, model, sample, reduce=True):
         2) the sample size, which is used as the denominator for the gradient
         3) logging outputs to display while training
         """
-        features, extra = model(**sample['net_input'], features_only=True)
-        padding_mask = sample['net_input']['src_tokens'].eq(self.padding_idx)
-
         assert hasattr(model, 'classification_heads') and \
             'sentence_classification_head' in model.classification_heads, \
             "model must provide sentence classification head for --criterion=sentence_prediction"
 
-        logits = model.classification_heads['sentence_classification_head'](
-            features,
-            padding_mask=padding_mask,
+        logits, _ = model(
+            **sample['net_input'],
+            features_only=True,
+            classification_head_name='sentence_classification_head',
         )
-
         targets = model.get_targets(sample, [logits]).view(-1)
         sample_size = targets.numel()
 
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index 93555c7d66..69916164ff 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -89,6 +89,16 @@ def build_model(cls, args, task):
         encoder = RobertaEncoder(args, task.source_dictionary)
         return cls(args, encoder)
 
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
+        assert classification_head_name is None or features_only, \
+            "If passing classification_head_name argument, features_only must be set to True"
+
+        x, extra = self.decoder(src_tokens, features_only, return_all_hiddens, **kwargs)
+
+        if classification_head_name is not None:
+            x = self.classification_heads[classification_head_name](x)
+        return x, extra
+
     def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
         """Register a classification head."""
         self.classification_heads[name] = RobertaClassificationHead(

From e40e4b21cb620408e99ab8c6a45e314584ef3508 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 6 Aug 2019 09:11:54 -0700
Subject: [PATCH 067/213] Add back set_epoch functionality lost in RoBERTa
 merge

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/982

Differential Revision: D16668353

Pulled By: myleott

fbshipit-source-id: 699243d6c028c47cd0e3f801d89051b3f919b17e
---
 fairseq/data/fairseq_dataset.py | 3 +++
 fairseq/data/iterators.py       | 1 +
 2 files changed, 4 insertions(+)

diff --git a/fairseq/data/fairseq_dataset.py b/fairseq/data/fairseq_dataset.py
index f710b3d9f7..6144beca4e 100644
--- a/fairseq/data/fairseq_dataset.py
+++ b/fairseq/data/fairseq_dataset.py
@@ -50,3 +50,6 @@ def supports_prefetch(self):
     def prefetch(self, indices):
         """Prefetch the data required for this epoch."""
         raise NotImplementedError
+
+    def set_epoch(self, epoch):
+        pass
diff --git a/fairseq/data/iterators.py b/fairseq/data/iterators.py
index 451a8c5e1d..0f1ec7e404 100644
--- a/fairseq/data/iterators.py
+++ b/fairseq/data/iterators.py
@@ -179,6 +179,7 @@ def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
             self._cur_epoch_itr = self._get_iterator_for_epoch(
                 self.epoch, shuffle, fix_batches_to_gpus=fix_batches_to_gpus,
             )
+        self.dataset.set_epoch(self.epoch)
         return self._cur_epoch_itr
 
     def end_of_epoch(self) -> bool:

From 2b7843daf85bad39b634b7963604771d3528e671 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 7 Aug 2019 06:31:34 -0700
Subject: [PATCH 068/213] Add code to realign RoBERTa features to word-level
 tokenizers

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/805

Differential Revision: D16670825

Pulled By: myleott

fbshipit-source-id: 872a1a0274681a34d54bda00bfcfcda2e94144c6
---
 examples/roberta/README.md                |  22 +++++
 fairseq/data/encoders/fastbpe.py          |   2 +-
 fairseq/models/roberta/alignment_utils.py | 115 ++++++++++++++++++++++
 fairseq/models/roberta/hub_interface.py   |  33 ++++++-
 4 files changed, 170 insertions(+), 2 deletions(-)
 create mode 100644 fairseq/models/roberta/alignment_utils.py

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index e975789f01..21b04c845a 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -76,6 +76,28 @@ assert len(all_layers) == 25
 assert torch.all(all_layers[-1] == last_layer_features)
 ```
 
+By default RoBERTa outputs one feature vector per BPE token. You can instead
+realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization)
+with the `extract_features_aligned_to_words` method. This will compute a
+weighted average of the BPE-level features for each word and expose them in
+spaCy's `Token.vector` attribute:
+```python
+doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."')
+assert len(doc) == 10
+for tok in doc:
+    print('{:10}{} (...)'.format(str(tok), tok.vector[:5]))
+# <s>       tensor([-0.1316, -0.0386, -0.0832, -0.0477,  0.1943], grad_fn=<SliceBackward>) (...)
+# I         tensor([ 0.0559,  0.1541, -0.4832,  0.0880,  0.0120], grad_fn=<SliceBackward>) (...)
+# said      tensor([-0.1565, -0.0069, -0.8915,  0.0501, -0.0647], grad_fn=<SliceBackward>) (...)
+# ,         tensor([-0.1318, -0.0387, -0.0834, -0.0477,  0.1944], grad_fn=<SliceBackward>) (...)
+# "         tensor([-0.0486,  0.1818, -0.3946, -0.0553,  0.0981], grad_fn=<SliceBackward>) (...)
+# hello     tensor([ 0.0079,  0.1799, -0.6204, -0.0777, -0.0923], grad_fn=<SliceBackward>) (...)
+# RoBERTa   tensor([-0.2339, -0.1184, -0.7343, -0.0492,  0.5829], grad_fn=<SliceBackward>) (...)
+# .         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
+# "         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
+# </s>      tensor([-0.0930, -0.0392, -0.0821,  0.0158,  0.0649], grad_fn=<SliceBackward>) (...)
+```
+
 ##### Use RoBERTa for sentence-pair classification tasks:
 ```python
 # Download RoBERTa already finetuned for MNLI
diff --git a/fairseq/data/encoders/fastbpe.py b/fairseq/data/encoders/fastbpe.py
index 61a8f726ec..ed39b1bca9 100644
--- a/fairseq/data/encoders/fastbpe.py
+++ b/fairseq/data/encoders/fastbpe.py
@@ -25,7 +25,7 @@ def __init__(self, args):
             self.bpe = fastBPE.fastBPE(codes)
             self.bpe_symbol = "@@ "
         except ImportError:
-            raise ImportError('Please install fastbpe at https://github.com/glample/fastBPE')
+            raise ImportError('Please install fastBPE with: pip install fastBPE')
 
     def encode(self, x: str) -> str:
         return self.bpe.apply([x])[0]
diff --git a/fairseq/models/roberta/alignment_utils.py b/fairseq/models/roberta/alignment_utils.py
new file mode 100644
index 0000000000..85da2c4c01
--- /dev/null
+++ b/fairseq/models/roberta/alignment_utils.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import Counter
+from typing import List
+
+import torch
+
+
+def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]):
+    """
+    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
+
+    Args:
+        roberta (RobertaHubInterface): RoBERTa instance
+        bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
+        other_tokens (List[str]): other tokens of shape `(T_words)`
+
+    Returns:
+        List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
+    """
+    assert bpe_tokens.dim() == 1
+
+    def clean(text):
+        return text.strip()
+
+    # remove whitespaces to simplify alignment
+    bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens]
+    bpe_tokens = [clean(roberta.bpe.decode(x) if x not in {'<s>', ''} else x) for x in bpe_tokens]
+    other_tokens = [clean(str(o)) for o in other_tokens]
+
+    # strip leading <s>
+    assert bpe_tokens[0] == '<s>'
+    bpe_tokens = bpe_tokens[1:]
+    assert ''.join(bpe_tokens) == ''.join(other_tokens)
+
+    # create alignment from every word to a list of BPE tokens
+    alignment = []
+    bpe_toks = filter(lambda item: item[1] != '', enumerate(bpe_tokens, start=1))
+    j, bpe_tok = next(bpe_toks)
+    for other_tok in other_tokens:
+        bpe_indices = []
+        while True:
+            if other_tok.startswith(bpe_tok):
+                bpe_indices.append(j)
+                other_tok = other_tok[len(bpe_tok):]
+                try:
+                    j, bpe_tok = next(bpe_toks)
+                except StopIteration:
+                    j, bpe_tok = None, None
+            elif bpe_tok.startswith(other_tok):
+                # other_tok spans multiple BPE tokens
+                bpe_indices.append(j)
+                bpe_tok = bpe_tok[len(other_tok):]
+                other_tok = ''
+            else:
+                raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok))
+            if other_tok == '':
+                break
+        assert len(bpe_indices) > 0
+        alignment.append(bpe_indices)
+    assert len(alignment) == len(other_tokens)
+
+    return alignment
+
+
+def align_features_to_words(roberta, features, alignment):
+    """
+    Align given features to words.
+
+    Args:
+        roberta (RobertaHubInterface): RoBERTa instance
+        features (torch.Tensor): features to align of shape `(T_bpe x C)`
+        alignment: alignment between BPE tokens and words returned by
+            func:`align_bpe_to_words`.
+    """
+    assert features.dim() == 2
+
+    bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices)
+    assert bpe_counts[0] == 0  # <s> shouldn't be aligned
+    denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))])
+    weighted_features = features / denom.unsqueeze(-1)
+
+    output = [weighted_features[0]]
+    largest_j = -1
+    for bpe_indices in alignment:
+        output.append(weighted_features[bpe_indices].sum(dim=0))
+        largest_j = max(largest_j, *bpe_indices)
+    for j in range(largest_j + 1, len(features)):
+        output.append(weighted_features[j])
+    output = torch.stack(output)
+    assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4)
+    return output
+
+
+def spacy_nlp():
+    if getattr(spacy_nlp, '_nlp', None) is None:
+        try:
+            from spacy.lang.en import English
+            spacy_nlp._nlp = English()
+        except ImportError:
+            raise ImportError('Please install spacy with: pip install spacy')
+    return spacy_nlp._nlp
+
+
+def spacy_tokenizer():
+    if getattr(spacy_tokenizer, '_tokenizer', None) is None:
+        try:
+            nlp = spacy_nlp()
+            spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp)
+        except ImportError:
+            raise ImportError('Please install spacy with: pip install spacy')
+    return spacy_tokenizer._tokenizer
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index 2cc519746c..f7ba231e3b 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import List
+
 import numpy as np
 import torch
 import torch.nn as nn
@@ -72,7 +74,7 @@ def decode(self, tokens: torch.LongTensor):
             return sentences[0]
         return sentences
 
-    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens=False) -> torch.Tensor:
+    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens: bool = False) -> torch.Tensor:
         if tokens.dim() == 1:
             tokens = tokens.unsqueeze(0)
         if tokens.size(-1) > self.model.max_positions():
@@ -102,3 +104,32 @@ def predict(self, head: str, tokens: torch.LongTensor):
         features = self.extract_features(tokens)
         logits = self.model.classification_heads[head](features)
         return F.log_softmax(logits, dim=-1)
+
+    def extract_features_aligned_to_words(self, sentence: str, return_all_hiddens: bool = False) -> torch.Tensor:
+        """Extract RoBERTa features, aligned to spaCy's word-level tokenizer."""
+        from fairseq.models.roberta import alignment_utils
+        from spacy.tokens import Doc
+
+        nlp = alignment_utils.spacy_nlp()
+        tokenizer = alignment_utils.spacy_tokenizer()
+
+        # tokenize both with GPT-2 BPE and spaCy
+        bpe_toks = self.encode(sentence)
+        spacy_toks = tokenizer(sentence)
+        spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
+        alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws)
+
+        # extract features and align them
+        features = self.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens)
+        features = features.squeeze(0)
+        aligned_feats = alignment_utils.align_features_to_words(self, features, alignment)
+
+        # wrap in spaCy Doc
+        doc = Doc(
+            nlp.vocab,
+            words=['<s>'] + [x.text for x in spacy_toks] + ['</s>'],
+            spaces=[True] + [x.endswith(' ') for x in spacy_toks_ws[:-1]] + [True, False],
+        )
+        assert len(doc) == aligned_feats.size(0)
+        doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i]
+        return doc

From 1e55bbdb385cccc72c7ac0d305ffd120ded7e1b6 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 7 Aug 2019 07:25:45 -0700
Subject: [PATCH 069/213] Fix tests and GLUE finetuning (fixes #989)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/991

Differential Revision: D16687970

Pulled By: myleott

fbshipit-source-id: d877fc16891a8ab97aec47a8d440baa56c2b5f46
---
 fairseq/data/base_wrapper_dataset.py | 3 ++-
 fairseq/models/transformer_lm.py     | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fairseq/data/base_wrapper_dataset.py b/fairseq/data/base_wrapper_dataset.py
index 17b39133dc..d14c3c76f3 100644
--- a/fairseq/data/base_wrapper_dataset.py
+++ b/fairseq/data/base_wrapper_dataset.py
@@ -48,4 +48,5 @@ def prefetch(self, indices):
 
     def set_epoch(self, epoch):
         super().set_epoch(epoch)
-        self.dataset.set_epoch(epoch)
+        if hasattr(self.dataset, 'set_epoch'):
+            self.dataset.set_epoch(epoch)
diff --git a/fairseq/models/transformer_lm.py b/fairseq/models/transformer_lm.py
index 54d5f3f4c3..87c7719209 100644
--- a/fairseq/models/transformer_lm.py
+++ b/fairseq/models/transformer_lm.py
@@ -150,6 +150,9 @@ def base_lm_architecture(args):
     if hasattr(args, 'decoder_final_norm'):
         args.no_decoder_final_norm = not args.decoder_final_norm
 
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.0)
+
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
     args.decoder_layers = getattr(args, 'decoder_layers', 6)
@@ -161,6 +164,7 @@ def base_lm_architecture(args):
     args.activation_fn = getattr(args, 'activation_fn', 'relu')
 
     args.add_bos_token = getattr(args, 'add_bos_token', False)
+    args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)
     args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
     args.character_embeddings = getattr(args, 'character_embeddings', False)
 

From a9eda736ec6295118dfa2b46f614519e0c191cbb Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair1287.h2.fair>
Date: Wed, 7 Aug 2019 09:25:17 -0700
Subject: [PATCH 070/213] Added mask_fill api and some examples in README
 (#807)

Summary:
1) This currently works only for single `<mask>` token as multi mask, we might have to look more into order of factorization.
2) This is currently only for single BPE token
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/807

Differential Revision: D16674509

fbshipit-source-id: 0a020030ee5df6a5115e5f85d5a9ef52b1ad9e1c
---
 examples/roberta/README.md              | 13 +++++++
 fairseq/models/roberta/hub_interface.py | 45 +++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 21b04c845a..537c55f3fa 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -134,6 +134,19 @@ roberta.cuda()
 roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
 ```
 
+##### Filling mask:
+Some examples from the [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/).
+```python
+>>> roberta.fill_mask("The first Star wars movie came out in <mask>", topk=3)
+[('The first Star wars movie came out in 1977', 0.9504712224006653), ('The first Star wars movie came out in 1978', 0.009986752644181252), ('The first Star wars movie came out in 1979', 0.00957468245178461)]
+
+>>> roberta.fill_mask("Vikram samvat calender is official in <mask>", topk=3)
+[('Vikram samvat calender is official in India', 0.21878768503665924), ('Vikram samvat calender is official in Delhi', 0.08547217398881912), ('Vikram samvat calender is official in Gujarat', 0.07556255906820297)]
+
+>>> roberta.fill_mask("<mask> is the common currency of the European Union", topk=3)
+[('Euro is the common currency of the European Union', 0.945650577545166), ('euro is the common currency of the European Union', 0.025747718289494514), ('€ is the common currency of the European Union', 0.011183015070855618)]
+```
+
 ##### Evaluating the `roberta.large.mnli` model
 
 Example python code snippet to evaluate accuracy on the MNLI dev_matched set.
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index f7ba231e3b..22ce96e89f 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -133,3 +133,48 @@ def extract_features_aligned_to_words(self, sentence: str, return_all_hiddens: b
         assert len(doc) == aligned_feats.size(0)
         doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i]
         return doc
+
+    def fill_mask(self, masked_input: str, topk: int = 5):
+        masked_token = '<mask>'
+        assert masked_token in masked_input and masked_input.count(masked_token) == 1, \
+            "Please add one {0} token for the input, eg: 'He is a {0} guy'".format(masked_token)
+
+        text_spans = masked_input.split(masked_token)
+        text_spans_bpe = (' {0} '.format(masked_token)).join(
+            [self.bpe.encode(text_span.rstrip()) for text_span in text_spans]
+        ).strip()
+        tokens = self.task.source_dictionary.encode_line(
+            '<s> ' + text_spans_bpe,
+            append_eos=True,
+        )
+
+        masked_index = (tokens == self.task.mask_idx).nonzero()
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+
+        features, extra = self.model(
+            tokens.long().to(device=self.device),
+            features_only=False,
+            return_all_hiddens=False,
+        )
+        logits = features[0, masked_index, :].squeeze()
+        prob = logits.softmax(dim=0)
+        values, index = prob.topk(k=topk, dim=0)
+        topk_predicted_token_bpe = self.task.source_dictionary.string(index)
+
+        topk_filled_outputs = []
+        for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
+            predicted_token = self.bpe.decode(predicted_token_bpe)
+            if " {0}".format(masked_token) in masked_input:
+                topk_filled_outputs.append((
+                    masked_input.replace(
+                        ' {0}'.format(masked_token), predicted_token
+                    ),
+                    values[index].item(),
+                ))
+            else:
+                topk_filled_outputs.append((
+                    masked_input.replace(masked_token, predicted_token),
+                    values[index].item(),
+                ))
+        return topk_filled_outputs

From 9a1038f68a92444a8f9cd2f0ca42a362b90fed20 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair1287.h2.fair>
Date: Wed, 7 Aug 2019 10:44:13 -0700
Subject: [PATCH 071/213] fixed reloading from checkpoint (#811)

Summary:
Tested by starting training from (a) `roberta.large`, (b) `roberta.large.mnli`, (c) `checkpoints/checkpoint_last.pt`
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/811

Reviewed By: myleott

Differential Revision: D16689528

Pulled By: myleott

fbshipit-source-id: 849d72ede9d526c34b4753c1bffd689554d1f837
---
 fairseq/models/roberta/model.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index 69916164ff..eb7e03f764 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -128,11 +128,15 @@ def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_na
 
     def upgrade_state_dict_named(self, state_dict, name):
         prefix = name + '.' if name != '' else ''
+        current_head_names = [] if not hasattr(self, 'classification_heads') else \
+            self.classification_heads.keys()
 
-        # recreate any classification heads present in the state dict
+        keys_to_delete = []
+        # Delete any heads present in state_dict, that are not in current constructed model.
         for k in state_dict.keys():
             if not k.startswith(prefix + 'classification_heads.'):
                 continue
+
             head_name = k[len(prefix + 'classification_heads.'):].split('.')[0]
             num_classes = state_dict[
                 prefix + 'classification_heads.' + head_name + '.out_proj.weight'
@@ -140,7 +144,19 @@ def upgrade_state_dict_named(self, state_dict, name):
             inner_dim = state_dict[
                 prefix + 'classification_heads.' + head_name + '.dense.weight'
             ].size(0)
-            self.register_classification_head(head_name, num_classes, inner_dim)
+
+            if head_name not in current_head_names:
+                print("WARNING: deleting classification head ({}) from checkpoint not present in current model: {}".format(head_name, k))
+                keys_to_delete.append(k)
+            elif (
+                num_classes != self.classification_heads[head_name].out_proj.out_features
+                or inner_dim != self.classification_heads[head_name].dense.out_features
+            ):
+                print("WARNING: deleting classification head ({}) from checkpoint with different dimensions than current model: {}".format(head_name, k))
+                keys_to_delete.append(k)
+
+        for k in keys_to_delete:
+            del state_dict[k]
 
         # Copy any newly-added classification heads into the state dict
         # with their current weights.

From 72f9364cc6aa28380c5453476e1cc25e22f4f869 Mon Sep 17 00:00:00 2001
From: Dmytro Okhonko <oxo@fb.com>
Date: Thu, 8 Aug 2019 02:42:38 -0700
Subject: [PATCH 072/213] Asr initial push (#810)

Summary:
Initial code for speech recognition task.
Right now only one ASR model added - https://arxiv.org/abs/1904.11660

unit test testing:
python -m unittest discover tests

also run model training with this code and obtained
5.0 test_clean | 13.4 test_other
on librispeech with pytorch/audio features
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/810

Reviewed By: cpuhrsch

Differential Revision: D16706659

Pulled By: okhonko

fbshipit-source-id: 89a5f9883e50bc0e548234287aa0ea73f7402514
---
 examples/speech_recognition/README.md         |  32 +
 examples/speech_recognition/__init__.py       |   1 +
 .../speech_recognition/criterions/__init__.py |   7 +
 .../criterions/cross_entropy_acc.py           | 129 +++
 examples/speech_recognition/data/__init__.py  |  10 +
 .../speech_recognition/data/asr_dataset.py    | 110 +++
 examples/speech_recognition/data/collaters.py | 129 +++
 .../speech_recognition/data/data_utils.py     |  60 ++
 .../datasets/asr_prep_json.py                 |  96 ++
 .../datasets/prepare-librispeech.sh           |  88 ++
 examples/speech_recognition/infer.py          | 243 +++++
 .../speech_recognition/models/__init__.py     |   7 +
 .../models/vggtransformer.py                  | 838 ++++++++++++++++++
 examples/speech_recognition/tasks/__init__.py |   7 +
 .../tasks/speech_recognition.py               | 116 +++
 fairseq/models/transformer.py                 | 249 +-----
 fairseq/modules/__init__.py                   |   5 +
 fairseq/modules/transformer_layer.py          | 279 ++++++
 fairseq/modules/vggblock.py                   | 115 +++
 tests/speech_recognition/__init__.py          |   0
 tests/speech_recognition/asr_test_base.py     | 549 ++++++++++++
 tests/speech_recognition/test_collaters.py    |  60 ++
 .../speech_recognition/test_cross_entropy.py  |  36 +
 .../speech_recognition/test_vggtransformer.py | 135 +++
 24 files changed, 3054 insertions(+), 247 deletions(-)
 create mode 100644 examples/speech_recognition/README.md
 create mode 100644 examples/speech_recognition/__init__.py
 create mode 100644 examples/speech_recognition/criterions/__init__.py
 create mode 100644 examples/speech_recognition/criterions/cross_entropy_acc.py
 create mode 100644 examples/speech_recognition/data/__init__.py
 create mode 100644 examples/speech_recognition/data/asr_dataset.py
 create mode 100644 examples/speech_recognition/data/collaters.py
 create mode 100644 examples/speech_recognition/data/data_utils.py
 create mode 100644 examples/speech_recognition/datasets/asr_prep_json.py
 create mode 100755 examples/speech_recognition/datasets/prepare-librispeech.sh
 create mode 100644 examples/speech_recognition/infer.py
 create mode 100644 examples/speech_recognition/models/__init__.py
 create mode 100644 examples/speech_recognition/models/vggtransformer.py
 create mode 100644 examples/speech_recognition/tasks/__init__.py
 create mode 100644 examples/speech_recognition/tasks/speech_recognition.py
 create mode 100644 fairseq/modules/transformer_layer.py
 create mode 100644 fairseq/modules/vggblock.py
 create mode 100644 tests/speech_recognition/__init__.py
 create mode 100644 tests/speech_recognition/asr_test_base.py
 create mode 100644 tests/speech_recognition/test_collaters.py
 create mode 100644 tests/speech_recognition/test_cross_entropy.py
 create mode 100644 tests/speech_recognition/test_vggtransformer.py

diff --git a/examples/speech_recognition/README.md b/examples/speech_recognition/README.md
new file mode 100644
index 0000000000..36363b0376
--- /dev/null
+++ b/examples/speech_recognition/README.md
@@ -0,0 +1,32 @@
+# Speech Recognition
+`examples/speech_recognition` is implementing ASR task in Fairseq, along with needed features, datasets, models and loss functions to train and infer model described in [Transformers with convolutional context for ASR (Abdelrahman Mohamed et al., 2019)](https://arxiv.org/abs/1904.11660).
+
+
+## Additional dependencies
+On top of main fairseq dependencies there are couple more additional requirements.
+
+1) Please follow the instructions to install [torchaudio](https://github.com/pytorch/audio). This is required to compute audio fbank features.
+2) [Sclite](http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/sclite.htm#sclite_name_0) is used to measure WER. Sclite can be downloaded and installed from source from sctk package [here](http://www.openslr.org/4/). Training and inference doesn't require Sclite dependency.
+
+## Preparing librispeech data
+```
+./examples/speech_recognition/datasets/prepare-librispeech.sh $DIR_TO_SAVE_RAW_DATA $DIR_FOR_PREPROCESSED_DATA
+```
+
+## Training librispeech data
+```
+python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 80 --task speech_recognition --arch vggtransformer_2 --optimizer adadelta --lr 1.0 --adadelta-eps 1e-8 --adadelta-rho 0.95 --clip-norm 10.0  --max-tokens 5000 --log-format json --log-interval 1 --criterion cross_entropy_acc --user-dir examples/speech_recognition/
+```
+
+## Inference for librispeech
+`$SET` can be `test_clean` or `test_other`
+Any checkpoint in `$MODEL_PATH` can be selected. In this example we are working with `checkpoint_last.pt`
+```
+python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --max-tokens 25000 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --beam 20 --results-path $RES_DIR --batch-size 40 --gen-subset $SET --user-dir examples/speech_recognition/
+```
+
+## Inference for librispeech
+```
+sclite -r ${RES_DIR}/ref.word-checkpoint_last.pt-${SET}.txt -h ${RES_DIR}/hypo.word-checkpoint_last.pt-${SET}.txt -i rm -o all stdout > $RES_REPORT
+```
+`Sum/Avg` row from first table of the report has WER
diff --git a/examples/speech_recognition/__init__.py b/examples/speech_recognition/__init__.py
new file mode 100644
index 0000000000..cd780902e3
--- /dev/null
+++ b/examples/speech_recognition/__init__.py
@@ -0,0 +1 @@
+from . import tasks, criterions, models  # noqa
diff --git a/examples/speech_recognition/criterions/__init__.py b/examples/speech_recognition/criterions/__init__.py
new file mode 100644
index 0000000000..5ba9fc1601
--- /dev/null
+++ b/examples/speech_recognition/criterions/__init__.py
@@ -0,0 +1,7 @@
+import importlib
+import os
+
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        criterion_name = file[:file.find('.py')]
+        importlib.import_module('examples.speech_recognition.criterions.' + criterion_name)
diff --git a/examples/speech_recognition/criterions/cross_entropy_acc.py b/examples/speech_recognition/criterions/cross_entropy_acc.py
new file mode 100644
index 0000000000..f7b46a0aa9
--- /dev/null
+++ b/examples/speech_recognition/criterions/cross_entropy_acc.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion("cross_entropy_acc")
+class CrossEntropyWithAccCriterion(FairseqCriterion):
+    def __init__(self, args, task):
+        super().__init__(args, task)
+
+    def compute_loss(self, model, net_output, target, reduction, log_probs):
+        # N, T -> N * T
+        target = target.view(-1)
+        lprobs = model.get_normalized_probs(net_output, log_probs=log_probs)
+        if not hasattr(lprobs, "batch_first"):
+            logging.warning(
+                "ERROR: we need to know whether "
+                "batch first for the net output; "
+                "you need to set batch_first attribute for the return value of "
+                "model.get_normalized_probs. Now, we assume this is true, but "
+                "in the future, we will raise exception instead. "
+            )
+        batch_first = getattr(lprobs, "batch_first", True)
+        if not batch_first:
+            lprobs = lprobs.transpose(0, 1)
+
+        # N, T, D -> N * T, D
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        loss = F.nll_loss(
+            lprobs, target, ignore_index=self.padding_idx, reduction=reduction
+        )
+        return lprobs, loss
+
+    def get_logging_output(self, sample, target, lprobs, loss):
+        target = target.view(-1)
+        mask = target != self.padding_idx
+        correct = torch.sum(
+            lprobs.argmax(1).masked_select(mask) == target.masked_select(mask)
+        )
+        total = torch.sum(mask)
+        sample_size = (
+            sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"]
+        )
+
+        logging_output = {
+            "loss": utils.item(loss.data),  # * sample['ntokens'],
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+            "correct": utils.item(correct.data),
+            "total": utils.item(total.data),
+            "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(),
+        }
+
+        return sample_size, logging_output
+
+    def forward(self, model, sample, reduction="sum", log_probs=True):
+        """Computes the cross entropy with accuracy metric for the given sample.
+
+        This is similar to CrossEntropyCriterion in fairseq, but also
+        computes accuracy metrics as part of logging
+
+        Args:
+            logprobs (Torch.tensor) of shape N, T, D i.e.
+                batchsize, timesteps, dimensions
+            targets (Torch.tensor) of shape N, T  i.e batchsize, timesteps
+
+        Returns:
+        tuple: With three elements:
+            1) the loss
+            2) the sample size, which is used as the denominator for the gradient
+            3) logging outputs to display while training
+
+        TODO:
+            * Currently this Criterion will only work with LSTMEncoderModels or
+            FairseqModels which have decoder, or Models which return TorchTensor
+            as net_output.
+            We need to make a change to support all FairseqEncoder models.
+        """
+        net_output = model(**sample["net_input"])
+        target = model.get_targets(sample, net_output)
+        lprobs, loss = self.compute_loss(
+            model, net_output, target, reduction, log_probs
+        )
+        sample_size, logging_output = self.get_logging_output(
+            sample, target, lprobs, loss
+        )
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        correct_sum = sum(log.get("correct", 0) for log in logging_outputs)
+        total_sum = sum(log.get("total", 0) for log in logging_outputs)
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        nframes = sum(log.get("nframes", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0,
+            # if args.sentence_avg, then sample_size is nsentences, then loss
+            # is per-sentence loss; else sample_size is ntokens, the loss
+            # becomes per-output token loss
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "nframes": nframes,
+            "sample_size": sample_size,
+            "acc": correct_sum * 100.0 / total_sum if total_sum > 0 else 0.0,
+            "correct": correct_sum,
+            "total": total_sum,
+            # total is the number of validate tokens
+        }
+        if sample_size != ntokens:
+            agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
+        # loss: per output token loss
+        # nll_loss: per sentence loss
+        return agg_output
diff --git a/examples/speech_recognition/data/__init__.py b/examples/speech_recognition/data/__init__.py
new file mode 100644
index 0000000000..737a22ec3a
--- /dev/null
+++ b/examples/speech_recognition/data/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .asr_dataset import AsrDataset
+
+__all__ = [
+    'AsrDataset',
+]
diff --git a/examples/speech_recognition/data/asr_dataset.py b/examples/speech_recognition/data/asr_dataset.py
new file mode 100644
index 0000000000..a848370607
--- /dev/null
+++ b/examples/speech_recognition/data/asr_dataset.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import numpy as np
+from fairseq.data import FairseqDataset
+
+from . import data_utils
+from .collaters import Seq2SeqCollater
+
+
+class AsrDataset(FairseqDataset):
+    """
+    A dataset representing speech and corresponding transcription.
+
+    Args:
+        aud_paths: (List[str]): A list of str with paths to audio files.
+        aud_durations_ms (List[int]): A list of int containing the durations of
+            audio files.
+        tgt (List[torch.LongTensor]): A list of LongTensors containing the indices
+            of target transcriptions.
+        tgt_dict (~fairseq.data.Dictionary): target vocabulary.
+        ids (List[str]): A list of utterance IDs.
+        speakers (List[str]): A list of speakers corresponding to utterances.
+        num_mel_bins (int): Number of triangular mel-frequency bins (default: 80)
+        frame_length (float): Frame length in milliseconds (default: 25.0)
+        frame_shift (float): Frame shift in milliseconds (default: 10.0)
+    """
+
+    def __init__(
+        self, aud_paths, aud_durations_ms, tgt,
+        tgt_dict, ids, speakers,
+        num_mel_bins=80, frame_length=25.0, frame_shift=10.0
+    ):
+        assert frame_length > 0
+        assert frame_shift > 0
+        assert all(x > frame_length for x in aud_durations_ms)
+        self.frame_sizes = [
+            int(1 + (d - frame_length) / frame_shift)
+            for d in aud_durations_ms
+        ]
+
+        assert len(aud_paths) > 0
+        assert len(aud_paths) == len(aud_durations_ms)
+        assert len(aud_paths) == len(tgt)
+        assert len(aud_paths) == len(ids)
+        assert len(aud_paths) == len(speakers)
+        self.aud_paths = aud_paths
+        self.tgt_dict = tgt_dict
+        self.tgt = tgt
+        self.ids = ids
+        self.speakers = speakers
+        self.num_mel_bins = num_mel_bins
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+
+    def __getitem__(self, index):
+        import torchaudio
+        import torchaudio.compliance.kaldi as kaldi
+        tgt_item = self.tgt[index] if self.tgt is not None else None
+
+        path = self.aud_paths[index]
+        if not os.path.exists(path):
+            raise FileNotFoundError("Audio file not found: {}".format(path))
+        sound, sample_rate = torchaudio.load_wav(path)
+        output = kaldi.fbank(
+            sound,
+            num_mel_bins=self.num_mel_bins,
+            frame_length=self.frame_length,
+            frame_shift=self.frame_shift
+        )
+        output_cmvn = data_utils.apply_mv_norm(output)
+        self.collater = Seq2SeqCollater(
+            0, 1, pad_index=self.tgt_dict.pad(),
+            eos_index=self.tgt_dict.eos(), move_eos_to_beginning=True
+        )
+
+        return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
+
+    def __len__(self):
+        return len(self.aud_paths)
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[int]): sample indices to collate
+
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        return self.collater.collate(samples)
+
+    def num_tokens(self, index):
+        return self.frame_sizes[index]
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return (
+            self.frame_sizes[index],
+            len(self.tgt[index]) if self.tgt is not None else 0,
+        )
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        return np.arange(len(self))
diff --git a/examples/speech_recognition/data/collaters.py b/examples/speech_recognition/data/collaters.py
new file mode 100644
index 0000000000..16166e55b2
--- /dev/null
+++ b/examples/speech_recognition/data/collaters.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+    This module contains collection of classes which implement
+    collate functionalities for various tasks.
+
+    Collaters should know what data to expect for each sample
+    and they should pack / collate them into batches
+"""
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+import numpy as np
+
+import torch
+from fairseq.data import data_utils as fairseq_data_utils
+
+
+class Seq2SeqCollater(object):
+    """
+        Implements collate function mainly for seq2seq tasks
+        This expects each sample to contain feature (src_tokens) and
+        targets.
+        This collator is also used for aligned training task.
+    """
+
+    def __init__(
+        self,
+        feature_index=0,
+        label_index=1,
+        pad_index=1,
+        eos_index=2,
+        move_eos_to_beginning=True,
+    ):
+        self.feature_index = feature_index
+        self.label_index = label_index
+        self.pad_index = pad_index
+        self.eos_index = eos_index
+        self.move_eos_to_beginning = move_eos_to_beginning
+
+    def _collate_frames(self, frames):
+        """Convert a list of 2d frames into a padded 3d tensor
+        Args:
+            frames (list): list of 2d frames of size L[i]*f_dim. Where L[i] is
+                length of i-th frame and f_dim is static dimension of features
+        Returns:
+            3d tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
+        """
+        len_max = max(frame.size(0) for frame in frames)
+        f_dim = frames[0].size(1)
+        res = frames[0].new(len(frames), len_max, f_dim).fill_(0.0)
+
+        for i, v in enumerate(frames):
+            res[i, : v.size(0)] = v
+
+        return res
+
+    def collate(self, samples):
+        """
+        utility function to collate samples into batch for speech recognition.
+        """
+        if len(samples) == 0:
+            return {}
+
+        # parse samples into torch tensors
+        parsed_samples = []
+        for s in samples:
+            # skip invalid samples
+            if s["data"][self.feature_index] is None:
+                continue
+            source = s["data"][self.feature_index]
+            if isinstance(source, (np.ndarray, np.generic)):
+                source = torch.from_numpy(source)
+            target = s["data"][self.label_index]
+            if isinstance(target, (np.ndarray, np.generic)):
+                target = torch.from_numpy(target).long()
+
+            parsed_sample = {"id": s["id"], "source": source, "target": target}
+            parsed_samples.append(parsed_sample)
+        samples = parsed_samples
+
+        id = torch.LongTensor([s["id"] for s in samples])
+        frames = self._collate_frames([s["source"] for s in samples])
+        # sort samples by descending number of frames
+        frames_lengths = torch.LongTensor([s["source"].size(0) for s in samples])
+        frames_lengths, sort_order = frames_lengths.sort(descending=True)
+        id = id.index_select(0, sort_order)
+        frames = frames.index_select(0, sort_order)
+
+        target = None
+        target_lengths = None
+        prev_output_tokens = None
+        if samples[0].get("target", None) is not None:
+            ntokens = sum(len(s["target"]) for s in samples)
+            target = fairseq_data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                self.pad_index,
+                self.eos_index,
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            target = target.index_select(0, sort_order)
+            target_lengths = torch.LongTensor(
+                [s["target"].size(0) for s in samples]
+            ).index_select(0, sort_order)
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                self.pad_index,
+                self.eos_index,
+                left_pad=False,
+                move_eos_to_beginning=self.move_eos_to_beginning,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
+        else:
+            ntokens = sum(len(s["source"]) for s in samples)
+
+        batch = {
+            "id": id,
+            "ntokens": ntokens,
+            "net_input": {"src_tokens": frames, "src_lengths": frames_lengths},
+            "target": target,
+            "target_lengths": target_lengths,
+            "nsentences": len(samples),
+        }
+        if prev_output_tokens is not None:
+            batch["net_input"]["prev_output_tokens"] = prev_output_tokens
+        return batch
diff --git a/examples/speech_recognition/data/data_utils.py b/examples/speech_recognition/data/data_utils.py
new file mode 100644
index 0000000000..5380461651
--- /dev/null
+++ b/examples/speech_recognition/data/data_utils.py
@@ -0,0 +1,60 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def calc_mean_invstddev(feature):
+    if len(feature.size()) != 2:
+        raise ValueError("We expect the input feature to be 2-D tensor")
+    mean = feature.mean(0)
+    var = feature.var(0)
+    # avoid division by ~zero
+    eps = 1e-8
+    if (var < eps).any():
+        return mean, 1.0 / (torch.sqrt(var) + eps)
+    return mean, 1.0 / torch.sqrt(var)
+
+
+def apply_mv_norm(features):
+    mean, invstddev = calc_mean_invstddev(features)
+    res = (features - mean) * invstddev
+    return res
+
+
+def lengths_to_encoder_padding_mask(lengths, batch_first=False):
+    """
+    convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor
+
+    Args:
+        lengths: a (B, )-shaped tensor
+
+    Return:
+        max_length: maximum length of B sequences
+        encoder_padding_mask: a (max_length, B) binary mask, where
+        [t, b] = 0 for t < lengths[b] and 1 otherwise
+
+    TODO:
+        kernelize this function if benchmarking shows this function is slow
+    """
+    max_lengths = torch.max(lengths).item()
+    bsz = lengths.size(0)
+    encoder_padding_mask = torch.arange(
+        max_lengths
+    ).to(  # a (T, ) tensor with [0, ..., T-1]
+        lengths.device
+    ).view(  # move to the right device
+        1, max_lengths
+    ).expand(  # reshape to (1, T)-shaped tensor
+        bsz, -1
+    ) >= lengths.view(  # expand to (B, T)-shaped tensor
+        bsz, 1
+    ).expand(
+        -1, max_lengths
+    )
+    if not batch_first:
+        return encoder_padding_mask.t(), max_lengths
+    else:
+        return encoder_padding_mask, max_lengths
diff --git a/examples/speech_recognition/datasets/asr_prep_json.py b/examples/speech_recognition/datasets/asr_prep_json.py
new file mode 100644
index 0000000000..e4b5d8f52f
--- /dev/null
+++ b/examples/speech_recognition/datasets/asr_prep_json.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from collections import namedtuple
+import concurrent.futures
+from itertools import chain
+import argparse
+import os
+import json
+import sentencepiece as spm
+import multiprocessing
+import torchaudio
+
+from fairseq.data import Dictionary
+
+MILLISECONDS_TO_SECONDS = 0.001
+
+
+def process_sample(aud_path, lable, utt_id, sp, tgt_dict):
+    input = {}
+    output = {}
+    si, ei = torchaudio.info(aud_path)
+    input["length_ms"] = int(si.length / si.channels / si.rate / MILLISECONDS_TO_SECONDS)
+    input["path"] = aud_path
+
+    token = " ".join(sp.EncodeAsPieces(lable))
+    ids = tgt_dict.encode_line(token, append_eos=False)
+    output["text"] = lable
+    output["token"] = token
+    output["tokenid"] = ', '.join(map(str, [t.tolist() for t in ids]))
+    return {utt_id: {"input": input, "output": output}}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--audio-dirs", nargs="+", default=['-'], required=True,
+                        help="input directories with audio files")
+    parser.add_argument("--labels", required=True,
+                        help="aggregated input labels with format <ID LABEL> per line",
+                        type=argparse.FileType('r', encoding='UTF-8'))
+    parser.add_argument("--spm-model", required=True,
+                        help="sentencepiece model to use for encoding",
+                        type=argparse.FileType('r', encoding='UTF-8'))
+    parser.add_argument("--dictionary", required=True,
+                        help="file to load fairseq dictionary from",
+                        type=argparse.FileType('r', encoding='UTF-8'))
+    parser.add_argument("--audio-format", choices=["flac", "wav"], default="wav")
+    parser.add_argument("--output", required=True, type=argparse.FileType('w'),
+                        help="path to save json output")
+    args = parser.parse_args()
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.spm_model.name)
+
+    tgt_dict = Dictionary.load(args.dictionary)
+
+    labels = {}
+    for line in args.labels:
+        (utt_id, label) = line.split(" ", 1)
+        labels[utt_id] = label
+    if len(labels) == 0:
+        raise Exception('No labels found in ', args.labels_path)
+
+    Sample = namedtuple('Sample', 'aud_path utt_id')
+    samples = []
+    for path, _, files in chain.from_iterable(os.walk(path) for path in args.audio_dirs):
+        for f in files:
+            if f.endswith(args.audio_format):
+                if len(os.path.splitext(f)) != 2:
+                    raise Exception('Expect <utt_id.extension> file name. Got: ', f)
+                utt_id = os.path.splitext(f)[0]
+                if utt_id not in labels:
+                    continue
+                samples.append(Sample(os.path.join(path, f), utt_id))
+
+    utts = {}
+    num_cpu = multiprocessing.cpu_count()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_cpu) as executor:
+        future_to_sample = {executor.submit(process_sample, s.aud_path, labels[s.utt_id], s.utt_id, sp, tgt_dict): s for s in samples}
+        for future in concurrent.futures.as_completed(future_to_sample):
+            try:
+                data = future.result()
+            except Exception as exc:
+                print('generated an exception: ', exc)
+            else:
+                utts.update(data)
+    json.dump({"utts": utts}, args.output, indent=4)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/speech_recognition/datasets/prepare-librispeech.sh b/examples/speech_recognition/datasets/prepare-librispeech.sh
new file mode 100755
index 0000000000..9e9297f089
--- /dev/null
+++ b/examples/speech_recognition/datasets/prepare-librispeech.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Prepare librispeech dataset
+
+base_url=www.openslr.org/resources/12
+train_dir=train_960
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <download_dir> <out_dir>"
+  echo "e.g.: $0 /tmp/librispeech_raw/ ~/data/librispeech_final"
+  exit 1
+fi
+
+download_dir=${1%/}
+out_dir=${2%/}
+
+fairseq_root=~/fairseq-py/
+mkdir -p ${out_dir}
+cd ${out_dir} || exit
+
+nbpe=5000
+bpemode=unigram
+
+if [ ! -d "$fairseq_root" ]; then
+    echo "$0: Please set correct fairseq_root"
+    exit 1
+fi
+
+echo "Data Download"
+for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+    url=$base_url/$part.tar.gz
+    if ! wget -P $download_dir $url; then
+        echo "$0: wget failed for $url"
+        exit 1
+    fi
+    if ! tar -C $download_dir -xvzf $download_dir/$part.tar.gz; then
+        echo "$0: error un-tarring archive $download_dir/$part.tar.gz"
+        exit 1
+    fi
+done
+
+echo "Merge all train packs into one"
+mkdir -p ${download_dir}/LibriSpeech/${train_dir}/
+for part in train-clean-100 train-clean-360 train-other-500; do
+    mv ${download_dir}/LibriSpeech/${part}/* $download_dir/LibriSpeech/${train_dir}/
+done
+echo "Merge train text"
+find ${download_dir}/LibriSpeech/${train_dir}/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/${train_dir}/text
+
+# Use combined dev-clean and dev-other as validation set
+find ${download_dir}/LibriSpeech/dev-clean/ ${download_dir}/LibriSpeech/dev-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/valid_text
+find ${download_dir}/LibriSpeech/test-clean/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-clean/text
+find ${download_dir}/LibriSpeech/test-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-other/text
+
+
+dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_units.txt
+encoded=data/lang_char/${train_dir}_${bpemode}${nbpe}_encoded.txt
+fairseq_dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_fairseq_dict.txt
+bpemodel=data/lang_char/${train_dir}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+echo "Dictionary preparation"
+mkdir -p data/lang_char/
+echo "<unk> 3" > ${dict}
+echo "</s> 2" >> ${dict}
+echo "<pad> 1" >> ${dict}
+cut -f 2- -d" " ${download_dir}/LibriSpeech/${train_dir}/text > data/lang_char/input.txt
+spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --unk_id=3 --eos_id=2 --pad_id=1 --bos_id=-1 --character_coverage=1
+spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt > ${encoded}
+cat ${encoded} | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+3}' >> ${dict}
+cat ${encoded} | tr ' ' '\n' | sort | uniq -c | awk '{print $2 " " $1}' > ${fairseq_dict}
+wc -l ${dict}
+
+echo "Prepare train and test jsons"
+for part in train_960 test-other test-clean; do
+    python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/${part} --labels ${download_dir}/LibriSpeech/${part}/text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output ${part}.json
+done
+# fairseq expects to find train.json and valid.json during training
+mv train_960.json train.json
+
+echo "Prepare valid json"
+python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/dev-clean ${download_dir}/LibriSpeech/dev-other --labels ${download_dir}/LibriSpeech/valid_text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output valid.json
+
+cp ${fairseq_dict} ./dict.txt
+cp ${bpemodel}.model ./spm.model
diff --git a/examples/speech_recognition/infer.py b/examples/speech_recognition/infer.py
new file mode 100644
index 0000000000..ce5f4f7654
--- /dev/null
+++ b/examples/speech_recognition/infer.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Run inference for pre-processed data with a trained model.
+"""
+
+import logging
+import os
+
+import sentencepiece as spm
+import torch
+from fairseq import options, progress_bar, utils, tasks
+from fairseq.meters import StopwatchMeter, TimeMeter
+from fairseq.utils import import_user_module
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def add_asr_eval_argument(parser):
+    parser.add_argument("--ctc", action="store_true", help="decode a ctc model")
+    parser.add_argument("--rnnt", default=False, help="decode a rnnt model")
+    parser.add_argument("--kspmodel", default=None, help="sentence piece model")
+    parser.add_argument(
+        "--wfstlm", default=None, help="wfstlm on dictonary output units"
+    )
+    parser.add_argument(
+        "--rnnt_decoding_type",
+        default="greedy",
+        help="wfstlm on dictonary\
+output units",
+    )
+    parser.add_argument(
+        "--lm_weight",
+        default=0.2,
+        help="weight for wfstlm while interpolating\
+with neural score",
+    )
+    parser.add_argument(
+        "--rnnt_len_penalty", default=-0.5, help="rnnt length penalty on word level"
+    )
+    return parser
+
+
+def check_args(args):
+    assert args.path is not None, "--path required for generation!"
+    assert args.results_path is not None, "--results_path required for generation!"
+    assert (
+        not args.sampling or args.nbest == args.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        args.replace_unk is None or args.raw_text
+    ), "--replace-unk requires a raw text dataset (--raw-text)"
+
+
+def get_dataset_itr(args, task):
+    return task.get_batch_iterator(
+        dataset=task.dataset(args.gen_subset),
+        max_tokens=args.max_tokens,
+        max_sentences=args.max_sentences,
+        max_positions=(1000000.0, 1000000.0),
+        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
+        required_batch_size_multiple=args.required_batch_size_multiple,
+        num_shards=args.num_shards,
+        shard_id=args.shard_id,
+        num_workers=args.num_workers,
+    ).next_epoch_itr(shuffle=False)
+
+
+def process_predictions(args, hypos, sp, tgt_dict, target_tokens, res_files, speaker, id):
+    for hypo in hypos[: min(len(hypos), args.nbest)]:
+        hyp_pieces = tgt_dict.string(hypo["tokens"].int().cpu())
+        hyp_words = sp.DecodePieces(hyp_pieces.split())
+        print(
+            "{} ({}-{})".format(hyp_pieces, speaker, id),
+            file=res_files["hypo.units"],
+        )
+        print(
+            "{} ({}-{})".format(hyp_words, speaker, id),
+            file=res_files["hypo.words"],
+        )
+
+        tgt_pieces = tgt_dict.string(target_tokens)
+        tgt_words = sp.DecodePieces(tgt_pieces.split())
+        print(
+            "{} ({}-{})".format(tgt_pieces, speaker, id),
+            file=res_files["ref.units"],
+        )
+        print(
+            "{} ({}-{})".format(tgt_words, speaker, id),
+            file=res_files["ref.words"],
+        )
+        # only score top hypothesis
+        if not args.quiet:
+            logger.debug("HYPO:" + hyp_words)
+            logger.debug("TARGET:" + tgt_words)
+            logger.debug("___________________")
+
+
+def prepare_result_files(args):
+    def get_res_file(file_prefix):
+        path = os.path.join(
+            args.results_path,
+            "{}-{}-{}.txt".format(
+                file_prefix, os.path.basename(args.path), args.gen_subset
+            ),
+        )
+        return open(path, "w", buffering=1)
+
+    return {
+        "hypo.words": get_res_file("hypo.word"),
+        "hypo.units": get_res_file("hypo.units"),
+        "ref.words": get_res_file("ref.word"),
+        "ref.units": get_res_file("ref.units"),
+    }
+
+
+def optimize_models(args, use_cuda, models):
+    """Optimize ensemble for generation
+    """
+    for model in models:
+        model.make_generation_fast_(
+            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
+            need_attn=args.print_alignment,
+        )
+        if args.fp16:
+            model.half()
+        if use_cuda:
+            model.cuda()
+
+
+def main(args):
+    check_args(args)
+    import_user_module(args)
+
+    if args.max_tokens is None and args.max_sentences is None:
+        args.max_tokens = 30000
+    logger.info(args)
+
+    use_cuda = torch.cuda.is_available() and not args.cpu
+
+    # Load dataset splits
+    task = tasks.setup_task(args)
+    task.load_dataset(args.gen_subset)
+    logger.info(
+        "| {} {} {} examples".format(
+            args.data, args.gen_subset, len(task.dataset(args.gen_subset))
+        )
+    )
+
+    # Set dictionary
+    tgt_dict = task.target_dictionary
+
+    if args.ctc or args.rnnt:
+        tgt_dict.add_symbol("<ctc_blank>")
+        if args.ctc:
+            logger.info("| decoding a ctc model")
+        if args.rnnt:
+            logger.info("| decoding a rnnt model")
+
+    # Load ensemble
+    logger.info("| loading model(s) from {}".format(args.path))
+    models, _model_args = utils.load_ensemble_for_inference(
+        args.path.split(":"),
+        task,
+        model_arg_overrides=eval(args.model_overrides),  # noqa
+    )
+    optimize_models(args, use_cuda, models)
+
+    # Load dataset (possibly sharded)
+    itr = get_dataset_itr(args, task)
+
+    # Initialize generator
+    gen_timer = StopwatchMeter()
+    generator = task.build_generator(args)
+
+    num_sentences = 0
+
+    if not os.path.exists(args.results_path):
+        os.makedirs(args.results_path)
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(os.path.join(args.data, 'spm.model'))
+
+    res_files = prepare_result_files(args)
+    with progress_bar.build_progress_bar(args, itr) as t:
+        wps_meter = TimeMeter()
+        for sample in t:
+            sample = utils.move_to_cuda(sample) if use_cuda else sample
+            if "net_input" not in sample:
+                continue
+
+            prefix_tokens = None
+            if args.prefix_size > 0:
+                prefix_tokens = sample["target"][:, : args.prefix_size]
+
+            gen_timer.start()
+            hypos = task.inference_step(generator, models, sample, prefix_tokens)
+            num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos)
+            gen_timer.stop(num_generated_tokens)
+
+            for i, sample_id in enumerate(sample['id'].tolist()):
+                speaker = task.dataset(args.gen_subset).speakers[int(sample_id)]
+                id = task.dataset(args.gen_subset).ids[int(sample_id)]
+                target_tokens = (
+                    utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()
+                )
+                # Process top predictions
+                process_predictions(
+                    args, hypos[i], sp, tgt_dict, target_tokens, res_files, speaker, id
+                )
+
+            wps_meter.update(num_generated_tokens)
+            t.log({"wps": round(wps_meter.avg)})
+            num_sentences += sample["nsentences"]
+
+    logger.info(
+        "| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}"
+        "sentences/s, {:.2f} tokens/s)".format(
+            num_sentences,
+            gen_timer.n,
+            gen_timer.sum,
+            num_sentences / gen_timer.sum,
+            1.0 / gen_timer.avg,
+        )
+    )
+    logger.info("| Generate {} with beam={}".format(args.gen_subset, args.beam))
+
+
+def cli_main():
+    parser = options.get_generation_parser()
+    parser = add_asr_eval_argument(parser)
+    args = options.parse_args_and_arch(parser)
+    main(args)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/examples/speech_recognition/models/__init__.py b/examples/speech_recognition/models/__init__.py
new file mode 100644
index 0000000000..66ad2b0a1f
--- /dev/null
+++ b/examples/speech_recognition/models/__init__.py
@@ -0,0 +1,7 @@
+import importlib
+import os
+
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        model_name = file[:file.find('.py')]
+        importlib.import_module('examples.speech_recognition.models.' + model_name)
diff --git a/examples/speech_recognition/models/vggtransformer.py b/examples/speech_recognition/models/vggtransformer.py
new file mode 100644
index 0000000000..7b208a3b91
--- /dev/null
+++ b/examples/speech_recognition/models/vggtransformer.py
@@ -0,0 +1,838 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import math
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqIncrementalDecoder,
+    FairseqModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import LinearizedConvolution
+from examples.speech_recognition.data.data_utils import lengths_to_encoder_padding_mask
+from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer, VGGBlock
+
+
+@register_model("asr_vggtransformer")
+class VGGTransformerModel(FairseqModel):
+    """
+    Transformers with convolutional context for ASR
+    https://arxiv.org/abs/1904.11660
+    """
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="encoder input dimension per input channel",
+        )
+        parser.add_argument(
+            "--vggblock-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples each containing the configuration of one vggblock:
+    [(out_channels,
+      conv_kernel_size,
+      pooling_kernel_size,
+      num_conv_layers,
+      use_layer_norm), ...])
+            """,
+        )
+        parser.add_argument(
+            "--transformer-enc-config",
+            type=str,
+            metavar="EXPR",
+            help=""""
+    a tuple containing the configuration of the encoder transformer layers
+    configurations:
+    [(input_dim,
+      num_heads,
+      ffn_dim,
+      normalize_before,
+      dropout,
+      attention_dropout,
+      relu_dropout), ...]')
+            """,
+        )
+        parser.add_argument(
+            "--enc-output-dim",
+            type=int,
+            metavar="N",
+            help="""
+    encoder output dimension, can be None. If specified, projecting the
+    transformer output to the specified dimension""",
+        )
+        parser.add_argument(
+            "--in-channels",
+            type=int,
+            metavar="N",
+            help="number of encoder input channels",
+        )
+        parser.add_argument(
+            "--tgt-embed-dim",
+            type=int,
+            metavar="N",
+            help="embedding dimension of the decoder target tokens",
+        )
+        parser.add_argument(
+            "--transformer-dec-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    a tuple containing the configuration of the decoder transformer layers
+    configurations:
+    [(input_dim,
+      num_heads,
+      ffn_dim,
+      normalize_before,
+      dropout,
+      attention_dropout,
+      relu_dropout), ...]
+            """,
+        )
+        parser.add_argument(
+            "--conv-dec-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples for the decoder 1-D convolution config
+        [(out_channels, conv_kernel_size, use_layer_norm), ...]""",
+        )
+
+    @classmethod
+    def build_encoder(cls, args, task):
+        return VGGTransformerEncoder(
+            input_feat_per_channel=args.input_feat_per_channel,
+            vggblock_config=eval(args.vggblock_enc_config),
+            transformer_config=eval(args.transformer_enc_config),
+            encoder_output_dim=args.enc_output_dim,
+            in_channels=args.in_channels,
+        )
+
+    @classmethod
+    def build_decoder(cls, args, task):
+        return TransformerDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.tgt_embed_dim,
+            transformer_config=eval(args.transformer_dec_config),
+            conv_config=eval(args.conv_dec_config),
+            encoder_output_dim=args.enc_output_dim,
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted
+        # (in case there are any new ones)
+        base_architecture(args)
+
+        encoder = cls.build_encoder(args, task)
+        decoder = cls.build_decoder(args, task)
+        return cls(encoder, decoder)
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        # net_output['encoder_out'] is a (B, T, D) tensor
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        lprobs.batch_first = True
+        return lprobs
+
+
+DEFAULT_ENC_VGGBLOCK_CONFIG = ((32, 3, 2, 2, False),) * 2
+DEFAULT_ENC_TRANSFORMER_CONFIG = ((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2
+# 256: embedding dimension
+# 4: number of heads
+# 1024: FFN
+# True: apply layerNorm before (dropout + resiaul) instead of after
+# 0.2 (dropout): dropout after MultiheadAttention and second FC
+# 0.2 (attention_dropout): dropout in MultiheadAttention
+# 0.2 (relu_dropout): dropout after ReLu
+DEFAULT_DEC_TRANSFORMER_CONFIG = ((256, 2, 1024, True, 0.2, 0.2, 0.2),) * 2
+DEFAULT_DEC_CONV_CONFIG = ((256, 3, True),) * 2
+
+
+# TODO: repace transformer encoder config from one liner
+# to explicit args to get rid of this transformation
+def prepare_transformer_encoder_params(
+    input_dim,
+    num_heads,
+    ffn_dim,
+    normalize_before,
+    dropout,
+    attention_dropout,
+    relu_dropout,
+):
+    args = argparse.Namespace()
+    args.encoder_embed_dim = input_dim
+    args.encoder_attention_heads = num_heads
+    args.attention_dropout = attention_dropout
+    args.dropout = dropout
+    args.activation_dropout = relu_dropout
+    args.encoder_normalize_before = normalize_before
+    args.encoder_ffn_embed_dim = ffn_dim
+    return args
+
+
+def prepare_transformer_decoder_params(
+    input_dim,
+    num_heads,
+    ffn_dim,
+    normalize_before,
+    dropout,
+    attention_dropout,
+    relu_dropout,
+):
+    args = argparse.Namespace()
+    args.decoder_embed_dim = input_dim
+    args.decoder_attention_heads = num_heads
+    args.attention_dropout = attention_dropout
+    args.dropout = dropout
+    args.activation_dropout = relu_dropout
+    args.decoder_normalize_before = normalize_before
+    args.decoder_ffn_embed_dim = ffn_dim
+    return args
+
+
+class VGGTransformerEncoder(FairseqEncoder):
+    """VGG + Transformer encoder"""
+
+    def __init__(
+        self,
+        input_feat_per_channel,
+        vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG,
+        transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG,
+        encoder_output_dim=512,
+        in_channels=1,
+        transformer_context=None,
+        transformer_sampling=None,
+    ):
+        """constructor for VGGTransformerEncoder
+
+        Args:
+            - input_feat_per_channel: feature dim (not including stacked,
+              just base feature)
+            - in_channel: # input channels (e.g., if stack 8 feature vector
+                together, this is 8)
+            - vggblock_config: configuration of vggblock, see comments on
+                DEFAULT_ENC_VGGBLOCK_CONFIG
+            - transformer_config: configuration of transformer layer, see comments
+                on DEFAULT_ENC_TRANSFORMER_CONFIG
+            - encoder_output_dim: final transformer output embedding dimension
+            - transformer_context: (left, right) if set, self-attention will be focused
+              on (t-left, t+right)
+            - transformer_sampling: an iterable of int, must match with
+              len(transformer_config), transformer_sampling[i] indicates sampling
+              factor for i-th transformer layer, after multihead att and feedfoward
+              part
+        """
+        super().__init__(None)
+
+        self.num_vggblocks = 0
+        if vggblock_config is not None:
+            if not isinstance(vggblock_config, Iterable):
+                raise ValueError("vggblock_config is not iterable")
+            self.num_vggblocks = len(vggblock_config)
+
+        self.conv_layers = nn.ModuleList()
+        self.in_channels = in_channels
+        self.input_dim = input_feat_per_channel
+
+        if vggblock_config is not None:
+            for _, config in enumerate(vggblock_config):
+                (
+                    out_channels,
+                    conv_kernel_size,
+                    pooling_kernel_size,
+                    num_conv_layers,
+                    layer_norm,
+                ) = config
+                self.conv_layers.append(
+                    VGGBlock(
+                        in_channels,
+                        out_channels,
+                        conv_kernel_size,
+                        pooling_kernel_size,
+                        num_conv_layers,
+                        input_dim=input_feat_per_channel,
+                        layer_norm=layer_norm,
+                    )
+                )
+                in_channels = out_channels
+                input_feat_per_channel = self.conv_layers[-1].output_dim
+
+        transformer_input_dim = self.infer_conv_output_dim(
+            self.in_channels, self.input_dim
+        )
+        # transformer_input_dim is the output dimension of VGG part
+
+        self.validate_transformer_config(transformer_config)
+        self.transformer_context = self.parse_transformer_context(transformer_context)
+        self.transformer_sampling = self.parse_transformer_sampling(
+            transformer_sampling, len(transformer_config)
+        )
+
+        self.transformer_layers = nn.ModuleList()
+
+        if transformer_input_dim != transformer_config[0][0]:
+            self.transformer_layers.append(
+                Linear(transformer_input_dim, transformer_config[0][0])
+            )
+        self.transformer_layers.append(
+            TransformerEncoderLayer(
+                prepare_transformer_encoder_params(*transformer_config[0])
+            )
+        )
+
+        for i in range(1, len(transformer_config)):
+            if transformer_config[i - 1][0] != transformer_config[i][0]:
+                self.transformer_layers.append(
+                    Linear(transformer_config[i - 1][0], transformer_config[i][0])
+                )
+            self.transformer_layers.append(
+                TransformerEncoderLayer(
+                    prepare_transformer_encoder_params(*transformer_config[i])
+                )
+            )
+
+        self.encoder_output_dim = encoder_output_dim
+        self.transformer_layers.extend(
+            [
+                Linear(transformer_config[-1][0], encoder_output_dim),
+                LayerNorm(encoder_output_dim),
+            ]
+        )
+
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        src_tokens: padded tensor (B, T, C * feat)
+        src_lengths: tensor of original lengths of input utterances (B,)
+        """
+        bsz, max_seq_len, _ = src_tokens.size()
+        x = src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim)
+        x = x.transpose(1, 2).contiguous()
+        # (B, C, T, feat)
+
+        for layer_idx in range(len(self.conv_layers)):
+            x = self.conv_layers[layer_idx](x)
+
+        bsz, _, output_seq_len, _ = x.size()
+
+        # (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) -> (T, B, C * feat)
+        x = x.transpose(1, 2).transpose(0, 1)
+        x = x.contiguous().view(output_seq_len, bsz, -1)
+
+        subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5)
+        # TODO: shouldn't subsampling_factor determined in advance ?
+        input_lengths = (src_lengths.float() / subsampling_factor).ceil().long()
+
+        encoder_padding_mask, _ = lengths_to_encoder_padding_mask(
+            input_lengths, batch_first=True
+        )
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
+        attn_mask = self.lengths_to_attn_mask(input_lengths, subsampling_factor)
+
+        transformer_layer_idx = 0
+
+        for layer_idx in range(len(self.transformer_layers)):
+
+            if isinstance(self.transformer_layers[layer_idx], TransformerEncoderLayer):
+                x = self.transformer_layers[layer_idx](
+                    x, encoder_padding_mask, attn_mask
+                )
+
+                if self.transformer_sampling[transformer_layer_idx] != 1:
+                    sampling_factor = self.transformer_sampling[transformer_layer_idx]
+                    x, encoder_padding_mask, attn_mask = self.slice(
+                        x, encoder_padding_mask, attn_mask, sampling_factor
+                    )
+
+                transformer_layer_idx += 1
+
+            else:
+                x = self.transformer_layers[layer_idx](x)
+
+        # encoder_padding_maks is a (T x B) tensor, its [t, b] elements indicate
+        # whether encoder_output[t, b] is valid or not (valid=0, invalid=1)
+
+        return {
+            "encoder_out": x,  # (T, B, C)
+            "encoder_padding_mask": encoder_padding_mask.t()
+            if encoder_padding_mask is not None
+            else None,
+            # (B, T) --> (T, B)
+        }
+
+    def infer_conv_output_dim(self, in_channels, input_dim):
+        sample_seq_len = 200
+        sample_bsz = 10
+        x = torch.randn(sample_bsz, in_channels, sample_seq_len, input_dim)
+        for i, _ in enumerate(self.conv_layers):
+            x = self.conv_layers[i](x)
+        x = x.transpose(1, 2)
+        mb, seq = x.size()[:2]
+        return x.contiguous().view(mb, seq, -1).size(-1)
+
+    def validate_transformer_config(self, transformer_config):
+        for config in transformer_config:
+            input_dim, num_heads = config[:2]
+            if input_dim % num_heads != 0:
+                msg = (
+                    "ERROR in transformer config {}:".format(config)
+                    + "input dimension {} ".format(input_dim)
+                    + "not dividable by number of heads".format(num_heads)
+                )
+                raise ValueError(msg)
+
+    def parse_transformer_context(self, transformer_context):
+        """
+        transformer_context can be the following:
+        -   None; indicates no context is used, i.e.,
+            transformer can access full context
+        -   a tuple/list of two int; indicates left and right context,
+            any number <0 indicates infinite context
+                * e.g., (5, 6) indicates that for query at x_t, transformer can
+                access [t-5, t+6] (inclusive)
+                * e.g., (-1, 6) indicates that for query at x_t, transformer can
+                access [0, t+6] (inclusive)
+        """
+        if transformer_context is None:
+            return None
+
+        if not isinstance(transformer_context, Iterable):
+            raise ValueError("transformer context must be Iterable if it is not None")
+
+        if len(transformer_context) != 2:
+            raise ValueError("transformer context must have length 2")
+
+        left_context = transformer_context[0]
+        if left_context < 0:
+            left_context = None
+
+        right_context = transformer_context[1]
+        if right_context < 0:
+            right_context = None
+
+        if left_context is None and right_context is None:
+            return None
+
+        return (left_context, right_context)
+
+    def parse_transformer_sampling(self, transformer_sampling, num_layers):
+        """
+        parsing transformer sampling configuration
+
+        Args:
+            - transformer_sampling, accepted input:
+                * None, indicating no sampling
+                * an Iterable with int (>0) as element
+            - num_layers, expected number of transformer layers, must match with
+              the length of transformer_sampling if it is not None
+
+        Returns:
+            - A tuple with length num_layers
+        """
+        if transformer_sampling is None:
+            return (1,) * num_layers
+
+        if not isinstance(transformer_sampling, Iterable):
+            raise ValueError(
+                "transformer_sampling must be an iterable if it is not None"
+            )
+
+        if len(transformer_sampling) != num_layers:
+            raise ValueError(
+                "transformer_sampling {} does not match with the number "
+                + "of layers {}".format(transformer_sampling, num_layers)
+            )
+
+        for layer, value in enumerate(transformer_sampling):
+            if not isinstance(value, int):
+                raise ValueError("Invalid value in transformer_sampling: ")
+            if value < 1:
+                raise ValueError(
+                    "{} layer's subsampling is {}.".format(layer, value)
+                    + " This is not allowed! "
+                )
+        return transformer_sampling
+
+    def slice(self, embedding, padding_mask, attn_mask, sampling_factor):
+        """
+        embedding is a (T, B, D) tensor
+        padding_mask is a (B, T) tensor or None
+        attn_mask is a (T, T) tensor or None
+        """
+        embedding = embedding[::sampling_factor, :, :]
+        if padding_mask is not None:
+            padding_mask = padding_mask[:, ::sampling_factor]
+        if attn_mask is not None:
+            attn_mask = attn_mask[::sampling_factor, ::sampling_factor]
+
+        return embedding, padding_mask, attn_mask
+
+    def lengths_to_attn_mask(self, input_lengths, subsampling_factor=1):
+        """
+        create attention mask according to sequence lengths and transformer
+        context
+
+        Args:
+            - input_lengths: (B, )-shape Int/Long tensor; input_lengths[b] is
+              the length of b-th sequence
+            - subsampling_factor: int
+                * Note that the left_context and right_context is specified in
+                  the input frame-level while input to transformer may already
+                  go through subsampling (e.g., the use of striding in vggblock)
+                  we use subsampling_factor to scale the left/right context
+
+        Return:
+            - a (T, T) binary tensor or None, where T is max(input_lengths)
+                * if self.transformer_context is None, None
+                * if left_context is None,
+                    * attn_mask[t, t + right_context + 1:] = 1
+                    * others = 0
+                * if right_context is None,
+                    * attn_mask[t, 0:t - left_context] = 1
+                    * others = 0
+                * elsif
+                    * attn_mask[t, t - left_context: t + right_context + 1] = 0
+                    * others = 1
+        """
+        if self.transformer_context is None:
+            return None
+
+        maxT = torch.max(input_lengths).item()
+        attn_mask = torch.zeros(maxT, maxT)
+
+        left_context = self.transformer_context[0]
+        right_context = self.transformer_context[1]
+        if left_context is not None:
+            left_context = math.ceil(self.transformer_context[0] / subsampling_factor)
+        if right_context is not None:
+            right_context = math.ceil(self.transformer_context[1] / subsampling_factor)
+
+        for t in range(maxT):
+            if left_context is not None:
+                st = 0
+                en = max(st, t - left_context)
+                attn_mask[t, st:en] = 1
+            if right_context is not None:
+                st = t + right_context + 1
+                st = min(st, maxT - 1)
+                attn_mask[t, st:] = 1
+
+        return attn_mask.to(input_lengths.device)
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
+            1, new_order
+        )
+        if encoder_out["encoder_padding_mask"] is not None:
+            encoder_out["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ].index_select(1, new_order)
+        return encoder_out
+
+
+class TransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
+            Default: ``False``
+        left_pad (bool, optional): whether the input is left-padded. Default:
+            ``False``
+    """
+
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG,
+        conv_config=DEFAULT_DEC_CONV_CONFIG,
+        encoder_output_dim=512,
+    ):
+
+        super().__init__(dictionary)
+        vocab_size = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(vocab_size, embed_dim, self.padding_idx)
+
+        self.conv_layers = nn.ModuleList()
+        for i in range(len(conv_config)):
+            out_channels, kernel_size, layer_norm = conv_config[i]
+            if i == 0:
+                conv_layer = LinearizedConv1d(
+                    embed_dim, out_channels, kernel_size, padding=kernel_size - 1
+                )
+            else:
+                conv_layer = LinearizedConv1d(
+                    conv_config[i - 1][0],
+                    out_channels,
+                    kernel_size,
+                    padding=kernel_size - 1,
+                )
+            self.conv_layers.append(conv_layer)
+            if layer_norm:
+                self.conv_layers.append(nn.LayerNorm(out_channels))
+            self.conv_layers.append(nn.ReLU())
+
+        self.layers = nn.ModuleList()
+        if conv_config[-1][0] != transformer_config[0][0]:
+            self.layers.append(Linear(conv_config[-1][0], transformer_config[0][0]))
+        self.layers.append(TransformerDecoderLayer(
+            prepare_transformer_decoder_params(*transformer_config[0])
+        ))
+
+        for i in range(1, len(transformer_config)):
+            if transformer_config[i - 1][0] != transformer_config[i][0]:
+                self.layers.append(
+                    Linear(transformer_config[i - 1][0], transformer_config[i][0])
+                )
+            self.layers.append(TransformerDecoderLayer(
+                prepare_transformer_decoder_params(*transformer_config[i])
+            ))
+        self.fc_out = Linear(transformer_config[-1][0], vocab_size)
+
+    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for input feeding/teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+        Returns:
+            tuple:
+                - the last decoder layer's output of shape `(batch, tgt_len,
+                  vocab)`
+                - the last decoder layer's attention weights of shape `(batch,
+                  tgt_len, src_len)`
+        """
+        target_padding_mask = (
+            (prev_output_tokens == self.padding_idx).to(prev_output_tokens.device)
+            if incremental_state is None
+            else None
+        )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+
+        # embed tokens
+        x = self.embed_tokens(prev_output_tokens)
+
+        # B x T x C -> T x B x C
+        x = self._transpose_if_training(x, incremental_state)
+
+        for layer in self.conv_layers:
+            if isinstance(layer, LinearizedConvolution):
+                x = layer(x, incremental_state)
+            else:
+                x = layer(x)
+
+        # B x T x C -> T x B x C
+        x = self._transpose_if_inference(x, incremental_state)
+
+        # decoder layers
+        for layer in self.layers:
+            if isinstance(layer, TransformerDecoderLayer):
+                x, _ = layer(
+                    x,
+                    (encoder_out["encoder_out"] if encoder_out is not None else None),
+                    (
+                        encoder_out["encoder_padding_mask"].t()
+                        if encoder_out["encoder_padding_mask"] is not None
+                        else None
+                    ),
+                    incremental_state,
+                    self_attn_mask=(
+                        self.buffered_future_mask(x)
+                        if incremental_state is None
+                        else None
+                    ),
+                    self_attn_padding_mask=(
+                        target_padding_mask if incremental_state is None else None
+                    ),
+                )
+            else:
+                x = layer(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        x = self.fc_out(x)
+
+        return x, None
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        if self._future_mask.size(0) < dim:
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+
+    def _transpose_if_training(self, x, incremental_state):
+        if incremental_state is None:
+            x = x.transpose(0, 1)
+        return x
+
+    def _transpose_if_inference(self, x, incremental_state):
+        if incremental_state:
+            x = x.transpose(0, 1)
+        return x
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    # nn.init.uniform_(m.weight, -0.1, 0.1)
+    # nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True, dropout=0):
+    """Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features, bias=bias)
+    # m.weight.data.uniform_(-0.1, 0.1)
+    # if bias:
+    #     m.bias.data.uniform_(-0.1, 0.1)
+    return m
+
+
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    nn.init.normal_(m.weight, mean=0, std=std)
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m, dim=2)
+
+
+def LayerNorm(embedding_dim):
+    m = nn.LayerNorm(embedding_dim)
+    return m
+
+
+# seq2seq models
+def base_architecture(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 40)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", DEFAULT_ENC_VGGBLOCK_CONFIG
+    )
+    args.transformer_enc_config = getattr(
+        args, "transformer_enc_config", DEFAULT_ENC_TRANSFORMER_CONFIG
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 512)
+    args.in_channels = getattr(args, "in_channels", 1)
+    args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 128)
+    args.transformer_dec_config = getattr(
+        args, "transformer_dec_config", DEFAULT_ENC_TRANSFORMER_CONFIG
+    )
+    args.conv_dec_config = getattr(args, "conv_dec_config", DEFAULT_DEC_CONV_CONFIG)
+    args.transformer_context = getattr(args, "transformer_context", "None")
+
+
+@register_model_architecture("asr_vggtransformer", "vggtransformer_1")
+def vggtransformer_1(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args,
+        "transformer_enc_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 14",
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 1024)
+    args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 128)
+    args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4")
+    args.transformer_dec_config = getattr(
+        args,
+        "transformer_dec_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 4",
+    )
+
+
+@register_model_architecture("asr_vggtransformer", "vggtransformer_2")
+def vggtransformer_2(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args,
+        "transformer_enc_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16",
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 1024)
+    args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 512)
+    args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4")
+    args.transformer_dec_config = getattr(
+        args,
+        "transformer_dec_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 6",
+    )
+
+
+@register_model_architecture("asr_vggtransformer", "vggtransformer_base")
+def vggtransformer_base(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args, "transformer_enc_config", "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 12"
+    )
+
+    args.enc_output_dim = getattr(args, "enc_output_dim", 512)
+    args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 512)
+    args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4")
+    args.transformer_dec_config = getattr(
+        args, "transformer_dec_config", "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 6"
+    )
+    # Size estimations:
+    # Encoder:
+    #   - vggblock param: 64*1*3*3 + 64*64*3*3 + 128*64*3*3  + 128*128*3 = 258K
+    #   Transformer:
+    #   - input dimension adapter: 2560 x 512 -> 1.31M
+    #   - transformer_layers (x12) --> 37.74M
+    #       * MultiheadAttention: 512*512*3 (in_proj) + 512*512 (out_proj) = 1.048M
+    #       * FFN weight: 512*2048*2 = 2.097M
+    #   - output dimension adapter: 512 x 512 -> 0.26 M
+    # Decoder:
+    #   - LinearizedConv1d: 512 * 256 * 3 + 256 * 256 * 3 * 3
+    #   - transformer_layer: (x6) --> 25.16M
+    #        * MultiheadAttention (self-attention): 512*512*3 + 512*512 = 1.048M
+    #        * MultiheadAttention (encoder-attention): 512*512*3 + 512*512 = 1.048M
+    #        * FFN: 512*2048*2 = 2.097M
+    # Final FC:
+    #   - FC: 512*5000 = 256K (assuming vocab size 5K)
+    # In total:
+    #       ~65 M
diff --git a/examples/speech_recognition/tasks/__init__.py b/examples/speech_recognition/tasks/__init__.py
new file mode 100644
index 0000000000..fb9e98372d
--- /dev/null
+++ b/examples/speech_recognition/tasks/__init__.py
@@ -0,0 +1,7 @@
+import importlib
+import os
+
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        task_name = file[:file.find('.py')]
+        importlib.import_module('examples.speech_recognition.tasks.' + task_name)
diff --git a/examples/speech_recognition/tasks/speech_recognition.py b/examples/speech_recognition/tasks/speech_recognition.py
new file mode 100644
index 0000000000..8c974aa720
--- /dev/null
+++ b/examples/speech_recognition/tasks/speech_recognition.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import re
+
+import torch
+from fairseq.data import Dictionary
+from fairseq.tasks import FairseqTask, register_task
+from examples.speech_recognition.data import AsrDataset
+
+
+def get_asr_dataset_from_json(data_json_path, tgt_dict):
+    """
+    Parse data json and create dataset.
+    See scripts/asr_prep_json.py which pack json from raw files
+
+    Json example:
+    {
+    "utts": {
+        "4771-29403-0025": {
+            "input": {
+                "length_ms": 170,
+                "path": "/tmp/file1.flac"
+            },
+            "output": {
+                "text": "HELLO \n",
+                "token": "HE LLO",
+                "tokenid": "4815, 861"
+            }
+        },
+        "1564-142299-0096": {
+            ...
+        }
+    }
+    """
+    if not os.path.isfile(data_json_path):
+        raise FileNotFoundError("Dataset not found: {}".format(data_json_path))
+    with open(data_json_path, "rb") as f:
+        data_samples = json.load(f)["utts"]
+        assert len(data_samples) != 0
+        sorted_samples = sorted(
+            data_samples.items(),
+            key=lambda sample: int(sample[1]["input"]["length_ms"]),
+            reverse=True,
+        )
+        aud_paths = [s[1]["input"]["path"] for s in sorted_samples]
+        ids = [s[0] for s in sorted_samples]
+        speakers = []
+        for s in sorted_samples:
+            m = re.search("(.+?)-(.+?)-(.+?)", s[0])
+            speakers.append(m.group(1) + "_" + m.group(2))
+        frame_sizes = [s[1]["input"]["length_ms"] for s in sorted_samples]
+        tgt = [
+            torch.LongTensor(
+                [int(i) for i in s[1]["output"]["tokenid"].split(", ")]
+            )
+            for s in sorted_samples
+        ]
+        # append eos
+        tgt = [torch.cat([t, torch.LongTensor([tgt_dict.eos()])]) for t in tgt]
+        return AsrDataset(
+            aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers
+        )
+
+
+@register_task("speech_recognition")
+class SpeechRecognitionTask(FairseqTask):
+    """
+    Task for training speech recognition model.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument("data", help="path to data directory")
+
+    def __init__(self, args, tgt_dict):
+        super().__init__(args)
+        self.tgt_dict = tgt_dict
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries)."""
+        dict_path = os.path.join(args.data, "dict.txt")
+        if not os.path.isfile(dict_path):
+            raise FileNotFoundError("Dict not found: {}".format(dict_path))
+        tgt_dict = Dictionary.load(dict_path)
+
+        print("| dictionary: {} types".format(len(tgt_dict)))
+        return cls(args, tgt_dict)
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        data_json_path = os.path.join(self.args.data, "{}.json".format(split))
+        self.datasets[split] = get_asr_dataset_from_json(
+            data_json_path, self.tgt_dict)
+
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.tgt_dict
+
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        return None
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 2e23dcf784..c5edba6461 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -23,6 +23,8 @@
     MultiheadAttention,
     PositionalEmbedding,
     SinusoidalPositionalEmbedding,
+    TransformerDecoderLayer,
+    TransformerEncoderLayer,
 )
 
 DEFAULT_MAX_SOURCE_POSITIONS = 1024
@@ -504,253 +506,6 @@ def upgrade_state_dict_named(self, state_dict, name):
         return state_dict
 
 
-class TransformerEncoderLayer(nn.Module):
-    """Encoder layer block.
-
-    In the original paper each operation (multi-head attention or FFN) is
-    postprocessed with: `dropout -> add residual -> layernorm`. In the
-    tensor2tensor code they suggest that learning is more robust when
-    preprocessing each layer with layernorm and postprocessing with:
-    `dropout -> add residual`. We default to the approach in the paper, but the
-    tensor2tensor approach can be enabled by setting
-    *args.encoder_normalize_before* to ``True``.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-    """
-
-    def __init__(self, args):
-        super().__init__()
-        self.embed_dim = args.encoder_embed_dim
-        self.self_attn = MultiheadAttention(
-            self.embed_dim, args.encoder_attention_heads,
-            dropout=args.attention_dropout, self_attention=True
-        )
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.dropout = args.dropout
-        self.activation_fn = utils.get_activation_fn(
-            activation=getattr(args, 'activation_fn', 'relu')
-        )
-        self.activation_dropout = getattr(args, 'activation_dropout', 0)
-        if self.activation_dropout == 0:
-            # for backwards compatibility with models that use args.relu_dropout
-            self.activation_dropout = getattr(args, 'relu_dropout', 0)
-        self.normalize_before = args.encoder_normalize_before
-        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
-        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim)
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        """
-        Rename layer norm states from `...layer_norms.0.weight` to
-        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
-        `...final_layer_norm.weight`
-        """
-        layer_norm_map = {
-            '0': 'self_attn_layer_norm',
-            '1': 'final_layer_norm'
-        }
-        for old, new in layer_norm_map.items():
-            for m in ('weight', 'bias'):
-                k = '{}.layer_norms.{}.{}'.format(name, old, m)
-                if k in state_dict:
-                    state_dict[
-                        '{}.{}.{}'.format(name, new, m)
-                    ] = state_dict[k]
-                    del state_dict[k]
-
-    def forward(self, x, encoder_padding_mask):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, src_len)` where padding elements are indicated by ``1``.
-
-        Returns:
-            encoded output of shape `(seq_len, batch, embed_dim)`
-        """
-        residual = x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
-        x, _ = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
-
-        residual = x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
-        return x
-
-    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return layer_norm(x)
-        else:
-            return x
-
-
-class TransformerDecoderLayer(nn.Module):
-    """Decoder layer block.
-
-    In the original paper each operation (multi-head attention, encoder
-    attention or FFN) is postprocessed with: `dropout -> add residual ->
-    layernorm`. In the tensor2tensor code they suggest that learning is more
-    robust when preprocessing each layer with layernorm and postprocessing with:
-    `dropout -> add residual`. We default to the approach in the paper, but the
-    tensor2tensor approach can be enabled by setting
-    *args.decoder_normalize_before* to ``True``.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs
-            (default: False).
-    """
-
-    def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False):
-        super().__init__()
-        self.embed_dim = args.decoder_embed_dim
-        self.self_attn = MultiheadAttention(
-            embed_dim=self.embed_dim,
-            num_heads=args.decoder_attention_heads,
-            dropout=args.attention_dropout,
-            add_bias_kv=add_bias_kv,
-            add_zero_attn=add_zero_attn,
-            self_attention=True
-        )
-        self.dropout = args.dropout
-        self.activation_fn = utils.get_activation_fn(
-            activation=getattr(args, 'activation_fn', 'relu')
-        )
-        self.activation_dropout = getattr(args, 'activation_dropout', 0)
-        if self.activation_dropout == 0:
-            # for backwards compatibility with models that use args.relu_dropout
-            self.activation_dropout = getattr(args, 'relu_dropout', 0)
-        self.normalize_before = args.decoder_normalize_before
-
-        # use layerNorm rather than FusedLayerNorm for exporting.
-        # char_inputs can be used to determint this.
-        # TODO  remove this once we update apex with the fix
-        export = getattr(args, 'char_inputs', False)
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
-
-        if no_encoder_attn:
-            self.encoder_attn = None
-            self.encoder_attn_layer_norm = None
-        else:
-            self.encoder_attn = MultiheadAttention(
-                self.embed_dim,
-                args.decoder_attention_heads,
-                kdim=getattr(args, 'encoder_embed_dim', None),
-                vdim=getattr(args, 'encoder_embed_dim', None),
-                dropout=args.attention_dropout,
-                encoder_decoder_attention=True,
-            )
-            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
-
-        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
-        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
-
-        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
-        self.need_attn = True
-
-        self.onnx_trace = False
-
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
-
-    def forward(
-        self,
-        x,
-        encoder_out=None,
-        encoder_padding_mask=None,
-        incremental_state=None,
-        prev_self_attn_state=None,
-        prev_attn_state=None,
-        self_attn_mask=None,
-        self_attn_padding_mask=None,
-    ):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, src_len)` where padding elements are indicated by ``1``.
-
-        Returns:
-            encoded output of shape `(seq_len, batch, embed_dim)`
-        """
-        residual = x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
-        if prev_self_attn_state is not None:
-            if incremental_state is None:
-                incremental_state = {}
-            prev_key, prev_value = prev_self_attn_state
-            saved_state = {"prev_key": prev_key, "prev_value": prev_value}
-            self.self_attn._set_input_buffer(incremental_state, saved_state)
-        x, attn = self.self_attn(
-            query=x,
-            key=x,
-            value=x,
-            key_padding_mask=self_attn_padding_mask,
-            incremental_state=incremental_state,
-            need_weights=False,
-            attn_mask=self_attn_mask,
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
-
-        if self.encoder_attn is not None:
-            residual = x
-            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
-            if prev_attn_state is not None:
-                if incremental_state is None:
-                    incremental_state = {}
-                prev_key, prev_value = prev_attn_state
-                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
-                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
-            x, attn = self.encoder_attn(
-                query=x,
-                key=encoder_out,
-                value=encoder_out,
-                key_padding_mask=encoder_padding_mask,
-                incremental_state=incremental_state,
-                static_kv=True,
-                need_weights=(not self.training and self.need_attn),
-            )
-            x = F.dropout(x, p=self.dropout, training=self.training)
-            x = residual + x
-            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
-
-        residual = x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
-        if self.onnx_trace and incremental_state is not None:
-            saved_state = self.self_attn._get_input_buffer(incremental_state)
-            self_attn_state = saved_state["prev_key"], saved_state["prev_value"]
-            return x, attn, self_attn_state
-        return x, attn
-
-    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return layer_norm(x)
-        else:
-            return x
-
-    def make_generation_fast_(self, need_attn=False, **kwargs):
-        self.need_attn = need_attn
-
-
 def Embedding(num_embeddings, embedding_dim, padding_idx):
     m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
     nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
index 8cffb0d792..6458f7d02f 100644
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -26,6 +26,8 @@
 from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer
 from .transformer_sentence_encoder import TransformerSentenceEncoder
 from .unfold import unfold1d
+from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer
+from .vggblock import VGGBlock
 
 __all__ = [
     'AdaptiveInput',
@@ -51,5 +53,8 @@
     'SinusoidalPositionalEmbedding',
     'TransformerSentenceEncoderLayer',
     'TransformerSentenceEncoder',
+    'TransformerDecoderLayer',
+    'TransformerEncoderLayer',
+    'VGGBlock',
     'unfold1d',
 ]
diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py
new file mode 100644
index 0000000000..5da4909ca2
--- /dev/null
+++ b/fairseq/modules/transformer_layer.py
@@ -0,0 +1,279 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.modules import LayerNorm, MultiheadAttention
+
+
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer block.
+
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: `dropout -> add residual -> layernorm`. In the
+    tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.encoder_normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+    """
+
+    def __init__(self, args):
+        super().__init__()
+        self.embed_dim = args.encoder_embed_dim
+        self.self_attn = MultiheadAttention(
+            self.embed_dim, args.encoder_attention_heads,
+            dropout=args.attention_dropout, self_attention=True
+        )
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.dropout = args.dropout
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, 'activation_fn', 'relu')
+        )
+        self.activation_dropout = getattr(args, 'activation_dropout', 0)
+        if self.activation_dropout == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            self.activation_dropout = getattr(args, 'relu_dropout', 0)
+        self.normalize_before = args.encoder_normalize_before
+        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
+        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """
+        Rename layer norm states from `...layer_norms.0.weight` to
+        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
+        `...final_layer_norm.weight`
+        """
+        layer_norm_map = {
+            '0': 'self_attn_layer_norm',
+            '1': 'final_layer_norm'
+        }
+        for old, new in layer_norm_map.items():
+            for m in ('weight', 'bias'):
+                k = '{}.layer_norms.{}.{}'.format(name, old, m)
+                if k in state_dict:
+                    state_dict[
+                        '{}.{}.{}'.format(name, new, m)
+                    ] = state_dict[k]
+                    del state_dict[k]
+
+    def forward(self, x, encoder_padding_mask, attn_mask=None):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+            attn_mask (ByteTensor): binary tensor of shape (T_tgt, T_src), where
+            T_tgt is the length of query, while T_src is the length of key,
+            though here both query and key is x here,
+            attn_mask[t_tgt, t_src] = 1 means when calculating embedding
+            for t_tgt, t_src is excluded (or masked out), =0 means it is
+            included in attention
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        residual = x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
+        if attn_mask is not None:
+            attn_mask = attn_mask.masked_fill(attn_mask.byte(), -1e8)
+        # anything in original attn_mask = 1, becomes -1e8
+        # anything in original attn_mask = 0, becomes 0
+        # Note that we cannot use -inf here, because at some edge cases,
+        # the attention weight (before softmax) for some padded element in query
+        # will become -inf, which results in NaN in model parameters
+        # TODO: to formally solve this problem, we need to change fairseq's
+        # MultiheadAttention. We will do this later on.
+        x, _ = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
+
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+        return x
+
+    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return layer_norm(x)
+        else:
+            return x
+
+
+class TransformerDecoderLayer(nn.Module):
+    """Decoder layer block.
+
+    In the original paper each operation (multi-head attention, encoder
+    attention or FFN) is postprocessed with: `dropout -> add residual ->
+    layernorm`. In the tensor2tensor code they suggest that learning is more
+    robust when preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.decoder_normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.self_attn = MultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=True
+        )
+        self.dropout = args.dropout
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, 'activation_fn', 'relu')
+        )
+        self.activation_dropout = getattr(args, 'activation_dropout', 0)
+        if self.activation_dropout == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            self.activation_dropout = getattr(args, 'relu_dropout', 0)
+        self.normalize_before = args.decoder_normalize_before
+
+        # use layerNorm rather than FusedLayerNorm for exporting.
+        # char_inputs can be used to determint this.
+        # TODO  remove this once we update apex with the fix
+        export = getattr(args, 'char_inputs', False)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = MultiheadAttention(
+                self.embed_dim,
+                args.decoder_attention_heads,
+                kdim=getattr(args, 'encoder_embed_dim', None),
+                vdim=getattr(args, 'encoder_embed_dim', None),
+                dropout=args.attention_dropout,
+                encoder_decoder_attention=True,
+            )
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
+        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
+
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.need_attn = True
+
+        self.onnx_trace = False
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def forward(
+        self,
+        x,
+        encoder_out=None,
+        encoder_padding_mask=None,
+        incremental_state=None,
+        prev_self_attn_state=None,
+        prev_attn_state=None,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        residual = x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
+        if prev_self_attn_state is not None:
+            if incremental_state is None:
+                incremental_state = {}
+            prev_key, prev_value = prev_self_attn_state
+            saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+            self.self_attn._set_input_buffer(incremental_state, saved_state)
+        x, attn = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
+
+        if self.encoder_attn is not None:
+            residual = x
+            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
+            if prev_attn_state is not None:
+                if incremental_state is None:
+                    incremental_state = {}
+                prev_key, prev_value = prev_attn_state
+                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=(not self.training and self.need_attn),
+            )
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = residual + x
+            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
+
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+        if self.onnx_trace and incremental_state is not None:
+            saved_state = self.self_attn._get_input_buffer(incremental_state)
+            self_attn_state = saved_state["prev_key"], saved_state["prev_value"]
+            return x, attn, self_attn_state
+        return x, attn
+
+    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return layer_norm(x)
+        else:
+            return x
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.)
+    return m
diff --git a/fairseq/modules/vggblock.py b/fairseq/modules/vggblock.py
new file mode 100644
index 0000000000..193026b16f
--- /dev/null
+++ b/fairseq/modules/vggblock.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from collections.abc import Iterable
+from itertools import repeat
+
+import torch
+import torch.nn as nn
+
+
+def _pair(v):
+    if isinstance(v, Iterable):
+        assert len(v) == 2, "len(v) != 2"
+        return v
+    return tuple(repeat(v, 2))
+
+
+def infer_conv_output_dim(conv_op, input_dim, sample_inchannel):
+    sample_seq_len = 200
+    sample_bsz = 10
+    x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim)
+    # N x C x H x W
+    # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim
+    x = conv_op(x)
+    # N x C x H x W
+    x = x.transpose(1, 2)
+    # N x H x C x W
+    bsz, seq = x.size()[:2]
+    per_channel_dim = x.size()[3]
+    # bsz: N, seq: H, CxW the rest
+    return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim
+
+
+class VGGBlock(torch.nn.Module):
+    """
+    VGG motibated cnn module https://arxiv.org/pdf/1409.1556.pdf
+
+    Args:
+        in_channels: (int) number of input channels (typically 1)
+        out_channels: (int) number of output channels
+        conv_kernel_size: convolution channels
+        pooling_kernel_size: the size of the pooling window to take a max over
+        num_conv_layers: (int) number of convolution layers
+        input_dim: (int) input dimension
+        conv_stride: the stride of the convolving kernel.
+            Can be a single number or a tuple (sH, sW)  Default: 1
+        padding: implicit paddings on both sides of the input.
+            Can be a single number or a tuple (padH, padW). Default: None
+        layer_norm: (bool) if layer norm is going to be applied. Default: False
+
+    Shape:
+        Input: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
+        Output: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        conv_kernel_size,
+        pooling_kernel_size,
+        num_conv_layers,
+        input_dim,
+        conv_stride=1,
+        padding=None,
+        layer_norm=False,
+    ):
+        assert (
+            input_dim is not None
+        ), "Need input_dim for LayerNorm and infer_conv_output_dim"
+        super(VGGBlock, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_kernel_size = _pair(conv_kernel_size)
+        self.pooling_kernel_size = _pair(pooling_kernel_size)
+        self.num_conv_layers = num_conv_layers
+        self.padding = (
+            tuple(e // 2 for e in self.conv_kernel_size)
+            if padding is None
+            else _pair(padding)
+        )
+        self.conv_stride = _pair(conv_stride)
+
+        self.layers = nn.ModuleList()
+        for layer in range(num_conv_layers):
+            conv_op = nn.Conv2d(
+                in_channels if layer == 0 else out_channels,
+                out_channels,
+                self.conv_kernel_size,
+                stride=self.conv_stride,
+                padding=self.padding,
+            )
+            self.layers.append(conv_op)
+            if layer_norm:
+                conv_output_dim, per_channel_dim = infer_conv_output_dim(
+                    conv_op, input_dim, in_channels if layer == 0 else out_channels
+                )
+                self.layers.append(nn.LayerNorm(per_channel_dim))
+                input_dim = per_channel_dim
+            self.layers.append(nn.ReLU())
+
+        pool_op = nn.MaxPool2d(kernel_size=pooling_kernel_size, ceil_mode=True)
+        self.layers.append(pool_op)
+        self.total_output_dim, self.output_dim = infer_conv_output_dim(
+            pool_op, input_dim, out_channels
+        )
+
+    def forward(self, x):
+        for i, _ in enumerate(self.layers):
+            x = self.layers[i](x)
+        return x
diff --git a/tests/speech_recognition/__init__.py b/tests/speech_recognition/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/speech_recognition/asr_test_base.py b/tests/speech_recognition/asr_test_base.py
new file mode 100644
index 0000000000..4fc7f78cac
--- /dev/null
+++ b/tests/speech_recognition/asr_test_base.py
@@ -0,0 +1,549 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import unittest
+from inspect import currentframe, getframeinfo
+
+import numpy as np
+import torch
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data.dictionary import Dictionary
+from fairseq.models import (
+    BaseFairseqModel,
+    FairseqDecoder,
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqEncoderModel,
+    FairseqModel,
+)
+from fairseq.tasks.fairseq_task import FairseqTask
+from examples.speech_recognition.data.data_utils import lengths_to_encoder_padding_mask
+
+
+DEFAULT_TEST_VOCAB_SIZE = 100
+
+
+# ///////////////////////////////////////////////////////////////////////////
+# utility function to setup dummy dict/task/input
+# ///////////////////////////////////////////////////////////////////////////
+
+
+def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE):
+    dummy_dict = Dictionary()
+    # add dummy symbol to satisfy vocab size
+    for id, _ in enumerate(range(vocab_size)):
+        dummy_dict.add_symbol("{}".format(id), 1000)
+    return dummy_dict
+
+
+class DummyTask(FairseqTask):
+    def __init__(self, args):
+        super().__init__(args)
+        self.dictionary = get_dummy_dictionary()
+        if getattr(self.args, "ctc", False):
+            self.dictionary.add_symbol("<ctc_blank>")
+        self.tgt_dict = self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+
+def get_dummy_task_and_parser():
+    """
+    to build a fariseq model, we need some dummy parse and task. This function
+    is used to create dummy task and parser to faciliate model/criterion test
+
+    Note: we use FbSpeechRecognitionTask as the dummy task. You may want
+    to use other task by providing another function
+    """
+    parser = argparse.ArgumentParser(
+        description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS
+    )
+    DummyTask.add_args(parser)
+    args = parser.parse_args([])
+    task = DummyTask.setup_task(args)
+    return task, parser
+
+
+def get_dummy_input(T=100, D=80, B=5, K=100):
+    forward_input = {}
+    # T max sequence length
+    # D feature vector dimension
+    # B batch size
+    # K target dimension size
+    feature = torch.randn(B, T, D)
+    # this (B, T, D) layout is just a convention, you can override it by
+    # write your own _prepare_forward_input function
+    src_lengths = torch.from_numpy(
+        np.random.randint(low=1, high=T, size=B).astype(np.int64)
+    )
+    src_lengths[0] = T  # make sure the maximum length matches
+    prev_output_tokens = []
+    for b in range(B):
+        token_length = np.random.randint(low=1, high=src_lengths[b].item() + 1)
+        tokens = np.random.randint(low=0, high=K, size=token_length)
+        prev_output_tokens.append(torch.from_numpy(tokens))
+
+    prev_output_tokens = fairseq_data_utils.collate_tokens(
+        prev_output_tokens,
+        pad_idx=1,
+        eos_idx=2,
+        left_pad=False,
+        move_eos_to_beginning=False,
+    )
+    src_lengths, sorted_order = src_lengths.sort(descending=True)
+    forward_input["src_tokens"] = feature.index_select(0, sorted_order)
+    forward_input["src_lengths"] = src_lengths
+    forward_input["prev_output_tokens"] = prev_output_tokens
+
+    return forward_input
+
+
+def get_dummy_encoder_output(encoder_out_shape=(100, 80, 5)):
+    """
+    This only provides an example to generate dummy encoder output
+    """
+    (T, B, D) = encoder_out_shape
+    encoder_out = {}
+
+    encoder_out["encoder_out"] = torch.from_numpy(
+        np.random.randn(*encoder_out_shape).astype(np.float32)
+    )
+    seq_lengths = torch.from_numpy(np.random.randint(low=1, high=T, size=B))
+    # some dummy mask
+    encoder_out["encoder_padding_mask"] = torch.arange(T).view(1, T).expand(
+        B, -1
+    ) >= seq_lengths.view(B, 1).expand(-1, T)
+    encoder_out["encoder_padding_mask"].t_()
+
+    # encoer_padding_mask is (T, B) tensor, with (t, b)-th element indicate
+    # whether encoder_out[t, b] is valid (=0) or not (=1)
+    return encoder_out
+
+
+def _current_postion_info():
+    cf = currentframe()
+    frameinfo = " (at {}:{})".format(
+        os.path.basename(getframeinfo(cf).filename), cf.f_back.f_lineno
+    )
+    return frameinfo
+
+
+def check_encoder_output(encoder_output, batch_size=None):
+    """we expect encoder_output to be a dict with the following
+    key/value pairs:
+    - encoder_out: a Torch.Tensor
+    - encoder_padding_mask: a binary Torch.Tensor
+    """
+    if not isinstance(encoder_output, dict):
+        msg = (
+            "FairseqEncoderModel.forward(...) must be a dict" + _current_postion_info()
+        )
+        return False, msg
+
+    if "encoder_out" not in encoder_output:
+        msg = (
+            "FairseqEncoderModel.forward(...) must contain encoder_out"
+            + _current_postion_info()
+        )
+        return False, msg
+
+    if "encoder_padding_mask" not in encoder_output:
+        msg = (
+            "FairseqEncoderModel.forward(...) must contain encoder_padding_mask"
+            + _current_postion_info()
+        )
+        return False, msg
+
+    if not isinstance(encoder_output["encoder_out"], torch.Tensor):
+        msg = "encoder_out must be a torch.Tensor" + _current_postion_info()
+        return False, msg
+
+    if encoder_output["encoder_out"].dtype != torch.float32:
+        msg = "encoder_out must have float32 dtype" + _current_postion_info()
+        return False, msg
+
+    mask = encoder_output["encoder_padding_mask"]
+    if mask is not None:
+        if not isinstance(mask, torch.Tensor):
+            msg = (
+                "encoder_padding_mask must be a torch.Tensor" + _current_postion_info()
+            )
+            return False, msg
+        if mask.dtype != torch.uint8:
+            msg = (
+                "encoder_padding_mask must have dtype of uint8"
+                + _current_postion_info()
+            )
+            return False, msg
+
+        if mask.dim() != 2:
+            msg = (
+                "we expect encoder_padding_mask to be a 2-d tensor, in shape (T, B)"
+                + _current_postion_info()
+            )
+            return False, msg
+
+        if batch_size is not None and mask.size(1) != batch_size:
+            msg = (
+                "we expect encoder_padding_mask to be a 2-d tensor, with size(1)"
+                + " being the batch size"
+                + _current_postion_info()
+            )
+            return False, msg
+    return True, None
+
+
+def check_decoder_output(decoder_output):
+    """we expect output from a decoder is a tuple with the following constraint:
+    - the first element is a torch.Tensor
+    - the second element can be anything (reserved for future use)
+    """
+    if not isinstance(decoder_output, tuple):
+        msg = "FariseqDecoder output must be a tuple" + _current_postion_info()
+        return False, msg
+
+    if len(decoder_output) != 2:
+        msg = "FairseqDecoder output must be 2-elem tuple" + _current_postion_info()
+        return False, msg
+
+    if not isinstance(decoder_output[0], torch.Tensor):
+        msg = (
+            "FariseqDecoder output[0] must be a torch.Tensor" + _current_postion_info()
+        )
+        return False, msg
+
+    return True, None
+
+
+# ///////////////////////////////////////////////////////////////////////////
+# Base Test class
+# ///////////////////////////////////////////////////////////////////////////
+
+
+class TestBaseFairseqModelBase(unittest.TestCase):
+    """
+    This class is used to facilitate writing unittest for any class derived from
+    `BaseFairseqModel`.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestBaseFairseqModelBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpModel(self, model):
+        self.assertTrue(isinstance(model, BaseFairseqModel))
+        self.model = model
+
+    def setupInput(self):
+        pass
+
+    def setUp(self):
+        self.model = None
+        self.forward_input = None
+        pass
+
+
+class TestFairseqEncoderDecoderModelBase(TestBaseFairseqModelBase):
+    """
+    base code to test FairseqEncoderDecoderModel (formally known as
+    `FairseqModel`) must be derived from this base class
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestFairseqEncoderDecoderModelBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpModel(self, model_cls, extra_args_setters=None):
+        self.assertTrue(
+            issubclass(model_cls, (FairseqEncoderDecoderModel, FairseqModel)),
+            msg="This class only tests for FairseqModel subclasses",
+        )
+
+        task, parser = get_dummy_task_and_parser()
+        model_cls.add_args(parser)
+
+        args = parser.parse_args([])
+        if extra_args_setters is not None:
+            for args_setter in extra_args_setters:
+                args_setter(args)
+        model = model_cls.build_model(args, task)
+        self.model = model
+
+    def setUpInput(self, input=None):
+        self.forward_input = get_dummy_input() if input is None else input
+
+    def setUp(self):
+        super().setUp()
+
+    def test_forward(self):
+        if self.model and self.forward_input:
+            forward_output = self.model.forward(**self.forward_input)
+            # for FairseqEncoderDecoderModel, forward returns a tuple of two
+            # elements, the first one is a Torch.Tensor
+            succ, msg = check_decoder_output(forward_output)
+            if not succ:
+                self.assertTrue(succ, msg=msg)
+            self.forward_output = forward_output
+
+    def test_get_normalized_probs(self):
+        if self.model and self.forward_input:
+            forward_output = self.model.forward(**self.forward_input)
+            logprob = self.model.get_normalized_probs(forward_output, log_probs=True)
+            prob = self.model.get_normalized_probs(forward_output, log_probs=False)
+
+            # in order for different models/criterion to play with each other
+            # we need to know whether the logprob or prob output is batch_first
+            # or not. We assume an additional attribute will be attached to logprob
+            # or prob. If you find your code failed here, simply override
+            # FairseqModel.get_normalized_probs, see example at
+            # https://fburl.com/batch_first_example
+            self.assertTrue(hasattr(logprob, "batch_first"))
+            self.assertTrue(hasattr(prob, "batch_first"))
+
+            self.assertTrue(torch.is_tensor(logprob))
+            self.assertTrue(torch.is_tensor(prob))
+
+
+class TestFairseqEncoderModelBase(TestBaseFairseqModelBase):
+    """
+    base class to test FairseqEncoderModel
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestFairseqEncoderModelBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpModel(self, model_cls, extra_args_setters=None):
+        self.assertTrue(
+            issubclass(model_cls, FairseqEncoderModel),
+            msg="This class is only used for testing FairseqEncoderModel",
+        )
+        task, parser = get_dummy_task_and_parser()
+        model_cls.add_args(parser)
+        args = parser.parse_args([])
+        if extra_args_setters is not None:
+            for args_setter in extra_args_setters:
+                args_setter(args)
+
+        model = model_cls.build_model(args, task)
+        self.model = model
+
+    def setUpInput(self, input=None):
+        self.forward_input = get_dummy_input() if input is None else input
+        # get_dummy_input() is originally for s2s, here we delete extra dict
+        # items, so it can be used for EncoderModel / Encoder as well
+        self.forward_input.pop("prev_output_tokens", None)
+
+    def setUp(self):
+        super().setUp()
+
+    def test_forward(self):
+        if self.forward_input and self.model:
+            bsz = self.forward_input["src_tokens"].size(0)
+            forward_output = self.model.forward(**self.forward_input)
+
+            # we expect forward_output to be a dict with the following
+            # key/value pairs:
+            # - encoder_out: a Torch.Tensor
+            # - encoder_padding_mask: a binary Torch.Tensor
+            succ, msg = check_encoder_output(forward_output, batch_size=bsz)
+            if not succ:
+                self.assertTrue(succ, msg=msg)
+            self.forward_output = forward_output
+
+    def test_get_normalized_probs(self):
+        if self.model and self.forward_input:
+            forward_output = self.model.forward(**self.forward_input)
+            logprob = self.model.get_normalized_probs(forward_output, log_probs=True)
+            prob = self.model.get_normalized_probs(forward_output, log_probs=False)
+
+            # in order for different models/criterion to play with each other
+            # we need to know whether the logprob or prob output is batch_first
+            # or not. We assume an additional attribute will be attached to logprob
+            # or prob. If you find your code failed here, simply override
+            # FairseqModel.get_normalized_probs, see example at
+            # https://fburl.com/batch_first_example
+            self.assertTrue(hasattr(logprob, "batch_first"))
+            self.assertTrue(hasattr(prob, "batch_first"))
+
+            self.assertTrue(torch.is_tensor(logprob))
+            self.assertTrue(torch.is_tensor(prob))
+
+
+class TestFairseqEncoderBase(unittest.TestCase):
+    """
+    base class to test FairseqEncoder
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestFairseqEncoderBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpEncoder(self, encoder):
+        self.assertTrue(
+            isinstance(encoder, FairseqEncoder),
+            msg="This class is only used for test FairseqEncoder",
+        )
+        self.encoder = encoder
+
+    def setUpInput(self, input=None):
+        self.forward_input = get_dummy_input() if input is None else input
+        # get_dummy_input() is originally for s2s, here we delete extra dict
+        # items, so it can be used for EncoderModel / Encoder as well
+        self.forward_input.pop("prev_output_tokens", None)
+
+    def setUp(self):
+        self.encoder = None
+        self.forward_input = None
+
+    def test_forward(self):
+        if self.encoder and self.forward_input:
+            bsz = self.forward_input["src_tokens"].size(0)
+
+            forward_output = self.encoder.forward(**self.forward_input)
+            succ, msg = check_encoder_output(forward_output, batch_size=bsz)
+            if not succ:
+                self.assertTrue(succ, msg=msg)
+            self.forward_output = forward_output
+
+
+class TestFairseqDecoderBase(unittest.TestCase):
+    """
+    base class to test FairseqDecoder
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if cls is TestFairseqDecoderBase:
+            raise unittest.SkipTest("Skipping test case in base")
+        super().setUpClass()
+
+    def setUpDecoder(self, decoder):
+        self.assertTrue(
+            isinstance(decoder, FairseqDecoder),
+            msg="This class is only used for test FairseqDecoder",
+        )
+        self.decoder = decoder
+
+    def setUpInput(self, input=None):
+        self.forward_input = get_dummy_encoder_output() if input is None else input
+
+    def setUpPrevOutputTokens(self, tokens=None):
+        if tokens is None:
+            self.encoder_input = get_dummy_input()
+            self.prev_output_tokens = self.encoder_input["prev_output_tokens"]
+        else:
+            self.prev_output_tokens = tokens
+
+    def setUp(self):
+        self.decoder = None
+        self.forward_input = None
+        self.prev_output_tokens = None
+
+    def test_forward(self):
+        if (
+            self.decoder is not None
+            and self.forward_input is not None
+            and self.prev_output_tokens is not None
+        ):
+            forward_output = self.decoder.forward(
+                prev_output_tokens=self.prev_output_tokens,
+                encoder_out=self.forward_input,
+            )
+            succ, msg = check_decoder_output(forward_output)
+            if not succ:
+                self.assertTrue(succ, msg=msg)
+            self.forward_input = forward_output
+
+
+class DummyEncoderModel(FairseqEncoderModel):
+    def __init__(self, encoder):
+        super().__init__(encoder)
+
+    @classmethod
+    def build_model(cls, args, task):
+        return cls(DummyEncoder())
+
+    def get_logits(self, net_output):
+        # Inverse of sigmoid to use with BinaryCrossEntropyWithLogitsCriterion as
+        # F.binary_cross_entropy_with_logits combines sigmoid and CE
+        return torch.log(
+            torch.div(net_output["encoder_out"], 1 - net_output["encoder_out"])
+        )
+
+
+class DummyEncoder(FairseqEncoder):
+    def __init__(self):
+        super().__init__(None)
+
+    def forward(self, src_tokens, src_lengths):
+        mask, max_len = lengths_to_encoder_padding_mask(src_lengths)
+        return {"encoder_out": src_tokens, "encoder_padding_mask": mask}
+
+
+class CrossEntropyCriterionTestBase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        if cls is CrossEntropyCriterionTestBase:
+            raise unittest.SkipTest("Skipping base class test case")
+        super().setUpClass()
+
+    def setUpArgs(self):
+        args = argparse.Namespace()
+        args.sentence_avg = False
+        args.threshold = 0.1  # to use with BinaryCrossEntropyWithLogitsCriterion
+        return args
+
+    def setUp(self):
+        args = self.setUpArgs()
+        self.model = DummyEncoderModel(encoder=DummyEncoder())
+        self.criterion = self.criterion_cls(args=args, task=DummyTask(args))
+
+    def get_src_tokens(self, correct_prediction, aggregate):
+        """
+            correct_prediction: True if the net_output (src_tokens) should
+            predict the correct target
+            aggregate: True if the criterion expects net_output (src_tokens)
+            aggregated across time axis
+        """
+        predicted_idx = 0 if correct_prediction else 1
+        if aggregate:
+            src_tokens = torch.zeros((2, 2), dtype=torch.float)
+            for b in range(2):
+                src_tokens[b][predicted_idx] = 1.0
+        else:
+            src_tokens = torch.zeros((2, 10, 2), dtype=torch.float)
+            for b in range(2):
+                for t in range(10):
+                    src_tokens[b][t][predicted_idx] = 1.0
+        return src_tokens
+
+    def get_target(self, soft_target):
+        if soft_target:
+            target = torch.zeros((2, 2), dtype=torch.float)
+            for b in range(2):
+                target[b][0] = 1.0
+        else:
+            target = torch.zeros((2, 10), dtype=torch.long)
+        return target
+
+    def get_test_sample(self, correct, soft_target, aggregate):
+        src_tokens = self.get_src_tokens(correct, aggregate)
+        target = self.get_target(soft_target)
+        L = src_tokens.size(1)
+        return {
+            "net_input": {"src_tokens": src_tokens, "src_lengths": torch.tensor([L])},
+            "target": target,
+            "ntokens": src_tokens.size(0) * src_tokens.size(1),
+        }
diff --git a/tests/speech_recognition/test_collaters.py b/tests/speech_recognition/test_collaters.py
new file mode 100644
index 0000000000..efb0e58792
--- /dev/null
+++ b/tests/speech_recognition/test_collaters.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import unittest
+
+import numpy as np
+import torch
+from examples.speech_recognition.data.collaters import Seq2SeqCollater
+
+
+class TestSeq2SeqCollator(unittest.TestCase):
+    def test_collate(self):
+
+        eos_idx = 1
+        pad_idx = 0
+        collater = Seq2SeqCollater(
+            feature_index=0, label_index=1, pad_index=pad_idx, eos_index=eos_idx
+        )
+
+        # 2 frames in the first sample and 3 frames in the second one
+        frames1 = np.array([[7, 8], [9, 10]])
+        frames2 = np.array([[1, 2], [3, 4], [5, 6]])
+        target1 = np.array([4, 2, 3, eos_idx])
+        target2 = np.array([3, 2, eos_idx])
+        sample1 = {"id": 0, "data": [frames1, target1]}
+        sample2 = {"id": 1, "data": [frames2, target2]}
+        batch = collater.collate([sample1, sample2])
+
+        # collate sort inputs by frame's length before creating the batch
+        self.assertTensorEqual(batch["id"], torch.tensor([1, 0]))
+        self.assertEqual(batch["ntokens"], 7)
+        self.assertTensorEqual(
+            batch["net_input"]["src_tokens"],
+            torch.tensor(
+                [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [pad_idx, pad_idx]]]
+            ),
+        )
+        self.assertTensorEqual(
+            batch["net_input"]["prev_output_tokens"],
+            torch.tensor([[eos_idx, 3, 2, pad_idx], [eos_idx, 4, 2, 3]]),
+        )
+        self.assertTensorEqual(batch["net_input"]["src_lengths"], torch.tensor([3, 2]))
+        self.assertTensorEqual(
+            batch["target"],
+            torch.tensor([[3, 2, eos_idx, pad_idx], [4, 2, 3, eos_idx]]),
+        )
+        self.assertEqual(batch["nsentences"], 2)
+
+    def assertTensorEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertEqual(t1.ne(t2).long().sum(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/speech_recognition/test_cross_entropy.py b/tests/speech_recognition/test_cross_entropy.py
new file mode 100644
index 0000000000..11daf4166f
--- /dev/null
+++ b/tests/speech_recognition/test_cross_entropy.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from examples.speech_recognition.criterions.cross_entropy_acc import CrossEntropyWithAccCriterion
+from .asr_test_base import CrossEntropyCriterionTestBase
+
+
+class CrossEntropyWithAccCriterionTest(CrossEntropyCriterionTestBase):
+    def setUp(self):
+        self.criterion_cls = CrossEntropyWithAccCriterion
+        super().setUp()
+
+    def test_cross_entropy_all_correct(self):
+        sample = self.get_test_sample(correct=True, soft_target=False, aggregate=False)
+        loss, sample_size, logging_output = self.criterion(
+            self.model, sample, "sum", log_probs=True
+        )
+        assert logging_output["correct"] == 20
+        assert logging_output["total"] == 20
+        assert logging_output["sample_size"] == 20
+        assert logging_output["ntokens"] == 20
+
+    def test_cross_entropy_all_wrong(self):
+        sample = self.get_test_sample(correct=False, soft_target=False, aggregate=False)
+        loss, sample_size, logging_output = self.criterion(
+            self.model, sample, "sum", log_probs=True
+        )
+        assert logging_output["correct"] == 0
+        assert logging_output["total"] == 20
+        assert logging_output["sample_size"] == 20
+        assert logging_output["ntokens"] == 20
diff --git a/tests/speech_recognition/test_vggtransformer.py b/tests/speech_recognition/test_vggtransformer.py
new file mode 100644
index 0000000000..4dc73b8c73
--- /dev/null
+++ b/tests/speech_recognition/test_vggtransformer.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+
+# import models/encoder/decoder to be tested
+from examples.speech_recognition.models.vggtransformer import (
+    TransformerDecoder,
+    VGGTransformerEncoder,
+    VGGTransformerModel,
+    vggtransformer_1,
+    vggtransformer_2,
+    vggtransformer_base,
+)
+
+# import base test class
+from .asr_test_base import (
+    DEFAULT_TEST_VOCAB_SIZE,
+    TestFairseqDecoderBase,
+    TestFairseqEncoderBase,
+    TestFairseqEncoderDecoderModelBase,
+    get_dummy_dictionary,
+    get_dummy_encoder_output,
+    get_dummy_input,
+)
+
+
+class VGGTransformerModelTest_mid(TestFairseqEncoderDecoderModelBase):
+    def setUp(self):
+        def override_config(args):
+            """
+            vggtrasformer_1 use 14 layers of transformer,
+            for testing purpose, it is too expensive. For fast turn-around
+            test, reduce the number of layers to 3.
+            """
+            args.transformer_enc_config = (
+                "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3"
+            )
+
+        super().setUp()
+        extra_args_setter = [vggtransformer_1, override_config]
+
+        self.setUpModel(VGGTransformerModel, extra_args_setter)
+        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
+
+
+class VGGTransformerModelTest_big(TestFairseqEncoderDecoderModelBase):
+    def setUp(self):
+        def override_config(args):
+            """
+            vggtrasformer_2 use 16 layers of transformer,
+            for testing purpose, it is too expensive. For fast turn-around
+            test, reduce the number of layers to 3.
+            """
+            args.transformer_enc_config = (
+                "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3"
+            )
+
+        super().setUp()
+        extra_args_setter = [vggtransformer_2, override_config]
+
+        self.setUpModel(VGGTransformerModel, extra_args_setter)
+        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
+
+
+class VGGTransformerModelTest_base(TestFairseqEncoderDecoderModelBase):
+    def setUp(self):
+        def override_config(args):
+            """
+            vggtrasformer_base use 12 layers of transformer,
+            for testing purpose, it is too expensive. For fast turn-around
+            test, reduce the number of layers to 3.
+            """
+            args.transformer_enc_config = (
+                "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 3"
+            )
+
+        super().setUp()
+        extra_args_setter = [vggtransformer_base, override_config]
+
+        self.setUpModel(VGGTransformerModel, extra_args_setter)
+        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
+
+
+class VGGTransformerEncoderTest(TestFairseqEncoderBase):
+    def setUp(self):
+        super().setUp()
+
+        self.setUpInput(get_dummy_input(T=50, D=80, B=5))
+
+    def test_forward(self):
+        print("1. test standard vggtransformer")
+        self.setUpEncoder(VGGTransformerEncoder(input_feat_per_channel=80))
+        super().test_forward()
+        print("2. test vggtransformer with limited right context")
+        self.setUpEncoder(
+            VGGTransformerEncoder(
+                input_feat_per_channel=80, transformer_context=(-1, 5)
+            )
+        )
+        super().test_forward()
+        print("3. test vggtransformer with limited left context")
+        self.setUpEncoder(
+            VGGTransformerEncoder(
+                input_feat_per_channel=80, transformer_context=(5, -1)
+            )
+        )
+        super().test_forward()
+        print("4. test vggtransformer with limited right context and sampling")
+        self.setUpEncoder(
+            VGGTransformerEncoder(
+                input_feat_per_channel=80,
+                transformer_context=(-1, 12),
+                transformer_sampling=(2, 2),
+            )
+        )
+        super().test_forward()
+        print("5. test vggtransformer with windowed context and sampling")
+        self.setUpEncoder(
+            VGGTransformerEncoder(
+                input_feat_per_channel=80,
+                transformer_context=(12, 12),
+                transformer_sampling=(2, 2),
+            )
+        )
+
+
+class TransformerDecoderTest(TestFairseqDecoderBase):
+    def setUp(self):
+        super().setUp()
+
+        dict = get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE)
+        decoder = TransformerDecoder(dict)
+        dummy_encoder_output = get_dummy_encoder_output(encoder_out_shape=(50, 5, 256))
+
+        self.setUpDecoder(decoder)
+        self.setUpInput(dummy_encoder_output)
+        self.setUpPrevOutputTokens()

From 439ead5a7738bc5080d1d4643ae4bf6dfc78b8ca Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 8 Aug 2019 08:23:22 -0700
Subject: [PATCH 073/213] Integrate with Apache Arrow/Plasma in-memory store
 for large datasets (#995)

Summary:
Datasets with many examples can generate very large indexes in TokenBlockDataset (and possibly elsewhere). When using `--num-workers>0` these indexes are pickled and transferred via a multiprocessing pipe, which is slow and can fail if the index grows beyond 4GB (~0.5B examples). Apache Arrow has an in-memory store called Plasma that will offload these arrays to shared memory, which both reduces duplication of the data and avoids needing to pickle.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/995

Differential Revision: D16697219

Pulled By: myleott

fbshipit-source-id: 1b679ee5b3d2726af54ff418f6159a3671173fb8
---
 fairseq/data/plasma_utils.py        | 86 +++++++++++++++++++++++++++++
 fairseq/data/token_block_dataset.py | 48 ++++++++++------
 2 files changed, 118 insertions(+), 16 deletions(-)
 create mode 100644 fairseq/data/plasma_utils.py

diff --git a/fairseq/data/plasma_utils.py b/fairseq/data/plasma_utils.py
new file mode 100644
index 0000000000..33f250eea9
--- /dev/null
+++ b/fairseq/data/plasma_utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import subprocess
+import tempfile
+
+
+class PlasmaArray(object):
+    """
+    Wrapper around numpy arrays that automatically moves the data to shared
+    memory upon serialization. This is particularly helpful when passing numpy
+    arrays through multiprocessing, so that data is not unnecessarily
+    duplicated or pickled.
+    """
+
+    def __init__(self, array):
+        super().__init__()
+        self.array = array
+        self.disable = array.nbytes < 134217728  # disable for arrays <128MB
+        self.object_id = None
+        self.path = None
+
+        # variables with underscores shouldn't be pickled
+        self._client = None
+        self._server = None
+        self._server_tmp = None
+        self._plasma = None
+
+    @property
+    def plasma(self):
+        if self._plasma is None and not self.disable:
+            try:
+                import pyarrow.plasma as plasma
+                self._plasma = plasma
+            except ImportError:
+                self._plasma = None
+        return self._plasma
+
+    def start_server(self):
+        if self.plasma is None or self._server is not None:
+            return
+        assert self.object_id is None
+        assert self.path is None
+        self._server_tmp = tempfile.NamedTemporaryFile()
+        self.path = self._server_tmp.name
+        self._server = subprocess.Popen([
+            'plasma_store',
+            '-m', str(int(1.05 * self.array.nbytes)),
+            '-s', self.path,
+        ])
+
+    @property
+    def client(self):
+        if self._client is None:
+            assert self.path is not None
+            self._client = self.plasma.connect(self.path)
+        return self._client
+
+    def __getstate__(self):
+        if self.plasma is None:
+            return self.__dict__
+        if self.object_id is None:
+            self.start_server()
+            self.object_id = self.client.put(self.array)
+        state = self.__dict__.copy()
+        del state['array']
+        state['_client'] = None
+        state['_server'] = None
+        state['_server_tmp'] = None
+        state['_plasma'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if self.plasma is None:
+            return
+        self.array = self.client.get(self.object_id)
+
+    def __del__(self):
+        if self._server is not None:
+            self._server.kill()
+            self._server = None
+            self._server_tmp.close()
+            self._server_tmp = None
diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
index 4633167318..3d69cfcda4 100644
--- a/fairseq/data/token_block_dataset.py
+++ b/fairseq/data/token_block_dataset.py
@@ -8,7 +8,7 @@
 import numpy as np
 import torch
 
-from . import FairseqDataset
+from fairseq.data import FairseqDataset, plasma_utils
 
 
 class TokenBlockDataset(FairseqDataset):
@@ -43,7 +43,7 @@ def __init__(
         self.pad = pad
         self.eos = eos
         self.include_targets = include_targets
-        self.slice_indices = []
+        slice_indices = []
 
         assert len(dataset) == len(sizes)
         assert len(dataset) > 0
@@ -57,7 +57,7 @@ def block_at(i):
                 end = min(start + block_size, total_size)
                 return (start, end)
 
-            self.slice_indices = [block_at(i) for i in range(length)]
+            slice_indices = [block_at(i) for i in range(length)]
         elif break_mode == 'complete':
             tok_idx = 0
             sz_idx = 0
@@ -67,11 +67,11 @@ def block_at(i):
                     curr_size += sizes[sz_idx]
                     sz_idx += 1
                 else:
-                    self.slice_indices.append((tok_idx, tok_idx + curr_size))
+                    slice_indices.append((tok_idx, tok_idx + curr_size))
                     tok_idx += curr_size
                     curr_size = 0
             if curr_size > 0:
-                self.slice_indices.append((tok_idx, tok_idx + curr_size))
+                slice_indices.append((tok_idx, tok_idx + curr_size))
         elif break_mode == 'complete_doc':
             tok_idx = 0
             sz_idx = 0
@@ -85,32 +85,32 @@ def block_at(i):
                     curr_size += sizes[sz_idx]
                     sz_idx += 1
                 else:
-                    self.slice_indices.append((tok_idx, tok_idx + curr_size))
+                    slice_indices.append((tok_idx, tok_idx + curr_size))
                     tok_idx += curr_size
                     curr_size = 0
                     if sizes[sz_idx] == document_sep_len:
                         tok_idx += sizes[sz_idx]
                         sz_idx += 1
             if curr_size > 0:
-                self.slice_indices.append((tok_idx, tok_idx + curr_size))
+                slice_indices.append((tok_idx, tok_idx + curr_size))
         elif break_mode == 'eos':
-            self.slice_indices = np.empty((len(sizes), 2), dtype=int)
+            slice_indices = np.empty((len(sizes), 2), dtype=int)
             if not torch.is_tensor(sizes):
                 sizes = torch.tensor(sizes)
             cumsum = torch.cumsum(sizes, dim=0)
-            self.slice_indices[0] = [0, sizes[0]]
+            slice_indices[0] = [0, sizes[0]]
             if len(cumsum) > 1:
-                self.slice_indices[1:] = cumsum.unfold(0, 2, 1)
+                slice_indices[1:] = cumsum.unfold(0, 2, 1)
         else:
             raise ValueError('Invalid break_mode: ' + break_mode)
 
-        self.slice_indices = np.array(self.slice_indices, dtype=int)
-        self.sizes = self.slice_indices[:, 1] - self.slice_indices[:, 0]
+        slice_indices = np.array(slice_indices, dtype=int)
+        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]
 
         # build index mapping block indices to the underlying dataset indices
         if break_mode == 'eos':
             # much faster version for eos break mode
-            self.block_to_dataset_index = np.stack(
+            block_to_dataset_index = np.stack(
                 [
                     np.arange(len(sizes)),  # starting index in dataset
                     np.zeros(len(sizes), dtype=np.long),  # starting offset within starting index
@@ -120,8 +120,8 @@ def block_at(i):
             )
         else:
             ds = DatasetSearcher(sizes)
-            self.block_to_dataset_index = np.empty((len(self.slice_indices), 3), dtype=int)
-            for i, (s, e) in enumerate(self.slice_indices):
+            block_to_dataset_index = np.empty((len(slice_indices), 3), dtype=int)
+            for i, (s, e) in enumerate(slice_indices):
                 ds.seek(s)
                 start_ds_idx = ds.current_index
                 start_offset = ds.current_offset
@@ -129,12 +129,28 @@ def block_at(i):
                     continue
                 ds.seek(e - 1)
                 end_ds_idx = ds.current_index
-                self.block_to_dataset_index[i] = (
+                block_to_dataset_index[i] = (
                     start_ds_idx,  # starting index in dataset
                     start_offset,  # starting offset within starting index
                     end_ds_idx,  # ending index in dataset
                 )
 
+        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
+        self._sizes = plasma_utils.PlasmaArray(self._sizes)
+        self._block_to_dataset_index = plasma_utils.PlasmaArray(block_to_dataset_index)
+
+    @property
+    def slice_indices(self):
+        return self._slice_indices.array
+
+    @property
+    def sizes(self):
+        return self._sizes.array
+
+    @property
+    def block_to_dataset_index(self):
+        return self._block_to_dataset_index.array
+
     def __getitem__(self, index):
         start_ds_idx, start_offset, end_ds_idx = self.block_to_dataset_index[index]
         buffer = torch.cat([

From 6398aa9e5be57c89785e059462d73b108513238d Mon Sep 17 00:00:00 2001
From: Hafiz Shafruddin <gmhafiz@gmail.com>
Date: Thu, 8 Aug 2019 10:04:57 -0700
Subject: [PATCH 074/213] replace 'mkdir' with 'mkdir -p' (#997)

Summary:
Allow shell script to create sub directories with -p flag. Amends readme file too.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/997

Differential Revision: D16710813

Pulled By: myleott

fbshipit-source-id: 89abefa27e8fac99d212fc9b7b0dbc3690043ba0
---
 examples/roberta/preprocess_GLUE_tasks.sh | 4 ++--
 examples/scaling_nmt/README.md            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/roberta/preprocess_GLUE_tasks.sh b/examples/roberta/preprocess_GLUE_tasks.sh
index 52a5ffa1e9..7623566444 100755
--- a/examples/roberta/preprocess_GLUE_tasks.sh
+++ b/examples/roberta/preprocess_GLUE_tasks.sh
@@ -84,7 +84,7 @@ do
 
   # Strip out header and filter lines that don't have expected number of fields.
   rm -rf "$TASK_DATA_FOLDER/processed"
-  mkdir "$TASK_DATA_FOLDER/processed"
+  mkdir -p "$TASK_DATA_FOLDER/processed"
   for SPLIT in $SPLITS
   do
     # CoLA train and dev doesn't have header.
@@ -178,7 +178,7 @@ do
       --workers 60;
   else
     # For STS-B output range is converted to be between: [0.0, 1.0]
-    mkdir "$TASK-bin/label"
+    mkdir -p "$TASK-bin/label"
     awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/train.label" > "$TASK-bin/label/train.label"
     awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/dev.label" > "$TASK-bin/label/valid.label"
   fi
diff --git a/examples/scaling_nmt/README.md b/examples/scaling_nmt/README.md
index d814436a46..1e47917baf 100644
--- a/examples/scaling_nmt/README.md
+++ b/examples/scaling_nmt/README.md
@@ -17,7 +17,7 @@ Then:
 1. Extract the WMT'16 En-De data:
 ```bash
 TEXT=wmt16_en_de_bpe32k
-mkdir $TEXT
+mkdir -p $TEXT
 tar -xzvf wmt16_en_de.tar.gz -C $TEXT
 ```
 

From 3563e59abe6366981ae5eb7c3333efcd64225d04 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair1180.h2.fair>
Date: Fri, 9 Aug 2019 09:43:16 -0700
Subject: [PATCH 075/213] added superglue dev set results to readme

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/815

Differential Revision: D16733633

fbshipit-source-id: 0a5029e41b6dbb9fb28e9703ad057d939d489d90
---
 examples/roberta/README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 537c55f3fa..5f3be7941d 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -24,6 +24,13 @@ Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
 `roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
 `roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
 
+
+##### Results on SuperGLUE tasks (dev set, single model, single-task finetuning)
+
+Model | BoolQ | CB | COPA | MultiRC | RTE | WiC | WSC
+---|---|---|---|---|---|---|---
+`roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | 91.3
+
 ##### Results on SQuAD (dev set)
 
 Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1

From 838e108a917ba2dda5ed552d91ec308d7be49a8c Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Fri, 9 Aug 2019 10:03:53 -0700
Subject: [PATCH 076/213] MacOS requires c++ flag (#1000)

Summary:
To install on MacOS, `-stdlib=libc++` needs to be specified.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1000

Differential Revision: D16733819

Pulled By: myleott

fbshipit-source-id: 7a1ed11e2b4e1071e61c64c379c84f72e02ad2b5
---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a748599895..2b1510c276 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,10 @@ After PyTorch is installed, you can install fairseq with `pip`:
 ```
 pip install fairseq
 ```
-
+On MacOS,
+```
+CFLAGS="-stdlib=libc++" pip install fairseq
+```
 **Installing from source**
 
 To install fairseq from source and develop locally:

From b6c55b62d72c0df3a80c2b326679bea6fa26a0cf Mon Sep 17 00:00:00 2001
From: Jingfei Du <jingfeidu@fb.com>
Date: Fri, 9 Aug 2019 11:03:51 -0700
Subject: [PATCH 077/213] added sentence ranking task and loss (#809)

Summary:
This task and loss are used for sentence ranking and multiple choice tasks such as RACE
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/809

Reviewed By: myleott

Differential Revision: D16715745

Pulled By: jingfeidu

fbshipit-source-id: cb4d1c7b26ebb3e2382449ba51af5745ef56f30f
---
 examples/roberta/README.finetune_race.md |  47 ++++++
 examples/roberta/preprocess_RACE.py      |  92 +++++++++++
 examples/roberta/preprocess_RACE.sh      |  60 +++++++
 fairseq/criterions/sentence_ranking.py   |  78 +++++++++
 fairseq/tasks/sentence_ranking.py        | 194 +++++++++++++++++++++++
 5 files changed, 471 insertions(+)
 create mode 100644 examples/roberta/README.finetune_race.md
 create mode 100644 examples/roberta/preprocess_RACE.py
 create mode 100755 examples/roberta/preprocess_RACE.sh
 create mode 100644 fairseq/criterions/sentence_ranking.py
 create mode 100644 fairseq/tasks/sentence_ranking.py

diff --git a/examples/roberta/README.finetune_race.md b/examples/roberta/README.finetune_race.md
new file mode 100644
index 0000000000..320e101487
--- /dev/null
+++ b/examples/roberta/README.finetune_race.md
@@ -0,0 +1,47 @@
+# Finetuning RoBERTa on RACE tasks
+
+### 1) Download the data from RACE website (http://www.cs.cmu.edu/~glai1/data/race/)
+
+### 2) Preprocess RACE data:
+```bash
+python ./examples/roberta/preprocess_RACE.py <input-dir> <extracted-data-dir>
+./examples/roberta/preprocess_RACE.sh <extracted-data-dir> <output-dir>
+```
+
+### 3) Fine-tuning on RACE:
+
+```bash
+MAX_EPOCHS=5          # epoch number
+LR=1e-05              # Peak LR for fixed LR scheduler.
+NUM_CLASSES=4
+MAX_SENTENCES=2       # batch size
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+CUDA_VISIBLE_DEVICES=0 python train.py <race-preprocessed-dir>/ \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --max-sentences $MAX_SENTENCES \
+    --task sentence_ranking \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_ranking \
+    --num-classes $NUM_CLASSES \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler fixed --lr $LR \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --update-freq 8 \
+    --find-unused-parameters \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
+```
+
+**Note:**
+
+a) As contexts in RACE are relatively long, we are using smaller batch size per GPU while increasing update-freq to achieve larger effective batch size.
+
+b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--max-sentences`.
+
+c) The setting in above command is based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
diff --git a/examples/roberta/preprocess_RACE.py b/examples/roberta/preprocess_RACE.py
new file mode 100644
index 0000000000..4c9bba707b
--- /dev/null
+++ b/examples/roberta/preprocess_RACE.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import json
+import os
+
+
+class InputExample:
+    def __init__(self, paragraph, qa_list, label):
+        self.paragraph = paragraph
+        self.qa_list = qa_list
+        self.label = label
+
+
+def get_examples(data_dir, set_type):
+    """
+    Extract paragraph and question-answer list from each json file
+    """
+    examples = []
+
+    levels = ["middle", "high"]
+    set_type_c = set_type.split('-')
+    if len(set_type_c) == 2:
+        levels = [set_type_c[1]]
+        set_type = set_type_c[0]
+    for level in levels:
+        cur_dir = os.path.join(data_dir, set_type, level)
+        for filename in os.listdir(cur_dir):
+            cur_path = os.path.join(cur_dir, filename)
+            with open(cur_path, 'r') as f:
+                cur_data = json.load(f)
+                answers = cur_data["answers"]
+                options = cur_data["options"]
+                questions = cur_data["questions"]
+                context = cur_data["article"].replace("\n", " ")
+                for i in range(len(answers)):
+                    label = ord(answers[i]) - ord("A")
+                    qa_list = []
+                    question = questions[i]
+                    for j in range(4):
+                        option = options[i][j]
+                        if "_" in question:
+                            qa_cat = question.replace("_", option)
+                        else:
+                            qa_cat = " ".join([question, option])
+                        qa_list.append(qa_cat)
+                    examples.append(InputExample(context, qa_list, label))
+
+    return examples
+
+
+def main():
+    """
+    Helper script to extract paragraphs questions and answers from RACE datasets.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-dir",
+        help='input directory for downloaded RACE dataset',
+    )
+    parser.add_argument(
+        "--output-dir",
+        help='output directory for extracted data',
+    )
+    args = parser.parse_args()
+
+    for set_type in ["train", "dev", "test-middle", "test-high"]:
+        examples = get_examples(args.input_dir, set_type)
+        qa_file_paths = [args.output_dir + set_type + ".input" + str(i + 1) for i in range(4)]
+        qa_files = [open(qa_file_path, 'w') for qa_file_path in qa_file_paths]
+        outf_context_path = args.output_dir + set_type + ".input0"
+        outf_label_path = args.output_dir + set_type + ".label"
+        outf_context = open(outf_context_path, 'w')
+        outf_label = open(outf_label_path, 'w')
+        for example in examples:
+            outf_context.write(example.paragraph + '\n')
+            for i in range(4):
+                qa_files[i].write(example.qa_list[i] + '\n')
+            outf_label.write(str(example.label) + '\n')
+        
+        for f in qa_files:
+            f.close()
+        outf_label.close()
+        outf_context.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/roberta/preprocess_RACE.sh b/examples/roberta/preprocess_RACE.sh
new file mode 100755
index 0000000000..0957549169
--- /dev/null
+++ b/examples/roberta/preprocess_RACE.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# data should be downloaded and processed with reprocess_RACE.py
+if [[ $# -ne 2 ]]; then
+  echo "Run as following:"
+  echo "./examples/roberta/preprocess_RACE.sh <race_data_folder> <output_folder>"
+  exit 1
+fi
+
+RACE_DATA_FOLDER=$1
+OUT_DATA_FOLDER=$2
+
+# download bpe encoder.json, vocabulary and fairseq dictionary
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
+
+SPLITS="train dev test-middle test-high"
+INPUT_TYPES="input0 input1 input2 input3 input4"
+for INPUT_TYPE in $INPUT_TYPES
+do
+  for SPLIT in $SPLITS
+      do
+      echo "BPE encoding $SPLIT/$INPUT_TYPE"
+      python -m examples.roberta.multiprocessing_bpe_encoder \
+            --encoder-json encoder.json \
+            --vocab-bpe vocab.bpe \
+            --inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \
+            --outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \
+            --workers 10 \
+            --keep-empty;
+
+      done
+done
+
+for INPUT_TYPE in $INPUT_TYPES
+    do
+      LANG="input$INPUT_TYPE"
+      fairseq-preprocess \
+        --dataset-impl cached \
+        --only-source \
+        --trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \
+        --validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \
+        --testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \
+        --destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \
+        --workers 10 \
+        --srcdict dict.txt;
+done
+
+rm -rf "$OUT_DATA_FOLDER/label"
+mkdir -p "$OUT_DATA_FOLDER/label"
+cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/"
+cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label"
+cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label"
+cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label"
\ No newline at end of file
diff --git a/fairseq/criterions/sentence_ranking.py b/fairseq/criterions/sentence_ranking.py
new file mode 100644
index 0000000000..fef4c93bba
--- /dev/null
+++ b/fairseq/criterions/sentence_ranking.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+
+from . import FairseqCriterion, register_criterion
+
+
+@register_criterion('sentence_ranking')
+class SentenceRankingCriterion(FairseqCriterion):
+
+    def forward(self, model, sample, reduce=True):
+        """Compute ranking loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        scores = []
+        for idx in range(self.args.num_classes):
+            score, _ = model(
+                **sample['net_input{idx}'.format(idx=idx+1)],
+                features_only=True,
+                classification_head_name='sentence_classification_head',
+            )
+            scores.append(score)
+
+        logits = torch.cat(scores, dim=1)
+        targets = model.get_targets(sample, [logits]).view(-1)
+        sample_size = targets.numel()
+
+        loss = F.nll_loss(
+            F.log_softmax(logits, dim=-1, dtype=torch.float32),
+            targets,
+            reduction='sum',
+        )
+
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample_size,
+            'sample_size': sample_size,
+        }
+        logging_output.update(
+            ncorrect=(logits.max(dim=1)[1] == targets).sum().item()
+        )
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        agg_output = {
+            'loss': loss_sum / sample_size / math.log(2),
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
+
+        if len(logging_outputs) > 0 and 'ncorrect' in logging_outputs[0]:
+            ncorrect = sum(log.get('ncorrect', 0) for log in logging_outputs)
+            agg_output.update(accuracy=ncorrect/nsentences)
+
+        if sample_size != ntokens:
+            agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
+        return agg_output
diff --git a/fairseq/tasks/sentence_ranking.py b/fairseq/tasks/sentence_ranking.py
new file mode 100644
index 0000000000..ea79d3606a
--- /dev/null
+++ b/fairseq/tasks/sentence_ranking.py
@@ -0,0 +1,194 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import numpy as np
+
+from fairseq.data import (
+    ConcatSentencesDataset,
+    data_utils,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumSamplesDataset,
+    NumelDataset,
+    OffsetTokensDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    SortDataset,
+    TruncateDataset,
+)
+
+from . import FairseqTask, register_task
+
+
+@register_task('sentence_ranking')
+class SentenceRankingTask(FairseqTask):
+    """
+    Ranking task on multiple sentences.
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', metavar='FILE',
+                            help='file prefix for data')
+        parser.add_argument('--num-classes', type=int, default=2,
+                            help='number of sentences to be ranked')
+        parser.add_argument('--init-token', type=int, default=None,
+                            help='add token at the beginning of each batch item')
+        parser.add_argument('--separator-token', type=int, default=None,
+                            help='add separator token between inputs')
+        parser.add_argument('--no-shuffle', action='store_true', default=False)
+        parser.add_argument('--truncate-sequence', action='store_true', default=False,
+                            help='Truncate sequence to max_sequence_length')
+        parser.add_argument('--max-option-length', type=int, default=None,
+                            help='max length for each option')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+
+    @classmethod
+    def load_dictionary(cls, args, filename, source=True):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol('<mask>')
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == 'sentence_ranking', \
+            'Must set --criterion=sentence_ranking'
+
+        args.tokens_per_sample = args.max_positions
+
+        # load data dictionary
+        data_dict = cls.load_dictionary(
+            args,
+            os.path.join(args.data, 'input0', 'dict.txt'),
+            source=True,
+        )
+        print('| [input] dictionary: {} types'.format(len(data_dict)))
+        return SentenceRankingTask(args, data_dict)
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)."""
+
+        def get_path(type, split):
+            return os.path.join(self.args.data, type, split)
+
+        def make_dataset(type, dictionary):
+            split_path = get_path(type, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            return dataset
+
+        input0 = make_dataset('input0', self.source_dictionary)
+        input_options = [
+            make_dataset(
+                'input{idx}'.format(idx=idx + 1),
+                self.source_dictionary
+            )
+            for idx in range(self.args.num_classes)
+        ]
+
+        if self.args.separator_token is not None:
+            input0 = PrependTokenDataset(input0, self.args.separator_token)
+
+        src_tokens = []
+        for input_option in input_options:
+            if self.args.init_token is not None:
+                input_option = PrependTokenDataset(input_option, self.args.init_token)
+            input_option = TruncateDataset(input_option, self.args.max_option_length)
+            src_token = ConcatSentencesDataset(input_option, input0)
+            if self.args.truncate_sequence:
+                src_token = TruncateDataset(src_token, self.args.max_positions)
+            src_tokens.append(src_token)
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(src_tokens[0]))
+
+        dataset = {
+            'id': IdDataset(),
+            'nsentences': NumSamplesDataset(),
+            'ntokens': NumelDataset(src_tokens[0], reduce=True),
+        }
+
+        for src_token_idx in range(len(src_tokens)):
+            dataset.update(
+                {
+                    'net_input{idx}'.format(idx=src_token_idx+1): {
+                        'src_tokens': RightPadDataset(
+                            src_tokens[src_token_idx],
+                            pad_idx=self.source_dictionary.pad(),
+                        ),
+                        'src_lengths': NumelDataset(src_tokens[src_token_idx], reduce=False),
+                    }
+                }
+            )
+
+        label_path = f"{get_path('label', split)}.label"
+        if os.path.exists(label_path):
+            dataset.update(
+                target=RawLabelDataset([
+                    int(x.strip()) for x in open(label_path).readlines()
+                ])
+            )
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
+        )
+
+        if self.args.no_shuffle:
+            dataset = nested_dataset
+        else:
+            dataset = SortDataset(
+                nested_dataset,
+                # shuffle
+                sort_order=[shuffle],
+            )
+
+        print("| Loaded {0} with #samples: {1}".format(split, len(dataset)))
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, args):
+        from fairseq import models
+        model = models.build_model(args, self)
+
+        model.register_classification_head(
+            'sentence_classification_head',
+            num_classes=1,
+        )
+
+        return model
+
+    def max_positions(self):
+        return self.args.max_positions
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary

From a00ce132445703291626918bd1d6b7c05c9f7144 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 9 Aug 2019 22:35:41 -0700
Subject: [PATCH 078/213] Fix Python 3.5 compat

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1005

Differential Revision: D16751489

Pulled By: myleott

fbshipit-source-id: 6e372ac23643e32a3791044c13f4466bdc28f049
---
 fairseq/tasks/sentence_ranking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/tasks/sentence_ranking.py b/fairseq/tasks/sentence_ranking.py
index ea79d3606a..259423318f 100644
--- a/fairseq/tasks/sentence_ranking.py
+++ b/fairseq/tasks/sentence_ranking.py
@@ -144,7 +144,7 @@ def make_dataset(type, dictionary):
                 }
             )
 
-        label_path = f"{get_path('label', split)}.label"
+        label_path = '{}.label'.format(get_path('label', split))
         if os.path.exists(label_path):
             dataset.update(
                 target=RawLabelDataset([

From 832491962b30fb2164bed696e1489685a885402f Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sat, 10 Aug 2019 08:13:26 -0700
Subject: [PATCH 079/213] Add WSC task and criterion

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1004

Differential Revision: D16751443

Pulled By: myleott

fbshipit-source-id: f70acd6c7be6d69da45b5b32fe4c4eff021539ab
---
 examples/roberta/README.md              | 107 ++++++----
 examples/roberta/README.wsc.md          |  83 ++++++++
 examples/roberta/wsc/__init__.py        |   7 +
 examples/roberta/wsc/wsc_criterion.py   | 131 ++++++++++++
 examples/roberta/wsc/wsc_task.py        | 260 ++++++++++++++++++++++++
 examples/roberta/wsc/wsc_utils.py       | 219 ++++++++++++++++++++
 fairseq/checkpoint_utils.py             |   2 +
 fairseq/data/__init__.py                |   2 +
 fairseq/data/list_dataset.py            |  29 +++
 fairseq/hub_utils.py                    |   3 +
 fairseq/models/roberta/hub_interface.py |  27 ++-
 fairseq/models/roberta/model.py         |   1 +
 fairseq/progress_bar.py                 |   3 +-
 fairseq/tasks/masked_lm.py              |  12 +-
 fairseq/tasks/translation_moe.py        |  10 +-
 fairseq/utils.py                        |  13 ++
 hubconf.py                              |   2 +-
 17 files changed, 848 insertions(+), 63 deletions(-)
 create mode 100644 examples/roberta/README.wsc.md
 create mode 100644 examples/roberta/wsc/__init__.py
 create mode 100644 examples/roberta/wsc/wsc_criterion.py
 create mode 100644 examples/roberta/wsc/wsc_task.py
 create mode 100644 examples/roberta/wsc/wsc_utils.py
 create mode 100644 fairseq/data/list_dataset.py

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 5f3be7941d..5b80fe94cf 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -12,7 +12,8 @@ Model | Description | # params | Download
 ---|---|---|---
 `roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz)
 `roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz)
-`roberta.large.mnli` | `roberta.large` finetuned on MNLI | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
+`roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
+`roberta.large.wsc` | `roberta.large` finetuned on [WSC](https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
 
 ## Results
 
@@ -24,12 +25,12 @@ Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
 `roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
 `roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
 
-
 ##### Results on SuperGLUE tasks (dev set, single model, single-task finetuning)
 
 Model | BoolQ | CB | COPA | MultiRC | RTE | WiC | WSC
 ---|---|---|---|---|---|---|---
-`roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | 91.3
+`roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | -
+`roberta.large.wsc` | - | - | - | - | - | - | 91.3
 
 ##### Results on SQuAD (dev set)
 
@@ -83,28 +84,6 @@ assert len(all_layers) == 25
 assert torch.all(all_layers[-1] == last_layer_features)
 ```
 
-By default RoBERTa outputs one feature vector per BPE token. You can instead
-realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization)
-with the `extract_features_aligned_to_words` method. This will compute a
-weighted average of the BPE-level features for each word and expose them in
-spaCy's `Token.vector` attribute:
-```python
-doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."')
-assert len(doc) == 10
-for tok in doc:
-    print('{:10}{} (...)'.format(str(tok), tok.vector[:5]))
-# <s>       tensor([-0.1316, -0.0386, -0.0832, -0.0477,  0.1943], grad_fn=<SliceBackward>) (...)
-# I         tensor([ 0.0559,  0.1541, -0.4832,  0.0880,  0.0120], grad_fn=<SliceBackward>) (...)
-# said      tensor([-0.1565, -0.0069, -0.8915,  0.0501, -0.0647], grad_fn=<SliceBackward>) (...)
-# ,         tensor([-0.1318, -0.0387, -0.0834, -0.0477,  0.1944], grad_fn=<SliceBackward>) (...)
-# "         tensor([-0.0486,  0.1818, -0.3946, -0.0553,  0.0981], grad_fn=<SliceBackward>) (...)
-# hello     tensor([ 0.0079,  0.1799, -0.6204, -0.0777, -0.0923], grad_fn=<SliceBackward>) (...)
-# RoBERTa   tensor([-0.2339, -0.1184, -0.7343, -0.0492,  0.5829], grad_fn=<SliceBackward>) (...)
-# .         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
-# "         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
-# </s>      tensor([-0.0930, -0.0392, -0.0821,  0.0158,  0.0649], grad_fn=<SliceBackward>) (...)
-```
-
 ##### Use RoBERTa for sentence-pair classification tasks:
 ```python
 # Download RoBERTa already finetuned for MNLI
@@ -141,22 +120,79 @@ roberta.cuda()
 roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
 ```
 
-##### Filling mask:
-Some examples from the [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/).
+## Advanced usage
+
+#### Filling masks:
+
+RoBERTa can be used to fill `<mask>` tokens in the input. Some examples from the
+[Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/):
 ```python
->>> roberta.fill_mask("The first Star wars movie came out in <mask>", topk=3)
-[('The first Star wars movie came out in 1977', 0.9504712224006653), ('The first Star wars movie came out in 1978', 0.009986752644181252), ('The first Star wars movie came out in 1979', 0.00957468245178461)]
+roberta.fill_mask('The first Star wars movie came out in <mask>', topk=3)
+# [('The first Star wars movie came out in 1977', 0.9504712224006653), ('The first Star wars movie came out in 1978', 0.009986752644181252), ('The first Star wars movie came out in 1979', 0.00957468245178461)]
+
+roberta.fill_mask('Vikram samvat calender is official in <mask>', topk=3)
+# [('Vikram samvat calender is official in India', 0.21878768503665924), ('Vikram samvat calender is official in Delhi', 0.08547217398881912), ('Vikram samvat calender is official in Gujarat', 0.07556255906820297)]
+
+roberta.fill_mask('<mask> is the common currency of the European Union', topk=3)
+# [('Euro is the common currency of the European Union', 0.945650577545166), ('euro is the common currency of the European Union', 0.025747718289494514), ('€ is the common currency of the European Union', 0.011183015070855618)]
+```
 
->>> roberta.fill_mask("Vikram samvat calender is official in <mask>", topk=3)
-[('Vikram samvat calender is official in India', 0.21878768503665924), ('Vikram samvat calender is official in Delhi', 0.08547217398881912), ('Vikram samvat calender is official in Gujarat', 0.07556255906820297)]
+#### Pronoun disambiguation (Winograd Schema Challenge):
 
->>> roberta.fill_mask("<mask> is the common currency of the European Union", topk=3)
-[('Euro is the common currency of the European Union', 0.945650577545166), ('euro is the common currency of the European Union', 0.025747718289494514), ('€ is the common currency of the European Union', 0.011183015070855618)]
+RoBERTa can be used to disambiguate pronouns. First install spaCy and download the English-language model:
+```bash
+pip install spacy
+python -m spacy download en_core_web_lg
+```
+
+Next load the `roberta.large.wsc` model and call the `disambiguate_pronoun`
+function. The pronoun should be surrounded by square brackets (`[]`) and the
+query referent surrounded by underscores (`_`), or left blank to return the
+predicted candidate text directly:
+```python
+roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.wsc', user_dir='examples/roberta/wsc')
+roberta.cuda()  # use the GPU (optional)
+
+roberta.disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.')
+# True
+roberta.disambiguate_pronoun('The trophy would not fit in the brown _suitcase_ because [it] was too big.')
+# False
+
+roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] feared violence.')
+# 'The city councilmen'
+roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] advocated violence.')
+# 'demonstrators'
+```
+
+See the [RoBERTA Winograd Schema Challenge (WSC) README](README.wsc.md) for more details on how to train this model.
+
+#### Extract features aligned to words:
+
+By default RoBERTa outputs one feature vector per BPE token. You can instead
+realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization)
+with the `extract_features_aligned_to_words` method. This will compute a
+weighted average of the BPE-level features for each word and expose them in
+spaCy's `Token.vector` attribute:
+```python
+doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."')
+assert len(doc) == 10
+for tok in doc:
+    print('{:10}{} (...)'.format(str(tok), tok.vector[:5]))
+# <s>       tensor([-0.1316, -0.0386, -0.0832, -0.0477,  0.1943], grad_fn=<SliceBackward>) (...)
+# I         tensor([ 0.0559,  0.1541, -0.4832,  0.0880,  0.0120], grad_fn=<SliceBackward>) (...)
+# said      tensor([-0.1565, -0.0069, -0.8915,  0.0501, -0.0647], grad_fn=<SliceBackward>) (...)
+# ,         tensor([-0.1318, -0.0387, -0.0834, -0.0477,  0.1944], grad_fn=<SliceBackward>) (...)
+# "         tensor([-0.0486,  0.1818, -0.3946, -0.0553,  0.0981], grad_fn=<SliceBackward>) (...)
+# hello     tensor([ 0.0079,  0.1799, -0.6204, -0.0777, -0.0923], grad_fn=<SliceBackward>) (...)
+# RoBERTa   tensor([-0.2339, -0.1184, -0.7343, -0.0492,  0.5829], grad_fn=<SliceBackward>) (...)
+# .         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
+# "         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
+# </s>      tensor([-0.0930, -0.0392, -0.0821,  0.0158,  0.0649], grad_fn=<SliceBackward>) (...)
 ```
 
-##### Evaluating the `roberta.large.mnli` model
+#### Evaluating the `roberta.large.mnli` model:
 
-Example python code snippet to evaluate accuracy on the MNLI dev_matched set.
+Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set.
 ```python
 label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
 ncorrect, nsamples = 0, 0
@@ -181,6 +217,7 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 
 - [Finetuning on GLUE](README.finetune_glue.md)
 - [Finetuning on custom classification tasks (e.g., IMDB)](README.finetune_custom_classification.md)
+- [Finetuning on Winograd Schema Challenge (WSC)](README.wsc.md)
 - Finetuning on SQuAD: coming soon
 
 ## Pretraining using your own data
diff --git a/examples/roberta/README.wsc.md b/examples/roberta/README.wsc.md
new file mode 100644
index 0000000000..b1437d1de7
--- /dev/null
+++ b/examples/roberta/README.wsc.md
@@ -0,0 +1,83 @@
+# Finetuning RoBERTa on Winograd Schema Challenge (WSC) data
+
+The following instructions can be used to finetune RoBERTa on the WSC training
+data provided by [SuperGLUE](https://super.gluebenchmark.com/).
+
+Note that there is high variance in the results. For our GLUE/SuperGLUE
+submission we swept over the learning rate, batch size and total number of
+updates, as well as the random seed. Out of ~100 runs we chose the best 7 models
+and ensembled them.
+
+**Note:** The instructions below use a slightly different loss function than
+what's described in the original RoBERTa arXiv paper. In particular,
+[Kocijan et al. (2019)](https://arxiv.org/abs/1905.06290) introduce a margin
+ranking loss between `(query, candidate)` pairs with tunable hyperparameters
+alpha and beta. This is supported in our code as well with the `--wsc-alpha` and
+`--wsc-beta` arguments. However, we achieved slightly better (and more robust)
+results on the development set by instead using a single cross entropy loss term
+over the log-probabilities for the query and all candidates. This reduces the
+number of hyperparameters and our best model achieved 92.3% development set
+accuracy, compared to ~90% accuracy for the margin loss. Later versions of the
+RoBERTa arXiv paper will describe this updated formulation.
+
+### 1) Download the WSC data from the SuperGLUE website:
+```bash
+wget https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip
+unzip WSC.zip
+
+# we also need to copy the RoBERTa dictionary into the same directory
+wget -O WSC/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
+```
+
+### 2) Finetune over the provided training data:
+```bash
+TOTAL_NUM_UPDATES=2000  # Total number of training steps.
+WARMUP_UPDATES=250      # Linearly increase LR over this many steps.
+LR=2e-05                # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=16        # Batch size per GPU.
+SEED=1                  # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+# we use the --user-dir option to load the task and criterion
+# from the examples/roberta/wsc directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
+
+cd fairseq
+CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \
+  --restore-file $ROBERTA_PATH \
+  --reset-optimizer --reset-dataloader --reset-meters \
+  --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+  --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+  --valid-subset val \
+  --fp16 --ddp-backend no_c10d \
+  --user-dir $FAIRSEQ_USER_DIR \
+  --task wsc --criterion wsc --wsc-cross-entropy \
+  --arch roberta_large --bpe gpt2 --max-positions 512 \
+  --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+  --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
+  --lr-scheduler polynomial_decay --lr $LR \
+  --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
+  --max-sentences $MAX_SENTENCES \
+  --max-update $TOTAL_NUM_UPDATES \
+  --log-format simple --log-interval 100
+```
+
+The above command assumes training on 4 GPUs, but you can achieve the same
+results on a single GPU by adding `--update-freq=4`.
+
+### 3) Evaluate
+```python
+from fairseq.models.roberta import RobertaModel
+from examples.roberta.wsc import wsc_utils  # also loads WSC task and criterion
+roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'WSC/')
+roberta.cuda()
+nsamples, ncorrect = 0, 0
+for sentence, label in wsc_utils.jsonl_iterator('WSC/val.jsonl', eval=True):
+    pred = roberta.disambiguate_pronoun(sentence)
+    nsamples += 1
+    if pred == label:
+        ncorrect += 1
+print('Accuracy: ' + str(ncorrect / float(nsamples)))
+# Accuracy: 0.9230769230769231
+```
diff --git a/examples/roberta/wsc/__init__.py b/examples/roberta/wsc/__init__.py
new file mode 100644
index 0000000000..78afa4728e
--- /dev/null
+++ b/examples/roberta/wsc/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import wsc_criterion  # noqa
+from . import wsc_task  # noqa
diff --git a/examples/roberta/wsc/wsc_criterion.py b/examples/roberta/wsc/wsc_criterion.py
new file mode 100644
index 0000000000..c5b6507f9a
--- /dev/null
+++ b/examples/roberta/wsc/wsc_criterion.py
@@ -0,0 +1,131 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.data import encoders
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion('wsc')
+class WSCCriterion(FairseqCriterion):
+
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        if self.args.save_predictions is not None:
+            self.prediction_h = open(self.args.save_predictions, 'w')
+        else:
+            self.prediction_h = None
+        self.bpe = encoders.build_bpe(args)
+        self.tokenizer = encoders.build_tokenizer(args)
+
+    def __del__(self):
+        if self.prediction_h is not None:
+            self.prediction_h.close()
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        parser.add_argument('--wsc-margin-alpha', type=float, metavar='A', default=1.0)
+        parser.add_argument('--wsc-margin-beta', type=float, metavar='B', default=0.0)
+        parser.add_argument('--wsc-cross-entropy', action='store_true',
+                            help='use cross entropy formulation instead of margin loss')
+        parser.add_argument('--save-predictions', metavar='FILE',
+                            help='file to save predictions to')
+
+    def forward(self, model, sample, reduce=True):
+
+        def get_masked_input(tokens, mask):
+            masked_tokens = tokens.clone()
+            masked_tokens[mask] = self.task.mask
+            return masked_tokens
+
+        def get_lprobs(tokens, mask):
+            logits, _ = model(src_tokens=get_masked_input(tokens, mask))
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
+            scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
+            mask = mask.type_as(scores)
+            scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
+            return scores
+
+        # compute loss and accuracy
+        loss, nloss = 0., 0
+        ncorrect, nqueries = 0, 0
+        for i, label in enumerate(sample['labels']):
+            query_lprobs = get_lprobs(
+                sample['query_tokens'][i].unsqueeze(0),
+                sample['query_masks'][i].unsqueeze(0),
+            )
+            cand_lprobs = get_lprobs(
+                sample['candidate_tokens'][i],
+                sample['candidate_masks'][i],
+            )
+
+            pred = (query_lprobs >= cand_lprobs).all().item()
+
+            if label is not None:
+                label = 1 if label else 0
+                ncorrect += 1 if pred == label else 0
+                nqueries += 1
+
+            if label:
+                # only compute a loss for positive instances
+                nloss += 1
+                if self.args.wsc_cross_entropy:
+                    loss += F.cross_entropy(
+                        torch.cat([query_lprobs, cand_lprobs]).unsqueeze(0),
+                        query_lprobs.new([0]).long(),
+                    )
+                else:
+                    loss += (
+                        - query_lprobs
+                        + self.args.wsc_margin_alpha * (
+                            cand_lprobs - query_lprobs + self.args.wsc_margin_beta
+                        ).clamp(min=0)
+                    ).sum()
+
+            id = sample['id'][i].item()
+            if self.prediction_h is not None:
+                print('{}\t{}\t{}'.format(id, pred, label), file=self.prediction_h)
+
+        if nloss == 0:
+            loss = torch.tensor(0.0, requires_grad=True)
+
+        sample_size = nqueries if nqueries > 0 else 1
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['nsentences'],
+            'sample_size': sample_size,
+            'ncorrect': ncorrect,
+            'nqueries': nqueries,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        agg_output = {
+            'loss': loss_sum / sample_size / math.log(2),
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
+
+        ncorrect = sum(log.get('ncorrect', 0) for log in logging_outputs)
+        nqueries = sum(log.get('nqueries', 0) for log in logging_outputs)
+        if nqueries > 0:
+            agg_output['accuracy'] = ncorrect / float(nqueries)
+
+        return agg_output
diff --git a/examples/roberta/wsc/wsc_task.py b/examples/roberta/wsc/wsc_task.py
new file mode 100644
index 0000000000..7fd09fc77c
--- /dev/null
+++ b/examples/roberta/wsc/wsc_task.py
@@ -0,0 +1,260 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import tempfile
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.data import (
+    data_utils,
+    Dictionary,
+    encoders,
+    IdDataset,
+    ListDataset,
+    NestedDictionaryDataset,
+    NumSamplesDataset,
+    NumelDataset,
+    SortDataset,
+)
+from fairseq.tasks import FairseqTask, register_task
+
+from . import wsc_utils
+
+
+@register_task('wsc')
+class WSCTask(FairseqTask):
+    """Task to finetune RoBERTa for Winograd Schemas."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', metavar='DIR',
+                            help='path to data directory; we load <split>.jsonl')
+        parser.add_argument('--init-token', type=int, default=None,
+                            help='add token at the beginning of each batch item')
+
+    def __init__(self, args, vocab):
+        super().__init__(args)
+        self.vocab = vocab
+        self.mask = vocab.add_symbol('<mask>')
+
+        self.bpe = encoders.build_bpe(args)
+        self.tokenizer = encoders.build_tokenizer(args)
+
+        # hack to handle GPT-2 BPE, which includes leading spaces
+        if args.bpe == 'gpt2':
+            self.leading_space = True
+            self.trailing_space = False
+        else:
+            self.leading_space = False
+            self.trailing_space = True
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol('<mask>')
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == 'wsc', 'Must set --criterion=wsc'
+
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, 'dict.txt'))
+        print('| dictionary: {} types'.format(len(vocab)))
+
+        return cls(args, vocab)
+
+    def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+
+        def binarize(s: str, append_eos: bool = False):
+            if self.tokenizer is not None:
+                s = self.tokenizer.encode(s)
+            if self.bpe is not None:
+                s = self.bpe.encode(s)
+            tokens = self.vocab.encode_line(
+                s, append_eos=append_eos, add_if_not_exist=False,
+            ).long()
+            if self.args.init_token is not None:
+                tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
+            return tokens
+
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + '.jsonl')
+        if not os.path.exists(data_path):
+            raise FileNotFoundError('Cannot find data: {}'.format(data_path))
+
+        query_tokens = []
+        query_masks = []
+        query_lengths = []
+        candidate_tokens = []
+        candidate_masks = []
+        candidate_lengths = []
+        labels = []
+
+        for sentence, pronoun_span, query, label in wsc_utils.jsonl_iterator(data_path):
+            prefix = sentence[:pronoun_span.start].text
+            suffix = sentence[pronoun_span.end:].text_with_ws
+
+            # spaCy spans include trailing spaces, but we need to know about
+            # leading spaces for the GPT-2 BPE
+            leading_space = ' ' if sentence[:pronoun_span.start].text_with_ws.endswith(' ') else ''
+            trailing_space = ' ' if pronoun_span.text_with_ws.endswith(' ') else ''
+
+            # get noun phrases, excluding pronouns and anything overlapping with the query
+            cand_spans = wsc_utils.filter_noun_chunks(
+                wsc_utils.extended_noun_chunks(sentence),
+                exclude_pronouns=True,
+                exclude_query=query,
+                exact_match=False,
+            )
+
+            def binarize_with_mask(txt):
+                toks = binarize(
+                    prefix + leading_space + txt + trailing_space + suffix,
+                    append_eos=True,
+                )
+                mask = torch.zeros_like(toks, dtype=torch.uint8)
+                mask_start = len(binarize(prefix))
+                mask_size = len(binarize(leading_space + txt))
+                mask[mask_start:mask_start + mask_size] = 1
+                return toks, mask
+
+            if query is not None:
+                query_toks, query_mask = binarize_with_mask(query)
+                query_len = len(query_toks)
+            else:
+                query_toks, query_mask, query_len = None, None, 0
+
+            query_tokens.append(query_toks)
+            query_masks.append(query_mask)
+            query_lengths.append(query_len)
+
+            cand_toks, cand_masks = [], []
+            for cand_span in cand_spans:
+                toks, mask = binarize_with_mask(cand_span.text)
+                cand_toks.append(toks)
+                cand_masks.append(mask)
+
+            # collate candidates
+            cand_toks = data_utils.collate_tokens(cand_toks, pad_idx=self.vocab.pad())
+            cand_masks = data_utils.collate_tokens(cand_masks, pad_idx=0)
+            assert cand_toks.size() == cand_masks.size()
+
+            candidate_tokens.append(cand_toks)
+            candidate_masks.append(cand_masks)
+            candidate_lengths.append(cand_toks.size(1))
+
+            labels.append(label)
+
+        query_lengths = np.array(query_lengths)
+        query_tokens = ListDataset(query_tokens, query_lengths)
+        query_masks = ListDataset(query_masks, query_lengths)
+
+        candidate_lengths = np.array(candidate_lengths)
+        candidate_tokens = ListDataset(candidate_tokens, candidate_lengths)
+        candidate_masks = ListDataset(candidate_masks, candidate_lengths)
+
+        labels = ListDataset(labels, [1]*len(labels))
+
+        dataset = {
+            'id': IdDataset(),
+            'query_tokens': query_tokens,
+            'query_masks': query_masks,
+            'candidate_tokens': candidate_tokens,
+            'candidate_masks': candidate_masks,
+            'labels': labels,
+            'nsentences': NumSamplesDataset(),
+            'ntokens': NumelDataset(query_tokens, reduce=True),
+        }
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[query_lengths],
+        )
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(query_tokens))
+        dataset = SortDataset(
+            nested_dataset,
+            # shuffle
+            sort_order=[shuffle],
+        )
+
+        if return_only:
+            return dataset
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_dataset_for_inference(self, sample_json):
+        with tempfile.NamedTemporaryFile(buffering=0) as h:
+            h.write((json.dumps(sample_json) + '\n').encode('utf-8'))
+            dataset = self.load_dataset(
+                'disambiguate_pronoun',
+                data_path=h.name,
+                return_only=True,
+            )
+        return dataset
+
+    def disambiguate_pronoun(self, model, sentence, use_cuda=False):
+        sample_json = wsc_utils.convert_sentence_to_json(sentence)
+        dataset = self.build_dataset_for_inference(sample_json)
+        sample = dataset.collater([dataset[0]])
+        if use_cuda:
+            sample = utils.move_to_cuda(sample)
+
+        def get_masked_input(tokens, mask):
+            masked_tokens = tokens.clone()
+            masked_tokens[mask] = self.mask
+            return masked_tokens
+
+        def get_lprobs(tokens, mask):
+            logits, _ = model(src_tokens=get_masked_input(tokens, mask))
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
+            scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
+            mask = mask.type_as(scores)
+            scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
+            return scores
+
+        cand_lprobs = get_lprobs(
+            sample['candidate_tokens'][0],
+            sample['candidate_masks'][0],
+        )
+        if sample['query_tokens'][0] is not None:
+            query_lprobs = get_lprobs(
+                sample['query_tokens'][0].unsqueeze(0),
+                sample['query_masks'][0].unsqueeze(0),
+            )
+            return (query_lprobs >= cand_lprobs).all().item() == 1
+        else:
+            best_idx = cand_lprobs.argmax().item()
+            full_cand = sample['candidate_tokens'][0][best_idx]
+            mask = sample['candidate_masks'][0][best_idx]
+            toks = full_cand[mask]
+            return self.bpe.decode(self.source_dictionary.string(toks)).strip()
+
+    @property
+    def source_dictionary(self):
+        return self.vocab
+
+    @property
+    def target_dictionary(self):
+        return self.vocab
diff --git a/examples/roberta/wsc/wsc_utils.py b/examples/roberta/wsc/wsc_utils.py
new file mode 100644
index 0000000000..ef388665fd
--- /dev/null
+++ b/examples/roberta/wsc/wsc_utils.py
@@ -0,0 +1,219 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import lru_cache
+import json
+
+
+def convert_sentence_to_json(sentence):
+    if '_' in sentence:
+        prefix, rest = sentence.split('_', 1)
+        query, rest = rest.split('_', 1)
+        query_index = len(prefix.rstrip().split(' '))
+    else:
+        query, query_index = None, None
+
+    prefix, rest = sentence.split('[', 1)
+    pronoun, rest = rest.split(']', 1)
+    pronoun_index = len(prefix.rstrip().split(' '))
+
+    sentence = sentence.replace('_', '').replace('[', '').replace(']', '')
+
+    return {
+        'idx': 0,
+        'text': sentence,
+        'target': {
+            'span1_index': query_index,
+            'span1_text': query,
+            'span2_index': pronoun_index,
+            'span2_text': pronoun,
+        },
+    }
+
+
+def extended_noun_chunks(sentence):
+    noun_chunks = {(np.start, np.end) for np in sentence.noun_chunks}
+    np_start, cur_np = 0, 'NONE'
+    for i, token in enumerate(sentence):
+        np_type = token.pos_ if token.pos_ in {'NOUN', 'PROPN'} else 'NONE'
+        if np_type != cur_np:
+            if cur_np != 'NONE':
+                noun_chunks.add((np_start, i))
+            if np_type != 'NONE':
+                np_start = i
+            cur_np = np_type
+    if cur_np != 'NONE':
+        noun_chunks.add((np_start, len(sentence)))
+    return [sentence[s:e] for (s, e) in sorted(noun_chunks)]
+
+
+def find_token(sentence, start_pos):
+    found_tok = None
+    for tok in sentence:
+        if tok.idx == start_pos:
+            found_tok = tok
+            break
+    return found_tok
+
+
+def find_span(sentence, search_text, start=0):
+    search_text = search_text.lower()
+    for tok in sentence[start:]:
+        remainder = sentence[tok.i:].text.lower()
+        if remainder.startswith(search_text):
+            len_to_consume = len(search_text)
+            start_idx = tok.idx
+            for next_tok in sentence[tok.i:]:
+                end_idx = next_tok.idx + len(next_tok.text)
+                if end_idx - start_idx == len_to_consume:
+                    span = sentence[tok.i:next_tok.i + 1]
+                    return span
+    return None
+
+
+@lru_cache(maxsize=1)
+def get_detokenizer():
+    from sacremoses import MosesDetokenizer
+    detok = MosesDetokenizer(lang='en')
+    return detok
+
+
+@lru_cache(maxsize=1)
+def get_spacy_nlp():
+    import en_core_web_lg
+    nlp = en_core_web_lg.load()
+    return nlp
+
+
+def jsonl_iterator(input_fname, positive_only=False, ngram_order=3, eval=False):
+    detok = get_detokenizer()
+    nlp = get_spacy_nlp()
+
+    with open(input_fname) as fin:
+        for line in fin:
+            sample = json.loads(line.strip())
+
+            if positive_only and 'label' in sample and not sample['label']:
+                # only consider examples where the query is correct
+                continue
+
+            target = sample['target']
+
+            # clean up the query
+            query = target['span1_text']
+            if query is not None:
+                if '\n' in query:
+                    continue
+                if query.endswith('.') or query.endswith(','):
+                    query = query[:-1]
+
+            # split tokens
+            tokens = sample['text'].split(' ')
+
+            def strip_pronoun(x):
+                return x.rstrip('.,"')
+
+            # find the pronoun
+            pronoun_idx = target['span2_index']
+            pronoun = strip_pronoun(target['span2_text'])
+            if strip_pronoun(tokens[pronoun_idx]) != pronoun:
+                # hack: sometimes the index is misaligned
+                if strip_pronoun(tokens[pronoun_idx + 1]) == pronoun:
+                    pronoun_idx += 1
+                else:
+                    raise Exception('Misaligned pronoun!')
+            assert strip_pronoun(tokens[pronoun_idx]) == pronoun
+
+            # split tokens before and after the pronoun
+            before = tokens[:pronoun_idx]
+            after = tokens[pronoun_idx + 1:]
+
+            # the GPT BPE attaches leading spaces to tokens, so we keep track
+            # of whether we need spaces before or after the pronoun
+            leading_space = ' ' if pronoun_idx > 0 else ''
+            trailing_space = ' ' if len(after) > 0 else ''
+
+            # detokenize
+            before = detok.detokenize(before, return_str=True)
+            pronoun = detok.detokenize([pronoun], return_str=True)
+            after = detok.detokenize(after, return_str=True)
+
+            # hack: when the pronoun ends in a period (or comma), move the
+            # punctuation to the "after" part
+            if pronoun.endswith('.') or pronoun.endswith(','):
+                after = pronoun[-1] + trailing_space + after
+                pronoun = pronoun[:-1]
+
+            # hack: when the "after" part begins with a comma or period, remove
+            # the trailing space
+            if after.startswith('.') or after.startswith(','):
+                trailing_space = ''
+
+            # parse sentence with spacy
+            sentence = nlp(before + leading_space + pronoun + trailing_space + after)
+
+            # find pronoun span
+            start = len(before + leading_space)
+            first_pronoun_tok = find_token(sentence, start_pos=start)
+            pronoun_span = find_span(sentence, pronoun, start=first_pronoun_tok.i)
+            assert pronoun_span.text == pronoun
+
+            if eval:
+                # convert to format where pronoun is surrounded by "[]" and
+                # query is surrounded by "_"
+                query_span = find_span(sentence, query)
+                query_with_ws = '_{}_{}'.format(
+                    query_span.text,
+                    (' ' if query_span.text_with_ws.endswith(' ') else '')
+                )
+                pronoun_with_ws = '[{}]{}'.format(
+                    pronoun_span.text,
+                    (' ' if pronoun_span.text_with_ws.endswith(' ') else '')
+                )
+                if query_span.start < pronoun_span.start:
+                    first = (query_span, query_with_ws)
+                    second = (pronoun_span, pronoun_with_ws)
+                else:
+                    first = (pronoun_span, pronoun_with_ws)
+                    second = (query_span, query_with_ws)
+                sentence = (
+                    sentence[:first[0].start].text_with_ws
+                    + first[1]
+                    + sentence[first[0].end:second[0].start].text_with_ws
+                    + second[1]
+                    + sentence[second[0].end:].text
+                )
+                yield sentence, sample.get('label', None)
+            else:
+                yield sentence, pronoun_span, query, sample.get('label', None)
+
+
+def filter_noun_chunks(chunks, exclude_pronouns=False, exclude_query=None, exact_match=False):
+    if exclude_pronouns:
+        chunks = [
+            np for np in chunks if (
+                np.lemma_ != '-PRON-'
+                and not all(tok.pos_ == 'PRON' for tok in np)
+            )
+        ]
+
+    if exclude_query is not None:
+        excl_txt = [exclude_query.lower()]
+        filtered_chunks = []
+        for chunk in chunks:
+            lower_chunk = chunk.text.lower()
+            found = False
+            for excl in excl_txt:
+                if (
+                    (not exact_match and (lower_chunk in excl or excl in lower_chunk))
+                    or lower_chunk == excl
+                ):
+                    found = True
+                    break
+            if not found:
+                filtered_chunks.append(chunk)
+        chunks = filtered_chunks
+
+    return chunks
diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 70cb948270..0e080c9ae3 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -345,6 +345,8 @@ def load_pretrained_component_from_model(
 
 
 def verify_checkpoint_directory(save_dir: str) -> None:
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir, exist_ok=True)
     temp_file_path = os.path.join(save_dir, 'dummy')
     try:
         with open(temp_file_path, 'w'):
diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index d400a9b034..f97eaa9fab 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -16,6 +16,7 @@
 from .id_dataset import IdDataset
 from .indexed_dataset import IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, MMapIndexedDataset
 from .language_pair_dataset import LanguagePairDataset
+from .list_dataset import ListDataset
 from .lm_context_window_dataset import LMContextWindowDataset
 from .lru_cache_dataset import LRUCacheDataset
 from .mask_tokens_dataset import MaskTokensDataset
@@ -59,6 +60,7 @@
     'IndexedRawTextDataset',
     'LanguagePairDataset',
     'LeftPadDataset',
+    'ListDataset',
     'LMContextWindowDataset',
     'LRUCacheDataset',
     'MaskTokensDataset',
diff --git a/fairseq/data/list_dataset.py b/fairseq/data/list_dataset.py
new file mode 100644
index 0000000000..f753727abf
--- /dev/null
+++ b/fairseq/data/list_dataset.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class ListDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, sizes):
+        super().__init__(dataset)
+        self._sizes = sizes
+
+    def collater(self, samples):
+        return samples
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    def set_epoch(self, epoch):
+        pass
diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index 297338d02e..a1d37cd25b 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -47,6 +47,9 @@ def from_pretrained(
         if os.path.exists(path):
             kwargs[arg] = path
 
+    if 'user_dir' in kwargs:
+        utils.import_user_module(argparse.Namespace(user_dir=kwargs['user_dir']))
+
     models, args, task = checkpoint_utils.load_model_ensemble_and_task(
         [os.path.join(model_path, cpt) for cpt in checkpoint_file.split(':')],
         arg_overrides=kwargs,
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index 22ce96e89f..fc53384062 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -10,6 +10,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from fairseq import utils
 from fairseq.data import encoders
 
 
@@ -152,11 +153,12 @@ def fill_mask(self, masked_input: str, topk: int = 5):
         if tokens.dim() == 1:
             tokens = tokens.unsqueeze(0)
 
-        features, extra = self.model(
-            tokens.long().to(device=self.device),
-            features_only=False,
-            return_all_hiddens=False,
-        )
+        with utils.eval(self.model):
+            features, extra = self.model(
+                tokens.long().to(device=self.device),
+                features_only=False,
+                return_all_hiddens=False,
+            )
         logits = features[0, masked_index, :].squeeze()
         prob = logits.softmax(dim=0)
         values, index = prob.topk(k=topk, dim=0)
@@ -178,3 +180,18 @@ def fill_mask(self, masked_input: str, topk: int = 5):
                     values[index].item(),
                 ))
         return topk_filled_outputs
+
+    def disambiguate_pronoun(self, sentence: str) -> bool:
+        """
+        Usage::
+
+            >>> disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.')
+            True
+
+            >>> disambiguate_pronoun('The trophy would not fit in the brown suitcase because [it] was too big.')
+            'The trophy'
+        """
+        assert hasattr(self.task, 'disambiguate_pronoun'), \
+            'roberta.disambiguate_pronoun() requires a model trained with the WSC task.'
+        with utils.eval(self.model):
+            return self.task.disambiguate_pronoun(self.model, sentence, use_cuda=self.device.type == 'cuda')
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index eb7e03f764..8ae3f51f37 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -35,6 +35,7 @@ def hub_models(cls):
             'roberta.base': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz',
             'roberta.large': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz',
             'roberta.large.mnli': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz',
+            'roberta.large.wsc': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz',
         }
 
     def __init__(self, args, encoder):
diff --git a/fairseq/progress_bar.py b/fairseq/progress_bar.py
index a9bb5f97b4..59715f548d 100644
--- a/fairseq/progress_bar.py
+++ b/fairseq/progress_bar.py
@@ -14,8 +14,6 @@
 import re
 import sys
 
-from tqdm import tqdm
-
 from fairseq import distributed_utils
 from fairseq.meters import AverageMeter, StopwatchMeter, TimeMeter
 
@@ -208,6 +206,7 @@ class tqdm_progress_bar(progress_bar):
 
     def __init__(self, iterable, epoch=None, prefix=None):
         super().__init__(iterable, epoch, prefix)
+        from tqdm import tqdm
         self.tqdm = tqdm(iterable, self.prefix, leave=False)
 
     def __iter__(self):
diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py
index f1686258fb..240ec0a3b5 100644
--- a/fairseq/tasks/masked_lm.py
+++ b/fairseq/tasks/masked_lm.py
@@ -104,6 +104,7 @@ def load_dataset(self, split, epoch=0, combine=False):
             eos=self.source_dictionary.eos(),
             break_mode=self.args.sample_break_mode,
         )
+        print('| loaded {} batches from: {}'.format(len(dataset), split_path))
 
         # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
         dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
@@ -210,14 +211,3 @@ def source_dictionary(self):
     @property
     def target_dictionary(self):
         return self.dictionary
-
-    def get_average_masked_score(self, model, src_tokens, mask, **net_input):
-        """Mask a set of tokens and return their average score."""
-        masked_tokens = src_tokens.clone()
-        masked_tokens[mask.byte()] = self.mask_idx
-        net_output = model(src_tokens=masked_tokens, **net_input, last_state_only=True)
-        lprobs = F.log_softmax(net_output[0], dim=-1, dtype=torch.float32)
-        lprobs = lprobs.gather(-1, src_tokens.unsqueeze(-1)).squeeze(-1)
-        mask = mask.type_as(lprobs)
-        score = (lprobs * mask).sum(dim=-1) / mask.sum(dim=-1)
-        return score
diff --git a/fairseq/tasks/translation_moe.py b/fairseq/tasks/translation_moe.py
index 35d44e47cb..cd8b985bb1 100644
--- a/fairseq/tasks/translation_moe.py
+++ b/fairseq/tasks/translation_moe.py
@@ -12,14 +12,6 @@
 from fairseq.tasks.translation import TranslationTask
 
 
-@contextlib.contextmanager
-def eval(model):
-    is_training = model.training
-    model.eval()
-    yield
-    model.train(is_training)
-
-
 @register_task('translation_moe')
 class TranslationMoETask(TranslationTask):
     """
@@ -163,7 +155,7 @@ def get_lprob_yz(winners=None):
             return lprob_yz
 
         # compute responsibilities without dropout
-        with eval(model):  # disable dropout
+        with utils.eval(model):  # disable dropout
             with torch.no_grad():  # disable autograd
                 lprob_yz = get_lprob_yz()  # B x K
                 prob_z_xy = torch.nn.functional.softmax(lprob_yz, dim=1)
diff --git a/fairseq/utils.py b/fairseq/utils.py
index 76473837aa..4d9da12d62 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from collections import defaultdict
+import contextlib
 import copy
 import importlib.util
 import math
@@ -277,6 +278,10 @@ def import_user_module(args):
     module_path = getattr(args, 'user_dir', None)
     if module_path is not None:
         module_path = os.path.abspath(args.user_dir)
+        if not os.path.exists(module_path):
+            fairseq_rel_path = os.path.join(os.path.dirname(__file__), '..', args.user_dir)
+            if os.path.exists(fairseq_rel_path):
+                module_path = fairseq_rel_path
         module_parent, module_name = os.path.split(module_path)
 
         if module_name not in sys.modules:
@@ -339,3 +344,11 @@ def get_available_activation_fns() -> List:
         'tanh',
         'linear',
     ]
+
+
+@contextlib.contextmanager
+def eval(model):
+    is_training = model.training
+    model.eval()
+    yield
+    model.train(is_training)
diff --git a/hubconf.py b/hubconf.py
index d8f252ad7b..34179c9dba 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -21,7 +21,7 @@
     for model_name in _cls.hub_models().keys():
         globals()[model_name] = functools.partial(
             _cls.from_pretrained,
-            model_name_or_path=model_name,
+            model_name,
         )
     # to simplify the interface we only expose named models
     # globals()[_model_type] = _cls.from_pretrained

From c0a5d29e59e9b2d999298e06d0e46f96487eb024 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sat, 10 Aug 2019 10:43:10 -0700
Subject: [PATCH 080/213] Fix torch.hub for MNLI

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1006

Differential Revision: D16753078

Pulled By: myleott

fbshipit-source-id: 970055632edffcce4e75931ed93b42a249120a4a
---
 fairseq/models/roberta/model.py | 56 ++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index 8ae3f51f37..2bbe919bdb 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -76,6 +76,8 @@ def add_args(parser):
                             help='dropout probability in the masked_lm pooler layers')
         parser.add_argument('--max-positions', type=int,
                             help='number of positional embeddings to learn')
+        parser.add_argument('--load-checkpoint-heads', action='store_true',
+                            help='(re-)register and load heads when loading checkpoints')
 
     @classmethod
     def build_model(cls, args, task):
@@ -92,7 +94,7 @@ def build_model(cls, args, task):
 
     def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
         assert classification_head_name is None or features_only, \
-            "If passing classification_head_name argument, features_only must be set to True"
+            'If passing classification_head_name argument, features_only must be set to True'
 
         x, extra = self.decoder(src_tokens, features_only, return_all_hiddens, **kwargs)
 
@@ -102,6 +104,16 @@ def forward(self, src_tokens, features_only=False, return_all_hiddens=False, cla
 
     def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
         """Register a classification head."""
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                print(
+                    'WARNING: re-registering head "{}" with num_classes {} (prev: {}) '
+                    'and inner_dim {} (prev: {})'.format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
         self.classification_heads[name] = RobertaClassificationHead(
             self.args.encoder_embed_dim,
             inner_dim or self.args.encoder_embed_dim,
@@ -123,6 +135,7 @@ def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_na
             data_name_or_path,
             archive_map=cls.hub_models(),
             bpe='gpt2',
+            load_checkpoint_heads=True,
             **kwargs,
         )
         return RobertaHubInterface(x['args'], x['task'], x['models'][0])
@@ -132,30 +145,35 @@ def upgrade_state_dict_named(self, state_dict, name):
         current_head_names = [] if not hasattr(self, 'classification_heads') else \
             self.classification_heads.keys()
 
+        # Handle new classification heads present in the state dict.
         keys_to_delete = []
-        # Delete any heads present in state_dict, that are not in current constructed model.
         for k in state_dict.keys():
             if not k.startswith(prefix + 'classification_heads.'):
                 continue
 
             head_name = k[len(prefix + 'classification_heads.'):].split('.')[0]
-            num_classes = state_dict[
-                prefix + 'classification_heads.' + head_name + '.out_proj.weight'
-            ].size(0)
-            inner_dim = state_dict[
-                prefix + 'classification_heads.' + head_name + '.dense.weight'
-            ].size(0)
-
-            if head_name not in current_head_names:
-                print("WARNING: deleting classification head ({}) from checkpoint not present in current model: {}".format(head_name, k))
-                keys_to_delete.append(k)
-            elif (
-                num_classes != self.classification_heads[head_name].out_proj.out_features
-                or inner_dim != self.classification_heads[head_name].dense.out_features
-            ):
-                print("WARNING: deleting classification head ({}) from checkpoint with different dimensions than current model: {}".format(head_name, k))
-                keys_to_delete.append(k)
-
+            num_classes = state_dict[prefix + 'classification_heads.' + head_name + '.out_proj.weight'].size(0)
+            inner_dim = state_dict[prefix + 'classification_heads.' + head_name + '.dense.weight'].size(0)
+
+            if getattr(self.args, 'load_checkpoint_heads', False):
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    print(
+                        'WARNING: deleting classification head ({}) from checkpoint '
+                        'not present in current model: {}'.format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim != self.classification_heads[head_name].dense.out_features
+                ):
+                    print(
+                        'WARNING: deleting classification head ({}) from checkpoint '
+                        'with different dimensions than current model: {}'.format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
         for k in keys_to_delete:
             del state_dict[k]
 

From 3bbdc5543c4ec56a7b60a91a2cb9683fa15e5208 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 12 Aug 2019 07:13:55 -0700
Subject: [PATCH 081/213] Update --restore-file logic (partially fixes #999)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1007

Differential Revision: D16762490

Pulled By: myleott

fbshipit-source-id: d67137bcf581887850323d188bb4ea643a35ac9e
---
 fairseq/checkpoint_utils.py | 6 +++---
 fairseq/options.py          | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 0e080c9ae3..d84c75248a 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -97,10 +97,10 @@ def load_checkpoint(args, trainer):
     if args.distributed_rank == 0:
         os.makedirs(args.save_dir, exist_ok=True)
 
-    if os.path.isabs(args.restore_file):
-        checkpoint_path = args.restore_file
+    if args.restore_file == 'checkpoint_last.pt':
+        checkpoint_path = os.path.join(args.save_dir, 'checkpoint_last.pt')
     else:
-        checkpoint_path = os.path.join(args.save_dir, args.restore_file)
+        checkpoint_path = args.restore_file
 
     extra_state = trainer.load_checkpoint(
         checkpoint_path,
diff --git a/fairseq/options.py b/fairseq/options.py
index 006f9b6c05..e6b10fd9b0 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -361,7 +361,8 @@ def add_checkpoint_args(parser):
     group.add_argument('--save-dir', metavar='DIR', default='checkpoints',
                        help='path to save checkpoints')
     group.add_argument('--restore-file', default='checkpoint_last.pt',
-                       help='filename in save-dir from which to load checkpoint')
+                       help='filename from which to load checkpoint '
+                            '(default: <save-dir>/checkpoint_last.pt')
     group.add_argument('--reset-dataloader', action='store_true',
                        help='if set, does not reload dataloader state from the checkpoint')
     group.add_argument('--reset-lr-scheduler', action='store_true',

From 969f4474bce2ebc850db012408dd394e426d3655 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 12 Aug 2019 07:41:49 -0700
Subject: [PATCH 082/213] Remove LAMB optimizer (at least until we can test it
 more)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1008

Differential Revision: D16763315

Pulled By: myleott

fbshipit-source-id: d4bad8384eec273f2d5de4ed29fb8d158ab9187c
---
 fairseq/optim/lamb.py | 144 ------------------------------------------
 1 file changed, 144 deletions(-)
 delete mode 100644 fairseq/optim/lamb.py

diff --git a/fairseq/optim/lamb.py b/fairseq/optim/lamb.py
deleted file mode 100644
index e49a96f101..0000000000
--- a/fairseq/optim/lamb.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-LAMB optimizer from github.com/cybertronai/pytorch-lamb.
-"""
-
-import math
-
-import torch
-import torch.optim
-
-from . import FairseqOptimizer, register_optimizer
-
-
-@register_optimizer('lamb')
-class FairseqLamb(FairseqOptimizer):
-
-    def __init__(self, args, params):
-        super().__init__(args, params)
-        self._optimizer = Lamb(params, **self.optimizer_config)
-
-    @staticmethod
-    def add_args(parser):
-        """Add optimizer-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--lamb-betas', default='(0.9, 0.999)', metavar='B',
-                            help='betas for LAMB optimizer')
-        parser.add_argument('--lamb-eps', type=float, default=1e-8, metavar='D',
-                            help='epsilon for LAMB optimizer')
-        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
-                            help='weight decay')
-        # fmt: on
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        return {
-            'lr': self.args.lr[0],
-            'betas': eval(self.args.lamb_betas),
-            'eps': self.args.lamb_eps,
-            'weight_decay': self.args.weight_decay,
-        }
-
-
-class Lamb(torch.optim.Optimizer):
-    r"""Implements Lamb algorithm.
-    It has been proposed in `Reducing BERT Pre-Training Time from 3 Days to 76 Minutes`_.
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        adam (bool, optional): always use trust ratio = 1, which turns this into
-            Adam. Useful for comparison purposes.
-    .. _Reducing BERT Pre-Training Time from 3 Days to 76 Minutes:
-        https://arxiv.org/abs/1904.00962
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, adam=False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay)
-        self.adam = adam
-        super(Lamb, self).__init__(params, defaults)
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-
-                if group['weight_decay'] != 0:
-                    grad.add_(group['weight_decay'], p.data)
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                denom = exp_avg_sq.sqrt().add_(group['eps'])
-
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
-                # Apply bias to lr to avoid broadcast.
-                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
-
-                adam_step = exp_avg / denom
-                # L2 norm uses sum, but here since we're dividing, use mean to avoid overflow.
-                r1 = p.data.pow(2).mean().sqrt()
-                r2 = adam_step.pow(2).mean().sqrt()
-                r = 1 if r1 == 0 or r2 == 0 else  min(r1/r2, 10)
-                state['r1'] = r1
-                state['r2'] = r2
-                state['r'] = r
-                if self.adam:
-                    r = 1
-
-                p.data.add_(-step_size * r, adam_step)
-
-        return loss

From 2b68e91f231a2b7997664e1418f30b808d889963 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 12 Aug 2019 09:05:01 -0700
Subject: [PATCH 083/213] Lint

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/817

Differential Revision: D16762905

Pulled By: myleott

fbshipit-source-id: d920595bec44ed26b72dfc6fbc15c0aa107b4e56
---
 fairseq/__init__.py                        | 14 +++++++-------
 fairseq/checkpoint_utils.py                |  1 -
 fairseq/criterions/binary_cross_entropy.py |  2 +-
 fairseq/data/data_utils.py                 |  5 +++--
 fairseq/data/encoders/fastbpe.py           |  1 +
 fairseq/data/encoders/gpt2_bpe_utils.py    |  1 -
 fairseq/distributed_utils.py               |  2 --
 fairseq/hub_utils.py                       |  1 -
 fairseq/models/__init__.py                 | 12 ++++++------
 fairseq/models/fairseq_model.py            |  2 +-
 fairseq/models/roberta/hub_interface.py    |  2 --
 fairseq/models/transformer.py              |  3 ++-
 fairseq/models/wav2vec.py                  |  2 +-
 fairseq/optim/__init__.py                  |  2 +-
 fairseq/optim/adam.py                      |  2 +-
 fairseq/optim/bmuf.py                      |  4 ----
 fairseq/options.py                         |  2 +-
 fairseq/progress_bar.py                    |  2 +-
 fairseq/sequence_generator.py              |  2 +-
 fairseq/tasks/audio_pretraining.py         |  8 ++++----
 fairseq/tasks/cross_lingual_lm.py          |  1 -
 fairseq/tasks/language_modeling.py         |  4 ----
 fairseq/tasks/masked_lm.py                 |  4 ----
 fairseq/tasks/multilingual_translation.py  |  2 --
 fairseq/tasks/sentence_ranking.py          |  1 -
 fairseq/tasks/translation_moe.py           |  2 --
 fairseq/utils.py                           |  1 -
 27 files changed, 31 insertions(+), 54 deletions(-)

diff --git a/fairseq/__init__.py b/fairseq/__init__.py
index 1699f2bfca..cbb8d08a0e 100644
--- a/fairseq/__init__.py
+++ b/fairseq/__init__.py
@@ -6,10 +6,10 @@
 __all__ = ['pdb']
 __version__ = '0.7.2'
 
-import fairseq.criterions
-import fairseq.models
-import fairseq.modules
-import fairseq.optim
-import fairseq.optim.lr_scheduler
-import fairseq.pdb
-import fairseq.tasks
+import fairseq.criterions  # noqa
+import fairseq.models  # noqa
+import fairseq.modules  # noqa
+import fairseq.optim  # noqa
+import fairseq.optim.lr_scheduler  # noqa
+import fairseq.pdb  # noqa
+import fairseq.tasks  # noqa
diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index d84c75248a..c812136781 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import argparse
 from collections import OrderedDict
 from typing import Union
 import collections
diff --git a/fairseq/criterions/binary_cross_entropy.py b/fairseq/criterions/binary_cross_entropy.py
index d1f758f511..fba2cdcba4 100644
--- a/fairseq/criterions/binary_cross_entropy.py
+++ b/fairseq/criterions/binary_cross_entropy.py
@@ -68,4 +68,4 @@ def aggregate_logging_outputs(logging_outputs):
         }
         if sample_size != ntokens:
             agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
-        return agg_output
\ No newline at end of file
+        return agg_output
diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index bd2c5d35c9..38c385018d 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -149,8 +149,9 @@ def check_size(idx):
         else:
             # Hacky as heck, for the specific case of multilingual training with RoundRobin.
             if isinstance(size_fn(idx), dict) and isinstance(max_positions, tuple):
-                return all(a is None or b is None or a <= b
-                           for a, b in zip(size_fn(idx).values(), max_positions)
+                return all(
+                    a is None or b is None or a <= b
+                    for a, b in zip(size_fn(idx).values(), max_positions)
                 )
             # For MultiCorpusSampledDataset, will generalize it later
             if not isinstance(size_fn(idx), Iterable):
diff --git a/fairseq/data/encoders/fastbpe.py b/fairseq/data/encoders/fastbpe.py
index ed39b1bca9..376e22cd85 100644
--- a/fairseq/data/encoders/fastbpe.py
+++ b/fairseq/data/encoders/fastbpe.py
@@ -6,6 +6,7 @@
 from fairseq import file_utils
 from fairseq.data.encoders import register_bpe
 
+
 @register_bpe('fastbpe')
 class fastBPE(object):
 
diff --git a/fairseq/data/encoders/gpt2_bpe_utils.py b/fairseq/data/encoders/gpt2_bpe_utils.py
index ae98dbc708..90b7b199d5 100644
--- a/fairseq/data/encoders/gpt2_bpe_utils.py
+++ b/fairseq/data/encoders/gpt2_bpe_utils.py
@@ -7,7 +7,6 @@
 
 from functools import lru_cache
 import json
-import os
 
 
 @lru_cache()
diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py
index e854b85195..6da6ae115b 100644
--- a/fairseq/distributed_utils.py
+++ b/fairseq/distributed_utils.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from collections import namedtuple
 import os
 import pickle
 import socket
@@ -12,7 +11,6 @@
 
 import torch
 import torch.distributed as dist
-from torch import nn
 
 from fairseq import utils
 
diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
index a1d37cd25b..4e44a69f97 100644
--- a/fairseq/hub_utils.py
+++ b/fairseq/hub_utils.py
@@ -7,7 +7,6 @@
 import argparse
 import copy
 import os
-from typing import List
 
 import torch
 from torch import nn
diff --git a/fairseq/models/__init__.py b/fairseq/models/__init__.py
index 9cc884cc3a..36a4ecd1bc 100644
--- a/fairseq/models/__init__.py
+++ b/fairseq/models/__init__.py
@@ -3,11 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-MODEL_REGISTRY = {}
-ARCH_MODEL_REGISTRY = {}
-ARCH_MODEL_INV_REGISTRY = {}
-ARCH_CONFIG_REGISTRY = {}
-
 import argparse
 import importlib
 import os
@@ -28,6 +23,12 @@
 from .distributed_fairseq_model import DistributedFairseqModel
 
 
+MODEL_REGISTRY = {}
+ARCH_MODEL_REGISTRY = {}
+ARCH_MODEL_INV_REGISTRY = {}
+ARCH_CONFIG_REGISTRY = {}
+
+
 __all__ = [
     'BaseFairseqModel',
     'CompositeEncoder',
@@ -43,7 +44,6 @@
 ]
 
 
-
 def build_model(args, task):
     return ARCH_MODEL_REGISTRY[args.arch].build_model(args, task)
 
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index 37f1aa456f..fc53a7c9d7 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -6,7 +6,6 @@
 Base classes for various fairseq models.
 """
 
-import os
 from typing import Dict, List, Optional
 
 import torch
@@ -259,6 +258,7 @@ def __init__(self, *args, **kwargs):
             stacklevel=4,
         )
 
+
 class FairseqMultiModel(BaseFairseqModel):
     """Base class for combining multiple encoder-decoder models."""
 
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index fc53384062..6d3e5674a0 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -3,8 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import List
-
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index c5edba6461..c9ba537070 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -20,7 +20,6 @@
 from fairseq.modules import (
     AdaptiveSoftmax,
     LayerNorm,
-    MultiheadAttention,
     PositionalEmbedding,
     SinusoidalPositionalEmbedding,
     TransformerDecoderLayer,
@@ -51,6 +50,7 @@ class TransformerModel(FairseqEncoderDecoderModel):
 
     @classmethod
     def hub_models(cls):
+        # fmt: off
         return {
             'transformer.wmt14.en-fr': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2',
             'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2',
@@ -64,6 +64,7 @@ def hub_models(cls):
             'transformer.wmt19.de-en.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz',
             'transformer.wmt19.ru-en.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz',
         }
+        # fmt: on
 
     def __init__(self, encoder, decoder):
         super().__init__(encoder, decoder)
diff --git a/fairseq/models/wav2vec.py b/fairseq/models/wav2vec.py
index 050d4216ae..62807764ef 100644
--- a/fairseq/models/wav2vec.py
+++ b/fairseq/models/wav2vec.py
@@ -351,7 +351,7 @@ def forward(self, x):
             residual = x
             x = conv(x)
             if self.skip_connections:
-                if rproj != None:
+                if rproj is not None:
                     residual = rproj(residual)
                 x = (x + residual) * self.residual_scale
         return x
diff --git a/fairseq/optim/__init__.py b/fairseq/optim/__init__.py
index d8306c4ef6..268291be76 100644
--- a/fairseq/optim/__init__.py
+++ b/fairseq/optim/__init__.py
@@ -9,7 +9,7 @@
 from fairseq import registry
 from fairseq.optim.fairseq_optimizer import FairseqOptimizer
 from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
-from fairseq.optim.bmuf import FairseqBMUF
+from fairseq.optim.bmuf import FairseqBMUF  # noqa
 
 
 __all__ = [
diff --git a/fairseq/optim/adam.py b/fairseq/optim/adam.py
index e60f2fdf6a..0df1182066 100644
--- a/fairseq/optim/adam.py
+++ b/fairseq/optim/adam.py
@@ -19,7 +19,7 @@ def __init__(self, args, params):
         super().__init__(args, params)
         if torch.cuda.is_available():
             try:
-                from apex.optimizers import FusedAdam as _FusedAdam
+                from apex.optimizers import FusedAdam as _FusedAdam  # noqa
                 self._optimizer = FusedAdam(params, **self.optimizer_config)
             except ImportError:
                 self._optimizer = Adam(params, **self.optimizer_config)
diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py
index deec08ea74..12e18adc03 100644
--- a/fairseq/optim/bmuf.py
+++ b/fairseq/optim/bmuf.py
@@ -3,10 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-
-import sys
-import time
-
 import torch
 import torch.distributed as dist
 
diff --git a/fairseq/options.py b/fairseq/options.py
index e6b10fd9b0..b02a5778f0 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -350,7 +350,7 @@ def add_optimization_args(parser):
     group.add_argument('--min-lr', default=-1, type=float, metavar='LR',
                        help='stop training when the learning rate reaches this minimum')
     group.add_argument('--use-bmuf', default=False, action='store_true',
-                        help="specify global optimizer for syncing models on different GPUs/Shards")
+                       help='specify global optimizer for syncing models on different GPUs/shards')
     # fmt: on
     return group
 
diff --git a/fairseq/progress_bar.py b/fairseq/progress_bar.py
index 59715f548d..dacd0bedb2 100644
--- a/fairseq/progress_bar.py
+++ b/fairseq/progress_bar.py
@@ -11,7 +11,6 @@
 import json
 from numbers import Number
 import os
-import re
 import sys
 
 from fairseq import distributed_utils
@@ -19,6 +18,7 @@
 
 g_tbmf_wrapper = None
 
+
 def build_progress_bar(args, iterator, epoch=None, prefix=None, default='tqdm', no_progress_bar='none'):
     if args.log_format is None:
         args.log_format = no_progress_bar if args.no_progress_bar else default
diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py
index 0a8ffa1843..3b100b9615 100644
--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from fairseq import search, utils
+from fairseq import search
 from fairseq.models import FairseqIncrementalDecoder
 
 
diff --git a/fairseq/tasks/audio_pretraining.py b/fairseq/tasks/audio_pretraining.py
index e4bf0d79f6..76e072866b 100644
--- a/fairseq/tasks/audio_pretraining.py
+++ b/fairseq/tasks/audio_pretraining.py
@@ -47,12 +47,12 @@ def load_dataset(self, split, **kwargs):
 
         manifest = os.path.join(self.args.data, '{}.tsv'.format(split))
         self.datasets[split] = RawAudioDataset(manifest,
-                                                 sample_rate=self.args.sample_rate,
-                                                 max_sample_size=self.args.max_sample_size,
-                                                 min_sample_size=self.args.min_sample_size)
+                                               sample_rate=self.args.sample_rate,
+                                               max_sample_size=self.args.max_sample_size,
+                                               min_sample_size=self.args.min_sample_size)
 
     @property
     def target_dictionary(self):
         """Return the :class:`~fairseq.data.Dictionary` for the language
         model."""
-        return None
\ No newline at end of file
+        return None
diff --git a/fairseq/tasks/cross_lingual_lm.py b/fairseq/tasks/cross_lingual_lm.py
index c173f0ad16..b171b761c1 100644
--- a/fairseq/tasks/cross_lingual_lm.py
+++ b/fairseq/tasks/cross_lingual_lm.py
@@ -16,7 +16,6 @@
 from fairseq.data import (
     ConcatDataset,
     data_utils,
-    indexed_dataset,
     TokenBlockDataset,
 )
 
diff --git a/fairseq/tasks/language_modeling.py b/fairseq/tasks/language_modeling.py
index 066dd89544..f6d0d70b62 100644
--- a/fairseq/tasks/language_modeling.py
+++ b/fairseq/tasks/language_modeling.py
@@ -3,22 +3,18 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import itertools
 import os
 
-import numpy as np
 import torch
 
 from fairseq import utils
 from fairseq.data import (
-    ConcatDataset,
     data_utils,
     Dictionary,
     MonolingualDataset,
     TokenBlockDataset,
     TransformEosDataset,
     TruncatedDictionary,
-    indexed_dataset
 )
 from fairseq.tasks import FairseqTask, register_task
 
diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py
index 240ec0a3b5..cd677dd0ac 100644
--- a/fairseq/tasks/masked_lm.py
+++ b/fairseq/tasks/masked_lm.py
@@ -3,20 +3,16 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import itertools
 import os
 
 import numpy as np
 import torch
-import torch.nn.functional as F
 
 from fairseq.data import (
-    ConcatDataset,
     data_utils,
     Dictionary,
     encoders,
     IdDataset,
-    indexed_dataset,
     MaskTokensDataset,
     NestedDictionaryDataset,
     NumelDataset,
diff --git a/fairseq/tasks/multilingual_translation.py b/fairseq/tasks/multilingual_translation.py
index 87deadf4f0..7b359ea868 100644
--- a/fairseq/tasks/multilingual_translation.py
+++ b/fairseq/tasks/multilingual_translation.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
-import copy
 import os
 
 import torch
@@ -15,7 +14,6 @@
     LanguagePairDataset,
     RoundRobinZipDatasets,
     TransformEosLangPairDataset,
-    indexed_dataset,
 )
 from fairseq.models import FairseqMultiModel
 from fairseq.tasks.translation import load_langpair_dataset
diff --git a/fairseq/tasks/sentence_ranking.py b/fairseq/tasks/sentence_ranking.py
index 259423318f..99dff68e1b 100644
--- a/fairseq/tasks/sentence_ranking.py
+++ b/fairseq/tasks/sentence_ranking.py
@@ -15,7 +15,6 @@
     NestedDictionaryDataset,
     NumSamplesDataset,
     NumelDataset,
-    OffsetTokensDataset,
     PrependTokenDataset,
     RawLabelDataset,
     RightPadDataset,
diff --git a/fairseq/tasks/translation_moe.py b/fairseq/tasks/translation_moe.py
index cd8b985bb1..ae8817a306 100644
--- a/fairseq/tasks/translation_moe.py
+++ b/fairseq/tasks/translation_moe.py
@@ -3,8 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import contextlib
-
 import torch
 
 from fairseq import modules, utils
diff --git a/fairseq/utils.py b/fairseq/utils.py
index 4d9da12d62..1b664cbfe3 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -155,7 +155,6 @@ def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
 
 
 def post_process_prediction(hypo_tokens, src_str, alignment, align_dict, tgt_dict, remove_bpe=None):
-    from fairseq import tokenizer
     hypo_str = tgt_dict.string(hypo_tokens, remove_bpe)
     if align_dict is not None:
         hypo_str = replace_unk(hypo_str, src_str, alignment, align_dict, tgt_dict.unk_string())

From d00366405b774671d2a4ca59a8dcf2ed0bb59119 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 12 Aug 2019 14:29:32 -0700
Subject: [PATCH 084/213] Minor fixes for RACE finetuning (#818)

Summary:
- remove unnecessary extra spaces in RACE data in preprocessing
- fix finetuning instructions (add `--truncate-sequence` and add `--dropout` params)
- close file handle in SentenceRankingTask
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/818

Differential Revision: D16770055

Pulled By: myleott

fbshipit-source-id: 2c80084e92cdf8692f2ea7e43f7c344c402b9e61
---
 examples/roberta/README.finetune_race.md | 32 +++++++++++++-----------
 examples/roberta/preprocess_RACE.py      | 15 ++++++++---
 examples/roberta/preprocess_RACE.sh      |  3 +--
 fairseq/data/truncate_dataset.py         |  1 +
 fairseq/tasks/sentence_ranking.py        | 30 +++++++++++-----------
 5 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/examples/roberta/README.finetune_race.md b/examples/roberta/README.finetune_race.md
index 320e101487..dc3f1b9f6f 100644
--- a/examples/roberta/README.finetune_race.md
+++ b/examples/roberta/README.finetune_race.md
@@ -4,38 +4,42 @@
 
 ### 2) Preprocess RACE data:
 ```bash
-python ./examples/roberta/preprocess_RACE.py <input-dir> <extracted-data-dir>
+python ./examples/roberta/preprocess_RACE.py --input-dir <input-dir> --output-dir <extracted-data-dir>
 ./examples/roberta/preprocess_RACE.sh <extracted-data-dir> <output-dir>
 ```
 
 ### 3) Fine-tuning on RACE:
 
 ```bash
-MAX_EPOCHS=5          # epoch number
+MAX_EPOCH=5           # Number of training epochs.
 LR=1e-05              # Peak LR for fixed LR scheduler.
 NUM_CLASSES=4
-MAX_SENTENCES=2       # batch size
+MAX_SENTENCES=1       # Batch size per GPU.
+UPDATE_FREQ=8         # Accumulate gradients to simulate training on 8 GPUs.
+DATA_DIR=/path/to/race-output-dir
 ROBERTA_PATH=/path/to/roberta/model.pt
 
-CUDA_VISIBLE_DEVICES=0 python train.py <race-preprocessed-dir>/ \
+CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR \
     --restore-file $ROBERTA_PATH \
-    --max-positions 512 \
-    --max-sentences $MAX_SENTENCES \
-    --task sentence_ranking \
     --reset-optimizer --reset-dataloader --reset-meters \
-    --required-batch-size-multiple 1 \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --task sentence_ranking \
+    --num-classes $NUM_CLASSES \
     --init-token 0 --separator-token 2 \
+    --max-option-length 128 \
+    --max-positions 512 \
+    --truncate-sequence \
     --arch roberta_large \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
     --criterion sentence_ranking \
-    --num-classes $NUM_CLASSES \
-    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
     --clip-norm 0.0 \
     --lr-scheduler fixed --lr $LR \
     --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
-    --max-epoch 10 \
-    --update-freq 8 \
-    --find-unused-parameters \
-    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
+    --max-sentences $MAX_SENTENCES \
+    --required-batch-size-multiple 1 \
+    --update-freq $UPDATE_FREQ \
+    --max-epoch $MAX_EPOCH
 ```
 
 **Note:**
diff --git a/examples/roberta/preprocess_RACE.py b/examples/roberta/preprocess_RACE.py
index 4c9bba707b..f6f606a389 100644
--- a/examples/roberta/preprocess_RACE.py
+++ b/examples/roberta/preprocess_RACE.py
@@ -4,9 +4,11 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+
 import argparse
 import json
 import os
+import re
 
 
 class InputExample:
@@ -37,6 +39,7 @@ def get_examples(data_dir, set_type):
                 options = cur_data["options"]
                 questions = cur_data["questions"]
                 context = cur_data["article"].replace("\n", " ")
+                context = re.sub(r'\s+', ' ', context)
                 for i in range(len(answers)):
                     label = ord(answers[i]) - ord("A")
                     qa_list = []
@@ -47,6 +50,7 @@ def get_examples(data_dir, set_type):
                             qa_cat = question.replace("_", option)
                         else:
                             qa_cat = " ".join([question, option])
+                        qa_cat = re.sub(r'\s+', ' ', qa_cat)
                         qa_list.append(qa_cat)
                     examples.append(InputExample(context, qa_list, label))
 
@@ -68,12 +72,15 @@ def main():
     )
     args = parser.parse_args()
 
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+
     for set_type in ["train", "dev", "test-middle", "test-high"]:
         examples = get_examples(args.input_dir, set_type)
-        qa_file_paths = [args.output_dir + set_type + ".input" + str(i + 1) for i in range(4)]
+        qa_file_paths = [os.path.join(args.output_dir, set_type + ".input" + str(i + 1)) for i in range(4)]
         qa_files = [open(qa_file_path, 'w') for qa_file_path in qa_file_paths]
-        outf_context_path = args.output_dir + set_type + ".input0"
-        outf_label_path = args.output_dir + set_type + ".label"
+        outf_context_path = os.path.join(args.output_dir, set_type + ".input0")
+        outf_label_path = os.path.join(args.output_dir, set_type + ".label")
         outf_context = open(outf_context_path, 'w')
         outf_label = open(outf_label_path, 'w')
         for example in examples:
@@ -81,7 +88,7 @@ def main():
             for i in range(4):
                 qa_files[i].write(example.qa_list[i] + '\n')
             outf_label.write(str(example.label) + '\n')
-        
+
         for f in qa_files:
             f.close()
         outf_label.close()
diff --git a/examples/roberta/preprocess_RACE.sh b/examples/roberta/preprocess_RACE.sh
index 0957549169..932d2ab6e5 100755
--- a/examples/roberta/preprocess_RACE.sh
+++ b/examples/roberta/preprocess_RACE.sh
@@ -42,7 +42,6 @@ for INPUT_TYPE in $INPUT_TYPES
     do
       LANG="input$INPUT_TYPE"
       fairseq-preprocess \
-        --dataset-impl cached \
         --only-source \
         --trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \
         --validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \
@@ -57,4 +56,4 @@ mkdir -p "$OUT_DATA_FOLDER/label"
 cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/"
 cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label"
 cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label"
-cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label"
\ No newline at end of file
+cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label"
diff --git a/fairseq/data/truncate_dataset.py b/fairseq/data/truncate_dataset.py
index 36d3745658..efd1c6d1cb 100644
--- a/fairseq/data/truncate_dataset.py
+++ b/fairseq/data/truncate_dataset.py
@@ -12,6 +12,7 @@ class TruncateDataset(BaseWrapperDataset):
 
     def __init__(self, dataset, truncation_length):
         super().__init__(dataset)
+        assert truncation_length is not None
         self.truncation_length = truncation_length
         self.dataset = dataset
 
diff --git a/fairseq/tasks/sentence_ranking.py b/fairseq/tasks/sentence_ranking.py
index 99dff68e1b..76e7294c07 100644
--- a/fairseq/tasks/sentence_ranking.py
+++ b/fairseq/tasks/sentence_ranking.py
@@ -39,16 +39,16 @@ def add_args(parser):
         """Add task-specific arguments to the parser."""
         parser.add_argument('data', metavar='FILE',
                             help='file prefix for data')
-        parser.add_argument('--num-classes', type=int, default=2,
+        parser.add_argument('--num-classes', type=int,
                             help='number of sentences to be ranked')
-        parser.add_argument('--init-token', type=int, default=None,
+        parser.add_argument('--init-token', type=int,
                             help='add token at the beginning of each batch item')
-        parser.add_argument('--separator-token', type=int, default=None,
+        parser.add_argument('--separator-token', type=int,
                             help='add separator token between inputs')
-        parser.add_argument('--no-shuffle', action='store_true', default=False)
-        parser.add_argument('--truncate-sequence', action='store_true', default=False,
-                            help='Truncate sequence to max_sequence_length')
-        parser.add_argument('--max-option-length', type=int, default=None,
+        parser.add_argument('--no-shuffle', action='store_true')
+        parser.add_argument('--truncate-sequence', action='store_true',
+                            help='Truncate sequence to max_positions')
+        parser.add_argument('--max-option-length', type=int,
                             help='max length for each option')
 
     def __init__(self, args, dictionary):
@@ -71,8 +71,6 @@ def setup_task(cls, args, **kwargs):
         assert args.criterion == 'sentence_ranking', \
             'Must set --criterion=sentence_ranking'
 
-        args.tokens_per_sample = args.max_positions
-
         # load data dictionary
         data_dict = cls.load_dictionary(
             args,
@@ -115,7 +113,8 @@ def make_dataset(type, dictionary):
         for input_option in input_options:
             if self.args.init_token is not None:
                 input_option = PrependTokenDataset(input_option, self.args.init_token)
-            input_option = TruncateDataset(input_option, self.args.max_option_length)
+            if self.args.max_option_length is not None:
+                input_option = TruncateDataset(input_option, self.args.max_option_length)
             src_token = ConcatSentencesDataset(input_option, input0)
             if self.args.truncate_sequence:
                 src_token = TruncateDataset(src_token, self.args.max_positions)
@@ -145,11 +144,12 @@ def make_dataset(type, dictionary):
 
         label_path = '{}.label'.format(get_path('label', split))
         if os.path.exists(label_path):
-            dataset.update(
-                target=RawLabelDataset([
-                    int(x.strip()) for x in open(label_path).readlines()
-                ])
-            )
+            with open(label_path) as h:
+                dataset.update(
+                    target=RawLabelDataset([
+                        int(x.strip()) for x in h.readlines()
+                    ])
+                )
 
         nested_dataset = NestedDictionaryDataset(
             dataset,

From 0563d879245d3c6bb04f2302782c935794635924 Mon Sep 17 00:00:00 2001
From: Ilia Kulikov <kulikov@fb.com>
Date: Mon, 12 Aug 2019 15:36:41 -0700
Subject: [PATCH 085/213] ignore files starting with . e.g. .ipynb_checkpoints
 (#819)

Summary:
.ipynb_checkpoints folder in models folders crashed the importlib
now there is a check for this
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/819

Differential Revision: D16772192

Pulled By: myleott

fbshipit-source-id: 01c956aef4ed312bc7645c31c83dbf98af89d931
---
 fairseq/models/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/models/__init__.py b/fairseq/models/__init__.py
index 36a4ecd1bc..6386e7491c 100644
--- a/fairseq/models/__init__.py
+++ b/fairseq/models/__init__.py
@@ -123,7 +123,7 @@ def register_model_arch_fn(fn):
 models_dir = os.path.dirname(__file__)
 for file in os.listdir(models_dir):
     path = os.path.join(models_dir, file)
-    if not file.startswith('_') and (file.endswith('.py') or os.path.isdir(path)):
+    if not file.startswith('_') and not file.startswith('.') and (file.endswith('.py') or os.path.isdir(path)):
         model_name = file[:file.find('.py')] if file.endswith('.py') else file
         module = importlib.import_module('fairseq.models.' + model_name)
 

From 577e4fa78a295fd7cd3ee7e9fd4b936ca800ebea Mon Sep 17 00:00:00 2001
From: Siddharth Shah <siddshah@fb.com>
Date: Mon, 12 Aug 2019 17:30:55 -0700
Subject: [PATCH 086/213] fix cosine scheduler docstring

Summary: as title

Reviewed By: myleott

Differential Revision: D16773845

fbshipit-source-id: 2d10e197c31f94d894430559327289a4d03e33f7
---
 fairseq/optim/lr_scheduler/cosine_lr_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py b/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
index 206b79a009..9137e11b78 100644
--- a/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
+++ b/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
@@ -16,7 +16,7 @@ class CosineSchedule(FairseqLRScheduler):
 
     We also support a warmup phase where we linearly increase the learning rate
     from some initial learning rate (``--warmup-init-lr``) until the configured
-    learning rate (``--lr``).
+    max learning rate (``--max-lr``).
 
     During warmup::
 

From a171c2dda06f54ec404da9c496d396856fb1a5fc Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@devfair0110.h2.fair>
Date: Tue, 13 Aug 2019 06:43:14 -0700
Subject: [PATCH 087/213] added readme code for inference with GLUE finetuned
 model

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/820

Differential Revision: D16783469

fbshipit-source-id: d5af8ba6a6685608d67b72d584952b8e43eabf9f
---
 examples/roberta/README.finetune_glue.md | 32 ++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/examples/roberta/README.finetune_glue.md b/examples/roberta/README.finetune_glue.md
index d44a5aee53..52a974de6c 100644
--- a/examples/roberta/README.finetune_glue.md
+++ b/examples/roberta/README.finetune_glue.md
@@ -65,3 +65,35 @@ a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calcul
 b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--max-sentences`.
 
 c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
+
+### Inference on GLUE task
+After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet:
+
+```python
+from fairseq.models.roberta import RobertaModel
+
+roberta = RobertaModel.from_pretrained(
+    'checkpoints/',
+    checkpoint_file='checkpoint_best.pt',
+    data_name_or_path='RTE-bin'
+)
+
+label_fn = lambda label: roberta.task.label_dictionary.string(
+    [label + roberta.task.target_dictionary.nspecial]
+)
+ncorrect, nsamples = 0, 0
+roberta.cuda()
+roberta.eval()
+with open('glue_data/RTE/dev.tsv') as fin:
+    fin.readline()
+    for index, line in enumerate(fin):
+        tokens = line.strip().split('\t')
+        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
+        tokens = roberta.encode(sent1, sent2)
+        prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
+        prediction_label = label_fn(prediction)
+        ncorrect += int(prediction_label == target)
+        nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+
+```

From a33ac060de722d921d33dbc6e0c7c93bbff9ee9d Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 13 Aug 2019 07:49:25 -0700
Subject: [PATCH 088/213] Add Commonsense QA task

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1014

Differential Revision: D16784120

Pulled By: myleott

fbshipit-source-id: 946c0e33b594f8378e4ab6482ce49efcb36e1743
---
 README.md                                     |  46 ++---
 examples/roberta/README.cqa.md                |  99 ++++++++++
 ...ion.md => README.custom_classification.md} |   0
 ...README.finetune_glue.md => README.glue.md} |   0
 examples/roberta/README.md                    |   5 +-
 ...README.finetune_race.md => README.race.md} |   0
 examples/roberta/README.wsc.md                |  34 ++--
 examples/roberta/commonsense_qa/__init__.py   |   6 +
 .../commonsense_qa/commonsense_qa_task.py     | 174 ++++++++++++++++++
 .../commonsense_qa/download_cqa_data.sh       |  14 ++
 fairseq/criterions/sentence_ranking.py        |  53 ++++--
 fairseq/data/list_dataset.py                  |   2 +-
 fairseq/models/roberta/hub_interface.py       |   9 +-
 fairseq/models/roberta/model.py               |   4 +-
 14 files changed, 387 insertions(+), 59 deletions(-)
 create mode 100644 examples/roberta/README.cqa.md
 rename examples/roberta/{README.finetune_custom_classification.md => README.custom_classification.md} (100%)
 rename examples/roberta/{README.finetune_glue.md => README.glue.md} (100%)
 rename examples/roberta/{README.finetune_race.md => README.race.md} (100%)
 create mode 100644 examples/roberta/commonsense_qa/__init__.py
 create mode 100644 examples/roberta/commonsense_qa/commonsense_qa_task.py
 create mode 100644 examples/roberta/commonsense_qa/download_cqa_data.sh

diff --git a/README.md b/README.md
index 2b1510c276..b17b42bfed 100644
--- a/README.md
+++ b/README.md
@@ -15,22 +15,22 @@ modeling and other text generation tasks.
 
 Fairseq provides reference implementations of various sequence-to-sequence models, including:
 - **Convolutional Neural Networks (CNN)**
-  - [Dauphin et al. (2017): Language Modeling with Gated Convolutional Networks](examples/language_model/conv_lm/README.md)
-  - [Gehring et al. (2017): Convolutional Sequence to Sequence Learning](examples/conv_seq2seq/README.md)
-  - [Edunov et al. (2018): Classical Structured Prediction Losses for Sequence to Sequence Learning](https://github.com/pytorch/fairseq/tree/classic_seqlevel)
-  - [Fan et al. (2018): Hierarchical Neural Story Generation](examples/stories/README.md)
-  - **_New_** [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md)
+  - [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/conv_lm/README.md)
+  - [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md)
+  - [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel)
+  - [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md)
+  - [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md)
 - **LightConv and DynamicConv models**
-  - [Wu et al. (2019): Pay Less Attention with Lightweight and Dynamic Convolutions](examples/pay_less_attention_paper/README.md)
+  - [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md)
 - **Long Short-Term Memory (LSTM) networks**
-  - Luong et al. (2015): Effective Approaches to Attention-based Neural Machine Translation
+  - Effective Approaches to Attention-based Neural Machine Translation (Luong et al., 2015)
 - **Transformer (self-attention) networks**
-  - Vaswani et al. (2017): Attention Is All You Need
-  - [Ott et al. (2018): Scaling Neural Machine Translation](examples/scaling_nmt/README.md)
-  - [Edunov et al. (2018): Understanding Back-Translation at Scale](examples/backtranslation/README.md)
-  - [Baevski and Auli (2018): Adaptive Input Representations for Neural Language Modeling](examples/language_model/transformer_lm/README.md)
-  - [Shen et al. (2019): Mixture Models for Diverse Machine Translation: Tricks of the Trade](examples/translation_moe/README.md)
-  - **_New_** [Liu et al. (2019): RoBERTa: A Robustly Optimized BERT Pretraining Approach](examples/roberta/README.md)
+  - Attention Is All You Need (Vaswani et al., 2017)
+  - [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md)
+  - [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md)
+  - [Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)](examples/language_model/transformer_lm/README.md)
+  - [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md)
+  - [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
 
 **Additionally:**
 - multi-GPU (distributed) training on one machine or across multiple machines
@@ -96,16 +96,16 @@ as well as example training and evaluation commands.
 - [Language Modeling](examples/language_model/README.md): convolutional models are available
 
 We also have more detailed READMEs to reproduce results from specific papers:
-- [Liu et al. (2019): RoBERTa: A Robustly Optimized BERT Pretraining Approach](examples/roberta/README.md)
-- [Schneider et al. (2019): wav2vec: Unsupervised Pre-training for Speech Recognition](examples/wav2vec/README.md)
-- [Shen et al. (2019) Mixture Models for Diverse Machine Translation: Tricks of the Trade](examples/translation_moe/README.md)
-- [Wu et al. (2019): Pay Less Attention with Lightweight and Dynamic Convolutions](examples/pay_less_attention_paper/README.md)
-- [Edunov et al. (2018): Understanding Back-Translation at Scale](examples/backtranslation/README.md)
-- [Edunov et al. (2018): Classical Structured Prediction Losses for Sequence to Sequence Learning](https://github.com/pytorch/fairseq/tree/classic_seqlevel)
-- [Fan et al. (2018): Hierarchical Neural Story Generation](examples/stories/README.md)
-- [Ott et al. (2018): Scaling Neural Machine Translation](examples/scaling_nmt/README.md)
-- [Gehring et al. (2017): Convolutional Sequence to Sequence Learning](examples/conv_seq2seq/README.md)
-- [Dauphin et al. (2017): Language Modeling with Gated Convolutional Networks](examples/language_model/conv_lm/README.md)
+- [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
+- [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md)
+- [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md)
+- [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md)
+- [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md)
+- [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel)
+- [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md)
+- [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md)
+- [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md)
+- [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/conv_lm/README.md)
 
 # Join the fairseq community
 
diff --git a/examples/roberta/README.cqa.md b/examples/roberta/README.cqa.md
new file mode 100644
index 0000000000..f6b6036f30
--- /dev/null
+++ b/examples/roberta/README.cqa.md
@@ -0,0 +1,99 @@
+# Finetuning RoBERTa on Commonsense QA
+
+We follow a similar approach to [finetuning RACE](README.race.md). Specifically
+for each question we construct five inputs, one for each of the five candidate
+answer choices. Each input is constructed by concatenating the question and
+candidate answer. We then encode each input and pass the resulting "[CLS]"
+representations through a fully-connected layer to predict the correct answer.
+We train with a standard cross-entropy loss.
+
+We also found it helpful to prepend a prefix of `Q:` to the question and `A:` to
+the input. The complete input format is:
+```
+<s> Q: Where would I not want a fox? </s> A: hen house </s>
+```
+
+Our final submission is based on a hyperparameter search over the learning rate
+(1e-5, 2e-5, 3e-5), batch size (8, 16), number of training steps (2000, 3000,
+4000) and random seed. We selected the model with the best performance on the
+development set after 100 trials.
+
+### 1) Download the data from Commonsense QA website (https://www.tau-nlp.org/commonsenseqa)
+```bash
+bash examples/roberta/commonsense_qa/download_cqa_data.sh
+```
+
+### 2) Finetune
+
+```bash
+MAX_UPDATES=3000      # Number of training steps.
+WARMUP_UPDATES=150    # Linearly increase LR over this many steps.
+LR=1e-05              # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=16      # Batch size.
+SEED=1                # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+DATA_DIR=data/CommonsenseQA
+
+# we use the --user-dir option to load the task from
+# the examples/roberta/commonsense_qa directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa
+
+CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 \
+    $DATA_DIR \
+    --user-dir $FAIRSEQ_USER_DIR \
+    --restore-file $ROBERTA_PATH \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --task commonsense_qa --init-token 0 --bpe gpt2 \
+    --arch roberta_large --max-positions 512 \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --criterion sentence_ranking --num-classes 5 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR
+    --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
+    --max-sentences $MAX_SENTENCES \
+    --max-update $MAX_UPDATES \
+    --log-format simple --log-interval 25 \
+    --seed $SEED
+```
+
+The above command assumes training on 1 GPU with 32GB of RAM. For GPUs with
+less memory, decrease `--max-sentences` and increase `--update-freq`
+accordingly to compensate.
+
+### 3) Evaluate
+```python
+import json
+import torch
+from fairseq.models.roberta import RobertaModel
+from examples.roberta import commonsense_qa  # load the Commonsense QA task
+roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'data/CommonsenseQA')
+roberta.eval()  # disable dropout
+roberta.cuda()  # use the GPU (optional)
+nsamples, ncorrect = 0, 0
+with open('data/CommonsenseQA/valid.jsonl') as h:
+    for line in h:
+        example = json.loads(line)
+        scores = []
+        for choice in example['question']['choices']:
+            input = roberta.encode(
+                'Q: ' + example['question']['stem'],
+                'A: ' + choice['text'],
+                no_separator=True
+            )
+            score = roberta.predict('sentence_classification_head', input, return_logits=True)
+            scores.append(score)
+        pred = torch.cat(scores).argmax()
+        answer = ord(example['answerKey']) - ord('A')
+        nsamples += 1
+        if pred == answer:
+            ncorrect += 1
+
+print('Accuracy: ' + str(ncorrect / float(nsamples)))
+# Accuracy: 0.7846027846027847
+```
+
+The above snippet is not batched, which makes it quite slow. See [instructions
+for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta#batched-prediction).
diff --git a/examples/roberta/README.finetune_custom_classification.md b/examples/roberta/README.custom_classification.md
similarity index 100%
rename from examples/roberta/README.finetune_custom_classification.md
rename to examples/roberta/README.custom_classification.md
diff --git a/examples/roberta/README.finetune_glue.md b/examples/roberta/README.glue.md
similarity index 100%
rename from examples/roberta/README.finetune_glue.md
rename to examples/roberta/README.glue.md
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 5b80fe94cf..1b9545d85e 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -215,9 +215,10 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 
 ## Finetuning
 
-- [Finetuning on GLUE](README.finetune_glue.md)
-- [Finetuning on custom classification tasks (e.g., IMDB)](README.finetune_custom_classification.md)
+- [Finetuning on GLUE](README.glue.md)
+- [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
 - [Finetuning on Winograd Schema Challenge (WSC)](README.wsc.md)
+- [Finetuning on Commonsense QA (CQA)](README.cqa.md)
 - Finetuning on SQuAD: coming soon
 
 ## Pretraining using your own data
diff --git a/examples/roberta/README.finetune_race.md b/examples/roberta/README.race.md
similarity index 100%
rename from examples/roberta/README.finetune_race.md
rename to examples/roberta/README.race.md
diff --git a/examples/roberta/README.wsc.md b/examples/roberta/README.wsc.md
index b1437d1de7..d279b89037 100644
--- a/examples/roberta/README.wsc.md
+++ b/examples/roberta/README.wsc.md
@@ -43,24 +43,24 @@ ROBERTA_PATH=/path/to/roberta/model.pt
 FAIRSEQ_PATH=/path/to/fairseq
 FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
 
-cd fairseq
 CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \
-  --restore-file $ROBERTA_PATH \
-  --reset-optimizer --reset-dataloader --reset-meters \
-  --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
-  --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
-  --valid-subset val \
-  --fp16 --ddp-backend no_c10d \
-  --user-dir $FAIRSEQ_USER_DIR \
-  --task wsc --criterion wsc --wsc-cross-entropy \
-  --arch roberta_large --bpe gpt2 --max-positions 512 \
-  --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
-  --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
-  --lr-scheduler polynomial_decay --lr $LR \
-  --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
-  --max-sentences $MAX_SENTENCES \
-  --max-update $TOTAL_NUM_UPDATES \
-  --log-format simple --log-interval 100
+    --restore-file $ROBERTA_PATH \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --valid-subset val \
+    --fp16 --ddp-backend no_c10d \
+    --user-dir $FAIRSEQ_USER_DIR \
+    --task wsc --criterion wsc --wsc-cross-entropy \
+    --arch roberta_large --bpe gpt2 --max-positions 512 \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
+    --lr-scheduler polynomial_decay --lr $LR \
+    --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
+    --max-sentences $MAX_SENTENCES \
+    --max-update $TOTAL_NUM_UPDATES \
+    --log-format simple --log-interval 100 \
+    --seed $SEED
 ```
 
 The above command assumes training on 4 GPUs, but you can achieve the same
diff --git a/examples/roberta/commonsense_qa/__init__.py b/examples/roberta/commonsense_qa/__init__.py
new file mode 100644
index 0000000000..42d21f35eb
--- /dev/null
+++ b/examples/roberta/commonsense_qa/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import commonsense_qa_task  # noqa
diff --git a/examples/roberta/commonsense_qa/commonsense_qa_task.py b/examples/roberta/commonsense_qa/commonsense_qa_task.py
new file mode 100644
index 0000000000..39a16d2948
--- /dev/null
+++ b/examples/roberta/commonsense_qa/commonsense_qa_task.py
@@ -0,0 +1,174 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+
+import numpy as np
+import torch
+
+from fairseq.data import (
+    data_utils,
+    Dictionary,
+    encoders,
+    IdDataset,
+    ListDataset,
+    NestedDictionaryDataset,
+    NumSamplesDataset,
+    NumelDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    SortDataset,
+)
+from fairseq.tasks import FairseqTask, register_task
+
+
+@register_task('commonsense_qa')
+class CommonsenseQATask(FairseqTask):
+    """Task to finetune RoBERTa for Commonsense QA."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', metavar='DIR',
+                            help='path to data directory; we load <split>.jsonl')
+        parser.add_argument('--init-token', type=int, default=None,
+                            help='add token at the beginning of each batch item')
+        parser.add_argument('--num-classes', type=int, default=5)
+
+    def __init__(self, args, vocab):
+        super().__init__(args)
+        self.vocab = vocab
+        self.mask = vocab.add_symbol('<mask>')
+
+        self.bpe = encoders.build_bpe(args)
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol('<mask>')
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == 'sentence_ranking', 'Must set --criterion=sentence_ranking'
+
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, 'dict.txt'))
+        print('| dictionary: {} types'.format(len(vocab)))
+
+        return cls(args, vocab)
+
+    def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+
+        def binarize(s, append_bos=False):
+            if self.bpe is not None:
+                s = self.bpe.encode(s)
+            tokens = self.vocab.encode_line(
+                s, append_eos=True, add_if_not_exist=False,
+            ).long()
+            if append_bos and self.args.init_token is not None:
+                tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
+            return tokens
+
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + '.jsonl')
+        if not os.path.exists(data_path):
+            raise FileNotFoundError('Cannot find data: {}'.format(data_path))
+
+        src_tokens = [[] for i in range(self.args.num_classes)]
+        src_lengths = [[] for i in range(self.args.num_classes)]
+        labels = []
+
+        with open(data_path) as h:
+            for line in h:
+                example = json.loads(line.strip())
+                if 'answerKey' in example:
+                    label = ord(example['answerKey']) - ord('A')
+                    labels.append(label)
+                question = example['question']['stem']
+                assert len(example['question']['choices']) == self.args.num_classes
+                # format: `<s> Q: Where would I not want a fox? </s> A: hen house </s>`
+                question = 'Q: ' + question
+                question_toks = binarize(question, append_bos=True)
+                for i, choice in enumerate(example['question']['choices']):
+                    src = 'A: ' + choice['text']
+                    src_bin = torch.cat([question_toks, binarize(src)])
+                    src_tokens[i].append(src_bin)
+                    src_lengths[i].append(len(src_bin))
+        assert all(len(src_tokens[0]) == len(src_tokens[i]) for i in range(self.args.num_classes))
+        assert len(src_tokens[0]) == len(src_lengths[0])
+        assert len(labels) == 0 or len(labels) == len(src_tokens[0])
+
+        for i in range(self.args.num_classes):
+            src_lengths[i] = np.array(src_lengths[i])
+            src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i])
+            src_lengths[i] = ListDataset(src_lengths[i])
+
+        dataset = {
+            'id': IdDataset(),
+            'nsentences': NumSamplesDataset(),
+            'ntokens': NumelDataset(src_tokens[0], reduce=True),
+        }
+
+        for i in range(self.args.num_classes):
+            dataset.update({
+                'net_input{}'.format(i + 1): {
+                    'src_tokens': RightPadDataset(
+                        src_tokens[i],
+                        pad_idx=self.source_dictionary.pad(),
+                    ),
+                    'src_lengths': src_lengths[i],
+                }
+            })
+
+        if len(labels) > 0:
+            dataset.update({'target': RawLabelDataset(labels)})
+
+        dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
+        )
+
+        with data_utils.numpy_seed(self.args.seed):
+            dataset = SortDataset(
+                dataset,
+                # shuffle
+                sort_order=[np.random.permutation(len(dataset))],
+            )
+
+        print('| Loaded {} with {} samples'.format(split, len(dataset)))
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, args):
+        from fairseq import models
+        model = models.build_model(args, self)
+
+        model.register_classification_head(
+            'sentence_classification_head',
+            num_classes=1,
+        )
+
+        return model
+
+    @property
+    def source_dictionary(self):
+        return self.vocab
+
+    @property
+    def target_dictionary(self):
+        return self.vocab
diff --git a/examples/roberta/commonsense_qa/download_cqa_data.sh b/examples/roberta/commonsense_qa/download_cqa_data.sh
new file mode 100644
index 0000000000..5f300093fa
--- /dev/null
+++ b/examples/roberta/commonsense_qa/download_cqa_data.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+OUTDIR=data/CommonsenseQA
+
+mkdir -p $OUTDIR
+
+wget -O $OUTDIR/train.jsonl https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl
+wget -O $OUTDIR/valid.jsonl https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl
+wget -O $OUTDIR/test.jsonl https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl
+wget -O $OUTDIR/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
diff --git a/fairseq/criterions/sentence_ranking.py b/fairseq/criterions/sentence_ranking.py
index fef4c93bba..05edca9723 100644
--- a/fairseq/criterions/sentence_ranking.py
+++ b/fairseq/criterions/sentence_ranking.py
@@ -16,6 +16,24 @@
 @register_criterion('sentence_ranking')
 class SentenceRankingCriterion(FairseqCriterion):
 
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        if self.args.save_predictions is not None:
+            self.prediction_h = open(self.args.save_predictions, 'w')
+        else:
+            self.prediction_h = None
+
+    def __del__(self):
+        if self.prediction_h is not None:
+            self.prediction_h.close()
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--save-predictions', metavar='FILE',
+                            help='file to save predictions to')
+        # fmt: on
+
     def forward(self, model, sample, reduce=True):
         """Compute ranking loss for the given sample.
 
@@ -28,20 +46,32 @@ def forward(self, model, sample, reduce=True):
         for idx in range(self.args.num_classes):
             score, _ = model(
                 **sample['net_input{idx}'.format(idx=idx+1)],
-                features_only=True,
                 classification_head_name='sentence_classification_head',
             )
             scores.append(score)
 
         logits = torch.cat(scores, dim=1)
-        targets = model.get_targets(sample, [logits]).view(-1)
-        sample_size = targets.numel()
+        sample_size = logits.size(0)
 
-        loss = F.nll_loss(
-            F.log_softmax(logits, dim=-1, dtype=torch.float32),
-            targets,
-            reduction='sum',
-        )
+        if 'target' in sample:
+            targets = model.get_targets(sample, [logits]).view(-1)
+            loss = F.nll_loss(
+                F.log_softmax(logits, dim=-1, dtype=torch.float32),
+                targets,
+                reduction='sum',
+            )
+        else:
+            targets = None
+            loss = torch.tensor(0.0, requires_grad=True)
+
+        if self.prediction_h is not None:
+            preds = logits.argmax(dim=1)
+            for i, (id, pred) in enumerate(zip(sample['id'].tolist(), preds.tolist())):
+                if targets is not None:
+                    label = targets[i].item()
+                    print('{}\t{}\t{}'.format(id, pred, label), file=self.prediction_h)
+                else:
+                    print('{}\t{}'.format(id, pred), file=self.prediction_h)
 
         logging_output = {
             'loss': utils.item(loss.data) if reduce else loss.data,
@@ -49,9 +79,10 @@ def forward(self, model, sample, reduce=True):
             'nsentences': sample_size,
             'sample_size': sample_size,
         }
-        logging_output.update(
-            ncorrect=(logits.max(dim=1)[1] == targets).sum().item()
-        )
+        if targets is not None:
+            logging_output.update(
+                ncorrect=(logits.max(dim=1)[1] == targets).sum().item()
+            )
         return loss, sample_size, logging_output
 
     @staticmethod
diff --git a/fairseq/data/list_dataset.py b/fairseq/data/list_dataset.py
index f753727abf..4d3b01d7bf 100644
--- a/fairseq/data/list_dataset.py
+++ b/fairseq/data/list_dataset.py
@@ -8,7 +8,7 @@
 
 class ListDataset(BaseWrapperDataset):
 
-    def __init__(self, dataset, sizes):
+    def __init__(self, dataset, sizes=None):
         super().__init__(dataset)
         self._sizes = sizes
 
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index 6d3e5674a0..e40e4ab92a 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -33,7 +33,7 @@ def __init__(self, args, task, model):
     def device(self):
         return self._float_tensor.device
 
-    def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
+    def encode(self, sentence: str, *addl_sentences, no_separator=False) -> torch.LongTensor:
         """
         BPE-encode a sentence (or multiple sentences).
 
@@ -56,7 +56,8 @@ def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
         """
         bpe_sentence = '<s> ' + self.bpe.encode(sentence) + ' </s>'
         for s in addl_sentences:
-            bpe_sentence += ' </s> ' + self.bpe.encode(s) + ' </s>'
+            bpe_sentence += (' </s>' if not no_separator else '')
+            bpe_sentence += ' ' + self.bpe.encode(s) + ' </s>'
         tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False)
         return tokens.long()
 
@@ -99,9 +100,11 @@ def register_classification_head(
             name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
         )
 
-    def predict(self, head: str, tokens: torch.LongTensor):
+    def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
         features = self.extract_features(tokens)
         logits = self.model.classification_heads[head](features)
+        if return_logits:
+            return logits
         return F.log_softmax(logits, dim=-1)
 
     def extract_features_aligned_to_words(self, sentence: str, return_all_hiddens: bool = False) -> torch.Tensor:
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index 2bbe919bdb..bf5e7c4ef5 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -93,8 +93,8 @@ def build_model(cls, args, task):
         return cls(args, encoder)
 
     def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
-        assert classification_head_name is None or features_only, \
-            'If passing classification_head_name argument, features_only must be set to True'
+        if classification_head_name is not None:
+            features_only = True
 
         x, extra = self.decoder(src_tokens, features_only, return_all_hiddens, **kwargs)
 

From d015d23a1f3fbe01a12d9ee6d07ae0a59b6241f8 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 13 Aug 2019 13:03:40 -0700
Subject: [PATCH 089/213] Add fairseq-validate

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/765

Differential Revision: D16763357

Pulled By: myleott

fbshipit-source-id: 758b03158e486ee82786e2d5bf4e46073b50c503
---
 fairseq/options.py     |   8 ++++
 setup.py               |   3 +-
 tests/test_binaries.py |  31 +++++++++++++
 validate.py            | 100 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 validate.py

diff --git a/fairseq/options.py b/fairseq/options.py
index b02a5778f0..1bd54d5797 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -48,6 +48,14 @@ def get_eval_lm_parser(default_task='language_modeling'):
     return parser
 
 
+def get_validation_parser(default_task=None):
+    parser = get_parser('Validation', default_task)
+    add_dataset_args(parser, train=True)
+    group = parser.add_argument_group('Evaluation')
+    add_common_eval_args(group)
+    return parser
+
+
 def eval_str_list(x, type=float):
     if x is None:
         return None
diff --git a/setup.py b/setup.py
index 83b3a7ee54..6457333cb4 100644
--- a/setup.py
+++ b/setup.py
@@ -60,8 +60,9 @@
             'fairseq-generate = fairseq_cli.generate:cli_main',
             'fairseq-interactive = fairseq_cli.interactive:cli_main',
             'fairseq-preprocess = fairseq_cli.preprocess:cli_main',
-            'fairseq-train = fairseq_cli.train:cli_main',
             'fairseq-score = fairseq_cli.score:main',
+            'fairseq-train = fairseq_cli.train:cli_main',
+            'fairseq-validate = fairseq_cli.validate:cli_main',
         ],
     },
 )
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
index d7f80f86ba..27c9ff149a 100644
--- a/tests/test_binaries.py
+++ b/tests/test_binaries.py
@@ -20,6 +20,7 @@
 import generate
 import interactive
 import eval_lm
+import validate
 
 
 class TestTranslation(unittest.TestCase):
@@ -476,6 +477,21 @@ def train_translation_model(data_dir, arch, extra_flags=None, task='translation'
     )
     train.main(train_args)
 
+    # test validation
+    validate_parser = options.get_validation_parser()
+    validate_args = options.parse_args_and_arch(
+        validate_parser,
+        [
+            '--task', task,
+            data_dir,
+            '--path', os.path.join(data_dir, 'checkpoint_last.pt'),
+            '--valid-subset', 'valid',
+            '--max-tokens', '500',
+            '--no-progress-bar',
+        ]
+    )
+    validate.main(validate_args)
+
 
 def generate_main(data_dir, extra_flags=None):
     generate_parser = options.get_generation_parser()
@@ -541,6 +557,21 @@ def train_language_model(data_dir, arch, extra_flags=None):
     )
     train.main(train_args)
 
+    # test validation
+    validate_parser = options.get_validation_parser()
+    validate_args = options.parse_args_and_arch(
+        validate_parser,
+        [
+            '--task', 'language_modeling',
+            data_dir,
+            '--path', os.path.join(data_dir, 'checkpoint_last.pt'),
+            '--valid-subset', 'valid',
+            '--max-tokens', '500',
+            '--no-progress-bar',
+        ]
+    )
+    validate.main(validate_args)
+
 
 def eval_lm_main(data_dir):
     eval_lm_parser = options.get_eval_lm_parser()
diff --git a/validate.py b/validate.py
new file mode 100644
index 0000000000..ed8f41e400
--- /dev/null
+++ b/validate.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+
+from fairseq import checkpoint_utils, options, progress_bar, utils
+
+
+def main(args, override_args=None):
+    utils.import_user_module(args)
+
+    use_fp16 = args.fp16
+    use_cuda = torch.cuda.is_available() and not args.cpu
+
+    if override_args is not None:
+        overrides = vars(override_args)
+        overrides.update(eval(getattr(override_args, 'model_overrides', '{}')))
+    else:
+        overrides = None
+
+    # Load ensemble
+    print('| loading model(s) from {}'.format(args.path))
+    models, model_args, task = checkpoint_utils.load_model_ensemble_and_task(
+        [args.path],
+        arg_overrides=overrides,
+    )
+    model = models[0]
+
+    # Move models to GPU
+    for model in models:
+        if use_fp16:
+            model.half()
+        if use_cuda:
+            model.cuda()
+
+    # Print args
+    print(model_args)
+
+    # Build criterion
+    criterion = task.build_criterion(model_args)
+    criterion.eval()
+
+    # Load valid dataset (we load training data below, based on the latest checkpoint)
+    for subset in args.valid_subset.split(','):
+        try:
+            task.load_dataset(subset, combine=False, epoch=0)
+            dataset = task.dataset(subset)
+        except KeyError:
+            raise Exception('Cannot find dataset: ' + subset)
+
+        # Initialize data iterator
+        itr = task.get_batch_iterator(
+            dataset=dataset,
+            max_tokens=args.max_tokens,
+            max_sentences=args.max_sentences,
+            max_positions=utils.resolve_max_positions(
+                task.max_positions(),
+                *[m.max_positions() for m in models],
+            ),
+            ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
+            required_batch_size_multiple=args.required_batch_size_multiple,
+            seed=args.seed,
+            num_workers=args.num_workers,
+        ).next_epoch_itr(shuffle=False)
+        progress = progress_bar.build_progress_bar(
+            args, itr,
+            prefix='valid on \'{}\' subset'.format(subset),
+            no_progress_bar='simple'
+        )
+
+        log_outputs = []
+        for i, sample in enumerate(progress):
+            sample = utils.move_to_cuda(sample) if use_cuda else sample
+            _loss, _sample_size, log_output = task.valid_step(sample, model, criterion)
+            progress.log(log_output, step=i)
+            log_outputs.append(log_output)
+
+        log_output = task.aggregate_logging_outputs(log_outputs, criterion)
+
+        progress.print(log_output, tag=subset, step=i)
+
+
+def cli_main():
+    parser = options.get_validation_parser()
+    args = options.parse_args_and_arch(parser)
+
+    # only override args that are explicitly given on the command line
+    override_parser = options.get_validation_parser()
+    override_args = options.parse_args_and_arch(override_parser, suppress_defaults=True)
+
+    main(args, override_args)
+
+
+if __name__ == '__main__':
+    cli_main()

From baa8ce119d56939c751ca81f6d0ddbab6feefb66 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 13 Aug 2019 17:34:29 -0700
Subject: [PATCH 090/213] Updates for PyTorch 1.2 masking/bool behavior

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/821

Differential Revision: D16790120

Pulled By: myleott

fbshipit-source-id: 2fb5070172636561d08596a29f08c93df07548bf
---
 fairseq/data/mask_tokens_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/data/mask_tokens_dataset.py b/fairseq/data/mask_tokens_dataset.py
index b73da6b0ca..84d313a096 100644
--- a/fairseq/data/mask_tokens_dataset.py
+++ b/fairseq/data/mask_tokens_dataset.py
@@ -127,7 +127,7 @@ def __getitem__(self, index: int):
                 if self.mask_whole_words is not None:
                     mask = np.repeat(mask, word_lens)
                 new_item = np.full(len(mask), self.pad_idx)
-                new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8))]
+                new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8)) == 1]
                 return torch.from_numpy(new_item)
 
             # decide unmasking and random replacement

From 7c89e13f64897b4caf24c83faec72fb82711e418 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 13 Aug 2019 20:33:07 -0700
Subject: [PATCH 091/213] Fix tests

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/822

Differential Revision: D16800078

Pulled By: myleott

fbshipit-source-id: b86e08e01f2fe13c64b77f1d23a5f6800f252bf7
---
 tests/speech_recognition/asr_test_base.py |  5 +-
 tests/test_binaries.py                    | 68 ++++++++++++-----------
 2 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/tests/speech_recognition/asr_test_base.py b/tests/speech_recognition/asr_test_base.py
index 4fc7f78cac..6302959a81 100644
--- a/tests/speech_recognition/asr_test_base.py
+++ b/tests/speech_recognition/asr_test_base.py
@@ -172,7 +172,10 @@ def check_encoder_output(encoder_output, batch_size=None):
                 "encoder_padding_mask must be a torch.Tensor" + _current_postion_info()
             )
             return False, msg
-        if mask.dtype != torch.uint8:
+        if (
+            mask.dtype != torch.uint8
+            and (not hasattr(torch, 'bool') or mask.dtype != torch.bool)
+        ):
             msg = (
                 "encoder_padding_mask must have dtype of uint8"
                 + _current_postion_info()
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
index 27c9ff149a..b517278273 100644
--- a/tests/test_binaries.py
+++ b/tests/test_binaries.py
@@ -151,7 +151,7 @@ def test_transformer(self):
                     '--decoder-layers', '2',
                     '--encoder-embed-dim', '8',
                     '--decoder-embed-dim', '8',
-                ])
+                ], run_validation=True)
                 generate_main(data_dir)
 
     def test_lightconv(self):
@@ -257,7 +257,9 @@ def test_transformer_lm(self):
             with tempfile.TemporaryDirectory('test_transformer_lm') as data_dir:
                 create_dummy_data(data_dir)
                 preprocess_lm_data(data_dir)
-                train_language_model(data_dir, 'transformer_lm', ['--add-bos-token'])
+                train_language_model(
+                    data_dir, 'transformer_lm', ['--add-bos-token'], run_validation=True,
+                )
                 eval_lm_main(data_dir)
 
 
@@ -457,7 +459,7 @@ def preprocess_translation_data(data_dir, extra_flags=None):
     preprocess.main(preprocess_args)
 
 
-def train_translation_model(data_dir, arch, extra_flags=None, task='translation'):
+def train_translation_model(data_dir, arch, extra_flags=None, task='translation', run_validation=False):
     train_parser = options.get_training_parser()
     train_args = options.parse_args_and_arch(
         train_parser,
@@ -477,20 +479,21 @@ def train_translation_model(data_dir, arch, extra_flags=None, task='translation'
     )
     train.main(train_args)
 
-    # test validation
-    validate_parser = options.get_validation_parser()
-    validate_args = options.parse_args_and_arch(
-        validate_parser,
-        [
-            '--task', task,
-            data_dir,
-            '--path', os.path.join(data_dir, 'checkpoint_last.pt'),
-            '--valid-subset', 'valid',
-            '--max-tokens', '500',
-            '--no-progress-bar',
-        ]
-    )
-    validate.main(validate_args)
+    if run_validation:
+        # test validation
+        validate_parser = options.get_validation_parser()
+        validate_args = options.parse_args_and_arch(
+            validate_parser,
+            [
+                '--task', task,
+                data_dir,
+                '--path', os.path.join(data_dir, 'checkpoint_last.pt'),
+                '--valid-subset', 'valid',
+                '--max-tokens', '500',
+                '--no-progress-bar',
+            ]
+        )
+        validate.main(validate_args)
 
 
 def generate_main(data_dir, extra_flags=None):
@@ -534,7 +537,7 @@ def preprocess_lm_data(data_dir):
     preprocess.main(preprocess_args)
 
 
-def train_language_model(data_dir, arch, extra_flags=None):
+def train_language_model(data_dir, arch, extra_flags=None, run_validation=False):
     train_parser = options.get_training_parser()
     train_args = options.parse_args_and_arch(
         train_parser,
@@ -557,20 +560,21 @@ def train_language_model(data_dir, arch, extra_flags=None):
     )
     train.main(train_args)
 
-    # test validation
-    validate_parser = options.get_validation_parser()
-    validate_args = options.parse_args_and_arch(
-        validate_parser,
-        [
-            '--task', 'language_modeling',
-            data_dir,
-            '--path', os.path.join(data_dir, 'checkpoint_last.pt'),
-            '--valid-subset', 'valid',
-            '--max-tokens', '500',
-            '--no-progress-bar',
-        ]
-    )
-    validate.main(validate_args)
+    if run_validation:
+        # test validation
+        validate_parser = options.get_validation_parser()
+        validate_args = options.parse_args_and_arch(
+            validate_parser,
+            [
+                '--task', 'language_modeling',
+                data_dir,
+                '--path', os.path.join(data_dir, 'checkpoint_last.pt'),
+                '--valid-subset', 'valid',
+                '--max-tokens', '500',
+                '--no-progress-bar',
+            ]
+        )
+        validate.main(validate_args)
 
 
 def eval_lm_main(data_dir):

From ffffe04ea12679bdc12fbbafdc6406dd3e6c9943 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 14 Aug 2019 05:00:05 -0700
Subject: [PATCH 092/213] v0.7.2 -> v0.8.0 (#1017)

Summary:
Changelog:
- Relicensed under MIT license
- Add RoBERTa
- Add wav2vec
- Add WMT'19 models
- Add initial ASR code
- Changed torch.hub interface (`generate` renamed to `translate`)
- Add `--tokenizer` and `--bpe`
- f812e52: Renamed data.transforms -> data.encoders
- 654affc: New Dataset API (optional)
- `47fd985`: Deprecate old Masked LM components
- `5f78106`: Set mmap as default dataset format and infer format automatically
- Misc fixes for sampling
- Misc fixes to support PyTorch 1.2
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1017

Differential Revision: D16799880

Pulled By: myleott

fbshipit-source-id: 45ad8bc531724a53063cbc24ca1c93f715cdc5a7
---
 docs/conf.py        | 4 ++--
 fairseq/__init__.py | 2 +-
 setup.py            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 7f108940d0..d6ee5c4ebf 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -60,9 +60,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.7.2'
+version = '0.8.0'
 # The full version, including alpha/beta/rc tags.
-release = '0.7.2'
+release = '0.8.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/fairseq/__init__.py b/fairseq/__init__.py
index cbb8d08a0e..08e29ab85a 100644
--- a/fairseq/__init__.py
+++ b/fairseq/__init__.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 __all__ = ['pdb']
-__version__ = '0.7.2'
+__version__ = '0.8.0'
 
 import fairseq.criterions  # noqa
 import fairseq.models  # noqa
diff --git a/setup.py b/setup.py
index 6457333cb4..59d3410af0 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@
 
 setup(
     name='fairseq',
-    version='0.7.2',
+    version='0.8.0',
     description='Facebook AI Research Sequence-to-Sequence Toolkit',
     url='https://github.com/pytorch/fairseq',
     classifiers=[

From b870468689a6d903fb6c2464ca18e07ce73444fb Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 14 Aug 2019 08:24:36 -0700
Subject: [PATCH 093/213] Update READMEs

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/823

Differential Revision: D16804995

Pulled By: myleott

fbshipit-source-id: abac5dc0ed6b7bfe2309ba273456e54b37340b2c
---
 README.md                                     | 39 ++++----
 examples/language_model/README.md             | 73 +++++++-------
 examples/language_model/conv_lm/README.md     | 23 ++++-
 .../language_model/transformer_lm/README.md   |  2 +-
 examples/roberta/README.cqa.md                |  4 +-
 examples/roberta/README.md                    | 35 ++++---
 examples/roberta/README.pretraining.md        | 95 +++++++++++++++++++
 examples/roberta/README.wsc.md                | 18 ++--
 8 files changed, 200 insertions(+), 89 deletions(-)
 create mode 100644 examples/roberta/README.pretraining.md

diff --git a/README.md b/README.md
index b17b42bfed..45dce65cf0 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,10 @@ modeling and other text generation tasks.
 
 ### What's New:
 
+- August 2019: [WMT'19 models released](examples/wmt19/README.md)
 - July 2019: fairseq relicensed under MIT license
-- July 2019: [RoBERTa models and code release](examples/roberta/README.md)
-- June 2019: [wav2vec models and code release](examples/wav2vec/README.md)
-- April 2019: [fairseq demo paper @ NAACL 2019](https://arxiv.org/abs/1904.01038)
+- July 2019: [RoBERTa models and code released](examples/roberta/README.md)
+- June 2019: [wav2vec models and code released](examples/wav2vec/README.md)
 
 ### Features:
 
@@ -31,6 +31,7 @@ Fairseq provides reference implementations of various sequence-to-sequence model
   - [Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)](examples/language_model/transformer_lm/README.md)
   - [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md)
   - [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
+  - [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md)
 
 **Additionally:**
 - multi-GPU (distributed) training on one machine or across multiple machines
@@ -49,38 +50,33 @@ translation and language modeling datasets.
 
 # Requirements and Installation
 
-* [PyTorch](http://pytorch.org/) version >= 1.0.0
+* [PyTorch](http://pytorch.org/) version >= 1.1.0
 * Python version >= 3.5
 * For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl)
+* **For faster training** install NVIDIA's [apex](https://github.com/NVIDIA/apex) library with the `--cuda_ext` option
 
-Please follow the instructions here to install PyTorch: https://github.com/pytorch/pytorch#installation.
-
-If you use Docker make sure to increase the shared memory size either with
-`--ipc=host` or `--shm-size` as command line options to `nvidia-docker run`.
-
-After PyTorch is installed, you can install fairseq with `pip`:
-```
+To install fairseq:
+```bash
 pip install fairseq
 ```
-On MacOS,
-```
+
+On MacOS:
+```bash
 CFLAGS="-stdlib=libc++" pip install fairseq
 ```
+
+If you use Docker make sure to increase the shared memory size either with
+`--ipc=host` or `--shm-size` as command line options to `nvidia-docker run`.
+
 **Installing from source**
 
 To install fairseq from source and develop locally:
-```
+```bash
 git clone https://github.com/pytorch/fairseq
 cd fairseq
 pip install --editable .
 ```
 
-**Improved training speed**
-
-Training speed can be further improved by installing NVIDIA's
-[apex](https://github.com/NVIDIA/apex) library with the `--cuda_ext` option.
-fairseq will automatically switch to the faster modules provided by apex.
-
 # Getting Started
 
 The [full documentation](https://fairseq.readthedocs.io/) contains instructions
@@ -93,9 +89,10 @@ We provide pre-trained models and pre-processed, binarized test sets for several
 as well as example training and evaluation commands.
 
 - [Translation](examples/translation/README.md): convolutional and transformer models are available
-- [Language Modeling](examples/language_model/README.md): convolutional models are available
+- [Language Modeling](examples/language_model/README.md): convolutional and transformer models are available
 
 We also have more detailed READMEs to reproduce results from specific papers:
+- [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md)
 - [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
 - [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md)
 - [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md)
diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index 180714de49..a103755228 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -27,58 +27,57 @@ en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperatur
 # "Barack Obama is coming to Sydney and New Zealand (...)"
 ```
 
-## Training a new model with the CLI tools
+## Training a transformer language model with the CLI tools
 
-These scripts provide an example of pre-processing data for the Language Modeling task.
+### 1) Preprocess the data
 
-### prepare-wikitext-103.sh
-
-Provides an example of pre-processing for [WikiText-103 language modeling task](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/):
-
-Example usage:
-
-Prepare data:
+First download and prepare the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/):
 ```bash
 cd examples/language_model/
 bash prepare-wikitext-103.sh
 cd ../..
+```
 
-# Binarize the dataset:
+Next preprocess/binarize the data:
+```bash
 TEXT=examples/language_model/wikitext-103
-
-fairseq-preprocess --only-source \
-    --trainpref $TEXT/wiki.train.tokens --validpref $TEXT/wiki.valid.tokens --testpref $TEXT/wiki.test.tokens \ 
-    --destdir data-bin/wikitext-103
+fairseq-preprocess \
+    --only-source \
+    --trainpref $TEXT/wiki.train.tokens \
+    --validpref $TEXT/wiki.valid.tokens \
+    --testpref $TEXT/wiki.test.tokens \ 
+    --destdir data-bin/wikitext-103 \
+    --workers 20
 ```
 
-Train a transformer language model with adaptive inputs ([Baevski and Auli (2018): Adaptive Input Representations for Neural Language Modeling](transformer_lm/README.md)):
+### 2) Train a language model
+
+Next we'll train a transformer language model using [adaptive inputs](transformer_lm/README.md):
 ```bash
-# If it runs out of memory, try to reduce max-tokens and tokens-per-sample
-mkdir -p checkpoints/transformer_wikitext-103
-fairseq-train --task language_modeling data-bin/wikitext-103 \
-    --save-dir checkpoints/transformer_wikitext-103 --arch transformer_lm_wiki103 \
+fairseq-train --task language_modeling \
+    data-bin/wikitext-103 \
+    --save-dir checkpoints/transformer_wikitext-103 \
+    --arch transformer_lm_wiki103 \
     --max-update 286000 --max-lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
     --warmup-updates 16000 --warmup-init-lr 1e-07 --min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \
     --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
     --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d
-
-# Evaluate:
-fairseq-eval-lm data-bin/wikitext-103 --path 'checkpoints/transformer_wiki103/checkpoint_best.pt' \
-    --sample-break-mode complete --max-tokens 3072 --context-window 2560 --softmax-batch 1024
 ```
 
-Train a convolutional language model ([Dauphin et al. (2017): Language Modeling with Gated Convolutional Networks](conv_lm/README.md)):
-```
-# If it runs out of memory, try to reduce max-tokens and tokens-per-sample
-mkdir -p checkpoints/fconv_wikitext-103
-fairseq-train --task language_modeling data-bin/wikitext-103 \
-    --save-dir checkpoints/fconv_wikitext-103 \
-    --max-epoch 35 --arch fconv_lm_dauphin_wikitext103 --optimizer nag \
-    --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
-    --clip-norm 0.1 --dropout 0.2 --weight-decay 5e-06 --criterion adaptive_loss \
-    --adaptive-softmax-cutoff 10000,20000,200000 --max-tokens 1024 --tokens-per-sample 1024 \
-    --ddp-backend=no_c10d
-
-# Evaluate:
-fairseq-eval-lm data-bin/wikitext-103 --path 'checkpoints/fconv_wiki103/checkpoint_best.pt'
+If the above command runs out of memory, try reducing `--max-tokens` (max number
+of tokens per batch) or `--tokens-per-sample` (max sequence length). You can
+also increase `--update-freq` to accumulate gradients and simulate training on
+more GPUs.
+
+### 3) Evaluate
+```bash
+fairseq-eval-lm data-bin/wikitext-103 \
+    --path checkpoints/transformer_wiki103/checkpoint_best.pt \
+    --sample-break-mode complete --max-tokens 3072 \
+    --context-window 2560 --softmax-batch 1024
 ```
+
+## Convolutional language models
+
+Please see the [convolutional LM README](conv_lm/README.md) for instructions to
+train convolutional language models.
diff --git a/examples/language_model/conv_lm/README.md b/examples/language_model/conv_lm/README.md
index e52e18a7b1..83ac0b454b 100644
--- a/examples/language_model/conv_lm/README.md
+++ b/examples/language_model/conv_lm/README.md
@@ -2,8 +2,27 @@
 
 ## Example usage
 
-See the [language modeling README](../README.md) for instructions on reproducing results for WikiText-103
-using the `fconv_lm_dauphin_wikitext103` model architecture.
+First download and preprocess the data following the main [language modeling
+README](../README.md).
+
+Then to train a convolutional LM using the `fconv_lm_dauphin_wikitext103`
+architecture:
+```bash
+fairseq-train --task language_modeling \
+    data-bin/wikitext-103 \
+    --save-dir checkpoints/fconv_wikitext-103 \
+    --arch fconv_lm_dauphin_wikitext103 \
+    --max-epoch 35 \ --optimizer nag \
+    --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
+    --clip-norm 0.1 --dropout 0.2 --weight-decay 5e-06 --criterion adaptive_loss \
+    --adaptive-softmax-cutoff 10000,20000,200000 --max-tokens 1024 --tokens-per-sample 1024 \
+    --ddp-backend=no_c10d
+```
+
+And evaluate with:
+```bash
+fairseq-eval-lm data-bin/wikitext-103 --path checkpoints/fconv_wiki103/checkpoint_best.pt
+```
 
 ## Citation
 
diff --git a/examples/language_model/transformer_lm/README.md b/examples/language_model/transformer_lm/README.md
index 9cdf6c7a0b..3eb8c76d93 100644
--- a/examples/language_model/transformer_lm/README.md
+++ b/examples/language_model/transformer_lm/README.md
@@ -1,4 +1,4 @@
-# Adaptive Input Representations for Neural Language Modeling (Baevski and Auli; 2018)
+# Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)
 
 ## Pre-trained models
 
diff --git a/examples/roberta/README.cqa.md b/examples/roberta/README.cqa.md
index f6b6036f30..b008fce17d 100644
--- a/examples/roberta/README.cqa.md
+++ b/examples/roberta/README.cqa.md
@@ -8,7 +8,7 @@ representations through a fully-connected layer to predict the correct answer.
 We train with a standard cross-entropy loss.
 
 We also found it helpful to prepend a prefix of `Q:` to the question and `A:` to
-the input. The complete input format is:
+the answer. The complete input format is:
 ```
 <s> Q: Where would I not want a fox? </s> A: hen house </s>
 ```
@@ -18,7 +18,7 @@ Our final submission is based on a hyperparameter search over the learning rate
 4000) and random seed. We selected the model with the best performance on the
 development set after 100 trials.
 
-### 1) Download the data from Commonsense QA website (https://www.tau-nlp.org/commonsenseqa)
+### 1) Download data from the Commonsense QA website (https://www.tau-nlp.org/commonsenseqa)
 ```bash
 bash examples/roberta/commonsense_qa/download_cqa_data.sh
 ```
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 1b9545d85e..f09a35b333 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -2,20 +2,24 @@
 
 https://arxiv.org/abs/1907.11692
 
-## Introduction
+### Introduction
 
-**RoBERTa** iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details.
+RoBERTa iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details.
 
-## Pre-trained models
+### What's New:
+
+- August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
+
+### Pre-trained models
 
 Model | Description | # params | Download
 ---|---|---|---
 `roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz)
 `roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz)
 `roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
-`roberta.large.wsc` | `roberta.large` finetuned on [WSC](https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
+`roberta.large.wsc` | `roberta.large` finetuned on [WSC](README.wsc.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
 
-## Results
+### Results
 
 ##### Results on GLUE tasks (dev set, single model, single-task finetuning)
 
@@ -44,7 +48,7 @@ Model | Accuracy | Middle | High
 ---|---|---|---
 `roberta.large` | 83.2 | 86.5 | 81.3
 
-## Example usage
+### Example usage
 
 ##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
 ```python
@@ -53,7 +57,7 @@ roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
 roberta.eval()  # disable dropout (or leave in train mode to finetune)
 ```
 
-##### Load RoBERTa (for PyTorch 1.0):
+##### Load RoBERTa (for PyTorch 1.0 or custom models):
 ```python
 # Download roberta.large model
 wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
@@ -61,7 +65,7 @@ tar -xzvf roberta.large.tar.gz
 
 # Load the model in fairseq
 from fairseq.models.roberta import RobertaModel
-roberta = RobertaModel.from_pretrained('/path/to/roberta.large')
+roberta = RobertaModel.from_pretrained('/path/to/roberta.large', checkpoint_file='model.pt')
 roberta.eval()  # disable dropout (or leave in train mode to finetune)
 ```
 
@@ -120,7 +124,7 @@ roberta.cuda()
 roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
 ```
 
-## Advanced usage
+### Advanced usage
 
 #### Filling masks:
 
@@ -212,8 +216,7 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 # Expected output: 0.9060
 ```
 
-
-## Finetuning
+### Finetuning
 
 - [Finetuning on GLUE](README.glue.md)
 - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
@@ -221,15 +224,11 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 - [Finetuning on Commonsense QA (CQA)](README.cqa.md)
 - Finetuning on SQuAD: coming soon
 
-## Pretraining using your own data
-
-You can use the [`masked_lm` task](/fairseq/tasks/masked_lm.py) to pretrain RoBERTa from scratch, or to continue pretraining RoBERTa starting from one of the released checkpoints.
-
-Data should be preprocessed following the [language modeling example](/examples/language_model).
+### Pretraining using your own data
 
-A more detailed tutorial is coming soon.
+See the [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
 
-## Citation
+### Citation
 
 ```bibtex
 @article{liu2019roberta,
diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md
new file mode 100644
index 0000000000..843d7ce377
--- /dev/null
+++ b/examples/roberta/README.pretraining.md
@@ -0,0 +1,95 @@
+# Pretraining RoBERTa using your own data
+
+This tutorial will walk you through pretraining RoBERTa over your own data.
+
+### 1) Preprocess the data.
+
+Data should be preprocessed following the [language modeling format](/examples/language_model).
+
+We'll use the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)
+to demonstrate how to preprocess raw text data with the GPT-2 BPE. Of course
+this dataset is quite small, so the resulting pretrained model will perform
+poorly, but it gives the general idea.
+
+First download the dataset:
+```bash
+wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
+unzip wikitext-103-raw-v1.zip
+```
+
+Next encode it with the GPT-2 BPE:
+```bash
+mkdir -p gpt2_bpe
+wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
+wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
+for SPLIT in train valid test; do \
+    python -m examples.roberta.multiprocessing_bpe_encoder \
+        --encoder-json gpt2_bpe/encoder.json \
+        --vocab-bpe gpt2_bpe/vocab.bpe \
+        --inputs wikitext-103-raw/wiki.${SPLIT}.raw \
+        --outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
+        --keep-empty \
+        --workers 60; \
+done
+```
+
+Finally preprocess/binarize the data using the GPT-2 fairseq dictionary:
+```bash
+wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
+fairseq-preprocess \
+    --only-source \
+    --srcdict gpt2_bpe/dict.txt \
+    --trainpref wikitext-103-raw/wiki.train.bpe \
+    --validpref wikitext-103-raw/wiki.valid.bpe \
+    --testpref wikitext-103-raw/wiki.test.bpe \
+    --destdir data-bin/wikitext-103 \
+    --workers 60
+```
+
+### 2) Train RoBERTa base
+```bash
+TOTAL_UPDATES=125000    # Total number of training steps
+WARMUP_UPDATES=10000    # Warmup the learning rate over this many updates
+PEAK_LR=0.0005          # Peak learning rate, adjust as needed
+TOKENS_PER_SAMPLE=512   # Max sequence length
+MAX_POSITIONS=512       # Num. positional embeddings (usually same as above)
+MAX_SENTENCES=16        # Number of sequences per batch (batch size)
+UPDATE_FREQ=16           # Increase the batch size 16x
+
+DATA_DIR=data-bin/wikitext-103
+
+fairseq-train --fp16 $DATA_DIR \
+    --task masked_lm --criterion masked_lm \
+    --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \
+    --max-update $TOTAL_UPDATES --log-format simple --log-interval 1
+```
+
+The above command assumes training on 8x32GB V100 GPUs. Each GPU uses a batch
+size of 16 sequences (`$MAX_SENTENCES`) and accumulates gradients to further
+increase the batch size by 16x (`$UPDATE_FREQ`), for a total batch size of 2048
+sequences. If you have fewer GPUs or GPUs with less memory you may need to
+reduce `$MAX_SENTENCES` and increase `$UPDATE_FREQ` to compensate. Alternatively
+if you have more GPUs you can decrease `$UPDATE_FREQ` accordingly to increase
+training speed.
+
+Also note that the learning rate and batch size are tightly connected and need
+to be adjusted together. We generally recommend increasing the learning rate as
+you increase the batch size according to the following table (although it's also
+dataset dependent, so don't rely on the following values too closely):
+
+batch size | peak learning rate
+---|---
+256 | 0.0001
+2048 | 0.0005
+8192 | 0.0007
+
+### 3) Load your pretrained model
+```python
+from fairseq.models.roberta import RobertaModel
+roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'path/to/data')
+assert isinstance(roberta.model, torch.nn.Module)
+```
diff --git a/examples/roberta/README.wsc.md b/examples/roberta/README.wsc.md
index d279b89037..1df64299f0 100644
--- a/examples/roberta/README.wsc.md
+++ b/examples/roberta/README.wsc.md
@@ -4,21 +4,23 @@ The following instructions can be used to finetune RoBERTa on the WSC training
 data provided by [SuperGLUE](https://super.gluebenchmark.com/).
 
 Note that there is high variance in the results. For our GLUE/SuperGLUE
-submission we swept over the learning rate, batch size and total number of
-updates, as well as the random seed. Out of ~100 runs we chose the best 7 models
-and ensembled them.
+submission we swept over the learning rate (1e-5, 2e-5, 3e-5), batch size (16,
+32, 64) and total number of updates (500, 1000, 2000, 3000), as well as the
+random seed. Out of ~100 runs we chose the best 7 models and ensembled them.
 
-**Note:** The instructions below use a slightly different loss function than
+**Approach:** The instructions below use a slightly different loss function than
 what's described in the original RoBERTa arXiv paper. In particular,
 [Kocijan et al. (2019)](https://arxiv.org/abs/1905.06290) introduce a margin
 ranking loss between `(query, candidate)` pairs with tunable hyperparameters
 alpha and beta. This is supported in our code as well with the `--wsc-alpha` and
 `--wsc-beta` arguments. However, we achieved slightly better (and more robust)
 results on the development set by instead using a single cross entropy loss term
-over the log-probabilities for the query and all candidates. This reduces the
-number of hyperparameters and our best model achieved 92.3% development set
-accuracy, compared to ~90% accuracy for the margin loss. Later versions of the
-RoBERTa arXiv paper will describe this updated formulation.
+over the log-probabilities for the query and all mined candidates. **The
+candidates are mined using spaCy from each input sentence in isolation, so the
+approach remains strictly pointwise.** This reduces the number of
+hyperparameters and our best model achieved 92.3% development set accuracy,
+compared to ~90% accuracy for the margin loss. Later versions of the RoBERTa
+arXiv paper will describe this updated formulation.
 
 ### 1) Download the WSC data from the SuperGLUE website:
 ```bash

From f840564da943f3f95bf80e93fd49be6f93d98348 Mon Sep 17 00:00:00 2001
From: Nathan Ng <n.ng555@gmail.com>
Date: Wed, 14 Aug 2019 10:45:52 -0700
Subject: [PATCH 094/213] initial light and dynamic convolution kernels (#547)

Summary:
CUDA code for light/dynamicconv kernels, including pytorch modules. Modules can be built by running setup.py in each respective folder, and can then be imported and used like any other module.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/547

Reviewed By: myleott, shubho

Differential Revision: D15703660

Pulled By: nng555

fbshipit-source-id: e9c913753be3a1cd571965f7200df6678b644520
---
 .gitignore                                    |   2 +
 examples/pay_less_attention_paper/README.md   |  16 +-
 fairseq/models/lightconv.py                   |  38 +-
 fairseq/modules/__init__.py                   |  10 +-
 fairseq/modules/cuda_utils.cu                 | 202 ++++++++++
 fairseq/modules/dynamic_convolution.py        |  21 +-
 fairseq/modules/dynamicconv_layer/__init__.py |   8 +
 .../dynamicconv_layer/cuda_function_gen.py    | 223 +++++++++++
 .../dynamicconv_layer/dynamicconv_cuda.cpp    |  49 +++
 .../dynamicconv_layer/dynamicconv_cuda.cuh    |  49 +++
 .../dynamicconv_cuda_kernel.cu                | 167 ++++++++
 .../dynamicconv_layer/dynamicconv_layer.py    | 205 ++++++++++
 .../dynamicconv_layer/dynamiconv_cpu.cpp      |  35 ++
 fairseq/modules/dynamicconv_layer/setup.py    |  17 +
 fairseq/modules/lightconv_layer/__init__.py   |   8 +
 .../lightconv_layer/cuda_function_gen.py      | 289 ++++++++++++++
 .../lightconv_layer/lightconv_cuda.cpp        |  47 +++
 .../lightconv_layer/lightconv_cuda.cuh        |  82 ++++
 .../lightconv_layer/lightconv_cuda_kernel.cu  | 374 ++++++++++++++++++
 .../lightconv_layer/lightconv_layer.py        | 113 ++++++
 fairseq/modules/lightconv_layer/setup.py      |  14 +
 fairseq/modules/lightweight_convolution.py    |  15 +
 fairseq/modules/unfold.py                     |   1 -
 23 files changed, 1958 insertions(+), 27 deletions(-)
 create mode 100644 fairseq/modules/cuda_utils.cu
 create mode 100644 fairseq/modules/dynamicconv_layer/__init__.py
 create mode 100644 fairseq/modules/dynamicconv_layer/cuda_function_gen.py
 create mode 100644 fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
 create mode 100644 fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
 create mode 100644 fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
 create mode 100644 fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
 create mode 100644 fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
 create mode 100644 fairseq/modules/dynamicconv_layer/setup.py
 create mode 100644 fairseq/modules/lightconv_layer/__init__.py
 create mode 100644 fairseq/modules/lightconv_layer/cuda_function_gen.py
 create mode 100644 fairseq/modules/lightconv_layer/lightconv_cuda.cpp
 create mode 100644 fairseq/modules/lightconv_layer/lightconv_cuda.cuh
 create mode 100644 fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
 create mode 100644 fairseq/modules/lightconv_layer/lightconv_layer.py
 create mode 100644 fairseq/modules/lightconv_layer/setup.py

diff --git a/.gitignore b/.gitignore
index ec0572d4db..7e4a2d4128 100644
--- a/.gitignore
+++ b/.gitignore
@@ -111,6 +111,8 @@ ENV/
 
 # Generated files
 fairseq/temporal_convolution_tbc
+fairseq/modules/*_layer/*_forward.cu
+fairseq/modules/*_layer/*_backward.cu
 
 # data
 data-bin/
diff --git a/examples/pay_less_attention_paper/README.md b/examples/pay_less_attention_paper/README.md
index bd66d705b7..97ab847fc0 100644
--- a/examples/pay_less_attention_paper/README.md
+++ b/examples/pay_less_attention_paper/README.md
@@ -1,5 +1,5 @@
 # Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)
-This page contains pointers to pre-trained models as well as instructions on how to train new models for [our paper](https://openreview.net/pdf?id=SkVhlh09tX)
+This page contains pointers to pre-trained models as well as instructions on how to train new models for [our paper](https://arxiv.org/abs/1901.10430)
 
 ## Citation:
 ```bibtex
@@ -8,7 +8,7 @@ This page contains pointers to pre-trained models as well as instructions on how
   author = {Felix Wu and Angela Fan and Alexei Baevski and Yann Dauphin and Michael Auli},
   booktitle = {International Conference on Learning Representations},
   year = {2019},
-  url = {https://openreview.net/forum?id=SkVhlh09tX},
+  url = {https://arxiv.org/abs/1901.10430},
 }
 ```
 
@@ -39,6 +39,18 @@ To use the model without GLU, please set `--encoder-glu 0 --decoder-glu 0`.
 For LightConv, please use `--encoder-conv-type lightweight --decoder-conv-type lightweight`, otherwise the default is DynamicConv.
 For best BLEU results, lenpen may need to be manually tuned.
 
+To use the CUDA kernels, first install the PyTorch modules using the commands below
+```sh
+# to install lightconv
+python fairseq/modules/lightconv_layer/cuda_function_gen.py
+python fairseq/modules/lightconv_layer/setup.py install
+
+# to install dynamicconv
+python fairseq/modules/dynamicconv_layer/cuda_function_gen.py
+python fairseq/modules/dynamicconv_layer/setup.py install
+```
+Once the CUDA modules are installed, they will automatically be used instead of the PyTorch modules.
+
 ### IWSLT14 De-En
 Training and evaluating DynamicConv (without GLU) on a GPU:
 ```sh
diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py
index 20ff4f0e6a..44d52dcd81 100644
--- a/fairseq/models/lightconv.py
+++ b/fairseq/models/lightconv.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
+import sys
 
 import torch
 import torch.nn as nn
@@ -19,10 +20,10 @@
 )
 from fairseq.modules import (
     AdaptiveSoftmax,
-    DynamicConv1dTBC,
+    DynamicConv,
     LayerNorm,
     PositionalEmbedding,
-    LightweightConv1dTBC,
+    LightweightConv,
     MultiheadAttention,
 )
 
@@ -173,7 +174,6 @@ def build_embedding(dictionary, embed_dim, path=None):
         decoder = LightConvDecoder(args, tgt_dict, decoder_embed_tokens)
         return LightConvModel(encoder, decoder)
 
-
 class LightConvEncoder(FairseqEncoder):
     """
     LightConv encoder consisting of *args.encoder_layers* layers. Each layer
@@ -447,15 +447,15 @@ def __init__(self, args, kernel_size=0):
             self.linear1 = Linear(self.embed_dim, self.conv_dim)
             self.act = None
         if args.encoder_conv_type == 'lightweight':
-            self.conv = LightweightConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l,
-                                             weight_softmax=args.weight_softmax,
-                                             num_heads=args.encoder_attention_heads,
-                                             weight_dropout=args.weight_dropout)
+            self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=padding_l,
+                                        weight_softmax=args.weight_softmax,
+                                        num_heads=args.encoder_attention_heads,
+                                        weight_dropout=args.weight_dropout)
         elif args.encoder_conv_type == 'dynamic':
-            self.conv = DynamicConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l,
-                                         weight_softmax=args.weight_softmax,
-                                         num_heads=args.encoder_attention_heads,
-                                         weight_dropout=args.weight_dropout)
+            self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=padding_l,
+                                    weight_softmax=args.weight_softmax,
+                                    num_heads=args.encoder_attention_heads,
+                                    weight_dropout=args.weight_dropout)
         else:
             raise NotImplementedError
         self.linear2 = Linear(self.conv_dim, self.embed_dim)
@@ -535,15 +535,15 @@ def __init__(self, args, no_encoder_attn=False, kernel_size=0):
             self.linear1 = Linear(self.embed_dim, self.conv_dim)
             self.act = None
         if args.decoder_conv_type == 'lightweight':
-            self.conv = LightweightConv1dTBC(self.conv_dim, kernel_size, padding_l=kernel_size-1,
-                                             weight_softmax=args.weight_softmax,
-                                             num_heads=args.decoder_attention_heads,
-                                             weight_dropout=args.weight_dropout)
+            self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=kernel_size-1,
+                                        weight_softmax=args.weight_softmax,
+                                        num_heads=args.decoder_attention_heads,
+                                        weight_dropout=args.weight_dropout)
         elif args.decoder_conv_type == 'dynamic':
-            self.conv = DynamicConv1dTBC(self.conv_dim, kernel_size, padding_l=kernel_size-1,
-                                         weight_softmax=args.weight_softmax,
-                                         num_heads=args.decoder_attention_heads,
-                                         weight_dropout=args.weight_dropout)
+            self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=kernel_size-1,
+                                    weight_softmax=args.weight_softmax,
+                                    num_heads=args.decoder_attention_heads,
+                                    weight_dropout=args.weight_dropout)
         else:
             raise NotImplementedError
         self.linear2 = Linear(self.conv_dim, self.embed_dim)
diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
index 6458f7d02f..ecfdc3d697 100644
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -9,13 +9,15 @@
 from .character_token_embedder import CharacterTokenEmbedder
 from .conv_tbc import ConvTBC
 from .downsampled_multihead_attention import DownsampledMultiHeadAttention
-from .dynamic_convolution import DynamicConv1dTBC
+from .dynamic_convolution import DynamicConv, DynamicConv1dTBC
+#from .dynamicconv_layer import DynamicconvLayer
 from .gelu import gelu, gelu_accurate
 from .grad_multiply import GradMultiply
 from .highway import Highway
 from .layer_norm import LayerNorm
 from .learned_positional_embedding import LearnedPositionalEmbedding
-from .lightweight_convolution import LightweightConv1dTBC
+from .lightweight_convolution import LightweightConv, LightweightConv1dTBC
+#from .lightconv_layer import LightconvLayer
 from .linearized_convolution import LinearizedConvolution
 from .logsumexp_moe import LogSumExpMoE
 from .mean_pool_gating_network import MeanPoolGatingNetwork
@@ -36,14 +38,18 @@
     'CharacterTokenEmbedder',
     'ConvTBC',
     'DownsampledMultiHeadAttention',
+#    'DyamicconvLayer',
     'DynamicConv1dTBC',
+    'DynamicConv',
     'gelu',
     'gelu_accurate',
     'GradMultiply',
     'Highway',
     'LayerNorm',
     'LearnedPositionalEmbedding',
+#    'LightconvLayer',
     'LightweightConv1dTBC',
+    'LightweightConv',
     'LinearizedConvolution',
     'LogSumExpMoE',
     'MeanPoolGatingNetwork',
diff --git a/fairseq/modules/cuda_utils.cu b/fairseq/modules/cuda_utils.cu
new file mode 100644
index 0000000000..596ff125f9
--- /dev/null
+++ b/fairseq/modules/cuda_utils.cu
@@ -0,0 +1,202 @@
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+
+template <typename U, typename V>	
+constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {	
+  return (a + b - 1) / b;	
+}
+
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__inline__ __device__
+void zeroSharedMem(scalar_t* data) {
+  /*
+    Given an array of length FS + SB, zero out the first padding_l and last
+    (FS - padding_l) values in the array
+  */
+
+  int tid = threadIdx.x;
+
+  if (FS < SB) {
+
+    // zero all if we have enough threads in a block to do all of them
+    if (tid < padding_l || tid > SB - FS + padding_l - 1) {
+      data[tid] = scalar_t(0.0);
+    }
+  } else {
+
+    // otherwise zero out one block at a time
+    const int numIterations = divUp<int, int>(FS, SB);
+    for (int i = 0; i < numIterations; i++) {
+      int offset = i * SB;
+      if (tid + offset < padding_l) {
+        data[tid + offset] = scalar_t(0.0);
+      } else if (tid + offset < FS) {
+        data[SB + tid + offset] = scalar_t(0.0);
+      }
+    }
+  }
+}
+
+template<typename scalar_t>
+__inline__ __device__
+scalar_t warpReduce(scalar_t data) {
+  /*
+    Reduce an array within each warp. After processing all values in warp will
+    caontain the sum of all original values in that warp.
+
+    data - pointer to data to reduce
+  */
+  data += __shfl_xor_sync(SHFL_MASK, data, 16);
+  data += __shfl_xor_sync(SHFL_MASK, data, 8);
+  data += __shfl_xor_sync(SHFL_MASK, data, 4);
+  data += __shfl_xor_sync(SHFL_MASK, data, 2);
+  data += __shfl_xor_sync(SHFL_MASK, data, 1);
+  return data;
+}
+
+template<typename scalar_t>
+__inline__ __device__
+scalar_t blockReduce(scalar_t data) {
+  /*
+     Reduce an entire array on the block level. After processing, the
+     first value in the array will contain the reduced sum.
+
+     data - pointer to data to reduce
+  */
+
+  static __shared__ scalar_t warpSum[32];
+  const int tid = threadIdx.x;
+  int wid = tid / 32;
+  int lane = tid % 32;
+
+  __syncthreads();
+
+  // reduce each warp then write to shared memory
+  scalar_t sum = warpReduce(data);
+  if (lane == 0) {
+    warpSum[wid] = sum;
+  }
+  
+  __syncthreads();
+
+  scalar_t v;
+  // perform final sum of partial warp sums
+  if (tid < blockDim.x / 32) {
+    v = warpSum[lane];
+  } else {
+    v = scalar_t(0.0);
+  }
+
+  if (wid == 0) {
+    v = warpReduce(v);
+  }
+  __syncthreads();
+
+  return v;
+}
+
+void checkCudaStatus(cudaError_t status, int lineNumber = -1) {
+
+  if (status != cudaSuccess) {
+    std::cout << cudaGetErrorString(status)
+              << " at line " << lineNumber << std::endl;
+    std::cout << "Exiting" << std::endl;
+    exit(1);
+  }
+}
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__device__
+void load_input_to_shared(const scalar_t* input, // global memory
+                          int inputOffset, int sequenceLength,
+                          int iteration, int numIterations,
+                          bool no_prev, scalar_t* output /* shared memory */) {
+  /*
+    Load a block size of input into shared memory with
+    right and left overhang of total size FS. If previously
+    loaded memory, overlap will be shifted over to reduce
+    global memory access
+
+    input - pointer to start of channel sequence
+    inputOffset - how far in the sequence to start loading
+    sequenceLength - total length of sequence
+    iteration - which block of sequence we are loading
+    numIterations - total number of blocks to load
+    no_prev - whether to load the whole block if the previous block
+              wasn't loaded
+    output - shared memory to write input to
+  */
+
+  const int tid = threadIdx.x;
+
+  // Load the left "overhang" of input
+  if (iteration > 0) {
+    if (padding_l < SB) {
+
+      // load all at once
+      if (tid < padding_l) {
+        output[tid] = (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB];
+      }
+    } else {
+
+      // load in chunks of size SB
+      int numIterations = divUp<int, int>(padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < padding_l) {
+          output[tid + offset] = (no_prev) ? input[inputOffset - padding_l + tid + offset] : output[tid + offset + SB];
+        }
+      }
+    }
+  }
+
+  // Load the right "overhang" of input
+  if (iteration < (numIterations - 1)) {
+    const int elementsLeft = sequenceLength - (iteration+1) * SB;
+
+    if ((FS - padding_l) < SB) {
+
+      // load all at once
+      if (tid < (FS - padding_l)) {
+          output[padding_l + SB + tid] = (tid < elementsLeft) ? input[inputOffset + SB + tid] : scalar_t(0.0);
+      }
+    } else {
+
+      // load in chunks of size SB
+      int numIterations = divUp<int, int>(FS - padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < (FS - padding_l)) {
+          output[padding_l + SB + tid + offset] = ((tid + offset) < elementsLeft) ? input[inputOffset + SB + tid + offset] : scalar_t(0.0);
+        }
+      }
+    }
+  }
+
+  // We should also clear out the right "overhang"
+  if (iteration == (numIterations - 1)) {
+    if ((FS - padding_l) < SB) {
+
+      // clear out all at once
+      if (tid < (FS - padding_l)) {
+          output[padding_l + SB + tid] = scalar_t(0.0);
+      }
+    } else {
+
+      // clear in chunks of size SB
+      int numIterations = divUp<int, int>(FS - padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < (FS - padding_l)) {
+          output[padding_l + SB + tid + offset] = scalar_t(0.0);
+        }
+      }
+    }
+  }
+  output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) ? input[inputOffset + tid] : scalar_t(0.0);
+}
diff --git a/fairseq/modules/dynamic_convolution.py b/fairseq/modules/dynamic_convolution.py
index a8fa47225d..7fbd3f37e1 100644
--- a/fairseq/modules/dynamic_convolution.py
+++ b/fairseq/modules/dynamic_convolution.py
@@ -10,6 +10,23 @@
 from fairseq import utils
 from .unfold import unfold1d
 
+def DynamicConv(input_size, kernel_size=1, padding_l=None, num_heads=1,
+                weight_dropout=0., weight_softmax=False,
+                renorm_padding=False, bias=False, conv_bias=False,
+                query_size=None, in_proj=False):
+    if torch.cuda.is_available():
+        try:
+            from fairseq.modules.dynamicconv_layer import DynamicconvLayer
+            return DynamicconvLayer(input_size, kernel_size=kernel_size,
+                                    padding_l=padding_l, num_heads=num_heads,
+                                    weight_dropout=weight_dropout,
+                                    weight_softmax=weight_softmax, bias=bias)
+        except ImportError as e:
+            print(e)
+    return DynamicConv1dTBC(input_size, kernel_size=kernel_size,
+                            padding_l=padding_l, num_heads=num_heads,
+                            weight_dropout=weight_dropout,
+                            weight_softmax=weight_softmax, bias=bias)
 
 def Linear(in_features, out_features, bias=True):
     m = nn.Linear(in_features, out_features, bias)
@@ -90,7 +107,6 @@ def forward(self, x, incremental_state=None, query=None, unfold=None):
 
         if query is None:
             query = x
-
         if unfold:
             output = self._forward_unfolded(x, incremental_state, query)
         else:
@@ -193,8 +209,7 @@ def _forward_expanded(self, x, incremental_stat, query):
             # turn the convolution filters into band matrices
             weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
             weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
-            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
-
+            weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T
         output = torch.bmm(weight_expanded, x)
         output = output.transpose(0, 1).contiguous().view(T, B, C)
         return output
diff --git a/fairseq/modules/dynamicconv_layer/__init__.py b/fairseq/modules/dynamicconv_layer/__init__.py
new file mode 100644
index 0000000000..c62ffac86c
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from .dynamicconv_layer import DynamicconvLayer
diff --git a/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
new file mode 100644
index 0000000000..caf151e4a1
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+
+def gen_forward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    blocks = [32, 64, 128, 256]
+
+    head = """
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+#include "dynamicconv_cuda.cuh"
+
+std::vector<at::Tensor> dynamicconv_cuda_forward(at::Tensor input, at::Tensor weight, int padding_l) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = weight.size(1);
+    const auto filterSize = weight.size(2);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+    const dim3 blocks(minibatch, numFeatures);
+
+    auto output = at::zeros_like(input);
+    auto stream = at::cuda::getCurrentCUDAStream();
+"""
+
+    switch = """
+    switch(filterSize) {
+"""
+
+    case_k = """
+        case {k}:
+"""
+
+    main_block = """
+            if (padding_l == {pad}) {{
+                AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "dynamicconv_forward", ([&] {{
+                    dynamicconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
+                    <<<blocks, {b_size}, 0, stream>>>(
+                            input.data<scalar_t>(),
+                            weight.data<scalar_t>(),
+                            minibatch,
+                            sequenceLength,
+                            numFeatures,
+                            numFiltersInBlock,
+                            numHeads,
+                            output.data<scalar_t>());
+                }}));
+            }} else
+"""
+
+    bad_padding = """
+            {
+                std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
+            }
+            break;\n
+"""
+
+    end = """
+        default:
+            std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
+    }
+
+    return {output};
+}
+"""
+
+    with open("dynamicconv_cuda_forward.cu", 'w') as forward:
+        forward.write(head)
+        forward.write(switch)
+        for k in kernels:
+            b_size = 32
+            for b in blocks:
+                if b > k:
+                    b_size = b
+                    break
+            forward.write(case_k.format(k=k))
+            for pad in [k // 2, k - 1]:
+                forward.write(main_block.format(k=k, b_size=b_size, pad=pad))
+            forward.write(bad_padding)
+        forward.write(end)
+
+
+def gen_backward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    thresh = [512, 512, 512, 512, 512, 380, 256, 256]
+    min_block = [64, 64, 64, 64, 64, 64, 128, 256]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+
+    head = """
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+#include "dynamicconv_cuda.cuh"
+
+std::vector<at::Tensor> dynamicconv_cuda_backward(at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor weight) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = weight.size(1);
+    const auto filterSize = weight.size(2);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+    auto numChunks = 1;
+
+    auto gradInput = at::zeros_like(input);
+    auto gradWeight = at::zeros_like(weight);
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    dim3 blocks(minibatch, numHeads, numChunks);
+"""
+
+    sequence_if = """
+    if (sequenceLength < {seq}) {{
+        switch(filterSize) {{
+"""
+
+    case_k = """
+            case {k}:
+"""
+
+    chunks_reset = """
+                numChunks = int(ceilf(sequenceLength/float({b_size})));
+                blocks = dim3(minibatch, numHeads, numChunks);
+"""
+
+    main_block = """
+                if (padding_l == {p}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.scalar_type(), "dynamicconv_backward", ([&] {{
+                        dynamicconv_backward_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<blocks, {b_size}, 0, stream>>>(
+                                    gradOutput.data<scalar_t>(),
+                                    input.data<scalar_t>(),
+                                    weight.data<scalar_t>(),
+                                    minibatch,
+                                    sequenceLength,
+                                    numFeatures,
+                                    numFiltersInBlock,
+                                    numHeads,
+                                    gradWeight.data<scalar_t>(),
+                                    gradInput.data<scalar_t>());
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
+                }
+                break;\n
+"""
+
+    bad_filter = """
+            default:
+                std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
+        }
+"""
+
+    con_else = """
+    } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    last_return = """
+    }
+    return {gradInput, gradWeight};
+}
+"""
+
+    with open("dynamicconv_cuda_backward.cu", 'w') as backward:
+        backward.write(head)
+        for seq in seqs:
+            backward.write(sequence_if.format(seq=seq))
+            for k, t, m in zip(kernels, thresh, min_block):
+                backward.write(case_k.format(k=k))
+                if seq <= t:
+                    b_size = seq
+                else:
+                    b_size = m
+                    backward.write(chunks_reset.format(b_size=b_size))
+                for p in [k // 2, k - 1]:
+                    backward.write(main_block.format(k=k, b_size=b_size, p=p))
+                backward.write(bad_padding)
+            backward.write(bad_filter)
+            backward.write(con_else)
+        backward.write(final_else)
+        for k, m in zip(kernels, min_block):
+            backward.write(case_k.format(k=k))
+            backward.write(chunks_reset.format(b_size=m))
+            for p in [k // 2, k - 1]:
+                backward.write(main_block.format(k=k, b_size=m, p=p))
+            backward.write(bad_padding)
+        backward.write(bad_filter)
+        backward.write(last_return)
+
+
+if __name__ == "__main__":
+    gen_forward()
+    gen_backward()
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
new file mode 100644
index 0000000000..b76c9e7fe2
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
@@ -0,0 +1,49 @@
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor> dynamicconv_cuda_forward(
+    at::Tensor input,
+    at::Tensor filters,
+    int padding_l);
+
+std::vector<at::Tensor> dynamicconv_cuda_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters);
+
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> dynamicconv_forward(
+    at::Tensor input,
+    at::Tensor filters,
+    int padding_l) {
+
+    CHECK_INPUT(input);
+    CHECK_INPUT(filters);
+
+    return dynamicconv_cuda_forward(input, filters,
+            padding_l);
+}
+
+std::vector<at::Tensor> dynamicconv_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters) {
+
+    CHECK_INPUT(gradOutput);
+    CHECK_INPUT(input);
+    CHECK_INPUT(filters);
+
+    return dynamicconv_cuda_backward(gradOutput, padding_l,
+            input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &dynamicconv_forward, "dynamicconv forward (CUDA)");
+    m.def("backward", &dynamicconv_backward, "dynamicconv backward (CUDA)");
+}
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
new file mode 100644
index 0000000000..5d6ed575f3
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+
+#define SHFL_MASK 0xffffffff
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void dynamicconv_forward_kernel(const scalar_t* input,
+                                const scalar_t* weight,
+                                int minibatch, 
+                                int sequenceLength,
+                                int numFeatures, 
+                                int numFiltersInBlock,
+                                int numHeads,
+                                scalar_t* output);
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void dynamicconv_backward_kernel(
+    const scalar_t* gradOutput, // B * C * T
+    const scalar_t* input, // B * C * T
+    const scalar_t* weight,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    scalar_t* gradWeight,
+    scalar_t* gradInput); // B * H * k * T
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
new file mode 100644
index 0000000000..f29e6ded06
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
@@ -0,0 +1,167 @@
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+#include "dynamicconv_cuda.cuh"
+#include "dynamicconv_cuda_forward.cu"
+#include "dynamicconv_cuda_backward.cu"
+#include "../cuda_utils.cu"
+
+// FS is filter size and kernels are specialized for filter sizes
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void dynamicconv_forward_kernel(const scalar_t* input,
+                                const scalar_t* weight,
+                                int minibatch,
+                                int sequenceLength,
+                                int numFeatures,
+                                int numFiltersInBlock,
+                                int numHeads,
+                                scalar_t* output) {
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int head = featureIdx / numFiltersInBlock;
+
+  const int IOOffset = batchIdx * numFeatures * sequenceLength
+                       + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+
+  scalar_t filter[FS];
+
+  __shared__ scalar_t tempInput[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    __syncthreads();
+    const int inputOffset = i * SB;
+    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset,
+                                            sequenceLength, i,
+                                            numIterations, false, tempInput);
+    __syncthreads();
+    if (inputOffset + tid < sequenceLength) {
+
+      #pragma unroll
+      for (int k = 0; k < FS; ++k) {
+        const int filterOffset = batchIdx * numHeads * FS * sequenceLength
+                                 + head * FS * sequenceLength
+                                 + k * sequenceLength
+                                 + i * SB + tid;
+        filter[k] = weight[filterOffset];
+      }
+
+      scalar_t out = scalar_t(0.0);
+      #pragma unroll
+      for (int k = 0; k < FS; ++k) {
+        out += filter[k] * tempInput[tid + k];
+      }
+
+      outputFeature[inputOffset + tid] = out;
+
+    }
+  }
+}
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void dynamicconv_backward_kernel(
+    const scalar_t* gradOutput, // B * C * T
+    const scalar_t* input, // B * C * T
+    const scalar_t* weight,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    scalar_t* gradWeight,
+    scalar_t* gradInput) { // B * H * k * T
+
+  assert(blockDim.x == SB);
+
+  // each block operates on a single batch and filter head
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int headIdx = blockIdx.y;
+  const int chunkIdx = blockIdx.z;
+
+  const int numChunks = divUp<int, int>(sequenceLength, SB);
+  const int inputOffset = chunkIdx * SB;
+
+  // initialize shared memory for output gradient and input
+  __shared__ scalar_t tempGradOutput[SB + FS];
+  __shared__ scalar_t tempInput[SB + FS];
+  const int padding = FS - padding_l - 1;
+
+  zeroSharedMem<FS, SB, padding>(tempGradOutput);
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+
+  // initialize local filter and weight gradient sum arrays
+  scalar_t tempGradSum[FS];
+  scalar_t bfilter[FS];
+  for (int k = 0; k < FS; ++k) {
+    tempGradSum[k] = scalar_t(0.0);
+
+    int idxOffset = inputOffset + tid + k - padding;
+    if (idxOffset >= 0 && idxOffset < sequenceLength) {
+      int bfilterOffset = batchIdx * numHeads * FS * sequenceLength
+                          + headIdx * FS * sequenceLength
+                          + (FS - k  - 1) * sequenceLength
+                          + idxOffset;
+      bfilter[k] = weight[bfilterOffset];
+    } else {
+      bfilter[k] = scalar_t(0.0);
+    }
+  }
+
+
+  // iterate over filter block
+  for (int featureIdx = 0; featureIdx < numFiltersInBlock; ++featureIdx) {
+    __syncthreads();
+
+    // load input and output gradient for this channel and chunk
+    const int IOOffset = batchIdx * numFeatures * sequenceLength
+                         + (headIdx * numFiltersInBlock + featureIdx) * sequenceLength;
+    const scalar_t* inputFeature = &input[IOOffset];
+    const scalar_t* gradOutputFeature = &gradOutput[IOOffset];
+    scalar_t* gradInputFeature = &gradInput[IOOffset];
+
+    load_input_to_shared<FS, SB, padding>(gradOutputFeature, inputOffset,
+                                            sequenceLength, chunkIdx,
+                                            numChunks, true, tempGradOutput);
+    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset,
+                                            sequenceLength, chunkIdx,
+                                            numChunks, true, tempInput);
+    __syncthreads();
+ 
+    // sum input and weight gradients
+    scalar_t out = scalar_t(0.0);
+    #pragma unroll
+    for (int k = 0; k < FS; ++k) {
+      tempGradSum[k] += tempInput[tid + k] * tempGradOutput[tid + padding];
+      out += bfilter[k] * tempGradOutput[tid + k];
+    }
+    
+    if (inputOffset + tid < sequenceLength) {
+      gradInputFeature[inputOffset + tid] = out;
+    }
+  }
+
+  const int gradOffset = batchIdx * numHeads * FS * sequenceLength
+               + headIdx * FS * sequenceLength;
+  scalar_t *gradWeightFeature = &gradWeight[gradOffset];
+
+  // write weight gradient
+  if (inputOffset + tid < sequenceLength) {
+    for (int k = 0; k < FS; ++k) {
+      const int outputOffset = k * sequenceLength + inputOffset + tid;
+      gradWeightFeature[outputOffset] = tempGradSum[k];
+    }
+  }
+}
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
new file mode 100644
index 0000000000..d50e13c0d2
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
@@ -0,0 +1,205 @@
+import torch
+from torch import nn
+from torch.autograd import Function
+import torch.nn.functional as F
+import dynamicconv_cuda
+from fairseq import utils
+
+
+class dynamicconvFunction(Function):
+
+    @staticmethod
+    def forward(ctx, x, weights, padding_l):
+        ctx.padding_l = padding_l
+        outputs = dynamicconv_cuda.forward(x, weights, padding_l)
+        variables = [x, weights]
+        ctx.save_for_backward(*variables)
+        return outputs[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        outputs = dynamicconv_cuda.backward(
+                grad_output.contiguous(),
+                ctx.padding_l,
+                *ctx.saved_variables)
+        grad_input, grad_weights = outputs
+        return grad_input, grad_weights, None
+
+
+class DynamicconvLayer(nn.Module):
+    def __init__(
+            self,
+            input_size,
+            kernel_size=1,
+            padding_l=None,
+            weight_softmax=False,
+            num_heads=1,
+            weight_dropout=0.,
+            bias=False,
+            renorm_padding=False,
+            conv_bias=False,
+            query_size=None):
+
+        super(DynamicconvLayer, self).__init__()
+        self.input_size = input_size
+        self.query_size = input_size if query_size is None else query_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_softmax = weight_softmax
+        self.weight_dropout = weight_dropout
+        self.renorm_padding = renorm_padding
+        self.bias = bias
+
+        self.weight_linear = nn.Linear(input_size, num_heads * kernel_size, bias)
+        if conv_bias:
+            self.conv_bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.conv_bias = None
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight_linear.weight)
+        if self.conv_bias is not None:
+            nn.init.constant_(self.conv_bias, 0.)
+            nn.init.constant_(self.weight_linaer.bias, 0.)
+
+    def forward(self, x, incremental_state=None, query=None, unfold=None):
+
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+
+        # during inference time, incremental BMM is faster
+        if incremental_state is not None:
+            unfold = x.size(0) > 512 if unfold is None else unfold  # use unfold mode as default for long sequence to save memory
+            unfold = unfold or (incremental_state is not None)
+            assert query is None
+
+            if query is None:
+                query = x
+            if unfold:
+                output = self._forward_unfolded(x, incremental_state, query)
+            else:
+                output = self._forward_expanded(x, incremental_state, query)
+
+            if self.conv_bias is not None:
+                output = output + self.conv_bias.view(1, 1, -1)
+
+            return output
+
+        # during training time, use CUDA kernel
+        else:
+            weight = self.weight_linear(x).view(T, B, H, K)
+            if self.weight_softmax:
+                weight = F.softmax(weight, dim=-1)
+            if self.weight_dropout:
+                weight = F.dropout(weight, self.weight_dropout, training=self.training)
+
+            weight = weight.permute(1, 2, 3, 0).contiguous()
+            self.filters = weight
+            x = x.permute(1, 2, 0).contiguous()
+            output = dynamicconvFunction.apply(x, weight, self.padding_l).permute(2, 0, 1)
+            if self.conv_bias is not None:
+                output = output + self.conv_bias.view(1, 1, -1)
+            return output
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, 'input_buffer')
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer)
+
+    def _forward_unfolded(self, x, incremental_state, query):
+        '''The conventional implementation of convolutions.
+        Unfolding the input by having a window shifting to the right.'''
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        weight = self.weight_linear(query).view(T*B*H, -1)
+
+        # renorm_padding is only implemented in _forward_expanded
+        assert not self.renorm_padding or incremental_state is not None
+
+        if incremental_state is not None:
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(incremental_state, x_unfold[:, :, :, -self.kernel_size+1:])
+            x_unfold = x_unfold.view(T*B*H, R, -1)
+        else:
+            padding_l = self.padding_l
+            if K > T and padding_l == K-1:
+                weight = weight.narrow(1, K-T, T)
+                K, padding_l = T, T-1
+            # unfold the input: T x B x C --> T' x B x C x K
+            x_unfold = unfold1d(x, K, padding_l, 0)
+            x_unfold = x_unfold.view(T*B*H, R, K)
+
+        if self.weight_softmax and not self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+        weight = weight.narrow(1, 0, K)
+
+        if incremental_state is not None:
+            weight = weight[:, -x_unfold.size(2):]
+            K = weight.size(1)
+
+        if self.weight_softmax and self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+
+        weight = F.dropout(weight, self.weight_dropout, training=self.training, inplace=False)
+
+        output = torch.bmm(x_unfold, weight.unsqueeze(2))  # T*B*H x R x 1
+        output = output.view(T, B, C)
+        return output
+
+    def _forward_expanded(self, x, incremental_stat, query):
+        '''Turn the convolution filters into band matrices and do matrix multiplication.
+        This is faster when the sequence is short, but less memory efficient.
+        This is not used in the decoder during inference.
+        '''
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+        weight = self.weight_linear(query).view(T*B*H, -1)
+
+        if not self.renorm_padding:
+            if self.weight_softmax:
+                weight = F.softmax(weight, dim=1)
+            weight = F.dropout(weight, self.weight_dropout, training=self.training, inplace=False)
+        weight = weight.narrow(1, 0, K).contiguous()
+        weight = weight.view(T, B*H, K).transpose(0, 1)
+
+        x = x.view(T, B*H, R).transpose(0, 1)
+        if self.weight_softmax and self.renorm_padding:
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new(B*H, T, T+K-1).fill_(float('-inf'))
+            weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, self.padding_l, T)
+            # normalize the weight over valid positions like self-attention
+            weight_expanded = F.softmax(weight_expanded, dim=2)
+            weight_expanded = F.dropout(weight_expanded, self.weight_dropout, training=self.training, inplace=False)
+        else:
+            P = self.padding_l
+            # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length
+            if K > T and P == K-1:
+                weight = weight.narrow(2, K-T, T)
+                K, P = T, T-1
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
+            weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T
+        output = torch.bmm(weight_expanded, x)
+        output = output.transpose(0, 1).contiguous().view(T, B, C)
+        return output
diff --git a/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp b/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
new file mode 100644
index 0000000000..8a6af4285d
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
@@ -0,0 +1,35 @@
+#include <torch/torch.h>
+#include <vector>
+
+std::vector<float*> dynamicconv_cpu_forward(
+    float* input,
+    float* filters,
+    int padding_l);
+
+std::vector<float*> dynamicconv_cpu_backward(
+    float* gradOutput,
+    int padding_l,
+    float* input,
+    float* filters);
+
+std::vector<float*> dynamicconv_forward(
+    float* input,
+    float* filters,
+    int padding_l) {
+
+    return dynamicconv_cpu_forward(input, filters, padding_l);
+}
+
+std::vector<float*> dynamicconv_backward(
+    float* gradOutput,
+    int padding_l,
+    float* input,
+    float* filters) {
+
+    return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)");
+    m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)");
+}
diff --git a/fairseq/modules/dynamicconv_layer/setup.py b/fairseq/modules/dynamicconv_layer/setup.py
new file mode 100644
index 0000000000..00ce29bc75
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/setup.py
@@ -0,0 +1,17 @@
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+
+setup(
+    name='dynamicconv_layer',
+    ext_modules=[
+        CUDAExtension(
+            name='dynamicconv_cuda',
+            sources=[
+                'dynamicconv_cuda.cpp',
+                'dynamicconv_cuda_kernel.cu',
+            ],
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
diff --git a/fairseq/modules/lightconv_layer/__init__.py b/fairseq/modules/lightconv_layer/__init__.py
new file mode 100644
index 0000000000..95fe76c7cd
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from .lightconv_layer import LightconvLayer
diff --git a/fairseq/modules/lightconv_layer/cuda_function_gen.py b/fairseq/modules/lightconv_layer/cuda_function_gen.py
new file mode 100644
index 0000000000..1bb3a1a0dd
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/cuda_function_gen.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+
+def gen_forward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+
+    head = """
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+#include "lightconv_cuda.cuh"
+
+std::vector<at::Tensor> lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = filters.size(0);
+    const auto filterSize = filters.size(1);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+
+    const dim3 blocks(minibatch, numFeatures);
+
+    auto output = at::zeros_like(input);
+    auto stream = at::cuda::getCurrentCUDAStream();
+"""
+
+    sequence_if = """
+    if (sequenceLength <= {seq}) {{
+        switch(filterSize) {{
+"""
+
+    case_k = """
+            case {k}:
+"""
+
+    main_block = """
+                if (padding_l == {pad}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_forward", ([&] {{
+                        lightconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
+                        <<<blocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                filters.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                output.data<scalar_t>());
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
+                }
+                break;
+"""
+
+    bad_filter = """
+            default:
+                std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
+        }
+"""
+
+    con_else = """
+    } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    final_return = """
+    }
+
+    return {output};
+}
+"""
+
+    with open("lightconv_cuda_forward.cu", 'w') as forward:
+        forward.write(head)
+        for seq in seqs:
+            forward.write(sequence_if.format(seq=seq))
+            for k in kernels:
+                forward.write(case_k.format(k=k))
+                for pad in [k // 2, k - 1]:
+                    forward.write(main_block.format(k=k, b_size=seq, pad=pad))
+                forward.write(bad_padding)
+            forward.write(bad_filter)
+            forward.write(con_else)
+
+        forward.write(final_else)
+        for k in kernels:
+            forward.write(case_k.format(k=k))
+            for pad in [k // 2, k - 1]:
+                forward.write(main_block.format(k=k, b_size=seq, pad=pad))
+            forward.write(bad_padding)
+        forward.write(bad_filter)
+        forward.write(final_return)
+
+
+def gen_backward():
+
+    head = """
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+#include "lightconv_cuda.cuh"
+
+std::vector<at::Tensor> lightconv_cuda_backward(
+        at::Tensor gradOutput,
+        int padding_l,
+        at::Tensor input,
+        at::Tensor filters) {
+
+    // gradWrtInput
+    const int minibatch = input.size(0);
+    const int numFeatures = input.size(1);
+    const int sequenceLength = input.size(2);
+
+    const int numHeads = filters.size(0);
+    const int filterSize = filters.size(1);
+
+    const dim3 gradBlocks(minibatch, numFeatures);
+    const dim3 weightGradFirstpassShortBlocks(minibatch, numHeads);
+    const dim3 weightGradSecondpassBlocks(numHeads, filterSize);
+
+    const int numFiltersInBlock = numFeatures / numHeads;
+
+    auto gradInput = at::zeros_like(input);
+    auto gradFilters = at::zeros_like(filters);
+
+    at::DeviceGuard g(input.device());
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    switch(filterSize) {
+"""
+
+    sequence_if = """
+            if (sequenceLength <= {seq}) {{
+"""
+
+    case_k = """
+        case {k}:
+"""
+
+    main_block = """
+                if (padding_l == {p}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_backward", ([&] {{
+                        lightconv_grad_wrt_input_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<gradBlocks, {b_size}, 0, stream>>>(
+                                gradOutput.data<scalar_t>(),
+                                filters.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                gradInput.data<scalar_t>());
+
+"""
+
+    weight_grad_short = """
+                        at::Tensor tempSumGradFilters = at::zeros({{minibatch, numHeads, filterSize}}, input.options().dtype(at::kFloat));
+                        lightconv_grad_wrt_weights_firstpass_short_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<weightGradFirstpassShortBlocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                gradOutput.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                numHeads,
+                                tempSumGradFilters.data<float>()
+                        );
+
+                        lightconv_grad_wrt_weights_secondpass_short_kernel<{k}, {b_size}, scalar_t>
+                        <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>(
+                                tempSumGradFilters.data<float>(),
+                                minibatch,
+                                numFiltersInBlock,
+                                gradFilters.data<scalar_t>()
+                        );
+                    }}));
+                }} else
+"""
+
+    weight_grad = """
+                        at::Tensor tempSumGradFilters = at::zeros({{minibatch, numFeatures, filterSize}}, input.options().dtype(at::kFloat));
+                        lightconv_grad_wrt_weights_firstpass_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<gradBlocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                gradOutput.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                tempSumGradFilters.data<float>()
+                        );
+
+                        lightconv_grad_wrt_weights_secondpass_kernel<{k}, {b_size}, scalar_t>
+                        <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>(
+                                tempSumGradFilters.data<float>(),
+                                minibatch,
+                                numFiltersInBlock,
+                                gradFilters.data<scalar_t>()
+                        );
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
+                }
+"""
+
+    breakout = """
+                break;
+"""
+
+    bad_filter = """
+        default:
+            std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
+"""
+
+    con_else = """
+            } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    last_return = """
+    }
+    return {gradInput, gradFilters};
+}
+"""
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+    thresh = [32, 32, 64, 128, 256, -1, -1, -1]
+    max_mem = [-1, -1, -1, -1, -1, 192, 96, 64]
+
+    with open("lightconv_cuda_backward.cu", 'w') as backward:
+        backward.write(head)
+        for (k, t, mem) in zip(kernels, thresh, max_mem):
+            backward.write(case_k.format(k=k))
+            for seq in seqs:
+                if (t == -1 or seq <= t) and (mem == -1 or seq < mem):
+                    backward.write(sequence_if.format(seq=seq))
+                    for p in [k // 2, k - 1]:
+                        backward.write(main_block.format(k=k, b_size=seq, p=p))
+                        backward.write(weight_grad_short.format(k=k, b_size=seq, p=p))
+                    backward.write(bad_padding)
+                else:
+                    for p in [k // 2, k - 1]:
+                        backward.write(main_block.format(k=k, b_size=32, p=p))
+                        backward.write(weight_grad.format(k=k, b_size=32, p=p))
+                    backward.write(bad_padding)
+                    backward.write(breakout)
+                    break
+                backward.write(con_else)
+        backward.write(bad_filter)
+        backward.write(last_return)
+
+
+if __name__ == "__main__":
+    gen_forward()
+    gen_backward()
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda.cpp b/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
new file mode 100644
index 0000000000..3dc1765bf0
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
@@ -0,0 +1,47 @@
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor> lightconv_cuda_forward(
+    at::Tensor input,
+    at::Tensor filters,
+    int padding_l);
+
+std::vector<at::Tensor> lightconv_cuda_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters);
+
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> lightconv_forward(
+    at::Tensor input,
+    at::Tensor filters,
+    int padding_l) {
+
+    CHECK_INPUT(input);
+    CHECK_INPUT(filters);
+
+    return lightconv_cuda_forward(input, filters, padding_l);
+}
+
+std::vector<at::Tensor> lightconv_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters) {
+
+    CHECK_INPUT(gradOutput);
+    CHECK_INPUT(input);
+    CHECK_INPUT(filters);
+
+    return lightconv_cuda_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &lightconv_forward, "lighconv forward (CUDA)");
+    m.def("backward", &lightconv_backward, "lighconv backward (CUDA)");
+}
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda.cuh b/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
new file mode 100644
index 0000000000..f4c5fec437
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
@@ -0,0 +1,82 @@
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <stdlib.h>
+#include <assert.h>
+
+#define SHFL_MASK 0xffffffff
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_forward_kernel(const scalar_t* input,
+                              const scalar_t* filters,
+                              int minibatch, int sequenceLength,
+                              int numFeatures, int numFiltersInBlock,
+                              scalar_t* output);
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_input_kernel(
+    const scalar_t* input, 
+    const scalar_t* filters,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    scalar_t* output);
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_firstpass_short_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    float* output);
+
+template<int FS, int SB, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_secondpass_short_kernel(
+    const float* input,
+    const int minibatch, 
+    const int numFiltersInBlock,
+    scalar_t* output);
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_firstpass_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    float* output);
+
+template<int FS, int SB, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_secondpass_kernel(
+    const float* input,
+    const int minibatch, 
+    const int numFiltersInBlock,
+    scalar_t* output);
+
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
new file mode 100644
index 0000000000..8e17e27af1
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
@@ -0,0 +1,374 @@
+/**
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+#include "lightconv_cuda.cuh"
+#include "lightconv_cuda_forward.cu"
+#include "lightconv_cuda_backward.cu"
+#include "../cuda_utils.cu"
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_forward_kernel(const scalar_t* input,
+                              const scalar_t* filters,
+                              int minibatch, int sequenceLength,
+                              int numFeatures, int numFiltersInBlock,
+                              scalar_t* output) {
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+
+  const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+  const scalar_t* inputFilter = &filters[filterIdx * FS];
+
+  assert(blockDim.x == SB);
+
+  scalar_t filter[FS];
+  #pragma unroll
+  for (int i = 0; i < FS; ++i) {
+    filter[i] = inputFilter[i];
+  }
+
+  __shared__ scalar_t temp[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(temp);
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    // Read input into shared memory
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
+                                 i, numIterations, (numIterations == 1), temp);
+
+    __syncthreads();
+
+    scalar_t out = 0;
+    #pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      out += filter[j] * temp[tid + j];
+    }
+
+    // Write output
+    const int outputOffset = inputOffset;
+    if ((outputOffset + tid) < sequenceLength) {
+      outputFeature[outputOffset + tid] = out;
+    }
+
+    __syncthreads();
+  }
+}
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_input_kernel(
+    const scalar_t* input,
+    const scalar_t* filters,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    scalar_t* output) {
+
+  // input grad kernel is similar to forward kernel
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+
+  const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+  const scalar_t* inputFilter = &filters[filterIdx * FS];
+
+  assert(blockDim.x == SB);
+
+  scalar_t filter[FS];
+
+  // The only change is loading the filter in reverse
+  #pragma unroll
+  for (int i = 0; i < FS; ++i) {
+    filter[i] = inputFilter[FS - i - 1];
+  }
+
+  __shared__ scalar_t temp[SB + FS];
+  const int padding = FS - padding_l - 1;
+  zeroSharedMem<FS, SB, padding>(temp);
+
+  __syncthreads();
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    // Read input into shared memory
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding>(inputFeature, inputOffset, sequenceLength,
+                                 i, numIterations, false, temp);
+
+    __syncthreads();
+
+    scalar_t out = 0;
+    #pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      out += filter[j] * temp[tid + j];
+    }
+
+    // Write output
+    const int outputOffset = inputOffset;
+    if ((outputOffset + tid) < sequenceLength) {
+      outputFeature[outputOffset + tid] = out;
+    }
+
+    __syncthreads();
+  }
+}
+
+// This is by far the most expensive kernel in terms of time taken.
+// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_firstpass_short_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    float* output) {
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int filterIdx = blockIdx.y;
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  float* tempOutputGradWeight = &output[filterIdx * FS * minibatch];
+
+  assert(blockDim.x == SB);
+
+  __shared__ scalar_t tempInput[SB + FS];
+  __shared__ scalar_t tempGradInput[SB + FS];
+
+  // local weight accumulation
+  float accumWeights[FS];
+
+  // Initialize memory
+  for (int i = 0; i < FS; ++i) {
+    accumWeights[i] = float(0.0);
+  }
+
+
+  // loop over each sequence within filterblock
+  for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock; ++idxInFilterBlock) {
+
+    const int featureOffset = batchIdx * numFeatures * sequenceLength + (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength;
+    const scalar_t* inputFeature = &input[featureOffset];
+    const scalar_t* gradInputFeature = &gradInput[featureOffset];
+
+    zeroSharedMem<FS, SB, padding_l>(tempInput);
+    zeroSharedMem<FS, SB, (FS/2)>(tempGradInput);
+    __syncthreads();
+
+    for (int i = 0; i < numIterations; ++i) {
+
+      const int inputOffset = i * SB;
+
+      load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
+                                    i, numIterations, false, tempInput);
+      load_input_to_shared<FS, SB, (FS/2)>(gradInputFeature, inputOffset, sequenceLength,
+                                    i, numIterations, false, tempGradInput);
+
+      __syncthreads();
+
+      const int gradIndex = (FS/2) + tid;
+      scalar_t tempGrad = tempGradInput[gradIndex];
+
+      #pragma unroll
+      for (int j = 0; j < FS; j++) {
+        const int inputIndex = tid + j;
+        accumWeights[j] += tempInput[inputIndex] * tempGrad;
+      }
+
+      __syncthreads();
+
+    }
+
+  }
+
+  // Row-major sum
+  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
+
+    float temp;
+    if (tid < sequenceLength) {
+        temp = accumWeights[filterWeightIdx];
+    } else {
+        temp = float(0.0);
+    }
+
+    const int outputOffset = filterWeightIdx * minibatch + batchIdx;
+
+    temp = blockReduce(temp);
+
+    if (tid == 0) {
+      tempOutputGradWeight[outputOffset] = temp;
+    }
+  }
+}
+
+template<int FS, int SB, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_secondpass_short_kernel(
+    const float* input,
+    const int minibatch,
+    const int numFiltersInBlock,
+    scalar_t* output) {
+
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+
+  const int filterIdx = blockIdx.x;
+  const int filterWeightIdx = blockIdx.y;
+
+  const int inputOffset = filterIdx * FS * minibatch +
+                          filterWeightIdx * minibatch;
+  const float* tempInput = &input[inputOffset];
+
+  // read into shared memory for reduction
+  int readIndex = tid;
+
+  float sum = 0.0;
+  while (readIndex < minibatch) {
+    sum += tempInput[readIndex];
+    readIndex += SB;
+  }
+
+  float temp = blockReduce(sum);
+
+  if (tid == 0) {
+    output[blockIdx.x * FS + blockIdx.y] = temp;
+  }
+}
+
+// This is by far the most expensive kernel in terms of time taken.
+// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_firstpass_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    float* output) {
+
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+  const int idxInFilterBlock = featureIdx % numFiltersInBlock;
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  float temp;
+
+  __shared__ scalar_t tempInput[SB + FS];
+  __shared__ scalar_t tempGradInput[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+  zeroSharedMem<FS, SB, (FS/2)>(tempGradInput);
+  __syncthreads();
+
+  float accumWeights[FS];
+
+  for (int i = 0; i < FS; ++i) {
+    accumWeights[i] = float(0.0);
+  }
+
+  const int IOOffset = batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  const scalar_t* gradInputFeature = &gradInput[IOOffset];
+  float* tempOutputGradWeight = &output[filterIdx * FS * minibatch * numFiltersInBlock];
+
+  for (int i = 0; i < numIterations; ++i) {
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
+                                 i, numIterations, false, tempInput);
+    load_input_to_shared<FS, SB, (FS/2)>(gradInputFeature, inputOffset, sequenceLength,
+                                 i, numIterations, false, tempGradInput);
+    __syncthreads();
+
+    #pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS/2)];
+    }
+
+    __syncthreads();
+  }
+
+  // Row-major sum
+  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
+
+    // Write to shared memory before reduction
+    if (tid < sequenceLength) {
+      temp = accumWeights[filterWeightIdx];
+    } else {
+      temp = float(0.0);
+    }
+
+    temp = blockReduce(temp);
+
+    const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock +
+                             batchIdx * numFiltersInBlock +
+                             idxInFilterBlock;
+
+    if (tid == 0) {
+      tempOutputGradWeight[outputOffset] = temp;
+    }
+  }
+}
+
+template<int FS, int SB, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_secondpass_kernel(
+    const float* input,
+    const int minibatch,
+    const int numFiltersInBlock,
+    scalar_t* output) {
+
+  assert(blockDim.x == SB);
+  const int tid = threadIdx.x;
+
+  // What is the id within a minibatch
+  const int filterIdx = blockIdx.x;
+  const int filterWeightIdx = blockIdx.y;
+
+  const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock +
+                          filterWeightIdx * minibatch * numFiltersInBlock;
+  const float* tempInput = &input[inputOffset];
+
+  int readIndex = tid;
+
+  float sum = float(0.0);
+  while (readIndex < (minibatch * numFiltersInBlock)) {
+    sum += tempInput[readIndex];
+    readIndex += SB;
+  }
+
+  float temp = blockReduce(sum);
+
+  if (tid == 0) {
+    output[blockIdx.x * FS + blockIdx.y] = temp;
+  }
+}
diff --git a/fairseq/modules/lightconv_layer/lightconv_layer.py b/fairseq/modules/lightconv_layer/lightconv_layer.py
new file mode 100644
index 0000000000..8728128277
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/lightconv_layer.py
@@ -0,0 +1,113 @@
+import torch
+from torch import nn
+from torch.autograd import Function
+import torch.nn.functional as F
+import time
+
+import lightconv_cuda
+from fairseq import utils
+
+class lightconvFunction(Function):
+
+    @staticmethod
+    def forward(ctx, x, weights, padding_l):
+        ctx.padding_l = padding_l
+        outputs = lightconv_cuda.forward(x, weights, padding_l)
+        variables = [x, weights]
+        ctx.save_for_backward(*variables)
+        return outputs[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        outputs = lightconv_cuda.backward(
+                grad_output.contiguous(),
+                ctx.padding_l,
+                *ctx.saved_variables)
+        grad_input, grad_weights = outputs
+        return grad_input, grad_weights, None
+
+class LightconvLayer(nn.Module):
+    def __init__(
+            self,
+            input_size,
+            kernel_size=1,
+            padding_l=None,
+            weight_softmax=False,
+            num_heads=1,
+            weight_dropout=0.,
+            bias=False):
+        super(LightconvLayer, self).__init__()
+        self.input_size = input_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_softmax = weight_softmax
+        self.weight_dropout = weight_dropout
+
+        self.weight = nn.Parameter(torch.Tensor(num_heads, kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.bias = None
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.)
+
+    def forward(self, x, incremental_state=None):
+
+        # during inference time, incremental BMM is faster
+        if incremental_state is not None:
+            T, B, C = x.size()
+            K, H = self.kernel_size, self.num_heads
+            R = C // H
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(incremental_state, x_unfold[:, :, :, -self.kernel_size+1:])
+            x_unfold = x_unfold.view(T*B*H, R, -1)
+
+            weight = self.weight
+            if self.weight_softmax:
+                weight = F.softmax(weight.float(), dim=1).type_as(weight)
+
+            weight = weight[:, -x_unfold.size(2):]
+
+            K = weight.size(1)
+
+            weight = weight.view(1, H, K).expand(T*B, H, K).contiguous().view(T*B*H, K, 1)
+
+            weight = F.dropout(weight, self.weight_dropout, training=self.training)
+            output = torch.bmm(x_unfold, weight) # T*B*H x R x 1
+            output = output.view(T, B, C)
+            return output
+
+        # during training time, use CUDA kernel
+        else:
+            x = x.permute(1, 2, 0).contiguous()
+            weight = self.weight
+            if self.weight_softmax:
+                weight = F.softmax(self.weight, -1)
+            if self.weight_dropout:
+                weight = F.dropout(weight, self.weight_dropout, training=self.training)
+            return lightconvFunction.apply(x, weight, self.padding_l).permute(2, 0, 1)
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, 'input_buffer')
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer)
+
+    def half(self):
+        print("HALF")
+        return self._apply(lambda t: t.half() if t.is_floating_point() else t)
diff --git a/fairseq/modules/lightconv_layer/setup.py b/fairseq/modules/lightconv_layer/setup.py
new file mode 100644
index 0000000000..c2a928ed82
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/setup.py
@@ -0,0 +1,14 @@
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+
+setup(
+    name='lightconv_layer',
+    ext_modules=[
+        CUDAExtension('lightconv_cuda', [
+            'lightconv_cuda.cpp',
+            'lightconv_cuda_kernel.cu',
+        ]),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
diff --git a/fairseq/modules/lightweight_convolution.py b/fairseq/modules/lightweight_convolution.py
index 6191d49501..95d0418af6 100644
--- a/fairseq/modules/lightweight_convolution.py
+++ b/fairseq/modules/lightweight_convolution.py
@@ -10,6 +10,21 @@
 from fairseq import utils
 from fairseq.modules.unfold import unfold1d
 
+def LightweightConv(input_size, kernel_size=1, padding_l=None, num_heads=1,
+                    weight_dropout=0., weight_softmax=False, bias=False):
+    if torch.cuda.is_available():
+        try:
+            from fairseq.modules.lightconv_layer import LightconvLayer
+            return LightconvLayer(input_size, kernel_size=kernel_size,
+                                  padding_l=padding_l, num_heads=num_heads,
+                                  weight_dropout=weight_dropout,
+                                  weight_softmax=weight_softmax, bias=bias)
+        except ImportError as e:
+            print(e)
+    return LightweightConv1dTBC(input_size, kernel_size=kernel_size,
+                                padding_l=padding_l, num_heads=num_heads,
+                                weight_dropout=weight_dropout,
+                                weight_softmax=weight_softmax, bias=bias)
 
 class LightweightConv1d(nn.Module):
     '''Lightweight Convolution assuming the input is BxCxT
diff --git a/fairseq/modules/unfold.py b/fairseq/modules/unfold.py
index 3a142db698..eff6ab575b 100644
--- a/fairseq/modules/unfold.py
+++ b/fairseq/modules/unfold.py
@@ -5,7 +5,6 @@
 
 import torch.nn.functional as F
 
-
 def unfold1d(x, kernel_size, padding_l, pad_value=0):
     '''unfold T x B x C to T x B x C x K'''
     if kernel_size > 1:

From 1d44cc8520fc7d2fb4957fcab32d102de6b30626 Mon Sep 17 00:00:00 2001
From: ngoyal2707 <ngoyal2707@users.noreply.github.com>
Date: Thu, 15 Aug 2019 06:50:44 -0700
Subject: [PATCH 095/213] added effcient wsc task/criterion for winogrande
 (#825)

Summary:
1) So far getting `78%`  on winogrande validation dataset comapred to `63.5%` in the paper.
2) Will upgrade readme once everything is finalized.

Questions:

1) Should I just call `binary_wsc_task` instead of `winogrande` to be less specific to dataset and be generic?
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/825

Differential Revision: D16810159

fbshipit-source-id: cfde73561fa4caaaa63a4773c0aecd12ce1fa518
---
 examples/roberta/README.md                    |   8 +-
 .../README.md}                                |   0
 .../roberta/{README.wsc.md => wsc/README.md}  |  40 +++++
 examples/roberta/wsc/wsc_criterion.py         |  91 +++++++---
 examples/roberta/wsc/wsc_task.py              | 167 +++++++++++++++---
 examples/roberta/wsc/wsc_utils.py             |  17 ++
 6 files changed, 265 insertions(+), 58 deletions(-)
 rename examples/roberta/{README.cqa.md => commonsense_qa/README.md} (100%)
 rename examples/roberta/{README.wsc.md => wsc/README.md} (69%)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index f09a35b333..022ea0e3c1 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -17,7 +17,7 @@ Model | Description | # params | Download
 `roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz)
 `roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz)
 `roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
-`roberta.large.wsc` | `roberta.large` finetuned on [WSC](README.wsc.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
+`roberta.large.wsc` | `roberta.large` finetuned on [WSC](wsc/README.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
 
 ### Results
 
@@ -168,7 +168,7 @@ roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a pe
 # 'demonstrators'
 ```
 
-See the [RoBERTA Winograd Schema Challenge (WSC) README](README.wsc.md) for more details on how to train this model.
+See the [RoBERTA Winograd Schema Challenge (WSC) README](wsc/README.md) for more details on how to train this model.
 
 #### Extract features aligned to words:
 
@@ -220,8 +220,8 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 
 - [Finetuning on GLUE](README.glue.md)
 - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
-- [Finetuning on Winograd Schema Challenge (WSC)](README.wsc.md)
-- [Finetuning on Commonsense QA (CQA)](README.cqa.md)
+- [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
+- [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
 - Finetuning on SQuAD: coming soon
 
 ### Pretraining using your own data
diff --git a/examples/roberta/README.cqa.md b/examples/roberta/commonsense_qa/README.md
similarity index 100%
rename from examples/roberta/README.cqa.md
rename to examples/roberta/commonsense_qa/README.md
diff --git a/examples/roberta/README.wsc.md b/examples/roberta/wsc/README.md
similarity index 69%
rename from examples/roberta/README.wsc.md
rename to examples/roberta/wsc/README.md
index 1df64299f0..0d3f62a07f 100644
--- a/examples/roberta/README.wsc.md
+++ b/examples/roberta/wsc/README.md
@@ -83,3 +83,43 @@ for sentence, label in wsc_utils.jsonl_iterator('WSC/val.jsonl', eval=True):
 print('Accuracy: ' + str(ncorrect / float(nsamples)))
 # Accuracy: 0.9230769230769231
 ```
+
+## RoBERTa training on WinoGrande dataset
+We have also provided `winogrande` task and criterion for finetuning on the
+[WinoGrande](https://mosaic.allenai.org/projects/winogrande) like datasets
+where there are always two candidates and one is correct.
+It's more efficient implementation for such subcases.
+
+```bash
+TOTAL_NUM_UPDATES=23750 # Total number of training steps.
+WARMUP_UPDATES=2375     # Linearly increase LR over this many steps.
+LR=1e-05                # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=32        # Batch size per GPU.
+SEED=1                  # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+# we use the --user-dir option to load the task and criterion
+# from the examples/roberta/wsc directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
+
+cd fairseq
+CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \
+  --restore-file $ROBERTA_PATH \
+  --reset-optimizer --reset-dataloader --reset-meters \
+  --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+  --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+  --valid-subset val \
+  --fp16 --ddp-backend no_c10d \
+  --user-dir $FAIRSEQ_USER_DIR \
+  --task winogrande --criterion winogrande \
+  --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \
+  --arch roberta_large --bpe gpt2 --max-positions 512 \
+  --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+  --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
+  --lr-scheduler polynomial_decay --lr $LR \
+  --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
+  --max-sentences $MAX_SENTENCES \
+  --max-update $TOTAL_NUM_UPDATES \
+  --log-format simple --log-interval 100
+```
diff --git a/examples/roberta/wsc/wsc_criterion.py b/examples/roberta/wsc/wsc_criterion.py
index c5b6507f9a..63a5d419e3 100644
--- a/examples/roberta/wsc/wsc_criterion.py
+++ b/examples/roberta/wsc/wsc_criterion.py
@@ -39,30 +39,46 @@ def add_args(parser):
         parser.add_argument('--save-predictions', metavar='FILE',
                             help='file to save predictions to')
 
-    def forward(self, model, sample, reduce=True):
-
-        def get_masked_input(tokens, mask):
-            masked_tokens = tokens.clone()
-            masked_tokens[mask] = self.task.mask
-            return masked_tokens
-
-        def get_lprobs(tokens, mask):
-            logits, _ = model(src_tokens=get_masked_input(tokens, mask))
-            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
-            scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
-            mask = mask.type_as(scores)
-            scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
-            return scores
+    def get_masked_input(self, tokens, mask):
+        masked_tokens = tokens.clone()
+        masked_tokens[mask] = self.task.mask
+        return masked_tokens
+
+    def get_lprobs(self, model, tokens, mask):
+        logits, _ = model(src_tokens=self.get_masked_input(tokens, mask))
+        lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
+        scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
+        mask = mask.type_as(scores)
+        scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
+        return scores
+
+    def get_loss(self, query_lprobs, cand_lprobs):
+        if self.args.wsc_cross_entropy:
+            return F.cross_entropy(
+                torch.cat([query_lprobs, cand_lprobs]).unsqueeze(0),
+                query_lprobs.new([0]).long(),
+            )
+        else:
+            return (
+                - query_lprobs
+                + self.args.wsc_margin_alpha * (
+                    cand_lprobs - query_lprobs + self.args.wsc_margin_beta
+                ).clamp(min=0)
+            ).sum()
 
+    def forward(self, model, sample, reduce=True):
         # compute loss and accuracy
         loss, nloss = 0., 0
         ncorrect, nqueries = 0, 0
+
         for i, label in enumerate(sample['labels']):
-            query_lprobs = get_lprobs(
+            query_lprobs = self.get_lprobs(
+                model,
                 sample['query_tokens'][i].unsqueeze(0),
                 sample['query_masks'][i].unsqueeze(0),
             )
-            cand_lprobs = get_lprobs(
+            cand_lprobs = self.get_lprobs(
+                model,
                 sample['candidate_tokens'][i],
                 sample['candidate_masks'][i],
             )
@@ -77,18 +93,7 @@ def get_lprobs(tokens, mask):
             if label:
                 # only compute a loss for positive instances
                 nloss += 1
-                if self.args.wsc_cross_entropy:
-                    loss += F.cross_entropy(
-                        torch.cat([query_lprobs, cand_lprobs]).unsqueeze(0),
-                        query_lprobs.new([0]).long(),
-                    )
-                else:
-                    loss += (
-                        - query_lprobs
-                        + self.args.wsc_margin_alpha * (
-                            cand_lprobs - query_lprobs + self.args.wsc_margin_beta
-                        ).clamp(min=0)
-                    ).sum()
+                loss += self.get_loss(query_lprobs, cand_lprobs)
 
             id = sample['id'][i].item()
             if self.prediction_h is not None:
@@ -129,3 +134,33 @@ def aggregate_logging_outputs(logging_outputs):
             agg_output['accuracy'] = ncorrect / float(nqueries)
 
         return agg_output
+
+
+@register_criterion('winogrande')
+class WinograndeCriterion(WSCCriterion):
+    def forward(self, model, sample, reduce=True):
+        # compute loss and accuracy
+        query_lprobs = self.get_lprobs(
+            model,
+            sample['query_tokens'],
+            sample['query_masks'],
+        )
+        cand_lprobs = self.get_lprobs(
+            model,
+            sample['candidate_tokens'],
+            sample['candidate_masks'],
+        )
+        pred = query_lprobs >= cand_lprobs
+        loss = self.get_loss(query_lprobs, cand_lprobs)
+
+        sample_size = sample['query_tokens'].size(0)
+        ncorrect = pred.sum().item()
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['nsentences'],
+            'sample_size': sample_size,
+            'ncorrect': ncorrect,
+            'nqueries': sample_size,
+        }
+        return loss, sample_size, logging_output
diff --git a/examples/roberta/wsc/wsc_task.py b/examples/roberta/wsc/wsc_task.py
index 7fd09fc77c..2af2b338cb 100644
--- a/examples/roberta/wsc/wsc_task.py
+++ b/examples/roberta/wsc/wsc_task.py
@@ -21,6 +21,7 @@
     NestedDictionaryDataset,
     NumSamplesDataset,
     NumelDataset,
+    PadDataset,
     SortDataset,
 )
 from fairseq.tasks import FairseqTask, register_task
@@ -77,25 +78,35 @@ def setup_task(cls, args, **kwargs):
 
         return cls(args, vocab)
 
+    def binarize(self, s: str, append_eos: bool = False):
+        if self.tokenizer is not None:
+            s = self.tokenizer.encode(s)
+        if self.bpe is not None:
+            s = self.bpe.encode(s)
+        tokens = self.vocab.encode_line(
+            s, append_eos=append_eos, add_if_not_exist=False,
+        ).long()
+        if self.args.init_token is not None:
+            tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
+        return tokens
+
+    def binarize_with_mask(self, txt, prefix, suffix, leading_space, trailing_space):
+        toks = self.binarize(
+            prefix + leading_space + txt + trailing_space + suffix,
+            append_eos=True,
+        )
+        mask = torch.zeros_like(toks, dtype=torch.uint8)
+        mask_start = len(self.binarize(prefix))
+        mask_size = len(self.binarize(leading_space + txt))
+        mask[mask_start:mask_start + mask_size] = 1
+        return toks, mask
+
     def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs):
         """Load a given dataset split.
 
         Args:
             split (str): name of the split (e.g., train, valid, test)
         """
-
-        def binarize(s: str, append_eos: bool = False):
-            if self.tokenizer is not None:
-                s = self.tokenizer.encode(s)
-            if self.bpe is not None:
-                s = self.bpe.encode(s)
-            tokens = self.vocab.encode_line(
-                s, append_eos=append_eos, add_if_not_exist=False,
-            ).long()
-            if self.args.init_token is not None:
-                tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
-            return tokens
-
         if data_path is None:
             data_path = os.path.join(self.args.data, split + '.jsonl')
         if not os.path.exists(data_path):
@@ -126,19 +137,10 @@ def binarize(s: str, append_eos: bool = False):
                 exact_match=False,
             )
 
-            def binarize_with_mask(txt):
-                toks = binarize(
-                    prefix + leading_space + txt + trailing_space + suffix,
-                    append_eos=True,
-                )
-                mask = torch.zeros_like(toks, dtype=torch.uint8)
-                mask_start = len(binarize(prefix))
-                mask_size = len(binarize(leading_space + txt))
-                mask[mask_start:mask_start + mask_size] = 1
-                return toks, mask
-
             if query is not None:
-                query_toks, query_mask = binarize_with_mask(query)
+                query_toks, query_mask = self.binarize_with_mask(
+                    query, prefix, suffix, leading_space, trailing_space
+                )
                 query_len = len(query_toks)
             else:
                 query_toks, query_mask, query_len = None, None, 0
@@ -149,7 +151,9 @@ def binarize_with_mask(txt):
 
             cand_toks, cand_masks = [], []
             for cand_span in cand_spans:
-                toks, mask = binarize_with_mask(cand_span.text)
+                toks, mask = self.binarize_with_mask(
+                    cand_span.text, prefix, suffix, leading_space, trailing_space,
+                )
                 cand_toks.append(toks)
                 cand_masks.append(mask)
 
@@ -258,3 +262,114 @@ def source_dictionary(self):
     @property
     def target_dictionary(self):
         return self.vocab
+
+
+@register_task('winogrande')
+class WinograndeTask(WSCTask):
+    """
+    Task for WinoGrande dataset. Efficient implementation for Winograd schema
+    tasks with exactly two candidates, one of which is correct.
+    """
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == 'winogrande', 'Must set --criterion=winogrande'
+
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, 'dict.txt'))
+        print('| dictionary: {} types'.format(len(vocab)))
+
+        return cls(args, vocab)
+
+
+    def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + '.jsonl')
+        if not os.path.exists(data_path):
+            raise FileNotFoundError('Cannot find data: {}'.format(data_path))
+
+        query_tokens = []
+        query_masks = []
+        query_lengths = []
+        candidate_tokens = []
+        candidate_masks = []
+        candidate_lengths = []
+
+        itr = wsc_utils.winogrande_jsonl_iterator(data_path, eval=split=='test')
+
+        for sample in itr:
+            sentence, pronoun_span, query, cand_text = sample
+            prefix = sentence[:pronoun_span[0]].rstrip()
+            suffix = sentence[pronoun_span[1]:]
+
+            leading_space = ' ' if sentence[:pronoun_span[0]].endswith(' ') else ''
+            trailing_space = ''
+
+            if query is not None:
+                query_toks, query_mask = self.binarize_with_mask(
+                    query, prefix, suffix, leading_space, trailing_space,
+                )
+                query_len = len(query_toks)
+            else:
+                query_toks, query_mask, query_len = None, None, 0
+
+            query_tokens.append(query_toks)
+            query_masks.append(query_mask)
+            query_lengths.append(query_len)
+
+            cand_toks, cand_mask = self.binarize_with_mask(
+                cand_text, prefix, suffix, leading_space, trailing_space,
+            )
+
+            candidate_tokens.append(cand_toks)
+            candidate_masks.append(cand_mask)
+            candidate_lengths.append(cand_toks.size(0))
+
+        query_lengths = np.array(query_lengths)
+
+        def get_pad_dataset_fn(tokens, length, pad_idx):
+            return PadDataset(
+                ListDataset(tokens, length),
+                pad_idx=pad_idx,
+                left_pad=False,
+            )
+
+        query_tokens = get_pad_dataset_fn(query_tokens, query_lengths, self.vocab.pad())
+        query_masks = get_pad_dataset_fn(query_masks, query_lengths, 0)
+
+        candidate_lengths = np.array(candidate_lengths)
+        candidate_tokens = get_pad_dataset_fn(candidate_tokens, candidate_lengths, self.vocab.pad())
+        candidate_masks = get_pad_dataset_fn(candidate_masks, candidate_lengths, 0)
+
+        dataset = {
+            'id': IdDataset(),
+            'query_tokens': query_tokens,
+            'query_masks': query_masks,
+            'candidate_tokens': candidate_tokens,
+            'candidate_masks': candidate_masks,
+            'nsentences': NumSamplesDataset(),
+            'ntokens': NumelDataset(query_tokens, reduce=True),
+        }
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[query_lengths],
+        )
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(query_tokens))
+        dataset = SortDataset(
+            nested_dataset,
+            # shuffle
+            sort_order=[shuffle],
+        )
+
+        if return_only:
+            return dataset
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
diff --git a/examples/roberta/wsc/wsc_utils.py b/examples/roberta/wsc/wsc_utils.py
index ef388665fd..2d4822479e 100644
--- a/examples/roberta/wsc/wsc_utils.py
+++ b/examples/roberta/wsc/wsc_utils.py
@@ -190,6 +190,23 @@ def strip_pronoun(x):
                 yield sentence, pronoun_span, query, sample.get('label', None)
 
 
+def winogrande_jsonl_iterator(input_fname, eval=False):
+    with open(input_fname) as fin:
+        for line in fin:
+            sample = json.loads(line.strip())
+            sentence, option1, option2 = sample['sentence'], sample['option1'],\
+                sample['option2']
+
+            pronoun_span = (sentence.index('_'), sentence.index('_') + 1)
+
+            if eval:
+                query, cand = option1, option2
+            else:
+                query = option1 if sample['answer'] == '1' else option2
+                cand = option2 if sample['answer'] == '1' else option1
+            yield sentence, pronoun_span, query, cand
+
+
 def filter_noun_chunks(chunks, exclude_pronouns=False, exclude_query=None, exact_match=False):
     if exclude_pronouns:
         chunks = [

From ac66df47b5394e730aa05efa50ed7ec6103388bb Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 15 Aug 2019 09:45:46 -0700
Subject: [PATCH 096/213] Update README

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/826

Differential Revision: D16830402

Pulled By: myleott

fbshipit-source-id: 25afaa6d9de7b51cc884e3f417c8e6b349f5a7bc
---
 examples/roberta/README.md             |  50 ++++++---
 examples/roberta/README.pretraining.md |   2 +-
 examples/scaling_nmt/README.md         |  36 ++++---
 examples/translation/README.md         | 141 ++++++++++++-------------
 4 files changed, 129 insertions(+), 100 deletions(-)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 022ea0e3c1..15119a345a 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -2,7 +2,7 @@
 
 https://arxiv.org/abs/1907.11692
 
-### Introduction
+## Introduction
 
 RoBERTa iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details.
 
@@ -10,7 +10,7 @@ RoBERTa iterates on BERT's pretraining procedure, including training the model l
 
 - August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
 
-### Pre-trained models
+## Pre-trained models
 
 Model | Description | # params | Download
 ---|---|---|---
@@ -19,9 +19,10 @@ Model | Description | # params | Download
 `roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
 `roberta.large.wsc` | `roberta.large` finetuned on [WSC](wsc/README.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
 
-### Results
+## Results
 
-##### Results on GLUE tasks (dev set, single model, single-task finetuning)
+**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)**
+_(dev set, single model, single-task finetuning)_
 
 Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
 ---|---|---|---|---|---|---|---|---
@@ -29,26 +30,51 @@ Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
 `roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
 `roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
 
-##### Results on SuperGLUE tasks (dev set, single model, single-task finetuning)
+**[SuperGLUE (Wang et al., 2019)](https://super.gluebenchmark.com/)**
+_(dev set, single model, single-task finetuning)_
 
 Model | BoolQ | CB | COPA | MultiRC | RTE | WiC | WSC
 ---|---|---|---|---|---|---|---
 `roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | -
 `roberta.large.wsc` | - | - | - | - | - | - | 91.3
 
-##### Results on SQuAD (dev set)
+**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)**
+_(dev set, no additional data used)_
 
 Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
 ---|---|---
 `roberta.large` | 88.9/94.6 | 86.5/89.4
 
-##### Results on Reading Comprehension (RACE, test set)
+**[RACE (Lai et al., 2017)](http://www.qizhexie.com/data/RACE_leaderboard.html)**
+_(test set)_
 
 Model | Accuracy | Middle | High
 ---|---|---|---
 `roberta.large` | 83.2 | 86.5 | 81.3
 
-### Example usage
+**[HellaSwag (Zellers et al., 2019)](https://rowanzellers.com/hellaswag/)**
+_(test set)_
+
+Model | Overall | In-domain | Zero-shot | ActivityNet | WikiHow
+---|---|---|---|---|---
+`roberta.large` | 85.2 | 87.3 | 83.1 | 74.6 | 90.9
+
+**[Commonsense QA (Talmor et al., 2019)](https://www.tau-nlp.org/commonsenseqa)**
+_(test set)_
+
+Model | Accuracy
+---|---
+`roberta.large` (single model) | 72.1
+`roberta.large` (ensemble) | 72.5
+
+**[Winogrande (Sakaguchi et al., 2019)](https://arxiv.org/abs/1907.10641)**
+_(test set)_
+
+Model | Accuracy
+---|---
+`roberta.large` | 78.1
+
+## Example usage
 
 ##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
 ```python
@@ -124,7 +150,7 @@ roberta.cuda()
 roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
 ```
 
-### Advanced usage
+## Advanced usage
 
 #### Filling masks:
 
@@ -216,7 +242,7 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 # Expected output: 0.9060
 ```
 
-### Finetuning
+## Finetuning
 
 - [Finetuning on GLUE](README.glue.md)
 - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
@@ -224,11 +250,11 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 - [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
 - Finetuning on SQuAD: coming soon
 
-### Pretraining using your own data
+## Pretraining using your own data
 
 See the [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
 
-### Citation
+## Citation
 
 ```bibtex
 @article{liu2019roberta,
diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md
index 843d7ce377..0e82bc93fb 100644
--- a/examples/roberta/README.pretraining.md
+++ b/examples/roberta/README.pretraining.md
@@ -2,7 +2,7 @@
 
 This tutorial will walk you through pretraining RoBERTa over your own data.
 
-### 1) Preprocess the data.
+### 1) Preprocess the data
 
 Data should be preprocessed following the [language modeling format](/examples/language_model).
 
diff --git a/examples/scaling_nmt/README.md b/examples/scaling_nmt/README.md
index 1e47917baf..a1d40ea623 100644
--- a/examples/scaling_nmt/README.md
+++ b/examples/scaling_nmt/README.md
@@ -11,45 +11,57 @@ Model | Description | Dataset | Download
 
 ## Training a new model on WMT'16 En-De
 
-Please first download the [preprocessed WMT'16 En-De data provided by Google](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8).
+First download the [preprocessed WMT'16 En-De data provided by Google](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8).
+
 Then:
 
-1. Extract the WMT'16 En-De data:
+##### 1. Extract the WMT'16 En-De data
 ```bash
 TEXT=wmt16_en_de_bpe32k
 mkdir -p $TEXT
 tar -xzvf wmt16_en_de.tar.gz -C $TEXT
 ```
 
-2. Preprocess the dataset with a joined dictionary:
+##### 2. Preprocess the dataset with a joined dictionary
 ```bash
-fairseq-preprocess --source-lang en --target-lang de \
+fairseq-preprocess \
+    --source-lang en --target-lang de \
     --trainpref $TEXT/train.tok.clean.bpe.32000 \
     --validpref $TEXT/newstest2013.tok.bpe.32000 \
     --testpref $TEXT/newstest2014.tok.bpe.32000 \
     --destdir data-bin/wmt16_en_de_bpe32k \
     --nwordssrc 32768 --nwordstgt 32768 \
-    --joined-dictionary
+    --joined-dictionary \
+    --workers 20
 ```
 
-3. Train a model:
+##### 3. Train a model
 ```bash
-fairseq-train data-bin/wmt16_en_de_bpe32k \
+fairseq-train \
+    data-bin/wmt16_en_de_bpe32k \
     --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
-    --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
-    --lr 0.0005 --min-lr 1e-09 \
-    --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+    --dropout 0.3 --weight-decay 0.0 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --max-tokens 3584 \
     --fp16
 ```
 
-Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU.
+Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer.
 
 If you want to train the above model with big batches (assuming your machine has 8 GPUs):
-- add `--update-freq 16` to simulate training on 8*16=128 GPUs
+- add `--update-freq 16` to simulate training on 8x16=128 GPUs
 - increase the learning rate; 0.001 works well for big batches
 
+##### 4. Evaluate
+```bash
+fairseq-generate \
+    data-bin/wmt16_en_de_bpe32k \
+    --path checkpoints/checkpoint_best.pt \
+    --beam 4 --lenpen 0.6 --remove-bpe
+```
+
 ## Citation
 
 ```bibtex
diff --git a/examples/translation/README.md b/examples/translation/README.md
index a43f0af1ad..b93115147a 100644
--- a/examples/translation/README.md
+++ b/examples/translation/README.md
@@ -1,5 +1,8 @@
 # Neural Machine Translation
 
+This README contains instructions for [using pretrained translation models](#example-usage-torchhub)
+as well as [training new models](#training-a-new-model).
+
 ## Pre-trained models
 
 Model | Description | Dataset | Download
@@ -56,132 +59,119 @@ fairseq-score --sys /tmp/gen.out.sys --ref /tmp/gen.out.ref
 # BLEU4 = 40.83, 67.5/46.9/34.4/25.5 (BP=1.000, ratio=1.006, syslen=83262, reflen=82787)
 ```
 
-## Preprocessing
-
-These scripts provide an example of pre-processing data for the NMT task.
+## Training a new model
 
-### prepare-iwslt14.sh
+### IWSLT'14 German to English (Transformer)
 
-Provides an example of pre-processing for IWSLT'14 German to English translation task: ["Report on the 11th IWSLT evaluation campaign" by Cettolo et al.](http://workshop2014.iwslt.org/downloads/proceeding.pdf)
+The following instructions can be used to train a Transformer model on the [IWSLT'14 German to English dataset](http://workshop2014.iwslt.org/downloads/proceeding.pdf).
 
-Example usage:
+First download and preprocess the data:
 ```bash
+# Download and prepare the data
 cd examples/translation/
 bash prepare-iwslt14.sh
 cd ../..
 
-# Binarize the dataset:
+# Preprocess/binarize the data
 TEXT=examples/translation/iwslt14.tokenized.de-en
 fairseq-preprocess --source-lang de --target-lang en \
     --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
-    --destdir data-bin/iwslt14.tokenized.de-en
+    --destdir data-bin/iwslt14.tokenized.de-en \
+    --workers 20
+```
 
-# Train the model (better for a single GPU setup):
-mkdir -p checkpoints/fconv
-CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
-    --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
+Next we'll train a Transformer translation model over this data:
+```bash
+CUDA_VISIBLE_DEVICES=0 fairseq-train \
+    data-bin/iwslt14.tokenized.de-en \
+    --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
+    --dropout 0.3 --weight-decay 0.0001 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
-    --lr-scheduler fixed --force-anneal 200 \
-    --arch fconv_iwslt_de_en --save-dir checkpoints/fconv
-
-# Generate:
-fairseq-generate data-bin/iwslt14.tokenized.de-en \
-    --path checkpoints/fconv/checkpoint_best.pt \
-    --batch-size 128 --beam 5 --remove-bpe
-
+    --max-tokens 4096
 ```
 
-To train transformer model on IWSLT'14 German to English:
+Finally we can evaluate our trained model:
 ```bash
-# Preparation steps are the same as for fconv model.
-
-# Train the model (better for a single GPU setup):
-mkdir -p checkpoints/transformer
-CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
-    -a transformer_iwslt_de_en --optimizer adam --lr 0.0005 -s de -t en \
-    --label-smoothing 0.1 --dropout 0.3 --max-tokens 4000 \
-    --min-lr '1e-09' --lr-scheduler inverse_sqrt --weight-decay 0.0001 \
-    --criterion label_smoothed_cross_entropy --max-update 50000 \
-    --warmup-updates 4000 --warmup-init-lr '1e-07' \
-    --adam-betas '(0.9, 0.98)' --save-dir checkpoints/transformer
-
-# Average 10 latest checkpoints:
-python scripts/average_checkpoints.py --inputs checkpoints/transformer \
-    --num-epoch-checkpoints 10 --output checkpoints/transformer/model.pt
-
-# Generate:
 fairseq-generate data-bin/iwslt14.tokenized.de-en \
-    --path checkpoints/transformer/model.pt \
+    --path checkpoints/checkpoint_best.pt \
     --batch-size 128 --beam 5 --remove-bpe
 ```
 
-### prepare-wmt14en2de.sh
-
-The WMT English to German dataset can be preprocessed using the `prepare-wmt14en2de.sh` script.
-By default it will produce a dataset that was modeled after ["Attention Is All You Need" (Vaswani et al., 2017)](https://arxiv.org/abs/1706.03762), but with news-commentary-v12 data from WMT'17.
+### WMT'14 English to German (Convolutional)
 
-To use only data available in WMT'14 or to replicate results obtained in the original ["Convolutional Sequence to Sequence Learning" (Gehring et al., 2017)](https://arxiv.org/abs/1705.03122) paper, please use the `--icml17` option.
+The following instructions can be used to train a Convolutional translation model on the WMT English to German dataset.
+See the [Scaling NMT README](../scaling_nmt/README.md) for instructions to train a Transformer translation model on this data.
 
-```bash
-bash prepare-wmt14en2de.sh --icml17
-```
+The WMT English to German dataset can be preprocessed using the `prepare-wmt14en2de.sh` script.
+By default it will produce a dataset that was modeled after [Attention Is All You Need (Vaswani et al., 2017)](https://arxiv.org/abs/1706.03762), but with additional news-commentary-v12 data from WMT'17.
 
-Example usage:
+To use only data available in WMT'14 or to replicate results obtained in the original [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](https://arxiv.org/abs/1705.03122) paper, please use the `--icml17` option.
 
 ```bash
+# Download and prepare the data
 cd examples/translation/
+# WMT'17 data:
 bash prepare-wmt14en2de.sh
+# or to use WMT'14 data:
+# bash prepare-wmt14en2de.sh --icml17
 cd ../..
 
-# Binarize the dataset:
+# Binarize the dataset
 TEXT=examples/translation/wmt17_en_de
-fairseq-preprocess --source-lang en --target-lang de \
+fairseq-preprocess \
+    --source-lang en --target-lang de \
     --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
-    --destdir data-bin/wmt17_en_de --thresholdtgt 0 --thresholdsrc 0
+    --destdir data-bin/wmt17_en_de --thresholdtgt 0 --thresholdsrc 0 \
+    --workers 20
 
-# Train the model:
-# If it runs out of memory, try to set --max-tokens 1500 instead
+# Train the model
 mkdir -p checkpoints/fconv_wmt_en_de
-fairseq-train data-bin/wmt17_en_de \
+fairseq-train \
+    data-bin/wmt17_en_de \
+    --arch fconv_wmt_en_de \
     --lr 0.5 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --lr-scheduler fixed --force-anneal 50 \
-    --arch fconv_wmt_en_de --save-dir checkpoints/fconv_wmt_en_de
+    --save-dir checkpoints/fconv_wmt_en_de
 
-# Generate:
+# Evaluate
 fairseq-generate data-bin/wmt17_en_de \
-    --path checkpoints/fconv_wmt_en_de/checkpoint_best.pt --beam 5 --remove-bpe
+    --path checkpoints/fconv_wmt_en_de/checkpoint_best.pt \
+    --beam 5 --remove-bpe
 ```
 
-### prepare-wmt14en2fr.sh
-
-Provides an example of pre-processing for the WMT'14 English to French translation task.
-
-Example usage:
-
+### WMT'14 English to French
 ```bash
+# Download and prepare the data
 cd examples/translation/
 bash prepare-wmt14en2fr.sh
 cd ../..
 
-# Binarize the dataset:
+# Binarize the dataset
 TEXT=examples/translation/wmt14_en_fr
-fairseq-preprocess --source-lang en --target-lang fr \
+fairseq-preprocess \
+    --source-lang en --target-lang fr \
     --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
-    --destdir data-bin/wmt14_en_fr --thresholdtgt 0 --thresholdsrc 0
+    --destdir data-bin/wmt14_en_fr --thresholdtgt 0 --thresholdsrc 0 \
+    --workers 60
 
-# Train the model:
-# If it runs out of memory, try to set --max-tokens 1000 instead
+# Train the model
 mkdir -p checkpoints/fconv_wmt_en_fr
-fairseq-train data-bin/wmt14_en_fr \
+fairseq-train \
+    data-bin/wmt14_en_fr \
     --lr 0.5 --clip-norm 0.1 --dropout 0.1 --max-tokens 3000 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --lr-scheduler fixed --force-anneal 50 \
-    --arch fconv_wmt_en_fr --save-dir checkpoints/fconv_wmt_en_fr
-
-# Generate:
-fairseq-generate data-bin/fconv_wmt_en_fr \
-    --path checkpoints/fconv_wmt_en_fr/checkpoint_best.pt --beam 5 --remove-bpe
+    --arch fconv_wmt_en_fr \
+    --save-dir checkpoints/fconv_wmt_en_fr
+
+# Evaluate
+fairseq-generate \
+    data-bin/fconv_wmt_en_fr \
+    --path checkpoints/fconv_wmt_en_fr/checkpoint_best.pt \
+    --beam 5 --remove-bpe
 ```
 
 ## Multilingual Translation
@@ -253,7 +243,8 @@ grep ^H iwslt17.test.${SRC}-en.en.sys | cut -f3 \
     | sacrebleu --test-set iwslt17 --language-pair ${SRC}-en
 ```
 
-### Argument format during inference
+##### Argument format during inference
+
 During inference it is required to specify a single `--source-lang` and
 `--target-lang`, which indicates the inference langauge direction.
 `--lang-pairs`, `--encoder-langtok`, `--decoder-langtok` have to be set to

From 49177c99c45f7d6e99a8f1500d16396e2d7b4519 Mon Sep 17 00:00:00 2001
From: Nathan Ng <n.ng555@gmail.com>
Date: Thu, 15 Aug 2019 10:03:44 -0700
Subject: [PATCH 097/213] Backward reranking public (#667)

Summary:
Implementation of noisy channel model reranking for release with paper
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/667

Reviewed By: michaelauli

Differential Revision: D15901665

Pulled By: nng555

fbshipit-source-id: 2de2c518be8e5828ffad72db3e741b0940623373
---
 .gitignore                               |   3 +
 eval_lm.py                               |   6 +-
 examples/__init__.py                     |  10 +
 examples/noisychannel/README.md          |  72 +++
 examples/noisychannel/__init__.py        |   8 +
 examples/noisychannel/rerank.py          | 283 ++++++++++
 examples/noisychannel/rerank_generate.py | 246 +++++++++
 examples/noisychannel/rerank_options.py  | 128 +++++
 examples/noisychannel/rerank_score_bw.py |  95 ++++
 examples/noisychannel/rerank_score_lm.py |  48 ++
 examples/noisychannel/rerank_tune.py     |  85 +++
 examples/noisychannel/rerank_utils.py    | 646 +++++++++++++++++++++++
 fairseq/models/transformer.py            |   2 +-
 13 files changed, 1629 insertions(+), 3 deletions(-)
 create mode 100644 examples/__init__.py
 create mode 100644 examples/noisychannel/README.md
 create mode 100644 examples/noisychannel/__init__.py
 create mode 100644 examples/noisychannel/rerank.py
 create mode 100644 examples/noisychannel/rerank_generate.py
 create mode 100644 examples/noisychannel/rerank_options.py
 create mode 100644 examples/noisychannel/rerank_score_bw.py
 create mode 100644 examples/noisychannel/rerank_score_lm.py
 create mode 100644 examples/noisychannel/rerank_tune.py
 create mode 100644 examples/noisychannel/rerank_utils.py

diff --git a/.gitignore b/.gitignore
index 7e4a2d4128..84ae18d953 100644
--- a/.gitignore
+++ b/.gitignore
@@ -116,3 +116,6 @@ fairseq/modules/*_layer/*_backward.cu
 
 # data
 data-bin/
+
+# reranking
+examples/reranking/rerank_data
diff --git a/eval_lm.py b/eval_lm.py
index e2da64fc1d..febed5ac8b 100644
--- a/eval_lm.py
+++ b/eval_lm.py
@@ -146,8 +146,9 @@ def main(parsed_args):
             hypos = scorer.generate(models, sample)
             gen_timer.stop(sample['ntokens'])
 
-            for hypos_i in hypos:
+            for i, hypos_i in enumerate(hypos):
                 hypo = hypos_i[0]
+                sample_id = sample['id'][i]
 
                 tokens = hypo['tokens']
                 tgt_len = tokens.numel()
@@ -199,7 +200,8 @@ def main(parsed_args):
                             is_bpe = False
                             w = ''
                     if args.output_word_probs:
-                        print('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob))
+                        print(str(int(sample_id)) + " " +
+                                  ('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob)))
 
             wps_meter.update(sample['ntokens'])
             t.log({'wps': round(wps_meter.avg)})
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000000..906098c1e3
--- /dev/null
+++ b/examples/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+__version__ = '0.7.2'
+
+import examples.noisychannel # noqa
diff --git a/examples/noisychannel/README.md b/examples/noisychannel/README.md
new file mode 100644
index 0000000000..a5dd0b9de3
--- /dev/null
+++ b/examples/noisychannel/README.md
@@ -0,0 +1,72 @@
+# Simple and Effective Noisy Channel Modeling for Neural Machine Translation (Yee et al., 2019)
+This page contains pointers to pre-trained models as well as instructions on how to run the reranking scripts.
+
+## Citation:
+```bibtex
+@inproceedings{yee2018simple,
+  title = {Simple and Effective Noisy Channel Modeling for Neural Machine Translation},
+  author = {Kyra Yee and Yann Dauphin and Michael Auli},
+  booktitle = {Conference on Empirical Methods in Natural Language Processing},
+  year = {2019},
+}
+```
+
+## Pre-trained Models:
+
+Model | Description |  Download
+---|---|---
+`transformer.noisychannel.de-en` | De->En Forward Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/forward_de2en.tar.bz2)
+`transformer.noisychannel.en-de` | En->De Channel Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/backward_en2de.tar.bz2)
+`transformer_lm.noisychannel.en` | En Language model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/reranking_en_lm.tar.bz2)
+
+Test Data: [newstest_wmt17](https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/wmt17test.tar.bz2)
+
+## Example usage
+
+```
+mkdir rerank_example
+curl https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/forward_de2en.tar.bz2 | tar xvjf - -C rerank_example
+curl https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/backward_en2de.tar.bz2 | tar xvjf - -C rerank_example
+curl https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/reranking_en_lm.tar.bz2 | tar xvjf - -C rerank_example
+curl https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/wmt17test.tar.bz2 | tar xvjf - -C rerank_example
+
+beam=50
+num_trials=1000
+fw_name=fw_model_ex
+bw_name=bw_model_ex
+lm_name=lm_ex
+data_dir=rerank_example/hyphen-splitting-mixed-case-wmt17test-wmt14bpe
+data_dir_name=wmt17
+lm=rerank_example/lm/checkpoint_best.pt
+lm_bpe_code=rerank_example/lm/bpe32k.code
+lm_dict=rerank_example/lm/dict.txt
+batch_size=32
+bw=rerank_example/backward_en2de.pt
+fw=rerank_example/forward_de2en.pt
+
+# reranking with P(T|S) P(S|T) and P(T)
+python examples/noisychannel/rerank_tune.py $data_dir  --tune-param lenpen weight1 weight3  \
+    --lower-bound 0 0 0 --upper-bound 3 3 3 --data-dir-name $data_dir_name  \ 
+    --num-trials $num_trials  --source-lang de --target-lang en --gen-model $fw \
+    -n $beam --batch-size $batch_size --score-model2 $fw --score-model1 $bw \
+    --backwards1 --weight2 1 \
+    -lm $lm  --lm-dict $lm_dict  --lm-name en_newscrawl --lm-bpe-code $lm_bpe_code \
+    --model2-name $fw_name --model1-name $bw_name --gen-model-name $fw_name
+
+# reranking with P(T|S) and P(T)
+python examples/noisychannel/rerank_tune.py $data_dir  --tune-param lenpen weight3 \
+    --lower-bound 0 0 --upper-bound 3 3  --data-dir-name $data_dir_name  \
+    --num-trials $num_trials  --source-lang de --target-lang en --gen-model $fw \
+    -n $beam --batch-size $batch_size --score-model1 $fw \
+    -lm $lm  --lm-dict $lm_dict  --lm-name en_newscrawl --lm-bpe-code $lm_bpe_code \
+    --model1-name $fw_name --gen-model-name $fw_name
+
+# to run with a preconfigured set of hyperparameters for the lenpen and model weights, using rerank.py instead.
+python examples/noisychannel/rerank.py $data_dir \
+    --lenpen 0.269 --weight1 1 --weight2 0.929 --weight3 0.831  \
+    --data-dir-name $data_dir_name  --source-lang de --target-lang en --gen-model $fw \
+    -n $beam --batch-size $batch_size --score-model2 $fw --score-model1 $bw --backwards1  \
+    -lm $lm  --lm-dict $lm_dict  --lm-name en_newscrawl --lm-bpe-code $lm_bpe_code \
+    --model2-name $fw_name --model1-name $bw_name --gen-model-name $fw_name
+```
+
diff --git a/examples/noisychannel/__init__.py b/examples/noisychannel/__init__.py
new file mode 100644
index 0000000000..b10ddbd812
--- /dev/null
+++ b/examples/noisychannel/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from .rerank_options import *
diff --git a/examples/noisychannel/rerank.py b/examples/noisychannel/rerank.py
new file mode 100644
index 0000000000..c17d64b4a1
--- /dev/null
+++ b/examples/noisychannel/rerank.py
@@ -0,0 +1,283 @@
+import rerank_utils
+import rerank_generate
+import rerank_score_bw
+import rerank_score_lm
+from fairseq import bleu, options
+from fairseq.data import dictionary
+from examples.noisychannel import rerank_options
+from multiprocessing import Pool
+
+import math
+import numpy as np
+
+
+def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile, write_hypos, normalize):
+
+    print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c)
+    gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files(args)
+    dict = dictionary.Dictionary()
+    scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
+
+    ordered_hypos = {}
+    ordered_targets = {}
+
+    for shard_id in range(len(bitext1_lst)):
+        bitext1 = bitext1_lst[shard_id]
+        bitext2 = bitext2_lst[shard_id]
+        gen_output = gen_output_lst[shard_id]
+        lm_res = lm_res_lst[shard_id]
+
+        total = len(bitext1.rescore_source.keys())
+        source_lst = []
+        hypo_lst = []
+        score_lst = []
+        reference_lst = []
+        j = 1
+        best_score = -math.inf
+
+        for i in range(total):
+            # length is measured in terms of words, not bpe tokens, since models may not share the same bpe
+            target_len = len(bitext1.rescore_hypo[i].split())
+
+            if lm_res is not None:
+                lm_score = lm_res.score[i]
+            else:
+                lm_score = 0
+
+            if bitext2 is not None:
+                bitext2_score = bitext2.rescore_score[i]
+                bitext2_backwards = bitext2.backwards
+            else:
+                bitext2_score = None
+                bitext2_backwards = None
+
+            score = rerank_utils.get_score(a, b, c, target_len,
+                                           bitext1.rescore_score[i], bitext2_score, lm_score=lm_score,
+                                           lenpen=lenpen, src_len=bitext1.source_lengths[i],
+                                           tgt_len=bitext1.target_lengths[i], bitext1_backwards=bitext1.backwards,
+                                           bitext2_backwards=bitext2_backwards, normalize=normalize)
+
+            if score > best_score:
+                best_score = score
+                best_hypo = bitext1.rescore_hypo[i]
+
+            if j == gen_output.num_hypos[i] or j == args.num_rescore:
+                j = 1
+                hypo_lst.append(best_hypo)
+                score_lst.append(best_score)
+                source_lst.append(bitext1.rescore_source[i])
+                reference_lst.append(bitext1.rescore_target[i])
+
+                best_score = -math.inf
+                best_hypo = ""
+            else:
+                j += 1
+
+        gen_keys = list(sorted(gen_output.no_bpe_target.keys()))
+
+        for key in range(len(gen_keys)):
+            if args.prefix_len is None:
+                assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \
+                    ("pred and rescore hypo mismatch: i: " + str(key) + ", " + str(hypo_lst[key]) + str(gen_keys[key]) +
+                    str(gen_output.no_bpe_hypo[key]))
+                sys_tok = dict.encode_line(hypo_lst[key])
+                ref_tok = dict.encode_line(gen_output.no_bpe_target[gen_keys[key]])
+                scorer.add(ref_tok, sys_tok)
+
+            else:
+                full_hypo = rerank_utils.get_full_from_prefix(hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
+                sys_tok = dict.encode_line(full_hypo)
+                ref_tok = dict.encode_line(gen_output.no_bpe_target[gen_keys[key]])
+                scorer.add(ref_tok, sys_tok)
+
+        # if only one set of hyper parameters is provided, write the predictions to a file
+        if write_hypos:
+            # recover the orinal ids from n best list generation
+            for key in range(len(gen_output.no_bpe_target)):
+                if args.prefix_len is None:
+                    assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \
+                        "pred and rescore hypo mismatch:"+"i:"+str(key)+str(hypo_lst[key]) + str(gen_output.no_bpe_hypo[key])
+                    ordered_hypos[gen_keys[key]] = hypo_lst[key]
+                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[gen_keys[key]]
+
+                else:
+                    full_hypo = rerank_utils.get_full_from_prefix(hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
+                    ordered_hypos[gen_keys[key]] = full_hypo
+                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[gen_keys[key]]
+
+    # write the hypos in the original order from nbest list generation
+    if args.num_shards == (len(bitext1_lst)):
+        with open(target_outfile, 'w') as t:
+            with open(hypo_outfile, 'w') as h:
+                for key in range(len(ordered_hypos)):
+                    t.write(ordered_targets[key])
+                    h.write(ordered_hypos[key])
+
+    res = scorer.result_string(4)
+    if write_hypos:
+        print(res)
+    score = rerank_utils.parse_bleu_scoring(res)
+    return score
+
+
+def match_target_hypo(args, target_outfile, hypo_outfile):
+    """combine scores from the LM and bitext models, and write the top scoring hypothesis to a file"""
+    if len(args.weight1) == 1:
+        res = score_target_hypo(args, args.weight1[0], args.weight2[0],
+                                args.weight3[0], args.lenpen[0], target_outfile,
+                                hypo_outfile, True, args.normalize)
+        rerank_scores = [res]
+    else:
+        print("launching pool")
+        with Pool(32) as p:
+            rerank_scores = p.starmap(score_target_hypo,
+                                      [(args, args.weight1[i], args.weight2[i], args.weight3[i],
+                                        args.lenpen[i], target_outfile, hypo_outfile,
+                                        False, args.normalize) for i in range(len(args.weight1))])
+
+    if len(rerank_scores) > 1:
+        best_index = np.argmax(rerank_scores)
+        best_score = rerank_scores[best_index]
+        print("best score", best_score)
+        print("best lenpen", args.lenpen[best_index])
+        print("best weight1", args.weight1[best_index])
+        print("best weight2", args.weight2[best_index])
+        print("best weight3", args.weight3[best_index])
+        return args.lenpen[best_index], args.weight1[best_index], \
+            args.weight2[best_index], args.weight3[best_index], best_score
+
+    else:
+        return args.lenpen[0], args.weight1[0], args.weight2[0], args.weight3[0], rerank_scores[0]
+
+
+def load_score_files(args):
+    if args.all_shards:
+        shard_ids = list(range(args.num_shards))
+    else:
+        shard_ids = [args.shard_id]
+
+    gen_output_lst = []
+    bitext1_lst = []
+    bitext2_lst = []
+    lm_res1_lst = []
+
+    for shard_id in shard_ids:
+        using_nbest = args.nbest_list is not None
+        pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
+            backwards_preprocessed_dir, lm_preprocessed_dir = \
+            rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
+                                         args.gen_model_name, shard_id, args.num_shards, args.sampling,
+                                         args.prefix_len, args.target_prefix_frac, args.source_prefix_frac)
+
+        rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None
+        rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None
+
+        score1_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model1_name,
+                                                     target_prefix_frac=args.target_prefix_frac,
+                                                     source_prefix_frac=args.source_prefix_frac,
+                                                     backwards=args.backwards1)
+        if args.score_model2 is not None:
+            score2_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model2_name,
+                                                         target_prefix_frac=args.target_prefix_frac,
+                                                         source_prefix_frac=args.source_prefix_frac,
+                                                         backwards=args.backwards2)
+        if args.language_model is not None:
+            lm_score_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.lm_name, lm_file=True)
+
+        # get gen output
+        predictions_bpe_file = pre_gen+"/generate_output_bpe.txt"
+        if using_nbest:
+            print("Using predefined n-best list from interactive.py")
+            predictions_bpe_file = args.nbest_list
+        gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.remove_bpe,
+                                                      nbest=using_nbest, prefix_len=args.prefix_len,
+                                                      target_prefix_frac=args.target_prefix_frac)
+
+        if rerank1_is_gen:
+            bitext1 = gen_output
+        else:
+            bitext1 = rerank_utils.BitextOutput(score1_file, args.backwards1, args.right_to_left1,
+                                                args.remove_bpe, args.prefix_len, args.target_prefix_frac,
+                                                args.source_prefix_frac)
+
+        if args.score_model2 is not None or args.nbest_list is not None:
+            if rerank2_is_gen:
+                bitext2 = gen_output
+            else:
+                bitext2 = rerank_utils.BitextOutput(score2_file, args.backwards2, args.right_to_left2,
+                                                    args.remove_bpe, args.prefix_len, args.target_prefix_frac,
+                                                    args.source_prefix_frac)
+
+                assert bitext2.source_lengths == bitext1.source_lengths, \
+                    "source lengths for rescoring models do not match"
+                assert bitext2.target_lengths == bitext1.target_lengths, \
+                    "target lengths for rescoring models do not match"
+        else:
+            if args.diff_bpe:
+                assert args.score_model2 is None
+                bitext2 = gen_output
+            else:
+                bitext2 = None
+
+        if args.language_model is not None:
+            lm_res1 = rerank_utils.LMOutput(lm_score_file, args.lm_dict, args.prefix_len,
+                                            args.remove_bpe, args.target_prefix_frac)
+        else:
+            lm_res1 = None
+
+        gen_output_lst.append(gen_output)
+        bitext1_lst.append(bitext1)
+        bitext2_lst.append(bitext2)
+        lm_res1_lst.append(lm_res1)
+    return gen_output_lst, bitext1_lst, bitext2_lst, lm_res1_lst
+
+
+def rerank(args):
+    if type(args.lenpen) is not list:
+        args.lenpen = [args.lenpen]
+    if type(args.weight1) is not list:
+        args.weight1 = [args.weight1]
+    if type(args.weight2) is not list:
+        args.weight2 = [args.weight2]
+    if type(args.weight3) is not list:
+        args.weight3 = [args.weight3]
+    if args.all_shards:
+        shard_ids = list(range(args.num_shards))
+    else:
+        shard_ids = [args.shard_id]
+
+    for shard_id in shard_ids:
+        pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
+                backwards_preprocessed_dir, lm_preprocessed_dir = \
+                rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
+                                             args.gen_model_name, shard_id, args.num_shards, args.sampling,
+                                             args.prefix_len, args.target_prefix_frac, args.source_prefix_frac)
+        rerank_generate.gen_and_reprocess_nbest(args)
+        rerank_score_bw.score_bw(args)
+        rerank_score_lm.score_lm(args)
+
+        if args.write_hypos is None:
+            write_targets = pre_gen+"/matched_targets"
+            write_hypos = pre_gen+"/matched_hypos"
+        else:
+            write_targets = args.write_hypos+"_targets" + args.gen_subset
+            write_hypos = args.write_hypos+"_hypos" + args.gen_subset
+
+    if args.all_shards:
+        write_targets += "_all_shards"
+        write_hypos += "_all_shards"
+
+    best_lenpen, best_weight1, best_weight2, best_weight3, best_score = \
+        match_target_hypo(args, write_targets, write_hypos)
+
+    return best_lenpen, best_weight1, best_weight2, best_weight3, best_score
+
+
+def cli_main():
+    parser = rerank_options.get_reranking_parser()
+    args = options.parse_args_and_arch(parser)
+    rerank(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/examples/noisychannel/rerank_generate.py b/examples/noisychannel/rerank_generate.py
new file mode 100644
index 0000000000..27dcdb5995
--- /dev/null
+++ b/examples/noisychannel/rerank_generate.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import rerank_utils
+import os
+import subprocess
+from examples.noisychannel import rerank_options
+from fairseq import options
+import generate
+import preprocess
+from contextlib import redirect_stdout
+
+"""
+Generate n-best translations using a trained model.
+"""
+
+def gen_and_reprocess_nbest(args):
+    if args.score_dict_dir is None:
+        args.score_dict_dir = args.data
+    if args.prefix_len is not None:
+        assert args.right_to_left1 is False, "prefix length not compatible with right to left models"
+        assert args.right_to_left2 is False, "prefix length not compatible with right to left models"
+
+    if args.nbest_list is not None:
+        assert args.score_model2 is None
+
+    if args.backwards1:
+        scorer1_src = args.target_lang
+        scorer1_tgt = args.source_lang
+    else:
+        scorer1_src = args.source_lang
+        scorer1_tgt = args.target_lang
+
+    store_data = os.path.join(os.path.dirname(__file__))+"/rerank_data/"+args.data_dir_name
+    if not os.path.exists(store_data):
+        os.makedirs(store_data)
+
+    pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
+        backwards_preprocessed_dir, lm_preprocessed_dir = \
+        rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
+                                     args.gen_model_name, args.shard_id, args.num_shards,
+                                     args.sampling, args.prefix_len, args.target_prefix_frac,
+                                     args.source_prefix_frac)
+    assert not (args.right_to_left1 and args.backwards1), "backwards right to left not supported"
+    assert not (args.right_to_left2 and args.backwards2), "backwards right to left not supported"
+    assert not (args.prefix_len is not None and args.target_prefix_frac is not None), \
+        "target prefix frac and target prefix len incompatible"
+
+    # make directory to store generation results
+    if not os.path.exists(pre_gen):
+        os.makedirs(pre_gen)
+
+    rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None
+    rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None
+
+    if args.nbest_list is not None:
+        rerank2_is_gen = True
+
+    # make directories to store preprossed nbest list for reranking
+    if not os.path.exists(left_to_right_preprocessed_dir):
+        os.makedirs(left_to_right_preprocessed_dir)
+    if not os.path.exists(right_to_left_preprocessed_dir):
+        os.makedirs(right_to_left_preprocessed_dir)
+    if not os.path.exists(lm_preprocessed_dir):
+        os.makedirs(lm_preprocessed_dir)
+    if not os.path.exists(backwards_preprocessed_dir):
+        os.makedirs(backwards_preprocessed_dir)
+
+    score1_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model1_name,
+                                                 target_prefix_frac=args.target_prefix_frac,
+                                                 source_prefix_frac=args.source_prefix_frac,
+                                                 backwards=args.backwards1)
+    if args.score_model2 is not None:
+        score2_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model2_name,
+                                                     target_prefix_frac=args.target_prefix_frac,
+                                                     source_prefix_frac=args.source_prefix_frac,
+                                                     backwards=args.backwards2)
+
+    predictions_bpe_file = pre_gen+"/generate_output_bpe.txt"
+
+    using_nbest = args.nbest_list is not None
+
+    if using_nbest:
+        print("Using predefined n-best list from interactive.py")
+        predictions_bpe_file = args.nbest_list
+
+    else:
+        if not os.path.isfile(predictions_bpe_file):
+            print("STEP 1: generate predictions using the p(T|S) model with bpe")
+            print(args.data)
+            param1 = [args.data,
+                      "--path", args.gen_model,
+                      "--shard-id", str(args.shard_id),
+                      "--num-shards", str(args.num_shards),
+                      "--nbest", str(args.num_rescore),
+                      "--batch-size", str(args.batch_size),
+                      "--beam", str(args.num_rescore),
+                      "--max-sentences", str(args.num_rescore),
+                      "--gen-subset", args.gen_subset,
+                      "--source-lang", args.source_lang,
+                      "--target-lang", args.target_lang]
+            if args.sampling:
+                param1 += ["--sampling"]
+
+            gen_parser = options.get_generation_parser()
+            input_args = options.parse_args_and_arch(gen_parser, param1)
+
+            print(input_args)
+            with open(predictions_bpe_file, 'w') as f:
+                with redirect_stdout(f):
+                    generate.main(input_args)
+
+    gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.remove_bpe,
+                                                  nbest=using_nbest, prefix_len=args.prefix_len,
+                                                  target_prefix_frac=args.target_prefix_frac)
+
+    if args.diff_bpe:
+        rerank_utils.write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo,
+                                       gen_output.no_bpe_target, pre_gen+"/source_gen_bpe."+args.source_lang,
+                                       pre_gen+"/target_gen_bpe."+args.target_lang,
+                                       pre_gen+"/reference_gen_bpe."+args.target_lang)
+        bitext_bpe = args.rescore_bpe_code
+        bpe_src_param = ["-c", bitext_bpe,
+                         "--input", pre_gen+"/source_gen_bpe."+args.source_lang,
+                         "--output", pre_gen+"/rescore_data."+args.source_lang]
+        bpe_tgt_param = ["-c", bitext_bpe,
+                         "--input", pre_gen+"/target_gen_bpe."+args.target_lang,
+                         "--output", pre_gen+"/rescore_data."+args.target_lang]
+
+        subprocess.call(["python",
+                         os.path.join(os.path.dirname(__file__),
+                                      "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_src_param,
+                        shell=False)
+
+        subprocess.call(["python",
+                         os.path.join(os.path.dirname(__file__),
+                                      "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_tgt_param,
+                        shell=False)
+
+    if (not os.path.isfile(score1_file) and not rerank1_is_gen) or \
+            (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen):
+        print("STEP 2: process the output of generate.py so we have clean text files with the translations")
+
+        rescore_file = "/rescore_data"
+        if args.prefix_len is not None:
+            prefix_len_rescore_file = rescore_file + "prefix"+str(args.prefix_len)
+        if args.target_prefix_frac is not None:
+            target_prefix_frac_rescore_file = rescore_file + "target_prefix_frac"+str(args.target_prefix_frac)
+        if args.source_prefix_frac is not None:
+            source_prefix_frac_rescore_file = rescore_file + "source_prefix_frac"+str(args.source_prefix_frac)
+
+        if not args.right_to_left1 or not args.right_to_left2:
+            if not args.diff_bpe:
+                rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
+                                               pre_gen+rescore_file+"."+args.source_lang,
+                                               pre_gen+rescore_file+"."+args.target_lang,
+                                               pre_gen+"/reference_file", bpe_symbol=args.remove_bpe)
+                if args.prefix_len is not None:
+                    bw_rescore_file = prefix_len_rescore_file
+                    rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
+                                                   pre_gen+prefix_len_rescore_file+"."+args.source_lang,
+                                                   pre_gen+prefix_len_rescore_file+"."+args.target_lang,
+                                                   pre_gen+"/reference_file", prefix_len=args.prefix_len,
+                                                   bpe_symbol=args.remove_bpe)
+                elif args.target_prefix_frac is not None:
+                    bw_rescore_file = target_prefix_frac_rescore_file
+                    rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
+                                                   pre_gen+target_prefix_frac_rescore_file+"."+args.source_lang,
+                                                   pre_gen+target_prefix_frac_rescore_file+"."+args.target_lang,
+                                                   pre_gen+"/reference_file", bpe_symbol=args.remove_bpe,
+                                                   target_prefix_frac=args.target_prefix_frac)
+                else:
+                    bw_rescore_file = rescore_file
+
+                if args.source_prefix_frac is not None:
+                    fw_rescore_file = source_prefix_frac_rescore_file
+                    rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
+                                                   pre_gen+source_prefix_frac_rescore_file+"."+args.source_lang,
+                                                   pre_gen+source_prefix_frac_rescore_file+"."+args.target_lang,
+                                                   pre_gen+"/reference_file", bpe_symbol=args.remove_bpe,
+                                                   source_prefix_frac=args.source_prefix_frac)
+                else:
+                    fw_rescore_file = rescore_file
+
+        if args.right_to_left1 or args.right_to_left2:
+            rerank_utils.write_reprocessed(gen_output.source, gen_output.hypo, gen_output.target,
+                                           pre_gen+"/right_to_left_rescore_data."+args.source_lang,
+                                           pre_gen+"/right_to_left_rescore_data."+args.target_lang,
+                                           pre_gen+"/right_to_left_reference_file",
+                                           right_to_left=True, bpe_symbol=args.remove_bpe)
+
+        print("STEP 3: binarize the translations")
+        if not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen:
+
+            if args.backwards1 or args.backwards2:
+                if args.backwards_score_dict_dir is not None:
+                    bw_dict = args.backwards_score_dict_dir
+                else:
+                    bw_dict = args.score_dict_dir
+                bw_preprocess_param = ["--source-lang", scorer1_src,
+                                       "--target-lang", scorer1_tgt,
+                                       "--trainpref", pre_gen+bw_rescore_file,
+                                       "--srcdict", bw_dict + "/dict." + scorer1_src + ".txt",
+                                       "--tgtdict", bw_dict + "/dict." + scorer1_tgt + ".txt",
+                                       "--destdir", backwards_preprocessed_dir]
+                preprocess_parser = options.get_preprocessing_parser()
+                input_args = preprocess_parser.parse_args(bw_preprocess_param)
+                preprocess.main(input_args)
+
+            preprocess_param = ["--source-lang", scorer1_src,
+                                "--target-lang", scorer1_tgt,
+                                "--trainpref", pre_gen+fw_rescore_file,
+                                "--srcdict", args.score_dict_dir+"/dict."+scorer1_src+".txt",
+                                "--tgtdict", args.score_dict_dir+"/dict."+scorer1_tgt+".txt",
+                                "--destdir", left_to_right_preprocessed_dir]
+            preprocess_parser = options.get_preprocessing_parser()
+            input_args = preprocess_parser.parse_args(preprocess_param)
+            preprocess.main(input_args)
+
+        if args.right_to_left1 or args.right_to_left2:
+            preprocess_param = ["--source-lang", scorer1_src,
+                                "--target-lang", scorer1_tgt,
+                                "--trainpref", pre_gen+"/right_to_left_rescore_data",
+                                "--srcdict", args.score_dict_dir+"/dict."+scorer1_src+".txt",
+                                "--tgtdict", args.score_dict_dir+"/dict."+scorer1_tgt+".txt",
+                                "--destdir", right_to_left_preprocessed_dir]
+            preprocess_parser = options.get_preprocessing_parser()
+            input_args = preprocess_parser.parse_args(preprocess_param)
+            preprocess.main(input_args)
+
+    return gen_output
+
+
+def cli_main():
+    parser = rerank_options.get_reranking_parser()
+    args = options.parse_args_and_arch(parser)
+    gen_and_reprocess_nbest(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/examples/noisychannel/rerank_options.py b/examples/noisychannel/rerank_options.py
new file mode 100644
index 0000000000..1f8c748b90
--- /dev/null
+++ b/examples/noisychannel/rerank_options.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from fairseq import options
+
+
+def get_reranking_parser(default_task='translation'):
+    parser = options.get_parser('Generation and reranking', default_task)
+    add_reranking_args(parser)
+    return parser
+
+
+def get_tuning_parser(default_task='translation'):
+    parser = options.get_parser('Reranking tuning', default_task)
+    add_reranking_args(parser)
+    add_tuning_args(parser)
+    return parser
+
+
+def add_reranking_args(parser):
+    group = parser.add_argument_group("Reranking")
+    # fmt: off
+    group.add_argument('--score-model1', '-s1', type=str, metavar='FILE', required=True,
+                       help='path to first model or ensemble of models for rescoring')
+    group.add_argument('--score-model2', '-s2', type=str, metavar='FILE', required=False,
+                       help='path to second model or ensemble of models for rescoring')
+    group.add_argument('--num-rescore', '-n', type=int, metavar='N', default=10,
+                       help='the number of candidate hypothesis to rescore')
+    group.add_argument('-bz', '--batch-size', type=int, metavar='N', default=128,
+                       help='batch size for generating the nbest list')
+    group.add_argument('--gen-subset', default='test', metavar='SET', choices=['test', 'train', 'valid'],
+                       help='data subset to generate (train, valid, test)')
+    group.add_argument('--gen-model', default=None, metavar='FILE',
+                       help='the model to generate translations')
+    group.add_argument('-b1', '--backwards1', action='store_true',
+                       help='whether or not the first model group is backwards')
+    group.add_argument('-b2', '--backwards2', action='store_true',
+                       help='whether or not the second model group is backwards')
+    group.add_argument('-a', '--weight1', default=1, nargs='+', type=float,
+                       help='the weight(s) of the first model')
+    group.add_argument('-b', '--weight2', default=1, nargs='+', type=float,
+                       help='the weight(s) of the second model, or the gen model if using nbest from interactive.py')
+    group.add_argument('-c', '--weight3', default=1, nargs='+', type=float,
+                       help='the weight(s) of the third model')
+
+    # lm arguments
+    group.add_argument('-lm', '--language-model', default=None, metavar='FILE',
+                       help='language model for target language to rescore translations')
+    group.add_argument('--lm-dict', default=None, metavar='FILE',
+                       help='the dict of the language model for the target language')
+    group.add_argument('--lm-name', default=None,
+                       help='the name of the language model for the target language')
+    group.add_argument('--lm-bpe-code', default=None, metavar='FILE',
+                       help='the bpe code for the language model for the target language')
+    group.add_argument('--data-dir-name', default=None,
+                       help='name of data directory')
+    group.add_argument('--lenpen', default=1, nargs='+', type=float,
+                       help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences')
+    group.add_argument('--score-dict-dir', default=None,
+                       help='the directory with dictionaries for the scoring models')
+    group.add_argument('--right-to-left1', action='store_true',
+                       help='whether the first model group is a right to left model')
+    group.add_argument('--right-to-left2', action='store_true',
+                       help='whether the second model group is a right to left model')
+    group.add_argument('--remove-bpe', default='@@ ',
+                       help='the bpe symbol, used for the bitext and LM')
+    group.add_argument('--prefix-len', default=None, type=int,
+                       help='the length of the target prefix to use in rescoring (in terms of words wo bpe)')
+    group.add_argument('--sampling', action='store_true',
+                       help='use sampling instead of beam search for generating n best list')
+    group.add_argument('--diff-bpe', action='store_true',
+                       help='bpe for rescoring and nbest list not the same')
+    group.add_argument('--rescore-bpe-code', default=None,
+                       help='bpe code for rescoring models')
+    group.add_argument('--nbest-list', default=None,
+                       help='use predefined nbest list in interactive.py format')
+    group.add_argument('--write-hypos', default=None,
+                       help='filename prefix to write hypos to')
+    group.add_argument('--ref-translation', default=None,
+                       help='reference translation to use with nbest list from interactive.py')
+    group.add_argument('--backwards-score-dict-dir', default=None,
+                       help='the directory with dictionaries for the backwards model,'
+                            'if None then it is assumed the fw and backwards models share dictionaries')
+
+    # extra scaling args
+    group.add_argument('--gen-model-name', default=None,
+                       help='the name of the models that generated the nbest list')
+    group.add_argument('--model1-name', default=None,
+                       help='the name of the set for model1 group ')
+    group.add_argument('--model2-name', default=None,
+                       help='the name of the set for model2 group')
+    group.add_argument('--shard-id', default=0, type=int,
+                       help='the id of the shard to generate')
+    group.add_argument('--num-shards', default=1, type=int,
+                       help='the number of shards to generate across')
+    group.add_argument('--all-shards', action='store_true',
+                       help='use all shards')
+    group.add_argument('--target-prefix-frac', default=None, type=float,
+                       help='the fraction of the target prefix to use in rescoring (in terms of words wo bpe)')
+    group.add_argument('--source-prefix-frac', default=None, type=float,
+                       help='the fraction of the source prefix to use in rescoring (in terms of words wo bpe)')
+    group.add_argument('--normalize', action='store_true',
+                       help='whether to normalize by src and target len')
+
+    return group
+
+
+def add_tuning_args(parser):
+    group = parser.add_argument_group("Tuning")
+
+    group.add_argument('--lower-bound', default=[-0.7], nargs='+', type=float,
+                       help='lower bound of search space')
+    group.add_argument('--upper-bound', default=[3], nargs='+', type=float,
+                       help='upper bound of search space')
+    group.add_argument('--tune-param', default=['lenpen'], nargs='+',
+                       choices=['lenpen', 'weight1', 'weight2', 'weight3'],
+                       help='the parameter(s) to tune')
+    group.add_argument('--tune-subset', default='valid', choices=['valid', 'test', 'train'],
+                       help='the subset to tune on ')
+    group.add_argument('--num-trials', default=1000, type=int,
+                       help='number of trials to do for random search')
+    group.add_argument('--share-weights', action='store_true',
+                       help='share weight2 and weight 3')
+    return group
diff --git a/examples/noisychannel/rerank_score_bw.py b/examples/noisychannel/rerank_score_bw.py
new file mode 100644
index 0000000000..c1558022a9
--- /dev/null
+++ b/examples/noisychannel/rerank_score_bw.py
@@ -0,0 +1,95 @@
+import rerank_utils
+import os
+from fairseq import options
+from examples.noisychannel import rerank_options
+from contextlib import redirect_stdout
+import generate
+
+
+def score_bw(args):
+        if args.backwards1:
+            scorer1_src = args.target_lang
+            scorer1_tgt = args.source_lang
+        else:
+            scorer1_src = args.source_lang
+            scorer1_tgt = args.target_lang
+
+        if args.score_model2 is not None:
+            if args.backwards2:
+                scorer2_src = args.target_lang
+                scorer2_tgt = args.source_lang
+            else:
+                scorer2_src = args.source_lang
+                scorer2_tgt = args.target_lang
+
+        rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None
+        rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None
+
+        pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
+            backwards_preprocessed_dir, lm_preprocessed_dir = \
+            rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
+                                         args.gen_model_name, args.shard_id, args.num_shards,
+                                         args.sampling, args.prefix_len, args.target_prefix_frac,
+                                         args.source_prefix_frac)
+
+        score1_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model1_name,
+                                                     target_prefix_frac=args.target_prefix_frac,
+                                                     source_prefix_frac=args.source_prefix_frac,
+                                                     backwards=args.backwards1)
+
+        if args.score_model2 is not None:
+            score2_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model2_name,
+                                                         target_prefix_frac=args.target_prefix_frac,
+                                                         source_prefix_frac=args.source_prefix_frac,
+                                                         backwards=args.backwards2)
+
+        if args.right_to_left1:
+            rerank_data1 = right_to_left_preprocessed_dir
+        elif args.backwards1:
+            rerank_data1 = backwards_preprocessed_dir
+        else:
+            rerank_data1 = left_to_right_preprocessed_dir
+
+        gen_param = ["--batch-size", str(128), "--score-reference", "--gen-subset", "train"]
+        if not rerank1_is_gen and not os.path.isfile(score1_file):
+            print("STEP 4: score the translations for model 1")
+
+            model_param1 = ["--path", args.score_model1, "--source-lang", scorer1_src, "--target-lang", scorer1_tgt]
+            gen_model1_param = [rerank_data1] + gen_param + model_param1
+
+            gen_parser = options.get_generation_parser()
+            input_args = options.parse_args_and_arch(gen_parser, gen_model1_param)
+
+            with open(score1_file, 'w') as f:
+                with redirect_stdout(f):
+                    generate.main(input_args)
+
+        if args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen:
+            print("STEP 4: score the translations for model 2")
+
+            if args.right_to_left2:
+                rerank_data2 = right_to_left_preprocessed_dir
+            elif args.backwards2:
+                rerank_data2 = backwards_preprocessed_dir
+            else:
+                rerank_data2 = left_to_right_preprocessed_dir
+
+            model_param2 = ["--path", args.score_model2, "--source-lang", scorer2_src, "--target-lang", scorer2_tgt]
+            gen_model2_param = [rerank_data2] + gen_param + model_param2
+
+            gen_parser = options.get_generation_parser()
+            input_args = options.parse_args_and_arch(gen_parser, gen_model2_param)
+
+            with open(score2_file, 'w') as f:
+                with redirect_stdout(f):
+                    generate.main(input_args)
+
+
+def cli_main():
+    parser = rerank_options.get_reranking_parser()
+    args = options.parse_args_and_arch(parser)
+    score_bw(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/examples/noisychannel/rerank_score_lm.py b/examples/noisychannel/rerank_score_lm.py
new file mode 100644
index 0000000000..e35e1da6c0
--- /dev/null
+++ b/examples/noisychannel/rerank_score_lm.py
@@ -0,0 +1,48 @@
+import rerank_utils
+import os
+from fairseq import options
+from examples.noisychannel import rerank_options
+
+
+def score_lm(args):
+    using_nbest = args.nbest_list is not None
+    pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
+        backwards_preprocessed_dir, lm_preprocessed_dir = \
+        rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
+                                     args.gen_model_name, args.shard_id, args.num_shards,
+                                     args.sampling, args.prefix_len, args.target_prefix_frac,
+                                     args.source_prefix_frac)
+
+    predictions_bpe_file = pre_gen+"/generate_output_bpe.txt"
+    if using_nbest:
+        print("Using predefined n-best list from interactive.py")
+        predictions_bpe_file = args.nbest_list
+
+    gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.remove_bpe, nbest=using_nbest)
+
+    if args.language_model is not None:
+        lm_score_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.lm_name, lm_file=True)
+
+    if args.language_model is not None and not os.path.isfile(lm_score_file):
+        print("STEP 4.5: language modeling for P(T)")
+        if args.lm_bpe_code is None:
+            bpe_status = "no bpe"
+        elif args.lm_bpe_code == "shared":
+            bpe_status = "shared"
+        else:
+            bpe_status = "different"
+
+        rerank_utils.lm_scoring(lm_preprocessed_dir, bpe_status, gen_output, pre_gen,
+                                args.lm_dict, args.lm_name, args.language_model,
+                                args.lm_bpe_code, 128, lm_score_file, args.target_lang,
+                                args.source_lang, prefix_len=args.prefix_len)
+
+
+def cli_main():
+    parser = rerank_options.get_reranking_parser()
+    args = options.parse_args_and_arch(parser)
+    score_lm(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/examples/noisychannel/rerank_tune.py b/examples/noisychannel/rerank_tune.py
new file mode 100644
index 0000000000..805d875796
--- /dev/null
+++ b/examples/noisychannel/rerank_tune.py
@@ -0,0 +1,85 @@
+import rerank
+import argparse
+import numpy as np
+import random
+from examples.noisychannel import rerank_options
+from fairseq import options
+
+
+def random_search(args):
+    param_values = []
+    tuneable_parameters = ['lenpen', 'weight1', 'weight2', 'weight3']
+    initial_params = [args.lenpen, args.weight1, args.weight2, args.weight3]
+    for i, elem in enumerate(initial_params):
+        if type(elem) is not list:
+            initial_params[i] = [elem]
+        else:
+            initial_params[i] = elem
+
+    tune_parameters = args.tune_param.copy()
+    for i in range(len(args.tune_param)):
+        assert args.upper_bound[i] >= args.lower_bound[i]
+        index = tuneable_parameters.index(args.tune_param[i])
+        del tuneable_parameters[index]
+        del initial_params[index]
+
+    tune_parameters += tuneable_parameters
+    param_values += initial_params
+    random.seed(args.seed)
+
+    random_params = np.array([[random.uniform(args.lower_bound[i], args.upper_bound[i])
+                               for i in range(len(args.tune_param))]
+                               for k in range(args.num_trials)])
+    set_params = np.array([[initial_params[i][0]
+                            for i in range(len(tuneable_parameters))]
+                            for k in range(args.num_trials)])
+    random_params = np.concatenate((random_params, set_params), 1)
+
+    rerank_args = vars(args).copy()
+    if args.nbest_list:
+        rerank_args['gen_subset'] = 'test'
+    else:
+        rerank_args['gen_subset'] = args.tune_subset
+
+    for k in range(len(tune_parameters)):
+        rerank_args[tune_parameters[k]] = list(random_params[:, k])
+
+    if args.share_weights:
+        k = tune_parameters.index('weight2')
+        rerank_args['weight3'] = list(random_params[:, k])
+
+    rerank_args = argparse.Namespace(**rerank_args)
+    best_lenpen, best_weight1, best_weight2, best_weight3, best_score = rerank.rerank(rerank_args)
+    rerank_args = vars(args).copy()
+    rerank_args['lenpen'] = [best_lenpen]
+    rerank_args['weight1'] = [best_weight1]
+    rerank_args['weight2'] = [best_weight2]
+    rerank_args['weight3'] = [best_weight3]
+
+    # write the hypothesis from the valid set from the best trial
+
+    if args.gen_subset != "valid":
+        rerank_args['gen_subset'] = "valid"
+        rerank_args = argparse.Namespace(**rerank_args)
+        rerank.rerank(rerank_args)
+
+    # test with the best hyperparameters on gen subset
+    rerank_args = vars(args).copy()
+    rerank_args['gen_subset'] = args.gen_subset
+    rerank_args['lenpen'] = [best_lenpen]
+    rerank_args['weight1'] = [best_weight1]
+    rerank_args['weight2'] = [best_weight2]
+    rerank_args['weight3'] = [best_weight3]
+    rerank_args = argparse.Namespace(**rerank_args)
+    rerank.rerank(rerank_args)
+
+
+def cli_main():
+    parser = rerank_options.get_tuning_parser()
+    args = options.parse_args_and_arch(parser)
+
+    random_search(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/examples/noisychannel/rerank_utils.py b/examples/noisychannel/rerank_utils.py
new file mode 100644
index 0000000000..9b8bb7bec2
--- /dev/null
+++ b/examples/noisychannel/rerank_utils.py
@@ -0,0 +1,646 @@
+import subprocess
+import os
+import re
+from fairseq import options
+import eval_lm
+import preprocess
+from contextlib import redirect_stdout
+import math
+
+
+def reprocess(fle):
+    # takes in a file of generate.py translation generate_output
+    # returns a source dict and hypothesis dict, where keys are the ID num (as a string)
+    # and values and the corresponding source and translation. There may be several translations
+    # per source, so the values for hypothesis_dict are lists.
+    # parses output of generate.py
+
+    with open(fle, 'r') as f:
+        txt = f.read()
+
+    """reprocess generate.py output"""
+    p = re.compile(r"[STHP][-]\d+\s*")
+    hp = re.compile(r"(\s*[-]?\d+[.]?\d+\s*)|(\s*(-inf)\s*)")
+    source_dict = {}
+    hypothesis_dict = {}
+    score_dict = {}
+    target_dict = {}
+    pos_score_dict = {}
+    lines = txt.split("\n")
+
+    for line in lines:
+        line += "\n"
+        prefix = re.search(p, line)
+        if prefix is not None:
+            assert len(prefix.group()) > 2, "prefix id not found"
+            _, j = prefix.span()
+            id_num = prefix.group()[2:]
+            id_num = int(id_num)
+            line_type = prefix.group()[0]
+            if line_type == "H":
+                h_txt = line[j:]
+                hypo = re.search(hp, h_txt)
+                assert hypo is not None, ("regular expression failed to find the hypothesis scoring")
+                _, i = hypo.span()
+                score = hypo.group()
+                if id_num in hypothesis_dict:
+                    hypothesis_dict[id_num].append(h_txt[i:])
+                    score_dict[id_num].append(float(score))
+                else:
+                    hypothesis_dict[id_num] = [h_txt[i:]]
+                    score_dict[id_num] = [float(score)]
+
+            elif line_type == "S":
+                source_dict[id_num] = (line[j:])
+            elif line_type == "T":
+                target_dict[id_num] = (line[j:])
+            elif line_type == "P":
+                pos_scores = (line[j:]).split()
+                pos_scores = [float(x) for x in pos_scores]
+                if id_num in pos_score_dict:
+                    pos_score_dict[id_num].append(pos_scores)
+                else:
+                    pos_score_dict[id_num] = [pos_scores]
+
+    return source_dict, hypothesis_dict, score_dict, target_dict, pos_score_dict
+
+
+def reprocess_nbest(fle):
+    """reprocess interactive.py output"""
+    with open(fle, 'r') as f:
+        txt = f.read()
+
+    source_dict = {}
+    hypothesis_dict = {}
+    score_dict = {}
+    target_dict = {}
+    pos_score_dict = {}
+    lines = txt.split("\n")
+
+    hp = re.compile(r'[-]?\d+[.]?\d+')
+    j = -1
+
+    for _i, line in enumerate(lines):
+        line += "\n"
+        line_type = line[0]
+
+        if line_type == "H":
+            hypo = re.search(hp, line)
+            _, start_index = hypo.span()
+            score = hypo.group()
+            if j in score_dict:
+                score_dict[j].append(float(score))
+                hypothesis_dict[j].append(line[start_index:].strip("\t"))
+            else:
+                score_dict[j] = [float(score)]
+                hypothesis_dict[j] = [line[start_index:].strip("\t")]
+        elif line_type == "O":
+            j += 1
+            source_dict[j] = line[2:]
+            # we don't have the targets for interactive.py
+            target_dict[j] = "filler"
+
+        elif line_type == "P":
+            pos_scores = [float(pos_score) for pos_score in line.split()[1:]]
+            if j in pos_score_dict:
+                pos_score_dict[j].append(pos_scores)
+            else:
+                pos_score_dict[j] = [pos_scores]
+
+    assert source_dict.keys() == hypothesis_dict.keys()
+    assert source_dict.keys() == pos_score_dict.keys()
+    assert source_dict.keys() == score_dict.keys()
+
+    return source_dict, hypothesis_dict, score_dict, target_dict, pos_score_dict
+
+
+def write_reprocessed(sources, hypos, targets, source_outfile,
+                      hypo_outfile, target_outfile, right_to_left=False,
+                      prefix_len=None, bpe_symbol=None,
+                      target_prefix_frac=None, source_prefix_frac=None):
+
+    """writes nbest hypothesis for rescoring"""
+    assert not (prefix_len is not None and target_prefix_frac is not None), \
+        "in writing reprocessed, only one type of prefix may be used"
+    assert not (prefix_len is not None and source_prefix_frac is not None), \
+        "in writing reprocessed, only one type of prefix may be used"
+    assert not (target_prefix_frac is not None and source_prefix_frac is not None), \
+        "in writing reprocessed, only one type of prefix may be used"
+
+    with open(source_outfile, 'w') as source_file, \
+         open(hypo_outfile, 'w') as hypo_file, \
+         open(target_outfile, 'w') as target_file:
+
+        assert len(sources) == len(hypos), "sources and hypos list length mismatch"
+        if right_to_left:
+            for i in range(len(sources)):
+                    for j in range(len(hypos[i])):
+                        if prefix_len is None:
+                            hypo_file.write(make_right_to_left(hypos[i][j])+"\n")
+                        else:
+                            raise NotImplementedError()
+                        source_file.write(make_right_to_left(sources[i])+"\n")
+                        target_file.write(make_right_to_left(targets[i])+"\n")
+        else:
+            for i in sorted(sources.keys()):
+                    for j in range(len(hypos[i])):
+                        if prefix_len is not None:
+                            shortened = get_prefix_no_bpe(hypos[i][j], bpe_symbol, prefix_len)+"\n"
+                            hypo_file.write(shortened)
+                            source_file.write(sources[i])
+                            target_file.write(targets[i])
+                        elif target_prefix_frac is not None:
+                            num_words, shortened, num_bpe_tokens = \
+                                    calc_length_from_frac(hypos[i][j], target_prefix_frac, bpe_symbol)
+                            shortened += "\n"
+                            hypo_file.write(shortened)
+                            source_file.write(sources[i])
+                            target_file.write(targets[i])
+                        elif source_prefix_frac is not None:
+                            num_words, shortened, num_bpe_tokensn = \
+                                    calc_length_from_frac(sources[i], source_prefix_frac, bpe_symbol)
+                            shortened += "\n"
+                            hypo_file.write(hypos[i][j])
+                            source_file.write(shortened)
+                            target_file.write(targets[i])
+                        else:
+                            hypo_file.write(hypos[i][j])
+                            source_file.write(sources[i])
+                            target_file.write(targets[i])
+
+
+def calc_length_from_frac(bpe_sentence, prefix_frac, bpe_symbol):
+    # return number of words, (not bpe tokens) that we want
+    no_bpe_sen = remove_bpe(bpe_sentence, bpe_symbol)
+    len_sen = len(no_bpe_sen.split())
+
+    num_words = math.ceil(len_sen * prefix_frac)
+    prefix = get_prefix_no_bpe(bpe_sentence, bpe_symbol, num_words)
+    num_bpe_tokens = len(prefix.split())
+    return num_words, prefix, num_bpe_tokens
+
+
+def get_prefix(sentence, prefix_len):
+    """assuming no bpe, gets the prefix of the sentence with prefix_len words"""
+    tokens = sentence.strip("\n").split()
+    if prefix_len >= len(tokens):
+        return sentence.strip("\n")
+    else:
+        return " ".join(tokens[:prefix_len])
+
+
+def get_prefix_no_bpe(sentence, bpe_symbol, prefix_len):
+    if bpe_symbol is None:
+        return get_prefix(sentence, prefix_len)
+    else:
+        return " ".join(get_prefix_from_len(sentence.split(), bpe_symbol, prefix_len))
+
+
+def get_prefix_from_len(sentence, bpe_symbol, prefix_len):
+    """get the prefix of sentence with bpe, with prefix len in terms of words, not bpe tokens"""
+    bpe_count = sum([bpe_symbol.strip(" ") in t for t in sentence[:prefix_len]])
+    if bpe_count == 0:
+        return sentence[:prefix_len]
+    else:
+        return sentence[:prefix_len]+get_prefix_from_len(sentence[prefix_len:], bpe_symbol, bpe_count)
+
+
+def get_num_bpe_tokens_from_len(sentence, bpe_symbol, prefix_len):
+    """given a prefix length in terms of words, return the number of bpe tokens"""
+    prefix = get_prefix_no_bpe(sentence, bpe_symbol, prefix_len)
+    assert len(remove_bpe(prefix, bpe_symbol).split()) <= prefix_len
+    return len(prefix.split(" "))
+
+
+def make_right_to_left(line):
+    tokens = line.split()
+    tokens.reverse()
+    new_line = " ".join(tokens)
+    return new_line
+
+
+def remove_bpe(line, bpe_symbol):
+    line = line.replace("\n", '')
+    line = (line + ' ').replace(bpe_symbol, '').rstrip()
+    return line+("\n")
+
+
+def remove_bpe_dict(pred_dict, bpe_symbol):
+    new_dict = {}
+    for i in pred_dict:
+        if type(pred_dict[i]) == list:
+            new_list = [remove_bpe(elem, bpe_symbol) for elem in pred_dict[i]]
+            new_dict[i] = new_list
+        else:
+            new_dict[i] = remove_bpe(pred_dict[i], bpe_symbol)
+    return new_dict
+
+
+def parse_bleu_scoring(line):
+    p = re.compile(r'(BLEU4 = )\d+[.]\d+')
+    res = re.search(p, line)
+    assert res is not None, line
+    return float(res.group()[8:])
+
+
+def get_full_from_prefix(hypo_prefix, hypos):
+    """given a hypo prefix, recover the first hypo from the list of complete hypos beginning with that prefix"""
+    for hypo in hypos:
+        hypo_prefix = hypo_prefix.strip("\n")
+        len_prefix = len(hypo_prefix)
+        if hypo[:len_prefix] == hypo_prefix:
+            return hypo
+    # no match found
+    raise Exception()
+
+
+def get_score(a, b, c, target_len, bitext_score1, bitext_score2=None, lm_score=None,
+              lenpen=None, src_len=None, tgt_len=None, bitext1_backwards=False,
+              bitext2_backwards=False, normalize=False):
+    if bitext1_backwards:
+        bitext1_norm = src_len
+    else:
+        bitext1_norm = tgt_len
+    if bitext_score2 is not None:
+        if bitext2_backwards:
+            bitext2_norm = src_len
+        else:
+            bitext2_norm = tgt_len
+    else:
+        bitext2_norm = 1
+        bitext_score2 = 0
+    if normalize:
+        score = a*bitext_score1/bitext1_norm + b*bitext_score2/bitext2_norm+c*lm_score/src_len
+    else:
+        score = a*bitext_score1 + b*bitext_score2+c*lm_score
+
+    if lenpen is not None:
+        score /= (target_len) ** float(lenpen)
+
+    return score
+
+
+class BitextOutput(object):
+    def __init__(self, output_file, backwards, right_to_left, bpe_symbol,
+                 prefix_len=None, target_prefix_frac=None, source_prefix_frac=None):
+        """process output from rescoring"""
+        source, hypo, score, target, pos_score = reprocess(output_file)
+        if backwards:
+            self.hypo_fracs = source_prefix_frac
+        else:
+            self.hypo_fracs = target_prefix_frac
+
+        # remove length penalty so we can use raw scores
+        score, num_bpe_tokens = get_score_from_pos(pos_score, prefix_len, hypo, bpe_symbol, self.hypo_fracs, backwards)
+        source_lengths = {}
+        target_lengths = {}
+
+        assert hypo.keys() == source.keys(), "key mismatch"
+        if backwards:
+            tmp = hypo
+            hypo = source
+            source = tmp
+        for i in source:
+            # since we are reranking, there should only be one hypo per source sentence
+            if backwards:
+                len_src = len(source[i][0].split())
+                # record length without <eos>
+                if len_src == num_bpe_tokens[i][0] - 1:
+                    source_lengths[i] = num_bpe_tokens[i][0] - 1
+                else:
+                    source_lengths[i] = num_bpe_tokens[i][0]
+
+                target_lengths[i] = len(hypo[i].split())
+
+                source[i] = remove_bpe(source[i][0], bpe_symbol)
+                target[i] = remove_bpe(target[i], bpe_symbol)
+                hypo[i] = remove_bpe(hypo[i], bpe_symbol)
+
+                score[i] = float(score[i][0])
+                pos_score[i] = pos_score[i][0]
+
+            else:
+                len_tgt = len(hypo[i][0].split())
+                # record length without <eos>
+                if len_tgt == num_bpe_tokens[i][0] - 1:
+                    target_lengths[i] = num_bpe_tokens[i][0] - 1
+                else:
+                    target_lengths[i] = num_bpe_tokens[i][0]
+
+                source_lengths[i] = len(source[i].split())
+
+                if right_to_left:
+                    source[i] = remove_bpe(make_right_to_left(source[i]), bpe_symbol)
+                    target[i] = remove_bpe(make_right_to_left(target[i]), bpe_symbol)
+                    hypo[i] = remove_bpe(make_right_to_left(hypo[i][0]), bpe_symbol)
+                    score[i] = float(score[i][0])
+                    pos_score[i] = pos_score[i][0]
+                else:
+                    assert len(hypo[i]) == 1, "expected only one hypothesis per source sentence"
+                    source[i] = remove_bpe(source[i], bpe_symbol)
+                    target[i] = remove_bpe(target[i], bpe_symbol)
+                    hypo[i] = remove_bpe(hypo[i][0], bpe_symbol)
+                    score[i] = float(score[i][0])
+                    pos_score[i] = pos_score[i][0]
+
+        self.rescore_source = source
+        self.rescore_hypo = hypo
+        self.rescore_score = score
+        self.rescore_target = target
+        self.rescore_pos_score = pos_score
+        self.backwards = backwards
+        self.right_to_left = right_to_left
+        self.target_lengths = target_lengths
+        self.source_lengths = source_lengths
+
+
+class BitextOutputFromGen(object):
+    def __init__(self, predictions_bpe_file, bpe_symbol=None, nbest=False, prefix_len=None, target_prefix_frac=None):
+        if nbest:
+            pred_source, pred_hypo, pred_score, pred_target, pred_pos_score = reprocess_nbest(predictions_bpe_file)
+        else:
+            pred_source, pred_hypo, pred_score, pred_target, pred_pos_score = reprocess(predictions_bpe_file)
+
+        assert len(pred_source) == len(pred_hypo)
+        assert len(pred_source) == len(pred_score)
+        assert len(pred_source) == len(pred_target)
+        assert len(pred_source) == len(pred_pos_score)
+
+        # remove length penalty so we can use raw scores
+        pred_score, num_bpe_tokens = get_score_from_pos(pred_pos_score, prefix_len, pred_hypo,
+                                                        bpe_symbol, target_prefix_frac, False)
+
+        self.source = pred_source
+        self.target = pred_target
+        self.score = pred_score
+        self.pos_score = pred_pos_score
+        self.hypo = pred_hypo
+        self.target_lengths = {}
+        self.source_lengths = {}
+
+        self.no_bpe_source = remove_bpe_dict(pred_source.copy(), bpe_symbol)
+        self.no_bpe_hypo = remove_bpe_dict(pred_hypo.copy(), bpe_symbol)
+        self.no_bpe_target = remove_bpe_dict(pred_target.copy(), bpe_symbol)
+
+        # indexes to match those from the rescoring models
+        self.rescore_source = {}
+        self.rescore_target = {}
+        self.rescore_pos_score = {}
+        self.rescore_hypo = {}
+        self.rescore_score = {}
+        self.num_hypos = {}
+        self.backwards = False
+        self.right_to_left = False
+
+        index = 0
+
+        for i in sorted(pred_source.keys()):
+            for j in range(len(pred_hypo[i])):
+
+                self.target_lengths[index] = len(self.hypo[i][j].split())
+                self.source_lengths[index] = len(self.source[i].split())
+
+                self.rescore_source[index] = self.no_bpe_source[i]
+                self.rescore_target[index] = self.no_bpe_target[i]
+                self.rescore_hypo[index] = self.no_bpe_hypo[i][j]
+                self.rescore_score[index] = float(pred_score[i][j])
+                self.rescore_pos_score[index] = pred_pos_score[i][j]
+                self.num_hypos[index] = len(pred_hypo[i])
+                index += 1
+
+
+def get_score_from_pos(pos_score_dict, prefix_len, hypo_dict, bpe_symbol, hypo_frac, backwards):
+    score_dict = {}
+    num_bpe_tokens_dict = {}
+    assert prefix_len is None or hypo_frac is None
+    for key in pos_score_dict:
+        score_dict[key] = []
+        num_bpe_tokens_dict[key] = []
+        for i in range(len(pos_score_dict[key])):
+            if prefix_len is not None and not backwards:
+                num_bpe_tokens = get_num_bpe_tokens_from_len(hypo_dict[key][i], bpe_symbol, prefix_len)
+                score_dict[key].append(sum(pos_score_dict[key][i][:num_bpe_tokens]))
+                num_bpe_tokens_dict[key].append(num_bpe_tokens)
+            elif hypo_frac is not None:
+                num_words, shortened, hypo_prefix_len = calc_length_from_frac(hypo_dict[key][i], hypo_frac, bpe_symbol)
+                score_dict[key].append(sum(pos_score_dict[key][i][:hypo_prefix_len]))
+                num_bpe_tokens_dict[key].append(hypo_prefix_len)
+            else:
+                score_dict[key].append(sum(pos_score_dict[key][i]))
+                num_bpe_tokens_dict[key].append(len(pos_score_dict[key][i]))
+    return score_dict, num_bpe_tokens_dict
+
+
+class LMOutput(object):
+    def __init__(self, lm_score_file, lm_dict=None, prefix_len=None, bpe_symbol=None, target_prefix_frac=None):
+        lm_sentences, lm_sen_scores, lm_sen_pos_scores, lm_no_bpe_sentences, lm_bpe_tokens = \
+                parse_lm(lm_score_file, prefix_len=prefix_len,
+                         bpe_symbol=bpe_symbol, target_prefix_frac=target_prefix_frac)
+
+        self.sentences = lm_sentences
+        self.score = lm_sen_scores
+        self.pos_score = lm_sen_pos_scores
+        self.lm_dict = lm_dict
+        self.no_bpe_sentences = lm_no_bpe_sentences
+        self.bpe_tokens = lm_bpe_tokens
+
+
+def parse_lm(input_file, prefix_len=None, bpe_symbol=None, target_prefix_frac=None):
+    """parse output of eval_lm"""
+    with open(input_file, 'r') as f:
+        text = f.readlines()
+        text = text[7:]
+        cleaned_text = text[:-2]
+
+        sentences = {}
+        sen_scores = {}
+        sen_pos_scores = {}
+        no_bpe_sentences = {}
+        num_bpe_tokens_dict = {}
+        for _i, line in enumerate(cleaned_text):
+            tokens = line.split()
+            if tokens[0].isdigit():
+                line_id = int(tokens[0])
+                scores = [float(x[1:-1]) for x in tokens[2::2]]
+                sentences[line_id] = " ".join(tokens[1::2][:-1])+"\n"
+                if bpe_symbol is not None:
+                    # exclude <eos> symbol to match output from generate.py
+                    bpe_sen = " ".join(tokens[1::2][:-1])+"\n"
+                    no_bpe_sen = remove_bpe(bpe_sen, bpe_symbol)
+                    no_bpe_sentences[line_id] = no_bpe_sen
+
+                if prefix_len is not None:
+                    num_bpe_tokens = get_num_bpe_tokens_from_len(bpe_sen, bpe_symbol, prefix_len)
+                    sen_scores[line_id] = sum(scores[:num_bpe_tokens])
+                    num_bpe_tokens_dict[line_id] = num_bpe_tokens
+                elif target_prefix_frac is not None:
+                    num_words, shortened, target_prefix_len = calc_length_from_frac(bpe_sen, target_prefix_frac,
+                                                                                    bpe_symbol)
+                    sen_scores[line_id] = sum(scores[:target_prefix_len])
+                    num_bpe_tokens_dict[line_id] = target_prefix_len
+                else:
+                    sen_scores[line_id] = sum(scores)
+                    num_bpe_tokens_dict[line_id] = len(scores)
+
+                sen_pos_scores[line_id] = scores
+
+    return sentences, sen_scores, sen_pos_scores, no_bpe_sentences, num_bpe_tokens_dict
+
+
+def get_directories(data_dir_name, num_rescore, gen_subset,
+                    fw_name, shard_id, num_shards,
+                    sampling=False, prefix_len=None,
+                    target_prefix_frac=None, source_prefix_frac=None):
+    nbest_file_id = "nbest_" + str(num_rescore) + \
+                    "_subset_" + gen_subset + \
+                    "_fw_name_" + fw_name + \
+                    "_shard_" + str(shard_id) + \
+                    "_of_" + str(num_shards)
+
+    if sampling:
+        nbest_file_id += "_sampling"
+
+    # the directory containing all information for this nbest list
+    pre_gen = os.path.join(os.path.dirname(__file__))+"/rerank_data/"+data_dir_name+"/"+nbest_file_id
+    # the directory to store the preprocessed nbest list, for left to right rescoring
+    left_to_right_preprocessed_dir = pre_gen+"/left_to_right_preprocessed"
+    if source_prefix_frac is not None:
+        left_to_right_preprocessed_dir = left_to_right_preprocessed_dir + "/prefix_frac" + str(source_prefix_frac)
+    # the directory to store the preprocessed nbest list, for right to left rescoring
+    right_to_left_preprocessed_dir = pre_gen+"/right_to_left_preprocessed"
+    # the directory to store the preprocessed nbest list, for backwards rescoring
+    backwards_preprocessed_dir = pre_gen+"/backwards"
+    if target_prefix_frac is not None:
+        backwards_preprocessed_dir = backwards_preprocessed_dir+"/prefix_frac"+str(target_prefix_frac)
+    elif prefix_len is not None:
+        backwards_preprocessed_dir = backwards_preprocessed_dir+"/prefix_"+str(prefix_len)
+
+    # the directory to store the preprocessed nbest list, for rescoring with P(T)
+    lm_preprocessed_dir = pre_gen+"/lm_preprocessed"
+
+    return pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
+        backwards_preprocessed_dir, lm_preprocessed_dir
+
+
+def lm_scoring(preprocess_directory, bpe_status, gen_output, pre_gen,
+               cur_lm_dict, cur_lm_name, cur_language_model, cur_lm_bpe_code,
+               batch_size, lm_score_file, target_lang, source_lang, prefix_len=None):
+    if prefix_len is not None:
+        assert bpe_status == "different", "bpe status must be different to use prefix len"
+    if bpe_status == "no bpe":
+        # run lm on output without bpe
+        write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo,
+                          gen_output.no_bpe_target, pre_gen+"/rescore_data_no_bpe.de",
+                          pre_gen+"/rescore_data_no_bpe.en", pre_gen+"/reference_file_no_bpe")
+
+        preprocess_lm_param = ["--only-source",
+                               "--trainpref", pre_gen+"/rescore_data_no_bpe."+target_lang,
+                               "--srcdict", cur_lm_dict,
+                               "--destdir", preprocess_directory]
+        preprocess_parser = options.get_preprocessing_parser()
+        input_args = preprocess_parser.parse_args(preprocess_lm_param)
+        preprocess.main(input_args)
+
+        eval_lm_param = [preprocess_directory,
+                         "--path", cur_language_model,
+                         "--output-word-probs",
+                         "--batch-size", str(batch_size),
+                         "--max-tokens", "1024",
+                         "--sample-break-mode", "eos",
+                         "--gen-subset", "train"]
+
+        eval_lm_parser = options.get_eval_lm_parser()
+        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)
+
+        with open(lm_score_file, 'w') as f:
+            with redirect_stdout(f):
+                eval_lm.main(input_args)
+
+    elif bpe_status == "shared":
+            preprocess_lm_param = ["--only-source",
+                                   "--trainpref", pre_gen+"/rescore_data."+target_lang,
+                                   "--srcdict", cur_lm_dict,
+                                   "--destdir", preprocess_directory]
+            preprocess_parser = options.get_preprocessing_parser()
+            input_args = preprocess_parser.parse_args(preprocess_lm_param)
+            preprocess.main(input_args)
+
+            eval_lm_param = [preprocess_directory,
+                             "--path", cur_language_model,
+                             "--output-word-probs",
+                             "--batch-size", str(batch_size),
+                             "--sample-break-mode", "eos",
+                             "--gen-subset", "train"]
+
+            eval_lm_parser = options.get_eval_lm_parser()
+            input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)
+
+            with open(lm_score_file, 'w') as f:
+                with redirect_stdout(f):
+                    eval_lm.main(input_args)
+
+    elif bpe_status == "different":
+        rescore_file = pre_gen+"/rescore_data_no_bpe"
+        rescore_bpe = pre_gen+"/rescore_data_new_bpe"
+
+        rescore_file += "."
+        rescore_bpe += "."
+
+        write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo,
+                          gen_output.no_bpe_target, rescore_file+source_lang,
+                          rescore_file+target_lang, pre_gen+"/reference_file_no_bpe",
+                          bpe_symbol=None)
+
+        # apply LM bpe to nbest list
+        bpe_src_param = ["-c", cur_lm_bpe_code,
+                         "--input", rescore_file+target_lang,
+                         "--output", rescore_bpe+target_lang]
+        subprocess.call(["python",
+                         os.path.join(os.path.dirname(__file__),
+                                      "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_src_param,
+                        shell=False)
+        # uncomment to use fastbpe instead of subword-nmt bpe
+        # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code]
+        # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False)
+
+        preprocess_dir = preprocess_directory
+
+        preprocess_lm_param = ["--only-source",
+                               "--trainpref", rescore_bpe+target_lang,
+                               "--srcdict", cur_lm_dict,
+                               "--destdir", preprocess_dir]
+        preprocess_parser = options.get_preprocessing_parser()
+        input_args = preprocess_parser.parse_args(preprocess_lm_param)
+        preprocess.main(input_args)
+
+        eval_lm_param = [preprocess_dir,
+                         "--path", cur_language_model,
+                         "--output-word-probs",
+                         "--batch-size", str(batch_size),
+                         "--max-tokens", "1024",
+                         "--sample-break-mode", "eos",
+                         "--gen-subset", "train"]
+
+        eval_lm_parser = options.get_eval_lm_parser()
+        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)
+
+        with open(lm_score_file, 'w') as f:
+            with redirect_stdout(f):
+                eval_lm.main(input_args)
+
+
+def rescore_file_name(nbest_dir, prefix_len, scorer_name, lm_file=False,
+                      target_prefix_frac=None, source_prefix_frac=None, backwards=None):
+    if lm_file:
+        score_file = nbest_dir+"/lm_score_translations_model_"+scorer_name+".txt"
+    else:
+        score_file = nbest_dir+"/"+scorer_name+"_score_translations.txt"
+    if backwards:
+        if prefix_len is not None:
+            score_file += "prefix_len"+str(prefix_len)
+        elif target_prefix_frac is not None:
+            score_file += "target_prefix_frac"+str(target_prefix_frac)
+    else:
+        if source_prefix_frac is not None:
+            score_file += "source_prefix_frac"+str(source_prefix_frac)
+    return score_file
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index c9ba537070..7fedc77550 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -498,7 +498,7 @@ def upgrade_state_dict_named(self, state_dict, name):
                         del state_dict[k]
 
         version_key = '{}.version'.format(name)
-        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2:
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
             # earlier checkpoints did not normalize after the stack of layers
             self.layer_norm = None
             self.normalize = False

From a8e321116b2b64d9530413c9add9622955fe2498 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 15 Aug 2019 12:02:25 -0700
Subject: [PATCH 098/213] Update README

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/827

Differential Revision: D16833252

Pulled By: myleott

fbshipit-source-id: 8eded8cc651002dfd60869fc2383d305ed335d3a
---
 examples/roberta/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 15119a345a..e4d9e4fee1 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -8,6 +8,8 @@ RoBERTa iterates on BERT's pretraining procedure, including training the model l
 
 ### What's New:
 
+- August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers).
+- August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/master/examples/roberta/wsc#roberta-training-on-winogrande-dataset).
 - August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
 
 ## Pre-trained models

From ed27ed8bacc2e1837f18942230b109f6815c73b6 Mon Sep 17 00:00:00 2001
From: Nayan Singhal <naysing@fb.com>
Date: Thu, 15 Aug 2019 14:24:43 -0700
Subject: [PATCH 099/213] BMUF Resetting local state param

Summary:
BMUF
1) Resetting BMUF parameters after warmup.
2) Resetting local param state after warmup.
3) Allowing user to pass block momentum value instead of gpu derived Block Momentum.

Reviewed By: skritika, mrshenli

Differential Revision: D16692026

fbshipit-source-id: d02eaf29d0e4b37007418166ec937d4bf5fe6aca
---
 fairseq/optim/bmuf.py | 66 +++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py
index 12e18adc03..756374d569 100644
--- a/fairseq/optim/bmuf.py
+++ b/fairseq/optim/bmuf.py
@@ -26,11 +26,12 @@ def __init__(self, args, params, optimizer):
         self.params = params
         self._num_updates = 0
         self.sync_iter = self.args.global_sync_iter
-        self.block_momentum = 1 - 1.0 / self.args.distributed_world_size
+        self.block_momentum = self.args.block_momentum
         self.block_lr = self.args.block_lr
         self._reset_local_data()
         self.warmup_iteration = self.args.warmup_iterations
         self.use_nbm = self.args.use_nbm
+        self.initial_state = self._optimizer.state_dict()
 
     @staticmethod
     def add_args(parser):
@@ -38,6 +39,12 @@ def add_args(parser):
         parser.add_argument(
             "--block-lr", default=1, type=float, help="block learning rate for bmuf"
         )
+        parser.add_argument(
+            "--block-momentum",
+            default=0.875,
+            type=float,
+            help="block momentum for bmuf",
+        )
         parser.add_argument(
             "--global-sync-iter",
             default=10,
@@ -85,8 +92,9 @@ def clip_grad_norm(self, max_norm):
         """Clips gradient norm."""
         return self._optimizer.clip_grad_norm(max_norm)
 
-    def _sync_block(self):
-        if self.get_num_updates() % self.sync_iter == 0:
+    def _block_sync(self):
+        # Update the global model using local models from all GPUs.
+        if self._is_bmuf_iter():
             if self.block_momentum != 0:
                 self._BM_before_sync()
 
@@ -95,33 +103,33 @@ def _sync_block(self):
             if self.block_momentum != 0:
                 self._BM_after_sync()
 
-    def _broadcast_model(self, rootRank=0):
-        if (
-            self.warmup_iteration != 0
-            and self.get_num_updates() % self.warmup_iteration == 0
-        ):
-            self.warmup_iteration = 0
+    def _is_warmup_end(self):
+        if self.get_num_updates() == self.warmup_iteration:
+            return True
+        return False
 
-            # broadcast the local model
-            for param in self.params:
-                dist.broadcast(param.data, rootRank)
+    def _is_bmuf_iter(self):
+        if self.get_num_updates() % self.sync_iter == 0:
+            return True
+        return False
+
+    def _warmup_sync(self, rootRank=0):
+        # broadcast the local model to all GPUs
+        for param in self.params:
+            dist.broadcast(param.data, src=rootRank)
 
-            # Also, broadcast the local parameters
-            for param in (
-                self.params_localprev
-                + self.smoothed_grads_localprev
-                + self.grads_localprev
-            ):
-                dist.broadcast(param, src=rootRank)
+        # Reset the local optimizer state and local bmuf related param
+        self._optimizer.load_state_dict(self.initial_state)
+        self._reset_local_data()
 
     def step(self, closure=None):
         """Performs a single optimization step."""
         self._optimizer.step(closure)
         self.set_num_updates(self.get_num_updates() + 1)
-        if self.warmup_iteration != 0:
-            self._broadcast_model()
+        if self._is_warmup_end():
+            self._warmup_sync()
         else:
-            self._sync_block()
+            self._block_sync()
 
     def zero_grad(self):
         """Clears the gradients of all optimized parameters."""
@@ -137,6 +145,7 @@ def set_num_updates(self, num_updates):
 
     @torch.no_grad()
     def _reset_local_data(self):
+        """Resetting all the BMUF specific params."""
         self.params_localprev = [torch.zeros_like(p.data) for p in self.params]
 
         self.smoothed_grads_localprev = [
@@ -144,12 +153,13 @@ def _reset_local_data(self):
         ]
         self.grads_localprev = [p.data.new_zeros(p.data.size()) for p in self.params]
 
-        # initialize
+        # saving the global model locally for calculating gradient during bmuf sync
         for param, copy_param in zip(self.params, self.params_localprev):
             copy_param.copy_(param.data)
 
     @torch.no_grad()
     def _BM_before_sync(self):
+        """Calculate grad between previously synced model and currrent local model."""
         # prev_param is basically the global copy from the previously finished
         # synchronisation. param.data is local parameter after block_sync_freq
         # for the local gpu. so grad is difference between previously synced
@@ -160,6 +170,7 @@ def _BM_before_sync(self):
             self.grads_localprev[index] = prev_param - param.data
 
     def _allreduce_parameter(self):
+        """Average gradient from all the GPUs. """
         for index, param in enumerate(self.params):
             sync_para = (
                 param.data if self.block_momentum == 0 else self.grads_localprev[index]
@@ -182,13 +193,8 @@ def _BM_after_sync(self):
             # prev_param is basically last syncrhornized parameter. though
             # smoothed_grad is local, all processes will have same value of
             # smoothed_grad and hence param is globally synchronized copy.
-            # This is essentially a first-order infinite impulse response (IIR)
-            # filter with the gain (1 - BM)*BM_lr:
-            # smoothed_grad(t)=BM * smoothed_grad(t-1) + (1 - BM)*BM_lr*grad(t)
-            smoothed_grad = (
-                smoothed_grad * self.block_momentum
-                + grad * (1 - self.block_momentum) * self.block_lr
-            )
+            # smoothed_grad(t)=BM * smoothed_grad(t-1) + BM_lr*grad(t)
+            smoothed_grad = smoothed_grad * self.block_momentum + grad * self.block_lr
             param.data.copy_(prev_param - smoothed_grad)
             # A Nesterov momentum here is to do a partial weight update before
             # calculating the gradient

From a3cfd51dd92c60381c13435f8761ca66691bea6b Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair0757.h2.fair>
Date: Fri, 16 Aug 2019 07:46:08 -0700
Subject: [PATCH 100/213] added hf bert bpe

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/829

Differential Revision: D16856693

fbshipit-source-id: 545bbf4815f5c40e72a6ed241312a51dc90e34a1
---
 fairseq/data/encoders/hf_bert_bpe.py | 51 ++++++++++++++++++++++++++++
 fairseq/models/roberta/model.py      |  4 +--
 2 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 fairseq/data/encoders/hf_bert_bpe.py

diff --git a/fairseq/data/encoders/hf_bert_bpe.py b/fairseq/data/encoders/hf_bert_bpe.py
new file mode 100644
index 0000000000..40c69d53c5
--- /dev/null
+++ b/fairseq/data/encoders/hf_bert_bpe.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe('bert')
+class BertBPE(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-cased', action='store_true',
+                            help='set for cased BPE',
+                            default=False)
+        parser.add_argument('--bpe-vocab-file', type=str,
+                            help='bpe vocab file.')
+        # fmt: on
+
+    def __init__(self, args):
+        try:
+            from pytorch_transformers import BertTokenizer
+            from pytorch_transformers.tokenization_utils import clean_up_tokenization
+        except ImportError:
+            raise ImportError(
+                'Please install 1.0.0 version of pytorch_transformers'
+                'with: pip install pytorch-transformers'
+            )
+
+        if 'bpe_vocab_file' in args:
+            self.bert_tokenizer = BertTokenizer(
+                args.bpe_vocab_file,
+                do_lower_case=not args.bpe_cased
+            )
+        else:
+            vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased'
+            self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
+            self.clean_up_tokenization = clean_up_tokenization
+
+    def encode(self, x: str) -> str:
+        return ' '.join(self.bert_tokenizer.tokenize(x))
+
+    def decode(self, x: str) -> str:
+        return self.clean_up_tokenization(
+            self.bert_tokenizer.convert_tokens_to_string(x.split(' '))
+        )
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        return not x.startswith('##')
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index bf5e7c4ef5..e5528dfc9c 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -127,14 +127,14 @@ def supported_targets(self):
         return {'self'}
 
     @classmethod
-    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', **kwargs):
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs):
         from fairseq import hub_utils
         x = hub_utils.from_pretrained(
             model_name_or_path,
             checkpoint_file,
             data_name_or_path,
             archive_map=cls.hub_models(),
-            bpe='gpt2',
+            bpe=bpe,
             load_checkpoint_heads=True,
             **kwargs,
         )

From 851c022610b27da3beaa4e40a6834b5fb3b44f44 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair0757.h2.fair>
Date: Fri, 16 Aug 2019 12:08:09 -0700
Subject: [PATCH 101/213] added check in token block dataset for multiple
 consecutive blank lines

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/830

Differential Revision: D16861799

fbshipit-source-id: d85deaf78ec5b9c23eafd4145a96252e3901fa22
---
 fairseq/data/token_block_dataset.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
index 3d69cfcda4..73a8cc93a1 100644
--- a/fairseq/data/token_block_dataset.py
+++ b/fairseq/data/token_block_dataset.py
@@ -48,6 +48,13 @@ def __init__(
         assert len(dataset) == len(sizes)
         assert len(dataset) > 0
         sizes = np.array(sizes, dtype=int)
+
+        assert np.all(np.diff((sizes == document_sep_len).nonzero()) != 1),\
+            (
+                "Found multiple blank lines in the dataset, please remove them"
+                " (eg. cat -s raw.txt) and preprocess the data again."
+            )
+
         if break_mode is None or break_mode == 'none':
             total_size = sum(sizes)
             length = math.ceil(total_size / block_size)

From 732d15a98ac7435c2f391b00e54f6959595d8dd3 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <yqw@fb.com>
Date: Fri, 16 Aug 2019 20:59:48 -0700
Subject: [PATCH 102/213] implement tri-stage lr_scheduler (#1028)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1028

Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/831

tri-stage lr-scheduler consisted of 3 stages: 1. warmup; 2. hold; 3.
(exponentially) decay; used in https://arxiv.org/pdf/1904.08779.pdf

Reviewed By: myleott

Differential Revision: D16806206

fbshipit-source-id: 40e472ec382449a0fb711f8ee980f14d27d2114a
---
 .../lr_scheduler/tri_stage_lr_scheduler.py    | 160 ++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py

diff --git a/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py b/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
new file mode 100644
index 0000000000..b5f99c54c7
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
@@ -0,0 +1,160 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import FairseqLRScheduler, register_lr_scheduler
+import math
+
+
+@register_lr_scheduler('tri_stage')
+class TriStageLRSchedule(FairseqLRScheduler):
+    """Tristage learning rate schedulr
+
+    Implement the learning rate scheduler in https://arxiv.org/pdf/1904.08779.pdf
+
+    Similar to inverse_squre_root scheduler, but tri_stage learning rate employs
+    three stages LR scheduling:
+
+        - warmup stage, starting from `lr` * `init_lr_scale`, linearly
+          increased to `lr` in `warmup_steps` iterations
+
+        - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps`
+          iterations
+
+        - decay stage, after hold stage, decay LR exponetially to
+          `lr` * `final_lr_scale` in `decay_steps`;
+          after that LR is keep as `final_lr_scale` * `lr`
+
+    During warmup::
+
+      init_lr = args.init_lr_scale * args.lr
+      lrs = torch.linspace(init_lr, args.lr, args.warmup_steps)
+      lr = lrs[update_num]
+
+    During hold::
+
+      lr = args.lr
+
+    During decay::
+
+      decay_factor = - math.log(args.final_lr_scale) / args.decay_steps
+      lr = args.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor)
+
+    After that::
+
+      lr = args.lr * args.final_lr_scale
+    """
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+        if len(args.lr) > 1:
+            raise ValueError(
+                'Cannot use a fixed learning rate schedule with tri-stage lr.'
+                ' Consider --lr-scheduler=fixed instead.'
+            )
+
+        # calculate LR at each point
+        self.peak_lr = args.lr[0]
+        self.init_lr = args.init_lr_scale * args.lr[0]
+        self.final_lr = args.final_lr_scale * args.lr[0]
+
+        # remember the steps at each stage
+        self.warmup_steps = args.warmup_steps
+        self.hold_steps = args.hold_steps
+        self.decay_steps = args.decay_steps
+
+        self.warmup_rate = (self.peak_lr - self.init_lr) / self.warmup_steps
+        self.decay_factor = -math.log(args.final_lr_scale) / args.decay_steps
+
+        # initial learning rate
+        self.lr = self.init_lr
+        self.optimizer.set_lr(self.lr)
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument(
+            '--warmup-steps',
+            default=4000,
+            type=int,
+            metavar='N',
+            help='warmup the learning rate linearly for the first N updates'
+        )
+        parser.add_argument(
+            '--hold-steps',
+            default=20000,
+            type=int,
+            metavar='N',
+            help='steps in hold stage.'
+        )
+        parser.add_argument(
+            '--decay-steps',
+            default=60000,
+            type=int,
+            metavar='N',
+            help='steps in decay stages'
+        )
+        parser.add_argument(
+            '--init-lr-scale',
+            default=0.01,
+            type=float,
+            help="""
+    initial learning rate scale during warmup phase; default is 0.01""")
+        parser.add_argument(
+            '--final-lr-scale',
+            default=0.01,
+            type=float,
+            help="final learning rate scale; default to 0.01"
+        )
+        # fmt: on
+
+    def _decide_stage(self, update_step):
+        """
+        return stage, and the corresponding steps within the current stage
+        """
+        if update_step < self.warmup_steps:
+            # warmup state
+            return 0, update_step
+
+        offset = self.warmup_steps
+
+        if update_step < offset + self.hold_steps:
+            # hold stage
+            return 1, update_step - offset
+
+        offset += self.hold_steps
+
+        if update_step <= offset + self.decay_steps:
+            # decay stage
+            return 2, update_step - offset
+
+        offset += self.decay_steps
+
+        # still here ? constant lr stage
+        return 3, update_step - offset
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        stage, steps_in_stage = self._decide_stage(num_updates)
+        if stage == 0:
+            self.lr = self.init_lr + self.warmup_rate * steps_in_stage
+        elif stage == 1:
+            self.lr = self.peak_lr
+        elif stage == 2:
+            self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
+        elif stage == 3:
+            self.lr = self.final_lr
+        else:
+            raise ValueError("Undefined stage")
+
+        self.optimizer.set_lr(self.lr)
+
+        return self.lr

From 0c75c7603185a1de8309ac91d496f88c7858624f Mon Sep 17 00:00:00 2001
From: Chunting Zhou <chuntinz@fb.com>
Date: Mon, 19 Aug 2019 07:28:41 -0700
Subject: [PATCH 103/213] Fix bug (the returned value has a dimension mismatch)
 in label-smoothed-cross-entropy for MoE (#1037)

Summary:
MoE will encounter a dimension mismatch bug when using label-smoothed cross entropy as the criterion, which occurs at [https://github.com/pytorch/fairseq/blob/master/fairseq/tasks/translation_moe.py#L125](url). This is a fix to the bug.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1037

Differential Revision: D16892674

Pulled By: myleott

fbshipit-source-id: a73bc03d2280356667d02422d22ad11d968d0c65
---
 fairseq/criterions/label_smoothed_cross_entropy.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
index 6687718725..92fdf8c242 100644
--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -16,9 +16,9 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T
     nll_loss = -lprobs.gather(dim=-1, index=target)
     smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
     if ignore_index is not None:
-        non_pad_mask = target.ne(ignore_index)
-        nll_loss = nll_loss[non_pad_mask]
-        smooth_loss = smooth_loss[non_pad_mask]
+        pad_mask = target.eq(ignore_index)
+        nll_loss[pad_mask] = nll_loss[pad_mask] * 0.
+        smooth_loss[pad_mask] = smooth_loss[pad_mask] * 0.
     else:
         nll_loss = nll_loss.squeeze(-1)
         smooth_loss = smooth_loss.squeeze(-1)

From 02cb5a43dbf33ac81aa3ba2f28e69b3b98adc223 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Mon, 19 Aug 2019 07:33:18 -0700
Subject: [PATCH 104/213] remove shlex.quote in scripts/spm_train.py (#972)

Summary:
to resolve the issue https://github.com/pytorch/fairseq/issues/971
Pull Request resolved: https://github.com/pytorch/fairseq/pull/972

Differential Revision: D16892827

Pulled By: myleott

fbshipit-source-id: baf277961f1e292f4593eefe31e3541aa9d0d8c4
---
 scripts/spm_train.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/spm_train.py b/scripts/spm_train.py
index e95a66bfa6..9db668fd41 100644
--- a/scripts/spm_train.py
+++ b/scripts/spm_train.py
@@ -7,11 +7,10 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import shlex
 import sys
 
 import sentencepiece as spm
 
 
 if __name__ == "__main__":
-    spm.SentencePieceTrainer.Train(" ".join(map(shlex.quote, sys.argv[1:])))
+    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))

From 79460d34da272354c4f4ae0036245623287006ff Mon Sep 17 00:00:00 2001
From: Trinkle23897 <463003665@qq.com>
Date: Mon, 19 Aug 2019 07:38:48 -0700
Subject: [PATCH 105/213] add constrains when checking multiple consecutive
 blank lines (#1031)

Summary:
It will cause runtime error on some standard datasets (e.g. wikitext-103).

Details:
After preprocessing to wikitext-103 folder with current master branch, I use fairseq-train and get the following Error:
```bash
Traceback (most recent call last):
  File "/home/trinkle/.local/bin/fairseq-train", line 11, in <module>
    load_entry_point('fairseq', 'console_scripts', 'fairseq-train')()
  File "/data/git/Transformer/fairseq/fairseq_cli/train.py", line 321, in cli_main
    main(args)
  File "/data/git/Transformer/fairseq/fairseq_cli/train.py", line 46, in main
    task.load_dataset(valid_sub_split, combine=False, epoch=0)
  File "/data/git/Transformer/fairseq/fairseq/tasks/language_modeling.py", line 167, in load_dataset
    break_mode=self.args.sample_break_mode, include_targets=True,
  File "/data/git/Transformer/fairseq/fairseq/data/token_block_dataset.py", line 54, in init
    "Found multiple blank lines in the dataset, please remove them"
AssertionError: Found multiple blank lines in the dataset, please remove them (eg. cat -s raw.txt) and preprocess the data again.
```

It's because these datasets have multiple blank lines. The assertion is added in https://github.com/pytorch/fairseq/commit/851c022610b27da3beaa4e40a6834b5fb3b44f44, however, adding this assertion is not a good way.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1031

Differential Revision: D16892942

Pulled By: myleott

fbshipit-source-id: 90c41b7d98a7b78f506bb57320f9f6b901e05d5b
---
 fairseq/data/token_block_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
index 73a8cc93a1..0b5ee5da4a 100644
--- a/fairseq/data/token_block_dataset.py
+++ b/fairseq/data/token_block_dataset.py
@@ -49,7 +49,7 @@ def __init__(
         assert len(dataset) > 0
         sizes = np.array(sizes, dtype=int)
 
-        assert np.all(np.diff((sizes == document_sep_len).nonzero()) != 1),\
+        assert break_mode != 'complete_doc' or np.all(np.diff((sizes == document_sep_len).nonzero()) != 1),\
             (
                 "Found multiple blank lines in the dataset, please remove them"
                 " (eg. cat -s raw.txt) and preprocess the data again."

From 2eb53b8ef1c9f5033c669f4ebec41106a29368f9 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 19 Aug 2019 15:03:43 -0700
Subject: [PATCH 106/213] Add instructions to resume training from released
 RoBERTa models (fixes #1034)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1041

Differential Revision: D16904073

Pulled By: myleott

fbshipit-source-id: 22e5e25a15f7a0b6f2d827d98c953a6cec07610e
---
 examples/roberta/README.pretraining.md | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md
index 0e82bc93fb..527d4a2e57 100644
--- a/examples/roberta/README.pretraining.md
+++ b/examples/roberta/README.pretraining.md
@@ -68,17 +68,20 @@ fairseq-train --fp16 $DATA_DIR \
     --max-update $TOTAL_UPDATES --log-format simple --log-interval 1
 ```
 
-The above command assumes training on 8x32GB V100 GPUs. Each GPU uses a batch
-size of 16 sequences (`$MAX_SENTENCES`) and accumulates gradients to further
-increase the batch size by 16x (`$UPDATE_FREQ`), for a total batch size of 2048
-sequences. If you have fewer GPUs or GPUs with less memory you may need to
-reduce `$MAX_SENTENCES` and increase `$UPDATE_FREQ` to compensate. Alternatively
-if you have more GPUs you can decrease `$UPDATE_FREQ` accordingly to increase
-training speed.
+**Note:** You can optionally resume training the released RoBERTa base model by
+adding `--restore-file /path/to/roberta.base/model.pt`.
 
-Also note that the learning rate and batch size are tightly connected and need
-to be adjusted together. We generally recommend increasing the learning rate as
-you increase the batch size according to the following table (although it's also
+**Note:** The above command assumes training on 8x32GB V100 GPUs. Each GPU uses
+a batch size of 16 sequences (`$MAX_SENTENCES`) and accumulates gradients to
+further increase the batch size by 16x (`$UPDATE_FREQ`), for a total batch size
+of 2048 sequences. If you have fewer GPUs or GPUs with less memory you may need
+to reduce `$MAX_SENTENCES` and increase `$UPDATE_FREQ` to compensate.
+Alternatively if you have more GPUs you can decrease `$UPDATE_FREQ` accordingly
+to increase training speed.
+
+**Note:** The learning rate and batch size are tightly connected and need to be
+adjusted together. We generally recommend increasing the learning rate as you
+increase the batch size according to the following table (although it's also
 dataset dependent, so don't rely on the following values too closely):
 
 batch size | peak learning rate

From 6ce55e4b011275e43404034832b40648b1483ff6 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 19 Aug 2019 15:04:41 -0700
Subject: [PATCH 107/213] Small fixes

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/835

Differential Revision: D16904038

Pulled By: myleott

fbshipit-source-id: 2c9d0b913f8d688297ac80fcabd905bd1397f66a
---
 eval_lm.py                                    |  6 +++--
 examples/__init__.py                          | 12 ++++------
 examples/noisychannel/__init__.py             | 10 ++++----
 examples/noisychannel/rerank.py               |  8 ++++---
 examples/noisychannel/rerank_generate.py      | 20 ++++++++--------
 examples/noisychannel/rerank_options.py       |  8 +++----
 examples/noisychannel/rerank_tune.py          | 14 ++++++-----
 examples/noisychannel/rerank_utils.py         |  4 ++--
 examples/roberta/wsc/wsc_task.py              |  4 ++--
 .../models/vggtransformer.py                  |  4 ++--
 fairseq/models/lightconv.py                   |  2 +-
 fairseq/modules/__init__.py                   |  4 ----
 fairseq/modules/cuda_utils.cu                 |  7 +++---
 fairseq/modules/dynamic_convolution.py        |  4 +++-
 fairseq/modules/dynamicconv_layer/__init__.py | 10 ++++----
 .../dynamicconv_layer/cuda_function_gen.py    | 18 +++++++--------
 .../dynamicconv_layer/dynamicconv_cuda.cpp    |  7 ++++++
 .../dynamicconv_layer/dynamicconv_cuda.cuh    |  8 ++++---
 .../dynamicconv_cuda_kernel.cu                |  7 +++---
 .../dynamicconv_layer/dynamicconv_layer.py    | 11 +++++++--
 fairseq/modules/dynamicconv_layer/setup.py    |  6 +++++
 fairseq/modules/lightconv_layer/__init__.py   | 10 ++++----
 .../lightconv_layer/cuda_function_gen.py      | 18 +++++++--------
 .../lightconv_layer/lightconv_cuda.cpp        |  7 ++++++
 .../lightconv_layer/lightconv_cuda.cuh        |  7 +++---
 .../lightconv_layer/lightconv_cuda_kernel.cu  |  7 +++---
 .../lightconv_layer/lightconv_layer.py        | 10 ++++++--
 fairseq/modules/lightconv_layer/setup.py      |  6 +++++
 fairseq/modules/lightweight_convolution.py    |  2 ++
 fairseq/modules/unfold.py                     |  1 +
 scripts/average_checkpoints.py                |  4 ++--
 scripts/compare_namespaces.py                 |  2 +-
 scripts/count_docs.py                         |  1 -
 scripts/shard_docs.py                         |  2 --
 scripts/split_train_valid_docs.py             |  8 +++++--
 scripts/spm_encode.py                         |  2 +-
 scripts/wav2vec_featurize.py                  | 23 +++++++++++--------
 tests/speech_recognition/test_collaters.py    |  8 +++----
 .../speech_recognition/test_cross_entropy.py  |  8 +++----
 tests/test_average_checkpoints.py             |  1 -
 validate.py                                   |  9 ++++----
 41 files changed, 176 insertions(+), 134 deletions(-)

diff --git a/eval_lm.py b/eval_lm.py
index febed5ac8b..f7add27eba 100644
--- a/eval_lm.py
+++ b/eval_lm.py
@@ -200,8 +200,10 @@ def main(parsed_args):
                             is_bpe = False
                             w = ''
                     if args.output_word_probs:
-                        print(str(int(sample_id)) + " " +
-                                  ('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob)))
+                        print(
+                            str(int(sample_id)) + " "
+                            + ('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob))
+                        )
 
             wps_meter.update(sample['ntokens'])
             t.log({'wps': round(wps_meter.avg)})
diff --git a/examples/__init__.py b/examples/__init__.py
index 906098c1e3..35b0568cb7 100644
--- a/examples/__init__.py
+++ b/examples/__init__.py
@@ -1,10 +1,8 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
-__version__ = '0.7.2'
+__version__ = '0.8.0'
 
-import examples.noisychannel # noqa
+import examples.noisychannel  # noqa
diff --git a/examples/noisychannel/__init__.py b/examples/noisychannel/__init__.py
index b10ddbd812..89f1aef4f6 100644
--- a/examples/noisychannel/__init__.py
+++ b/examples/noisychannel/__init__.py
@@ -1,8 +1,6 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
-from .rerank_options import *
+from .rerank_options import *  # noqa
diff --git a/examples/noisychannel/rerank.py b/examples/noisychannel/rerank.py
index c17d64b4a1..46f31b25c0 100644
--- a/examples/noisychannel/rerank.py
+++ b/examples/noisychannel/rerank.py
@@ -77,9 +77,11 @@ def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile, write
 
         for key in range(len(gen_keys)):
             if args.prefix_len is None:
-                assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \
-                    ("pred and rescore hypo mismatch: i: " + str(key) + ", " + str(hypo_lst[key]) + str(gen_keys[key]) +
-                    str(gen_output.no_bpe_hypo[key]))
+                assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], (
+                    "pred and rescore hypo mismatch: i: " + str(key) + ", "
+                    + str(hypo_lst[key]) + str(gen_keys[key])
+                    + str(gen_output.no_bpe_hypo[key])
+                )
                 sys_tok = dict.encode_line(hypo_lst[key])
                 ref_tok = dict.encode_line(gen_output.no_bpe_target[gen_keys[key]])
                 scorer.add(ref_tok, sys_tok)
diff --git a/examples/noisychannel/rerank_generate.py b/examples/noisychannel/rerank_generate.py
index 27dcdb5995..3d692b6cc8 100644
--- a/examples/noisychannel/rerank_generate.py
+++ b/examples/noisychannel/rerank_generate.py
@@ -1,23 +1,23 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
-import rerank_utils
+"""
+Generate n-best translations using a trained model.
+"""
+
+from contextlib import redirect_stdout
 import os
 import subprocess
+
+import rerank_utils
 from examples.noisychannel import rerank_options
 from fairseq import options
 import generate
 import preprocess
-from contextlib import redirect_stdout
 
-"""
-Generate n-best translations using a trained model.
-"""
 
 def gen_and_reprocess_nbest(args):
     if args.score_dict_dir is None:
diff --git a/examples/noisychannel/rerank_options.py b/examples/noisychannel/rerank_options.py
index 1f8c748b90..41a80d88d1 100644
--- a/examples/noisychannel/rerank_options.py
+++ b/examples/noisychannel/rerank_options.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from fairseq import options
 
diff --git a/examples/noisychannel/rerank_tune.py b/examples/noisychannel/rerank_tune.py
index 805d875796..437f056cb7 100644
--- a/examples/noisychannel/rerank_tune.py
+++ b/examples/noisychannel/rerank_tune.py
@@ -27,12 +27,14 @@ def random_search(args):
     param_values += initial_params
     random.seed(args.seed)
 
-    random_params = np.array([[random.uniform(args.lower_bound[i], args.upper_bound[i])
-                               for i in range(len(args.tune_param))]
-                               for k in range(args.num_trials)])
-    set_params = np.array([[initial_params[i][0]
-                            for i in range(len(tuneable_parameters))]
-                            for k in range(args.num_trials)])
+    random_params = np.array([
+        [random.uniform(args.lower_bound[i], args.upper_bound[i]) for i in range(len(args.tune_param))]
+        for k in range(args.num_trials)
+    ])
+    set_params = np.array([
+        [initial_params[i][0] for i in range(len(tuneable_parameters))]
+        for k in range(args.num_trials)
+    ])
     random_params = np.concatenate((random_params, set_params), 1)
 
     rerank_args = vars(args).copy()
diff --git a/examples/noisychannel/rerank_utils.py b/examples/noisychannel/rerank_utils.py
index 9b8bb7bec2..c64b18216d 100644
--- a/examples/noisychannel/rerank_utils.py
+++ b/examples/noisychannel/rerank_utils.py
@@ -128,8 +128,8 @@ def write_reprocessed(sources, hypos, targets, source_outfile,
         "in writing reprocessed, only one type of prefix may be used"
 
     with open(source_outfile, 'w') as source_file, \
-         open(hypo_outfile, 'w') as hypo_file, \
-         open(target_outfile, 'w') as target_file:
+            open(hypo_outfile, 'w') as hypo_file, \
+            open(target_outfile, 'w') as target_file:
 
         assert len(sources) == len(hypos), "sources and hypos list length mismatch"
         if right_to_left:
diff --git a/examples/roberta/wsc/wsc_task.py b/examples/roberta/wsc/wsc_task.py
index 2af2b338cb..312361faeb 100644
--- a/examples/roberta/wsc/wsc_task.py
+++ b/examples/roberta/wsc/wsc_task.py
@@ -270,6 +270,7 @@ class WinograndeTask(WSCTask):
     Task for WinoGrande dataset. Efficient implementation for Winograd schema
     tasks with exactly two candidates, one of which is correct.
     """
+
     @classmethod
     def setup_task(cls, args, **kwargs):
         assert args.criterion == 'winogrande', 'Must set --criterion=winogrande'
@@ -280,7 +281,6 @@ def setup_task(cls, args, **kwargs):
 
         return cls(args, vocab)
 
-
     def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs):
         """Load a given dataset split.
 
@@ -299,7 +299,7 @@ def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_onl
         candidate_masks = []
         candidate_lengths = []
 
-        itr = wsc_utils.winogrande_jsonl_iterator(data_path, eval=split=='test')
+        itr = wsc_utils.winogrande_jsonl_iterator(data_path, eval=(split == 'test'))
 
         for sample in itr:
             sentence, pronoun_span, query, cand_text = sample
diff --git a/examples/speech_recognition/models/vggtransformer.py b/examples/speech_recognition/models/vggtransformer.py
index 7b208a3b91..3a078ec6ff 100644
--- a/examples/speech_recognition/models/vggtransformer.py
+++ b/examples/speech_recognition/models/vggtransformer.py
@@ -13,7 +13,7 @@
 from fairseq.models import (
     FairseqEncoder,
     FairseqIncrementalDecoder,
-    FairseqModel,
+    FairseqEncoderDecoderModel,
     register_model,
     register_model_architecture,
 )
@@ -23,7 +23,7 @@
 
 
 @register_model("asr_vggtransformer")
-class VGGTransformerModel(FairseqModel):
+class VGGTransformerModel(FairseqEncoderDecoderModel):
     """
     Transformers with convolutional context for ASR
     https://arxiv.org/abs/1904.11660
diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py
index 44d52dcd81..087eee8a28 100644
--- a/fairseq/models/lightconv.py
+++ b/fairseq/models/lightconv.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
-import sys
 
 import torch
 import torch.nn as nn
@@ -174,6 +173,7 @@ def build_embedding(dictionary, embed_dim, path=None):
         decoder = LightConvDecoder(args, tgt_dict, decoder_embed_tokens)
         return LightConvModel(encoder, decoder)
 
+
 class LightConvEncoder(FairseqEncoder):
     """
     LightConv encoder consisting of *args.encoder_layers* layers. Each layer
diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
index ecfdc3d697..7b38fdae30 100644
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -10,14 +10,12 @@
 from .conv_tbc import ConvTBC
 from .downsampled_multihead_attention import DownsampledMultiHeadAttention
 from .dynamic_convolution import DynamicConv, DynamicConv1dTBC
-#from .dynamicconv_layer import DynamicconvLayer
 from .gelu import gelu, gelu_accurate
 from .grad_multiply import GradMultiply
 from .highway import Highway
 from .layer_norm import LayerNorm
 from .learned_positional_embedding import LearnedPositionalEmbedding
 from .lightweight_convolution import LightweightConv, LightweightConv1dTBC
-#from .lightconv_layer import LightconvLayer
 from .linearized_convolution import LinearizedConvolution
 from .logsumexp_moe import LogSumExpMoE
 from .mean_pool_gating_network import MeanPoolGatingNetwork
@@ -38,7 +36,6 @@
     'CharacterTokenEmbedder',
     'ConvTBC',
     'DownsampledMultiHeadAttention',
-#    'DyamicconvLayer',
     'DynamicConv1dTBC',
     'DynamicConv',
     'gelu',
@@ -47,7 +44,6 @@
     'Highway',
     'LayerNorm',
     'LearnedPositionalEmbedding',
-#    'LightconvLayer',
     'LightweightConv1dTBC',
     'LightweightConv',
     'LinearizedConvolution',
diff --git a/fairseq/modules/cuda_utils.cu b/fairseq/modules/cuda_utils.cu
index 596ff125f9..516f1d9244 100644
--- a/fairseq/modules/cuda_utils.cu
+++ b/fairseq/modules/cuda_utils.cu
@@ -1,7 +1,8 @@
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
- *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 
diff --git a/fairseq/modules/dynamic_convolution.py b/fairseq/modules/dynamic_convolution.py
index 7fbd3f37e1..19ffb0bf5b 100644
--- a/fairseq/modules/dynamic_convolution.py
+++ b/fairseq/modules/dynamic_convolution.py
@@ -10,6 +10,7 @@
 from fairseq import utils
 from .unfold import unfold1d
 
+
 def DynamicConv(input_size, kernel_size=1, padding_l=None, num_heads=1,
                 weight_dropout=0., weight_softmax=False,
                 renorm_padding=False, bias=False, conv_bias=False,
@@ -28,6 +29,7 @@ def DynamicConv(input_size, kernel_size=1, padding_l=None, num_heads=1,
                             weight_dropout=weight_dropout,
                             weight_softmax=weight_softmax, bias=bias)
 
+
 def Linear(in_features, out_features, bias=True):
     m = nn.Linear(in_features, out_features, bias)
     nn.init.xavier_uniform_(m.weight)
@@ -209,7 +211,7 @@ def _forward_expanded(self, x, incremental_stat, query):
             # turn the convolution filters into band matrices
             weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
             weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
-            weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T
+            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
         output = torch.bmm(weight_expanded, x)
         output = output.transpose(0, 1).contiguous().view(T, B, C)
         return output
diff --git a/fairseq/modules/dynamicconv_layer/__init__.py b/fairseq/modules/dynamicconv_layer/__init__.py
index c62ffac86c..22dc6f403d 100644
--- a/fairseq/modules/dynamicconv_layer/__init__.py
+++ b/fairseq/modules/dynamicconv_layer/__init__.py
@@ -1,8 +1,6 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
-from .dynamicconv_layer import DynamicconvLayer
+from .dynamicconv_layer import DynamicconvLayer  # noqa
diff --git a/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
index caf151e4a1..926d6ca846 100644
--- a/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
+++ b/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 def gen_forward():
@@ -13,9 +11,10 @@ def gen_forward():
 
     head = """
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
+ * Copyright (c) Facebook, Inc. and its affiliates.
  *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 #include "dynamicconv_cuda.cuh"
@@ -103,9 +102,10 @@ def gen_backward():
 
     head = """
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
+ * Copyright (c) Facebook, Inc. and its affiliates.
  *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 #include "dynamicconv_cuda.cuh"
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
index b76c9e7fe2..ebd4df0e96 100644
--- a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #include <torch/extension.h>
 #include <vector>
 
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
index 5d6ed575f3..2196259433 100644
--- a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
@@ -1,8 +1,10 @@
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
- *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
+
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
 
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
index f29e6ded06..300d35b647 100644
--- a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
@@ -1,7 +1,8 @@
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
- *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 #include "dynamicconv_cuda.cuh"
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
index d50e13c0d2..3e51f09fa6 100644
--- a/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
@@ -1,9 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 import torch
 from torch import nn
 from torch.autograd import Function
 import torch.nn.functional as F
+
 import dynamicconv_cuda
 from fairseq import utils
+from fairseq.modules.unfold import unfold1d
 
 
 class dynamicconvFunction(Function):
@@ -68,7 +75,7 @@ def forward(self, x, incremental_state=None, query=None, unfold=None):
 
         T, B, C = x.size()
         K, H = self.kernel_size, self.num_heads
-        R = C // H
+        # R = C // H
 
         # during inference time, incremental BMM is faster
         if incremental_state is not None:
@@ -199,7 +206,7 @@ def _forward_expanded(self, x, incremental_stat, query):
             # turn the convolution filters into band matrices
             weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
             weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
-            weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T
+            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
         output = torch.bmm(weight_expanded, x)
         output = output.transpose(0, 1).contiguous().view(T, B, C)
         return output
diff --git a/fairseq/modules/dynamicconv_layer/setup.py b/fairseq/modules/dynamicconv_layer/setup.py
index 00ce29bc75..4d789c3283 100644
--- a/fairseq/modules/dynamicconv_layer/setup.py
+++ b/fairseq/modules/dynamicconv_layer/setup.py
@@ -1,3 +1,9 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 from setuptools import setup
 from torch.utils.cpp_extension import CUDAExtension, BuildExtension
 
diff --git a/fairseq/modules/lightconv_layer/__init__.py b/fairseq/modules/lightconv_layer/__init__.py
index 95fe76c7cd..3b2a99c122 100644
--- a/fairseq/modules/lightconv_layer/__init__.py
+++ b/fairseq/modules/lightconv_layer/__init__.py
@@ -1,8 +1,6 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
-from .lightconv_layer import LightconvLayer
+from .lightconv_layer import LightconvLayer  # noqa
diff --git a/fairseq/modules/lightconv_layer/cuda_function_gen.py b/fairseq/modules/lightconv_layer/cuda_function_gen.py
index 1bb3a1a0dd..afec9e19e7 100644
--- a/fairseq/modules/lightconv_layer/cuda_function_gen.py
+++ b/fairseq/modules/lightconv_layer/cuda_function_gen.py
@@ -1,9 +1,7 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 
 def gen_forward():
@@ -13,9 +11,10 @@ def gen_forward():
 
     head = """
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
+ * Copyright (c) Facebook, Inc. and its affiliates.
  *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 #include "lightconv_cuda.cuh"
@@ -118,9 +117,10 @@ def gen_backward():
 
     head = """
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
+ * Copyright (c) Facebook, Inc. and its affiliates.
  *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 #include "lightconv_cuda.cuh"
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda.cpp b/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
index 3dc1765bf0..4bf6b5ad36 100644
--- a/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #include <torch/extension.h>
 #include <vector>
 
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda.cuh b/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
index f4c5fec437..3cae57b68f 100644
--- a/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
@@ -1,7 +1,8 @@
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
- *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 #include <ATen/ATen.h>
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
index 8e17e27af1..8ee83a56c8 100644
--- a/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
@@ -1,7 +1,8 @@
 /**
- * Copyright (c) 2018-present, Facebook, Inc.
- * All rights reserved.
- *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 #include "lightconv_cuda.cuh"
diff --git a/fairseq/modules/lightconv_layer/lightconv_layer.py b/fairseq/modules/lightconv_layer/lightconv_layer.py
index 8728128277..3daff29d0f 100644
--- a/fairseq/modules/lightconv_layer/lightconv_layer.py
+++ b/fairseq/modules/lightconv_layer/lightconv_layer.py
@@ -1,12 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 import torch
 from torch import nn
 from torch.autograd import Function
 import torch.nn.functional as F
-import time
 
 import lightconv_cuda
 from fairseq import utils
 
+
 class lightconvFunction(Function):
 
     @staticmethod
@@ -26,6 +31,7 @@ def backward(ctx, grad_output):
         grad_input, grad_weights = outputs
         return grad_input, grad_weights, None
 
+
 class LightconvLayer(nn.Module):
     def __init__(
             self,
@@ -82,7 +88,7 @@ def forward(self, x, incremental_state=None):
             weight = weight.view(1, H, K).expand(T*B, H, K).contiguous().view(T*B*H, K, 1)
 
             weight = F.dropout(weight, self.weight_dropout, training=self.training)
-            output = torch.bmm(x_unfold, weight) # T*B*H x R x 1
+            output = torch.bmm(x_unfold, weight)  # T*B*H x R x 1
             output = output.view(T, B, C)
             return output
 
diff --git a/fairseq/modules/lightconv_layer/setup.py b/fairseq/modules/lightconv_layer/setup.py
index c2a928ed82..0eac1df03c 100644
--- a/fairseq/modules/lightconv_layer/setup.py
+++ b/fairseq/modules/lightconv_layer/setup.py
@@ -1,3 +1,9 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 from setuptools import setup
 from torch.utils.cpp_extension import CUDAExtension, BuildExtension
 
diff --git a/fairseq/modules/lightweight_convolution.py b/fairseq/modules/lightweight_convolution.py
index 95d0418af6..037dd17925 100644
--- a/fairseq/modules/lightweight_convolution.py
+++ b/fairseq/modules/lightweight_convolution.py
@@ -10,6 +10,7 @@
 from fairseq import utils
 from fairseq.modules.unfold import unfold1d
 
+
 def LightweightConv(input_size, kernel_size=1, padding_l=None, num_heads=1,
                     weight_dropout=0., weight_softmax=False, bias=False):
     if torch.cuda.is_available():
@@ -26,6 +27,7 @@ def LightweightConv(input_size, kernel_size=1, padding_l=None, num_heads=1,
                                 weight_dropout=weight_dropout,
                                 weight_softmax=weight_softmax, bias=bias)
 
+
 class LightweightConv1d(nn.Module):
     '''Lightweight Convolution assuming the input is BxCxT
     This is just an example that explains LightConv clearer than the TBC version.
diff --git a/fairseq/modules/unfold.py b/fairseq/modules/unfold.py
index eff6ab575b..3a142db698 100644
--- a/fairseq/modules/unfold.py
+++ b/fairseq/modules/unfold.py
@@ -5,6 +5,7 @@
 
 import torch.nn.functional as F
 
+
 def unfold1d(x, kernel_size, padding_l, pad_value=0):
     '''unfold T x B x C to T x B x C x K'''
     if kernel_size > 1:
diff --git a/scripts/average_checkpoints.py b/scripts/average_checkpoints.py
index e5e9bce156..190c59ad08 100644
--- a/scripts/average_checkpoints.py
+++ b/scripts/average_checkpoints.py
@@ -121,9 +121,9 @@ def main():
         num = args.num_epoch_checkpoints
 
     assert args.checkpoint_upper_bound is None or args.num_epoch_checkpoints is not None, \
-            '--checkpoint-upper-bound requires --num-epoch-checkpoints'
+        '--checkpoint-upper-bound requires --num-epoch-checkpoints'
     assert args.num_epoch_checkpoints is None or args.num_update_checkpoints is None, \
-            'Cannot combine --num-epoch-checkpoints and --num-update-checkpoints'
+        'Cannot combine --num-epoch-checkpoints and --num-update-checkpoints'
 
     if num is not None:
         args.inputs = last_n_checkpoints(
diff --git a/scripts/compare_namespaces.py b/scripts/compare_namespaces.py
index 52ba2b9fb4..db5121189a 100644
--- a/scripts/compare_namespaces.py
+++ b/scripts/compare_namespaces.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 """Helper script to compare two argparse.Namespace objects."""
 
-from argparse import Namespace
+from argparse import Namespace  # noqa
 
 
 def main():
diff --git a/scripts/count_docs.py b/scripts/count_docs.py
index 13640f4b6f..8d185398a7 100644
--- a/scripts/count_docs.py
+++ b/scripts/count_docs.py
@@ -10,7 +10,6 @@
 
 import argparse
 import gzip
-import random
 import sys
 
 import numpy as np
diff --git a/scripts/shard_docs.py b/scripts/shard_docs.py
index f1adac72aa..e30d4a1229 100644
--- a/scripts/shard_docs.py
+++ b/scripts/shard_docs.py
@@ -10,8 +10,6 @@
 
 import argparse
 import contextlib
-import random
-import sys
 
 
 def main():
diff --git a/scripts/split_train_valid_docs.py b/scripts/split_train_valid_docs.py
index 41fb979ad1..9adf99634c 100644
--- a/scripts/split_train_valid_docs.py
+++ b/scripts/split_train_valid_docs.py
@@ -19,6 +19,8 @@ def main():
     parser.add_argument('sample_output', help='train output file')
     parser.add_argument('remainder_output', help='valid output file')
     parser.add_argument('-k', type=int, help="remainder size")
+    parser.add_argument('--lines', action='store_true',
+                        help='split lines instead of docs')
     args = parser.parse_args()
 
     assert args.k is not None
@@ -48,6 +50,8 @@ def update_sample(doc):
                 update_sample(doc)
             else:
                 doc.append(line)
+            if args.lines:
+                update_sample(doc)
             if i % 1000000 == 0:
                 print(i, file=sys.stderr, end="", flush=True)
             elif i % 100000 == 0:
@@ -61,7 +65,7 @@ def update_sample(doc):
     with open(args.sample_output, 'w', encoding='utf-8') as out:
         first = True
         for doc in sample:
-            if not first:
+            if not first and not args.lines:
                 out.write("\n")
             first = False
             for line in doc:
@@ -70,7 +74,7 @@ def update_sample(doc):
     with open(args.remainder_output, 'w', encoding='utf-8') as out:
         first = True
         for doc in remainder:
-            if not first:
+            if not first and not args.lines:
                 out.write("\n")
             first = False
             for line in doc:
diff --git a/scripts/spm_encode.py b/scripts/spm_encode.py
index eda9d62204..e1cb54192a 100644
--- a/scripts/spm_encode.py
+++ b/scripts/spm_encode.py
@@ -30,7 +30,7 @@ def main():
     args = parser.parse_args()
 
     assert len(args.inputs) == len(args.outputs), \
-            "number of input and output paths should match"
+        "number of input and output paths should match"
 
     sp = spm.SentencePieceProcessor()
     sp.Load(args.model)
diff --git a/scripts/wav2vec_featurize.py b/scripts/wav2vec_featurize.py
index 70764d7933..31e12433f9 100644
--- a/scripts/wav2vec_featurize.py
+++ b/scripts/wav2vec_featurize.py
@@ -1,22 +1,27 @@
-""" Helper script to pre-compute embeddings for a wav2letter++ dataset
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Helper script to pre-compute embeddings for a wav2letter++ dataset
 """
 
-import glob, os
-import tqdm
+import argparse
+import glob
+import os
 from shutil import copy
 
-import soundfile as sf
-
 import h5py
+import soundfile as sf
 import numpy as np
-
 import torch
 from torch import nn
+import tqdm
 
 from fairseq.models.wav2vec import Wav2VecModel
 
-import argparse
-
 
 def read_audio(fname):
     """ Load an audio file and return PCM along with the sample rate """
@@ -228,4 +233,4 @@ def __repr__(self):
         if not args.no_copy_labels:
             print("Copying label data...")
             writer.copy_labels()
-            print("Done.")
\ No newline at end of file
+            print("Done.")
diff --git a/tests/speech_recognition/test_collaters.py b/tests/speech_recognition/test_collaters.py
index efb0e58792..6a5029a48f 100644
--- a/tests/speech_recognition/test_collaters.py
+++ b/tests/speech_recognition/test_collaters.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import unittest
 
diff --git a/tests/speech_recognition/test_cross_entropy.py b/tests/speech_recognition/test_cross_entropy.py
index 11daf4166f..508d490e01 100644
--- a/tests/speech_recognition/test_cross_entropy.py
+++ b/tests/speech_recognition/test_cross_entropy.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 from examples.speech_recognition.criterions.cross_entropy_acc import CrossEntropyWithAccCriterion
 from .asr_test_base import CrossEntropyCriterionTestBase
diff --git a/tests/test_average_checkpoints.py b/tests/test_average_checkpoints.py
index 21f12cb421..8ed298c3c9 100644
--- a/tests/test_average_checkpoints.py
+++ b/tests/test_average_checkpoints.py
@@ -14,7 +14,6 @@
 from torch import nn
 
 
-
 from scripts.average_checkpoints import average_checkpoints
 
 
diff --git a/validate.py b/validate.py
index ed8f41e400..f768e8cce7 100644
--- a/validate.py
+++ b/validate.py
@@ -1,10 +1,9 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import torch
 

From c81fed46ac7868c6d80206ff71c6f6cfe93aee22 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 19 Aug 2019 15:19:24 -0700
Subject: [PATCH 108/213] Back out "[fairseq][PR] Fix bug (the returned value
 has a dimension mismatch) in label-smoothed-cross-entropy for MoE" (#837)

Summary:
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/837

Original commit changeset: a73bc03d2280

Differential Revision: D16904372

fbshipit-source-id: b4c4047b2686ba47258cdf0783059726134c920a
---
 fairseq/criterions/label_smoothed_cross_entropy.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
index 92fdf8c242..6687718725 100644
--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -16,9 +16,9 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T
     nll_loss = -lprobs.gather(dim=-1, index=target)
     smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
     if ignore_index is not None:
-        pad_mask = target.eq(ignore_index)
-        nll_loss[pad_mask] = nll_loss[pad_mask] * 0.
-        smooth_loss[pad_mask] = smooth_loss[pad_mask] * 0.
+        non_pad_mask = target.ne(ignore_index)
+        nll_loss = nll_loss[non_pad_mask]
+        smooth_loss = smooth_loss[non_pad_mask]
     else:
         nll_loss = nll_loss.squeeze(-1)
         smooth_loss = smooth_loss.squeeze(-1)

From 4812f64b651ab64881510d38d4e35ce4ce22b04f Mon Sep 17 00:00:00 2001
From: Dmytro Okhonko <oxo@fb.com>
Date: Tue, 20 Aug 2019 12:35:57 -0700
Subject: [PATCH 109/213] Fix method has same name as property

Summary:
Training is failing sometimes because `self.collater` can be both method and property for AsrDataset
https://github.com/pytorch/fairseq/issues/1036

Reviewed By: jcai1

Differential Revision: D16919945

fbshipit-source-id: b34ba54e4dae315b7c723996610a348a8e3031af
---
 examples/speech_recognition/data/asr_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/speech_recognition/data/asr_dataset.py b/examples/speech_recognition/data/asr_dataset.py
index a848370607..b95b71d6af 100644
--- a/examples/speech_recognition/data/asr_dataset.py
+++ b/examples/speech_recognition/data/asr_dataset.py
@@ -72,7 +72,7 @@ def __getitem__(self, index):
             frame_shift=self.frame_shift
         )
         output_cmvn = data_utils.apply_mv_norm(output)
-        self.collater = Seq2SeqCollater(
+        self.s2s_collater = Seq2SeqCollater(
             0, 1, pad_index=self.tgt_dict.pad(),
             eos_index=self.tgt_dict.eos(), move_eos_to_beginning=True
         )
@@ -91,7 +91,7 @@ def collater(self, samples):
         Returns:
             dict: a mini-batch suitable for forwarding with a Model
         """
-        return self.collater.collate(samples)
+        return self.s2s_collater.collate(samples)
 
     def num_tokens(self, index):
         return self.frame_sizes[index]

From 9e5edc104a40708912f4cf463cadbaf912c1a263 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <aryamc@fb.com>
Date: Tue, 20 Aug 2019 14:24:28 -0700
Subject: [PATCH 110/213] Give path when checkpoint can't be found (#1040)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1040

Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/836

Reviewed By: myleott, liezl200

Differential Revision: D16889252

fbshipit-source-id: 45a1b6c1217fb099f0350096e38e1c7d83ea0a64
---
 fairseq/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 507687e266..ce0e74dc9f 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -150,8 +150,8 @@ def load_checkpoint(
                 self.get_model().load_state_dict(state['model'], strict=True)
             except Exception:
                 raise Exception(
-                    'Cannot load model parameters from checkpoint, '
-                    'please ensure that the architectures match.'
+                    'Cannot load model parameters from checkpoint {}; '
+                    'please ensure that the architectures match.'.format(filename)
                 )
 
             extra_state = state['extra_state']

From 7a31fe068c8015e4ee9f8d45887efcd722b53fc0 Mon Sep 17 00:00:00 2001
From: Siddharth Dalmia <sdalmia@fb.com>
Date: Tue, 20 Aug 2019 21:07:19 -0700
Subject: [PATCH 111/213] vggblock support without pooling and
 pooling_kernel_size missing self (#839)

Summary:
1) VggBlock was not supported if pooling kernel size was None.
2) Since we modify pooling kernel size by using _pair. We should use self.pooling_kernel_size. But I agree it doesn't matter as pytorch is robust to this.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/839

Differential Revision: D16934112

Pulled By: okhonko

fbshipit-source-id: b6b95163b0e7f7203d76d535f01a41912382bdc3
---
 fairseq/modules/vggblock.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fairseq/modules/vggblock.py b/fairseq/modules/vggblock.py
index 193026b16f..ee5ee19a34 100644
--- a/fairseq/modules/vggblock.py
+++ b/fairseq/modules/vggblock.py
@@ -103,11 +103,12 @@ def __init__(
                 input_dim = per_channel_dim
             self.layers.append(nn.ReLU())
 
-        pool_op = nn.MaxPool2d(kernel_size=pooling_kernel_size, ceil_mode=True)
-        self.layers.append(pool_op)
-        self.total_output_dim, self.output_dim = infer_conv_output_dim(
-            pool_op, input_dim, out_channels
-        )
+        if self.pooling_kernel_size is not None:
+            pool_op = nn.MaxPool2d(kernel_size=self.pooling_kernel_size, ceil_mode=True)
+            self.layers.append(pool_op)
+            self.total_output_dim, self.output_dim = infer_conv_output_dim(
+                pool_op, input_dim, out_channels
+            )
 
     def forward(self, x):
         for i, _ in enumerate(self.layers):

From a2f5361d7021ca49184545170d6dddc032647f3d Mon Sep 17 00:00:00 2001
From: alexeib <alexei.b@gmail.com>
Date: Wed, 21 Aug 2019 09:53:50 -0700
Subject: [PATCH 112/213] Multiset (#838)

Summary:
Adds ability to tag individual examples with the names of their datasets, along with some minor miscellaneous fixes and improvements
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/838

Differential Revision: D16919175

Pulled By: alexeib

fbshipit-source-id: 4bf493299645bae63f3ee6382e15f18a9f73666c
---
 fairseq/data/__init__.py                  |  12 +-
 fairseq/data/concat_dataset.py            |   4 +
 fairseq/data/fairseq_dataset.py           |   3 +
 fairseq/data/prepend_dataset.py           |  28 ++++
 fairseq/data/replace_dataset.py           |  26 ++++
 fairseq/data/sharded_dataset.py           |  60 ++++++++
 fairseq/data/subsample_dataset.py         |  57 ++++++++
 fairseq/data/token_block_dataset.py       | 122 +++++++++-------
 fairseq/tasks/language_modeling.py        |  94 ++++++++-----
 fairseq/tasks/tagged_language_modeling.py | 164 ++++++++++++++++++++++
 train.py                                  |   1 -
 11 files changed, 482 insertions(+), 89 deletions(-)
 create mode 100644 fairseq/data/prepend_dataset.py
 create mode 100644 fairseq/data/replace_dataset.py
 create mode 100644 fairseq/data/sharded_dataset.py
 create mode 100644 fairseq/data/subsample_dataset.py
 create mode 100644 fairseq/tasks/tagged_language_modeling.py

diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index f97eaa9fab..f3ecd7d178 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -27,11 +27,15 @@
 from .num_samples_dataset import NumSamplesDataset
 from .offset_tokens_dataset import OffsetTokensDataset
 from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset
+from .prepend_dataset import PrependDataset
 from .prepend_token_dataset import PrependTokenDataset
 from .raw_label_dataset import RawLabelDataset
+from .replace_dataset import ReplaceDataset
 from .round_robin_zip_datasets import RoundRobinZipDatasets
+from .sharded_dataset import ShardedDataset
 from .sort_dataset import SortDataset
 from .strip_token_dataset import StripTokenDataset
+from .subsample_dataset import SubsampleDataset
 from .token_block_dataset import TokenBlockDataset
 from .transform_eos_dataset import TransformEosDataset
 from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
@@ -72,14 +76,18 @@
     'NumSamplesDataset',
     "OffsetTokensDataset",
     'PadDataset',
+    'PrependDataset',
     'PrependTokenDataset',
     'RawAudioDataset',
-    "RawLabelDataset",
+    'RawLabelDataset',
+    'ReplaceDataset',
     'RightPadDataset',
     'RoundRobinZipDatasets',
+    'ShardedDataset',
     'ShardedIterator',
     'SortDataset',
-    "StripTokenDataset",
+    'StripTokenDataset',
+    'SubsampleDataset',
     'TokenBlockDataset',
     'TransformEosDataset',
     'TransformEosLangPairDataset',
diff --git a/fairseq/data/concat_dataset.py b/fairseq/data/concat_dataset.py
index 659af9ae75..b61ebbe46d 100644
--- a/fairseq/data/concat_dataset.py
+++ b/fairseq/data/concat_dataset.py
@@ -64,6 +64,10 @@ def size(self, idx: int):
     def num_tokens(self, index: int):
         return np.max(self.size(index))
 
+    def attr(self, attr: str, index: int):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, index)
+        return getattr(self.datasets[dataset_idx], attr, None)
+
     @property
     def sizes(self):
         return np.concatenate(
diff --git a/fairseq/data/fairseq_dataset.py b/fairseq/data/fairseq_dataset.py
index 6144beca4e..ca6fd47dc1 100644
--- a/fairseq/data/fairseq_dataset.py
+++ b/fairseq/data/fairseq_dataset.py
@@ -47,6 +47,9 @@ def supports_prefetch(self):
         """Whether this dataset supports prefetching."""
         return False
 
+    def attr(self, attr: str, index: int):
+        return getattr(self, attr, None)
+
     def prefetch(self, indices):
         """Prefetch the data required for this epoch."""
         raise NotImplementedError
diff --git a/fairseq/data/prepend_dataset.py b/fairseq/data/prepend_dataset.py
new file mode 100644
index 0000000000..ad74784d2d
--- /dev/null
+++ b/fairseq/data/prepend_dataset.py
@@ -0,0 +1,28 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class PrependDataset(BaseWrapperDataset):
+    def __init__(self, dataset, prepend_getter, ensure_first_token_is=None):
+        super().__init__(dataset)
+        self.prepend_getter = prepend_getter
+        self.ensure_first_token = ensure_first_token_is
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        is_tuple = isinstance(item, tuple)
+        src = item[0] if is_tuple else item
+
+        assert self.ensure_first_token is None or src[0] == self.ensure_first_token
+        prepend_idx = self.prepend_getter(self.dataset, idx)
+        assert isinstance(prepend_idx, int)
+        src[0] = prepend_idx
+        item = tuple((src,) + item[1:]) if is_tuple else src
+        return item
diff --git a/fairseq/data/replace_dataset.py b/fairseq/data/replace_dataset.py
new file mode 100644
index 0000000000..670b812f45
--- /dev/null
+++ b/fairseq/data/replace_dataset.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class ReplaceDataset(BaseWrapperDataset):
+    def __init__(self, dataset, replace_map, offset=0):
+        super().__init__(dataset)
+        assert len(replace_map) > 0
+        self.replace_map = replace_map
+        self.offset = offset
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        is_tuple = isinstance(item, tuple)
+        src = item[0] if is_tuple else item
+
+        for k, v in self.replace_map.items():
+            src_off = src[self.offset:]
+            src_off.masked_fill_(src_off == k, v)
+
+        item = tuple((src,) + item[1:]) if is_tuple else src
+        return item
diff --git a/fairseq/data/sharded_dataset.py b/fairseq/data/sharded_dataset.py
new file mode 100644
index 0000000000..f7ef4e36cd
--- /dev/null
+++ b/fairseq/data/sharded_dataset.py
@@ -0,0 +1,60 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import os
+import random
+
+from . import BaseWrapperDataset
+from fairseq.data import data_utils
+
+
+class ShardedDataset(BaseWrapperDataset):
+    """A :class:`~fairseq.data.FairseqDataset` wrapper that appends/prepends/strips EOS.
+
+    Loads a dataset which has been sharded into multiple files. each shard is only loaded for each specific epoch
+
+    """
+
+    def __init__(
+        self,
+        dictionary,
+        dataset_impl: str,
+        path: str,
+        split: str,
+        epoch: int,
+        name: str = None,
+        combine: bool = False,
+        seed: int = 0,
+    ):
+        self._name = name if name is not None else os.path.basename(path)
+        num_shards = 0
+        for i in itertools.count():
+            if not os.path.exists(os.path.join(path, "shard" + str(i))):
+                break
+            num_shards += 1
+
+        if num_shards > 0 and split == "train":
+            random.seed(seed ^ epoch)
+            shard = random.randint(0, num_shards - 1)
+            split_path = os.path.join(path, "shard" + str(shard), split)
+        else:
+            split_path = os.path.join(path, split)
+            if os.path.isdir(split_path):
+                split_path = os.path.join(split_path, split)
+
+        dataset = data_utils.load_indexed_dataset(
+            split_path, dictionary, dataset_impl, combine=combine
+        )
+        if dataset is None:
+            raise FileNotFoundError(
+                "Dataset not found: {} ({})".format(split, split_path)
+            )
+
+        super().__init__(dataset)
+
+    @property
+    def name(self):
+        return self._name
diff --git a/fairseq/data/subsample_dataset.py b/fairseq/data/subsample_dataset.py
new file mode 100644
index 0000000000..91c4b0dd35
--- /dev/null
+++ b/fairseq/data/subsample_dataset.py
@@ -0,0 +1,57 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from . import BaseWrapperDataset
+
+
+class SubsampleDataset(BaseWrapperDataset):
+    def __init__(self, dataset, size_ratio):
+        super().__init__(dataset)
+        assert size_ratio < 1
+        self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int)
+        self.indices = np.random.choice(
+            range(len(self.dataset)), self.actual_size, replace=False
+        )
+        print(
+            f"subsampled dataset from {len(self.dataset)} to {self.actual_size} (ratio={size_ratio})"
+        )
+
+    def __getitem__(self, index):
+        return self.dataset[self.indices[index]]
+
+    def __len__(self):
+        return self.actual_size
+
+    def collater(self, samples):
+        return self.dataset.collater(samples)
+
+    @property
+    def sizes(self):
+        return self.dataset.sizes[self.indices]
+
+    @property
+    def name(self):
+        return self.dataset.name
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(self.indices[index])
+
+    def size(self, index):
+        return self.dataset.size(self.indices[index])
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        order.append(self.sizes)
+        return np.lexsort(order)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(self.indices[indices])
diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
index 0b5ee5da4a..6dd2cc8615 100644
--- a/fairseq/data/token_block_dataset.py
+++ b/fairseq/data/token_block_dataset.py
@@ -35,8 +35,15 @@ class TokenBlockDataset(FairseqDataset):
     """
 
     def __init__(
-        self, dataset, sizes, block_size, pad, eos, break_mode=None,
-        include_targets=False, document_sep_len=1,
+        self,
+        dataset,
+        sizes,
+        block_size,
+        pad,
+        eos,
+        break_mode=None,
+        include_targets=False,
+        document_sep_len=1,
     ):
         super().__init__()
         self.dataset = dataset
@@ -49,13 +56,7 @@ def __init__(
         assert len(dataset) > 0
         sizes = np.array(sizes, dtype=int)
 
-        assert break_mode != 'complete_doc' or np.all(np.diff((sizes == document_sep_len).nonzero()) != 1),\
-            (
-                "Found multiple blank lines in the dataset, please remove them"
-                " (eg. cat -s raw.txt) and preprocess the data again."
-            )
-
-        if break_mode is None or break_mode == 'none':
+        if break_mode is None or break_mode == "none":
             total_size = sum(sizes)
             length = math.ceil(total_size / block_size)
 
@@ -65,7 +66,7 @@ def block_at(i):
                 return (start, end)
 
             slice_indices = [block_at(i) for i in range(length)]
-        elif break_mode == 'complete':
+        elif break_mode == "complete":
             tok_idx = 0
             sz_idx = 0
             curr_size = 0
@@ -79,7 +80,7 @@ def block_at(i):
                     curr_size = 0
             if curr_size > 0:
                 slice_indices.append((tok_idx, tok_idx + curr_size))
-        elif break_mode == 'complete_doc':
+        elif break_mode == "complete_doc":
             tok_idx = 0
             sz_idx = 0
             curr_size = 0
@@ -92,15 +93,16 @@ def block_at(i):
                     curr_size += sizes[sz_idx]
                     sz_idx += 1
                 else:
-                    slice_indices.append((tok_idx, tok_idx + curr_size))
+                    if curr_size > 1:
+                        slice_indices.append((tok_idx, tok_idx + curr_size))
                     tok_idx += curr_size
                     curr_size = 0
                     if sizes[sz_idx] == document_sep_len:
                         tok_idx += sizes[sz_idx]
                         sz_idx += 1
-            if curr_size > 0:
+            if curr_size > 1:
                 slice_indices.append((tok_idx, tok_idx + curr_size))
-        elif break_mode == 'eos':
+        elif break_mode == "eos":
             slice_indices = np.empty((len(sizes), 2), dtype=int)
             if not torch.is_tensor(sizes):
                 sizes = torch.tensor(sizes)
@@ -109,19 +111,21 @@ def block_at(i):
             if len(cumsum) > 1:
                 slice_indices[1:] = cumsum.unfold(0, 2, 1)
         else:
-            raise ValueError('Invalid break_mode: ' + break_mode)
+            raise ValueError("Invalid break_mode: " + break_mode)
 
         slice_indices = np.array(slice_indices, dtype=int)
         self._sizes = slice_indices[:, 1] - slice_indices[:, 0]
 
         # build index mapping block indices to the underlying dataset indices
-        if break_mode == 'eos':
+        if break_mode == "eos":
             # much faster version for eos break mode
             block_to_dataset_index = np.stack(
                 [
                     np.arange(len(sizes)),  # starting index in dataset
-                    np.zeros(len(sizes), dtype=np.long),  # starting offset within starting index
-                    np.arange(len(sizes))  # ending index in dataset
+                    np.zeros(
+                        len(sizes), dtype=np.long
+                    ),  # starting offset within starting index
+                    np.arange(len(sizes)),  # ending index in dataset
                 ],
                 1,
             )
@@ -133,9 +137,10 @@ def block_at(i):
                 start_ds_idx = ds.current_index
                 start_offset = ds.current_offset
                 if e <= s:
-                    continue
-                ds.seek(e - 1)
-                end_ds_idx = ds.current_index
+                    end_ds_idx = start_ds_idx
+                else:
+                    ds.seek(e - 1)
+                    end_ds_idx = ds.current_index
                 block_to_dataset_index[i] = (
                     start_ds_idx,  # starting index in dataset
                     start_offset,  # starting offset within starting index
@@ -158,11 +163,17 @@ def sizes(self):
     def block_to_dataset_index(self):
         return self._block_to_dataset_index.array
 
+    def attr(self, attr: str, index: int):
+        start_ds_idx, _, _ = self.block_to_dataset_index[index]
+        return self.dataset.attr(attr, start_ds_idx)
+
     def __getitem__(self, index):
         start_ds_idx, start_offset, end_ds_idx = self.block_to_dataset_index[index]
-        buffer = torch.cat([
-            self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)
-        ])
+
+        buffer = torch.cat(
+            [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
+        )
+
         slice_s, slice_e = self.slice_indices[index]
         length = slice_e - slice_s
         s, e = start_offset, start_offset + length
@@ -173,16 +184,19 @@ def __getitem__(self, index):
             # *source* is shifted right by 1 (maybe left-padded with eos)
             # *past_target* is shifted right by 2 (left-padded as needed)
             if s == 0:
-                source = torch.cat([item.new([self.eos]), buffer[0:e - 1]])
-                past_target = torch.cat([item.new([self.pad, self.eos]), buffer[0:e - 2]])
+                source = torch.cat([item.new([self.eos]), buffer[0 : e - 1]])
+                past_target = torch.cat(
+                    [item.new([self.pad, self.eos]), buffer[0 : e - 2]]
+                )
             else:
-                source = buffer[s - 1:e - 1]
+                source = buffer[s - 1 : e - 1]
                 if s == 1:
-                    past_target = torch.cat([item.new([self.eos]), buffer[0:e - 2]])
+                    past_target = torch.cat([item.new([self.eos]), buffer[0 : e - 2]])
                 else:
-                    past_target = buffer[s - 2:e - 2]
+                    past_target = buffer[s - 2 : e - 2]
 
             return source, item, past_target
+
         return item
 
     def __len__(self):
@@ -190,15 +204,17 @@ def __len__(self):
 
     @property
     def supports_prefetch(self):
-        return getattr(self.dataset, 'supports_prefetch', False)
+        return getattr(self.dataset, "supports_prefetch", False)
 
     def prefetch(self, indices):
-        self.dataset.prefetch({
-            ds_idx
-            for index in indices
-            for start_ds_idx, _, end_ds_idx in [self.block_to_dataset_index[index]]
-            for ds_idx in range(start_ds_idx, end_ds_idx + 1)
-        })
+        self.dataset.prefetch(
+            {
+                ds_idx
+                for index in indices
+                for start_ds_idx, _, end_ds_idx in [self.block_to_dataset_index[index]]
+                for ds_idx in range(start_ds_idx, end_ds_idx + 1)
+            }
+        )
 
 
 class DatasetSearcher(object):
@@ -216,17 +232,25 @@ def reset(self):
 
     def seek(self, i):
         assert i >= 0
-        if i < self.current_i:
-            self.reset()
-        if i > self.current_i:
-            to_consume = i - self.current_i
-            remaining = self.sizes[self.current_index] - self.current_offset
-            if remaining > to_consume:
-                self.current_offset += to_consume
-                self.current_i += to_consume
-            else:
-                self.current_i += remaining
-                self.current_index += 1
-                self.current_offset = 0
-                self.seek(i)
+
+        def step():
+            if i < self.current_i:
+                self.reset()
+            if i > self.current_i:
+                to_consume = i - self.current_i
+                remaining = self.sizes[self.current_index] - self.current_offset
+                if remaining > to_consume:
+                    self.current_offset += to_consume
+                    self.current_i += to_consume
+                else:
+                    assert remaining > 0
+                    self.current_i += remaining
+                    self.current_index += 1
+                    self.current_offset = 0
+                    return True
+            return False
+
+        not_done = True
+        while not_done:
+            not_done = step()
         assert self.current_i == i
diff --git a/fairseq/tasks/language_modeling.py b/fairseq/tasks/language_modeling.py
index f6d0d70b62..91235b488a 100644
--- a/fairseq/tasks/language_modeling.py
+++ b/fairseq/tasks/language_modeling.py
@@ -19,7 +19,7 @@
 from fairseq.tasks import FairseqTask, register_task
 
 
-@register_task('language_modeling')
+@register_task("language_modeling")
 class LanguageModelingTask(FairseqTask):
     """
     Train a language model.
@@ -87,7 +87,7 @@ def __init__(self, args, dictionary, output_dictionary=None, targets=None):
         self.output_dictionary = output_dictionary or dictionary
 
         if targets is None:
-            targets = ['future']
+            targets = ["future"]
         self.targets = targets
 
     @classmethod
@@ -97,38 +97,44 @@ def setup_task(cls, args, **kwargs):
         Args:
             args (argparse.Namespace): parsed command-line arguments
         """
-        if getattr(args, 'raw_text', False):
-            utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw')
-            args.dataset_impl = 'raw'
-        elif getattr(args, 'lazy_load', False):
-            utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy')
-            args.dataset_impl = 'lazy'
+        if getattr(args, "raw_text", False):
+            utils.deprecation_warning(
+                "--raw-text is deprecated, please use --dataset-impl=raw"
+            )
+            args.dataset_impl = "raw"
+        elif getattr(args, "lazy_load", False):
+            utils.deprecation_warning(
+                "--lazy-load is deprecated, please use --dataset-impl=lazy"
+            )
+            args.dataset_impl = "lazy"
 
         dictionary = None
         output_dictionary = None
         if args.data:
-            paths = args.data.split(':')
+            paths = args.data.split(":")
             assert len(paths) > 0
-            dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))
-            print('| dictionary: {} types'.format(len(dictionary)))
+            dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+            print("| dictionary: {} types".format(len(dictionary)))
             output_dictionary = dictionary
             if args.output_dictionary_size >= 0:
-                output_dictionary = TruncatedDictionary(dictionary, args.output_dictionary_size)
+                output_dictionary = TruncatedDictionary(
+                    dictionary, args.output_dictionary_size
+                )
 
         # upgrade old checkpoints
-        if hasattr(args, 'exclude_self_target'):
+        if hasattr(args, "exclude_self_target"):
             args.self_target = not args.exclude_self_target
 
         targets = []
-        if getattr(args, 'self_target', False):
-            targets.append('self')
-        if getattr(args, 'future_target', False):
-            targets.append('future')
-        if getattr(args, 'past_target', False):
-            targets.append('past')
+        if getattr(args, "self_target", False):
+            targets.append("self")
+        if getattr(args, "future_target", False):
+            targets.append("future")
+        if getattr(args, "past_target", False):
+            targets.append("past")
         if len(targets) == 0:
             # standard language modeling
-            targets = ['future']
+            targets = ["future"]
 
         return cls(args, dictionary, output_dictionary, targets=targets)
 
@@ -137,7 +143,9 @@ def build_model(self, args):
 
         for target in self.targets:
             if target not in model.supported_targets:
-                raise ValueError('Unsupported language modeling target: {}'.format(target))
+                raise ValueError(
+                    "Unsupported language modeling target: {}".format(target)
+                )
 
         return model
 
@@ -147,32 +155,44 @@ def load_dataset(self, split, epoch=0, combine=False, **kwargs):
         Args:
             split (str): name of the split (e.g., train, valid, test)
         """
-        paths = self.args.data.split(':')
+        paths = self.args.data.split(":")
         assert len(paths) > 0
+
         data_path = paths[epoch % len(paths)]
         split_path = os.path.join(data_path, split)
 
         dataset = data_utils.load_indexed_dataset(
-            split_path,
-            self.dictionary,
-            self.args.dataset_impl,
-            combine=combine,
+            split_path, self.dictionary, self.args.dataset_impl, combine=combine
         )
         if dataset is None:
-            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))
+            raise FileNotFoundError(
+                "Dataset not found: {} ({})".format(split, split_path)
+            )
 
         dataset = TokenBlockDataset(
-            dataset, dataset.sizes, self.args.tokens_per_sample,
-            pad=self.dictionary.pad(), eos=self.dictionary.eos(),
-            break_mode=self.args.sample_break_mode, include_targets=True,
+            dataset,
+            dataset.sizes,
+            self.args.tokens_per_sample,
+            pad=self.dictionary.pad(),
+            eos=self.dictionary.eos(),
+            break_mode=self.args.sample_break_mode,
+            include_targets=True,
         )
 
-        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'
+        add_eos_for_other_targets = (
+            self.args.sample_break_mode is not None
+            and self.args.sample_break_mode != "none"
+        )
 
         self.datasets[split] = MonolingualDataset(
-            dataset, dataset.sizes, self.dictionary, self.output_dictionary,
-            add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True,
-            targets=self.targets, add_bos_token=self.args.add_bos_token,
+            dataset,
+            dataset.sizes,
+            self.dictionary,
+            self.output_dictionary,
+            add_eos_for_other_targets=add_eos_for_other_targets,
+            shuffle=True,
+            targets=self.targets,
+            add_bos_token=self.args.add_bos_token,
         )
 
     def build_dataset_for_inference(self, src_tokens, src_lengths):
@@ -184,7 +204,7 @@ def build_dataset_for_inference(self, src_tokens, src_lengths):
                     block_size=None,
                     pad=self.source_dictionary.pad(),
                     eos=self.source_dictionary.eos(),
-                    break_mode='eos',
+                    break_mode="eos",
                     include_targets=False,
                 ),
                 src_lengths,
@@ -202,9 +222,9 @@ def build_dataset_for_inference(self, src_tokens, src_lengths):
 
     def inference_step(self, generator, models, sample, prefix_tokens=None):
         with torch.no_grad():
-            if prefix_tokens is None and sample['net_input']['src_tokens'].nelement():
+            if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
                 # note: EOS has already been removed in build_dataset_for_inference
-                prefix_tokens = sample['net_input']['src_tokens']
+                prefix_tokens = sample["net_input"]["src_tokens"]
             return generator.generate(models, sample, prefix_tokens=prefix_tokens)
 
     @property
diff --git a/fairseq/tasks/tagged_language_modeling.py b/fairseq/tasks/tagged_language_modeling.py
new file mode 100644
index 0000000000..3c49ef7664
--- /dev/null
+++ b/fairseq/tasks/tagged_language_modeling.py
@@ -0,0 +1,164 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+import os
+
+from fairseq.data import (
+    ConcatDataset,
+    data_utils,
+    MonolingualDataset,
+    PrependDataset,
+    ReplaceDataset,
+    ShardedDataset,
+    SubsampleDataset,
+    TokenBlockDataset,
+)
+from fairseq.tasks import register_task
+
+from fairseq.tasks.language_modeling import LanguageModelingTask
+
+
+@register_task("tagged_language_modeling")
+class TaggedLanguageModelingTask(LanguageModelingTask):
+    """
+    Like the language modeling task, but prepends tags to each sample
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        LanguageModelingTask.add_args(parser)
+        parser.add_argument(
+            "--multiple-datasets",
+            action="store_true",
+            help="if set, treats paths in data as separate datasets to be combined, "
+            "rather than as splits of a single dataset",
+        )
+        parser.add_argument(
+            "--prepend-ds-name",
+            action="store_true",
+            help="if set and multiple-datasets is also set, prepends the name of the ds instead of "
+            "bos/eos token",
+        )
+        parser.add_argument(
+            "--generic-ds-name-chance",
+            type=float,
+            metavar="P",
+            default=0,
+            help='if multiple datasets is used, sets the prepended ds name to "generic" '
+            "this percentage of time",
+        )
+        parser.add_argument(
+            "--subsample-splits",
+            type=str,
+            metavar="SPLITS",
+            default="valid",
+            help="if multiple datasets is used, subsamples specified split(colon separated) to "
+            "the size of the smallest split",
+        )
+
+    def __init__(self, args, dictionary, output_dictionary=None, targets=None):
+        super().__init__(args, dictionary, output_dictionary, targets)
+        self.subsample_splits = (
+            set()
+            if args.subsample_splits is None
+            else set(args.subsample_splits.split(":"))
+        )
+
+    def make_prepended_ds(self, dataset):
+        def ds_name(dataset, index):
+            if (
+                self.args.generic_ds_name_chance > 0
+                and np.random.rand() <= self.args.generic_ds_name_chance
+            ):
+                ds_name = "generic"
+            else:
+                ds_name = dataset.attr("name", index)
+            assert ds_name is not None
+            return self.dictionary.indices[ds_name]
+
+        dataset = PrependDataset(
+            dataset, prepend_getter=ds_name, ensure_first_token_is=self.dictionary.eos()
+        )
+        return dataset
+
+    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = self.args.data.split(":")
+        assert len(paths) > 0
+
+        if self.args.multiple_datasets:
+            if len(paths) == 1:
+                paths = [os.path.join(paths[0], p) for p in next(os.walk(paths[0]))[1]]
+            datasets = [
+                ShardedDataset(
+                    self.dictionary,
+                    self.args.dataset_impl,
+                    path,
+                    split,
+                    epoch,
+                    combine=combine,
+                )
+                for path in paths
+            ]
+
+            if split in self.subsample_splits:
+                sizes = [sum(d.sizes) for d in datasets]
+                min_sz = min(sizes)
+                ratios = [min_sz / sz for sz in sizes]
+                datasets = [
+                    SubsampleDataset(d, r) if r < 1 else d
+                    for d, r in zip(datasets, ratios)
+                ]
+
+            dataset = ConcatDataset(datasets)
+        else:
+            data_path = paths[epoch % len(paths)]
+            split_path = os.path.join(data_path, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path, self.dictionary, self.args.dataset_impl, combine=combine
+            )
+            if dataset is None:
+                raise FileNotFoundError(
+                    "Dataset not found: {} ({})".format(split, split_path)
+                )
+
+        dataset = TokenBlockDataset(
+            dataset,
+            dataset.sizes,
+            self.args.tokens_per_sample,
+            pad=self.dictionary.pad(),
+            eos=self.dictionary.eos(),
+            break_mode=self.args.sample_break_mode,
+            include_targets=True,
+        )
+
+        if self.args.prepend_ds_name:
+            dataset = self.make_prepended_ds(dataset)
+
+        dataset = ReplaceDataset(dataset, { self.dictionary.eos(): self.dictionary.indices['\\n'] }, offset=1)
+
+        add_eos_for_other_targets = (
+            self.args.sample_break_mode is not None
+            and self.args.sample_break_mode != "none"
+        )
+
+        self.datasets[split] = MonolingualDataset(
+            dataset,
+            dataset.sizes,
+            self.dictionary,
+            self.output_dictionary,
+            add_eos_for_other_targets=add_eos_for_other_targets,
+            shuffle=True,
+            targets=self.targets,
+            add_bos_token=self.args.add_bos_token,
+        )
diff --git a/train.py b/train.py
index b73e362d5d..afe9c10232 100644
--- a/train.py
+++ b/train.py
@@ -73,7 +73,6 @@ def main(args, init_distributed=False):
     lr = trainer.get_lr()
     train_meter = StopwatchMeter()
     train_meter.start()
-    valid_losses = [None]
     valid_subsets = args.valid_subset.split(',')
     while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update:
         # train for one epoch

From ba5f829f64d2adf9de4502dab94abf0a1a54106b Mon Sep 17 00:00:00 2001
From: Jeff Cai <jcai@fb.com>
Date: Wed, 21 Aug 2019 13:41:41 -0700
Subject: [PATCH 113/213] Parameterized criterions (#808)

Summary:
Support criterion with parameters, such as AutoSegmentationCriterion (ASG) used in wav2letter which has a transition matrix parameter. This is needed to integrate wav2letter's ASG into PySpeech.

With this diff, parameters in criterions will be:
(1) updated by optimizers, with a configurable learning rate
(2) saved and loaded from checkpoints, preserving backward compatibility for criterions without parameters
(3) synchronized across nodes in distributed training.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/808

Reviewed By: jcai1

Differential Revision: D16934097

Pulled By: okhonko

fbshipit-source-id: 121ec9382459385c6f9cbef3a8274bec1a434038
---
 fairseq/checkpoint_utils.py                 |  3 ++
 fairseq/models/distributed_fairseq_model.py |  6 +--
 fairseq/optim/__init__.py                   |  7 +--
 fairseq/optim/adadelta.py                   |  2 +-
 fairseq/optim/adafactor.py                  |  2 +-
 fairseq/optim/adagrad.py                    |  2 +-
 fairseq/optim/adam.py                       |  2 +-
 fairseq/optim/adamax.py                     |  2 +-
 fairseq/optim/bmuf.py                       |  5 +--
 fairseq/optim/fairseq_optimizer.py          | 15 ++++---
 fairseq/optim/fp16_optimizer.py             | 11 ++---
 fairseq/optim/nag.py                        |  2 +-
 fairseq/optim/sgd.py                        |  2 +-
 fairseq/trainer.py                          | 49 ++++++++++++++++-----
 fairseq/utils.py                            |  8 ++++
 15 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index c812136781..4a8855d540 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -222,6 +222,7 @@ def save_state(
     filename, args, model_state_dict, criterion, optimizer, lr_scheduler,
     num_updates, optim_history=None, extra_state=None,
 ):
+    from fairseq import utils
     if optim_history is None:
         optim_history = []
     if extra_state is None:
@@ -239,6 +240,8 @@ def save_state(
         ],
         'extra_state': extra_state,
     }
+    if utils.has_parameters(criterion):
+        state_dict['criterion'] = criterion.state_dict()
     if not args.no_save_optimizer_state:
         state_dict['last_optimizer_state'] = convert_state_dict_type(optimizer.state_dict())
     torch_persistent_save(state_dict, filename)
diff --git a/fairseq/models/distributed_fairseq_model.py b/fairseq/models/distributed_fairseq_model.py
index e858717d97..dbf384367b 100644
--- a/fairseq/models/distributed_fairseq_model.py
+++ b/fairseq/models/distributed_fairseq_model.py
@@ -5,7 +5,7 @@
 
 import inspect
 
-from torch.nn import parallel
+import torch.nn as nn
 
 from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel
 from fairseq.models import BaseFairseqModel
@@ -25,9 +25,9 @@ def DistributedFairseqModel(args, model):
         model (BaseFairseqModel): model to wrap
     """
     # determine which DDP class to extend
-    assert isinstance(model, BaseFairseqModel)
+    assert isinstance(model, nn.Module)
     if args.ddp_backend == 'c10d':
-        ddp_class = parallel.DistributedDataParallel
+        ddp_class = nn.parallel.DistributedDataParallel
         init_kwargs = dict(
             module=model,
             device_ids=[args.device_id],
diff --git a/fairseq/optim/__init__.py b/fairseq/optim/__init__.py
index 268291be76..2b8334d8c2 100644
--- a/fairseq/optim/__init__.py
+++ b/fairseq/optim/__init__.py
@@ -19,18 +19,13 @@
 ]
 
 
-_build_optimizer, register_optimizer, OPTIMIZER_REGISTRY = registry.setup_registry(
+build_optimizer, register_optimizer, OPTIMIZER_REGISTRY = registry.setup_registry(
     '--optimizer',
     base_class=FairseqOptimizer,
     default='nag',
 )
 
 
-def build_optimizer(args, params, *extra_args, **extra_kwargs):
-    params = list(filter(lambda p: p.requires_grad, params))
-    return _build_optimizer(args, params, *extra_args, **extra_kwargs)
-
-
 # automatically import any Python files in the optim/ directory
 for file in os.listdir(os.path.dirname(__file__)):
     if file.endswith('.py') and not file.startswith('_'):
diff --git a/fairseq/optim/adadelta.py b/fairseq/optim/adadelta.py
index 27079a402b..8a9d54cc84 100644
--- a/fairseq/optim/adadelta.py
+++ b/fairseq/optim/adadelta.py
@@ -11,7 +11,7 @@
 @register_optimizer('adadelta')
 class Adadelta(FairseqOptimizer):
     def __init__(self, args, params):
-        super().__init__(args, params)
+        super().__init__(args)
         self._optimizer = torch.optim.Adadelta(params, **self.optimizer_config)
 
     @staticmethod
diff --git a/fairseq/optim/adafactor.py b/fairseq/optim/adafactor.py
index 1c026244b5..680ac371b9 100644
--- a/fairseq/optim/adafactor.py
+++ b/fairseq/optim/adafactor.py
@@ -13,7 +13,7 @@
 @register_optimizer('adafactor')
 class FairseqAdafactor(FairseqOptimizer):
     def __init__(self, args, params):
-        super().__init__(args, params)
+        super().__init__(args)
         self._optimizer = Adafactor(params, **self.optimizer_config)
 
     @staticmethod
diff --git a/fairseq/optim/adagrad.py b/fairseq/optim/adagrad.py
index 15b3a1c25a..5dead3b25d 100644
--- a/fairseq/optim/adagrad.py
+++ b/fairseq/optim/adagrad.py
@@ -11,7 +11,7 @@
 @register_optimizer('adagrad')
 class Adagrad(FairseqOptimizer):
     def __init__(self, args, params):
-        super().__init__(args, params)
+        super().__init__(args)
         self._optimizer = torch.optim.Adagrad(params, **self.optimizer_config)
 
     @staticmethod
diff --git a/fairseq/optim/adam.py b/fairseq/optim/adam.py
index 0df1182066..51a282380c 100644
--- a/fairseq/optim/adam.py
+++ b/fairseq/optim/adam.py
@@ -16,7 +16,7 @@
 class FairseqAdam(FairseqOptimizer):
 
     def __init__(self, args, params):
-        super().__init__(args, params)
+        super().__init__(args)
         if torch.cuda.is_available():
             try:
                 from apex.optimizers import FusedAdam as _FusedAdam  # noqa
diff --git a/fairseq/optim/adamax.py b/fairseq/optim/adamax.py
index 2a2e7698ad..a22f7cda8c 100644
--- a/fairseq/optim/adamax.py
+++ b/fairseq/optim/adamax.py
@@ -12,7 +12,7 @@
 @register_optimizer('adamax')
 class FairseqAdamax(FairseqOptimizer):
     def __init__(self, args, params):
-        super().__init__(args, params)
+        super().__init__(args)
         self._optimizer = Adamax(params, **self.optimizer_config)
 
     @staticmethod
diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py
index 756374d569..651fe7e604 100644
--- a/fairseq/optim/bmuf.py
+++ b/fairseq/optim/bmuf.py
@@ -19,11 +19,10 @@ class FairseqBMUF(FairseqOptimizer):
     model-update filtering
     """
 
-    def __init__(self, args, params, optimizer):
+    def __init__(self, args, optimizer):
 
-        super().__init__(args, params)
+        super().__init__(args)
         self._optimizer = optimizer
-        self.params = params
         self._num_updates = 0
         self.sync_iter = self.args.global_sync_iter
         self.block_momentum = self.args.block_momentum
diff --git a/fairseq/optim/fairseq_optimizer.py b/fairseq/optim/fairseq_optimizer.py
index 58bc7fc2d7..030b1fe4a0 100644
--- a/fairseq/optim/fairseq_optimizer.py
+++ b/fairseq/optim/fairseq_optimizer.py
@@ -10,10 +10,9 @@
 
 class FairseqOptimizer(object):
 
-    def __init__(self, args, params):
+    def __init__(self, args):
         super().__init__()
         self.args = args
-        self.params = list(params)
 
     @staticmethod
     def add_args(parser):
@@ -39,6 +38,13 @@ def optimizer_config(self):
         """
         raise NotImplementedError
 
+    @property
+    def params(self):
+        """Return an iterable of the parameters held by the optimizer."""
+        for param_group in self.optimizer.param_groups:
+            for p in param_group['params']:
+                yield p
+
     def __getstate__(self):
         return self._optimizer.__getstate__()
 
@@ -93,9 +99,8 @@ def step(self, closure=None):
 
     def zero_grad(self):
         """Clears the gradients of all optimized parameters."""
-        for group in self.optimizer.param_groups:
-            for p in group['params']:
-                p.grad = None
+        for p in self.params:
+            p.grad = None
         self.optimizer.zero_grad()
 
     @property
diff --git a/fairseq/optim/fp16_optimizer.py b/fairseq/optim/fp16_optimizer.py
index b3ae1ef49c..194e0f4f44 100644
--- a/fairseq/optim/fp16_optimizer.py
+++ b/fairseq/optim/fp16_optimizer.py
@@ -60,7 +60,8 @@ class FP16Optimizer(optim.FairseqOptimizer):
     """
 
     def __init__(self, args, params, fp32_optimizer, fp32_params):
-        super().__init__(args, params)
+        super().__init__(args)
+        self.fp16_params = params
         self.fp32_optimizer = fp32_optimizer
         self.fp32_params = fp32_params
 
@@ -149,7 +150,7 @@ def _sync_fp16_grads_to_fp32(self, multiply_grads=1.):
         if self._needs_sync:
             # copy FP16 grads to FP32
             offset = 0
-            for p in self.params:
+            for p in self.fp16_params:
                 if not p.requires_grad:
                     continue
                 grad_data = p.grad.data if p.grad is not None else p.data.new_zeros(p.data.shape)
@@ -196,7 +197,7 @@ def step(self, closure=None):
 
         # copy FP32 params back into FP16 model
         offset = 0
-        for p in self.params:
+        for p in self.fp16_params:
             if not p.requires_grad:
                 continue
             numel = p.data.numel()
@@ -205,7 +206,7 @@ def step(self, closure=None):
 
     def zero_grad(self):
         """Clears the gradients of all optimized parameters."""
-        for p in self.params:
+        for p in self.fp16_params:
             p.grad = None
         self._needs_sync = False
 
@@ -232,7 +233,7 @@ def __init__(self, args, params, optimizer):
                 'Unsupported optimizer: {}'.format(optimizer.__class__.__name__)
             )
 
-        super().__init__(args, params)
+        super().__init__(args)
         self.wrapped_optimizer = optimizer
 
         if getattr(args, 'fp16_scale_window', None) is None:
diff --git a/fairseq/optim/nag.py b/fairseq/optim/nag.py
index c916b6fadb..25c2609873 100644
--- a/fairseq/optim/nag.py
+++ b/fairseq/optim/nag.py
@@ -12,7 +12,7 @@
 @register_optimizer('nag')
 class FairseqNAG(FairseqOptimizer):
     def __init__(self, args, params):
-        super().__init__(args, params)
+        super().__init__(args)
         self._optimizer = NAG(params, **self.optimizer_config)
 
     @staticmethod
diff --git a/fairseq/optim/sgd.py b/fairseq/optim/sgd.py
index c34b9590dd..0efb283c63 100644
--- a/fairseq/optim/sgd.py
+++ b/fairseq/optim/sgd.py
@@ -11,7 +11,7 @@
 @register_optimizer('sgd')
 class SGD(FairseqOptimizer):
     def __init__(self, args, params):
-        super().__init__(args, params)
+        super().__init__(args)
         self._optimizer = torch.optim.SGD(params, **self.optimizer_config)
 
     @staticmethod
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index ce0e74dc9f..58448c83a7 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -36,13 +36,14 @@ def __init__(self, args, task, model, criterion, dummy_batch=None, oom_batch=Non
         self.task = task
 
         # copy model and criterion to current device
-        self.criterion = criterion
+        self._criterion = criterion
         self._model = model
         self.cuda = torch.cuda.is_available() and not args.cpu
         if args.fp16:
+            self._criterion = self._criterion.half()
             self._model = self._model.half()
         if self.cuda:
-            self.criterion = self.criterion.cuda()
+            self._criterion = self._criterion.cuda()
             self._model = self._model.cuda()
 
         self._dummy_batch = dummy_batch
@@ -53,6 +54,7 @@ def __init__(self, args, task, model, criterion, dummy_batch=None, oom_batch=Non
         self._optim_history = None
         self._optimizer = None
         self._prev_grad_norm = None
+        self._wrapped_criterion = None
         self._wrapped_model = None
 
         self.init_meters(args)
@@ -75,6 +77,21 @@ def init_meters(self, args):
         self.meters['wall'] = TimeMeter()      # wall time in seconds
         self.meters['train_wall'] = StopwatchMeter()  # train wall time in seconds
 
+    @property
+    def criterion(self):
+        if self._wrapped_criterion is None:
+            if (
+                utils.has_parameters(self._criterion)
+                and self.args.distributed_world_size > 1
+                and not self.args.use_bmuf
+            ):
+                self._wrapped_criterion = models.DistributedFairseqModel(
+                    self.args, self._criterion
+                )
+            else:
+                self._wrapped_criterion = self._criterion
+        return self._wrapped_criterion
+
     @property
     def model(self):
         if self._wrapped_model is None:
@@ -99,7 +116,13 @@ def lr_scheduler(self):
         return self._lr_scheduler
 
     def _build_optimizer(self):
-        params = list(filter(lambda p: p.requires_grad, self.model.parameters()))
+        params = list(
+            filter(
+                lambda p: p.requires_grad,
+                chain(self.model.parameters(), self.criterion.parameters()),
+            )
+        )
+
         if self.args.fp16:
             if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                 print('| WARNING: your device does NOT support faster training with --fp16, '
@@ -114,7 +137,7 @@ def _build_optimizer(self):
             self._optimizer = optim.build_optimizer(self.args, params)
 
         if self.args.use_bmuf:
-            self._optimizer = optim.FairseqBMUF(self.args, params, self._optimizer)
+            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)
 
         # We should initialize the learning rate scheduler immediately after
         # building the optimizer, so that the initial learning rate is set.
@@ -126,7 +149,7 @@ def save_checkpoint(self, filename, extra_state):
         if distributed_utils.is_master(self.args):  # only save one checkpoint
             extra_state['train_meters'] = self.meters
             checkpoint_utils.save_state(
-                filename, self.args, self.get_model().state_dict(), self.criterion,
+                filename, self.args, self.get_model().state_dict(), self.get_criterion(),
                 self.optimizer, self.lr_scheduler, self.get_num_updates(),
                 self._optim_history, extra_state,
             )
@@ -148,6 +171,8 @@ def load_checkpoint(
             # load model parameters
             try:
                 self.get_model().load_state_dict(state['model'], strict=True)
+                if utils.has_parameters(self.get_criterion()):
+                    self.get_criterion().load_state_dict(state['criterion'], strict=True)
             except Exception:
                 raise Exception(
                     'Cannot load model parameters from checkpoint {}; '
@@ -164,7 +189,7 @@ def load_checkpoint(
 
             # only reload optimizer and lr_scheduler if they match
             last_optim = self._optim_history[-1]
-            assert last_optim['criterion_name'] == self.criterion.__class__.__name__, \
+            assert last_optim['criterion_name'] == self.get_criterion().__class__.__name__, \
                 'Criterion does not match; please reset the optimizer (--reset-optimizer).'
             assert last_optim['optimizer_name'] == self.optimizer.__class__.__name__, \
                 'Optimizer does not match; please reset the optimizer (--reset-optimizer).'
@@ -322,9 +347,9 @@ def maybe_no_sync():
 
         # aggregate logging outputs and sample sizes
         logging_output = self.task.aggregate_logging_outputs(
-            logging_outputs, self.criterion
+            logging_outputs, self.get_criterion()
         )
-        sample_size = self.task.grad_denom(sample_sizes, self.criterion)
+        sample_size = self.task.grad_denom(sample_sizes, self.get_criterion())
 
         if not all(k in logging_output for k in ['ntokens', 'nsentences']):
             raise Exception((
@@ -424,10 +449,10 @@ def valid_step(self, sample, raise_oom=False):
 
         # aggregate logging outputs and sample sizes
         logging_output = self.task.aggregate_logging_outputs(
-            logging_output, self.criterion
+            logging_output, self.get_criterion()
         )
         sample_size = self.task.grad_denom(
-            sample_size, self.criterion
+            sample_size, self.get_criterion()
         )
 
         # update meters for validation
@@ -477,6 +502,10 @@ def get_model(self):
         """Get the (non-wrapped) model instance."""
         return self._model
 
+    def get_criterion(self):
+        """Get the (non-wrapped) criterion instance."""
+        return self._criterion
+
     def get_meter(self, name):
         """Get a specific meter by name."""
         if name not in self.meters:
diff --git a/fairseq/utils.py b/fairseq/utils.py
index 1b664cbfe3..1af2394434 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -351,3 +351,11 @@ def eval(model):
     model.eval()
     yield
     model.train(is_training)
+
+
+def has_parameters(module):
+    try:
+        next(module.parameters())
+        return True
+    except StopIteration:
+        return False

From 93057cc099041665f53744cda297a6935ebd9f5c Mon Sep 17 00:00:00 2001
From: Trinkle23897 <463003665@qq.com>
Date: Wed, 21 Aug 2019 15:19:34 -0700
Subject: [PATCH 114/213] fix string format to work in python 3.5 (#1050)

Summary:
change string fromat in fairseq/data/subsample_dataset.py#20
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1050

Differential Revision: D16946060

Pulled By: okhonko

fbshipit-source-id: 0eabf22e7ffd4f658b6d18c87dc6e59c81a355c7
---
 fairseq/data/subsample_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/data/subsample_dataset.py b/fairseq/data/subsample_dataset.py
index 91c4b0dd35..983a611393 100644
--- a/fairseq/data/subsample_dataset.py
+++ b/fairseq/data/subsample_dataset.py
@@ -17,7 +17,7 @@ def __init__(self, dataset, size_ratio):
             range(len(self.dataset)), self.actual_size, replace=False
         )
         print(
-            f"subsampled dataset from {len(self.dataset)} to {self.actual_size} (ratio={size_ratio})"
+            "subsampled dataset from {} to {} (ratio={})".format(len(self.dataset), self.actual_size, size_ratio)
         )
 
     def __getitem__(self, index):

From 3c2cf3b02a29f44e540648bcecd5ff663ad6f2b5 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 21 Aug 2019 17:41:23 -0700
Subject: [PATCH 115/213] Misc changes

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/840

Differential Revision: D16947645

Pulled By: myleott

fbshipit-source-id: e869789bc22bbf5cb08d9adfa44f9fc09b3805af
---
 examples/language_model/README.md         |   8 +-
 examples/roberta/README.md                |   7 +
 examples/roberta/README.pretraining.md    |   2 +-
 fairseq/tasks/tagged_language_modeling.py | 164 ----------------------
 4 files changed, 15 insertions(+), 166 deletions(-)
 delete mode 100644 fairseq/tasks/tagged_language_modeling.py

diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index a103755228..6199e69ece 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -12,7 +12,7 @@ Model | Description | Dataset | Download
 
 ## Example usage
 
-Sampling from a language model using PyTorch Hub:
+To sample from a language model using PyTorch Hub:
 ```python
 import torch
 
@@ -25,6 +25,12 @@ en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='
 # Sample from the language model
 en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
 # "Barack Obama is coming to Sydney and New Zealand (...)"
+
+# The same interface can be used with custom models as well
+from fairseq.models.transformer_lm import TransformerLanguageModel
+custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe')
+custom_lm.sample('Barack Obama', beam=5)
+# "Barack Obama (...)"
 ```
 
 ## Training a transformer language model with the CLI tools
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index e4d9e4fee1..9006e4f193 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -76,6 +76,13 @@ Model | Accuracy
 ---|---
 `roberta.large` | 78.1
 
+**[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)**
+_(TRANSLATE-TEST)_
+
+Model | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur
+---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
+`roberta.large.mnli` | 91.3 | 82.91 | 84.27 | 81.24 | 81.74 | 83.13 | 78.28 | 76.79 | 76.64 | 74.17 | 74.05 | 77.5 | 70.9 | 66.65 | 66.81
+
 ## Example usage
 
 ##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md
index 527d4a2e57..43bdf17676 100644
--- a/examples/roberta/README.pretraining.md
+++ b/examples/roberta/README.pretraining.md
@@ -54,7 +54,7 @@ PEAK_LR=0.0005          # Peak learning rate, adjust as needed
 TOKENS_PER_SAMPLE=512   # Max sequence length
 MAX_POSITIONS=512       # Num. positional embeddings (usually same as above)
 MAX_SENTENCES=16        # Number of sequences per batch (batch size)
-UPDATE_FREQ=16           # Increase the batch size 16x
+UPDATE_FREQ=16          # Increase the batch size 16x
 
 DATA_DIR=data-bin/wikitext-103
 
diff --git a/fairseq/tasks/tagged_language_modeling.py b/fairseq/tasks/tagged_language_modeling.py
deleted file mode 100644
index 3c49ef7664..0000000000
--- a/fairseq/tasks/tagged_language_modeling.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-import os
-
-from fairseq.data import (
-    ConcatDataset,
-    data_utils,
-    MonolingualDataset,
-    PrependDataset,
-    ReplaceDataset,
-    ShardedDataset,
-    SubsampleDataset,
-    TokenBlockDataset,
-)
-from fairseq.tasks import register_task
-
-from fairseq.tasks.language_modeling import LanguageModelingTask
-
-
-@register_task("tagged_language_modeling")
-class TaggedLanguageModelingTask(LanguageModelingTask):
-    """
-    Like the language modeling task, but prepends tags to each sample
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        LanguageModelingTask.add_args(parser)
-        parser.add_argument(
-            "--multiple-datasets",
-            action="store_true",
-            help="if set, treats paths in data as separate datasets to be combined, "
-            "rather than as splits of a single dataset",
-        )
-        parser.add_argument(
-            "--prepend-ds-name",
-            action="store_true",
-            help="if set and multiple-datasets is also set, prepends the name of the ds instead of "
-            "bos/eos token",
-        )
-        parser.add_argument(
-            "--generic-ds-name-chance",
-            type=float,
-            metavar="P",
-            default=0,
-            help='if multiple datasets is used, sets the prepended ds name to "generic" '
-            "this percentage of time",
-        )
-        parser.add_argument(
-            "--subsample-splits",
-            type=str,
-            metavar="SPLITS",
-            default="valid",
-            help="if multiple datasets is used, subsamples specified split(colon separated) to "
-            "the size of the smallest split",
-        )
-
-    def __init__(self, args, dictionary, output_dictionary=None, targets=None):
-        super().__init__(args, dictionary, output_dictionary, targets)
-        self.subsample_splits = (
-            set()
-            if args.subsample_splits is None
-            else set(args.subsample_splits.split(":"))
-        )
-
-    def make_prepended_ds(self, dataset):
-        def ds_name(dataset, index):
-            if (
-                self.args.generic_ds_name_chance > 0
-                and np.random.rand() <= self.args.generic_ds_name_chance
-            ):
-                ds_name = "generic"
-            else:
-                ds_name = dataset.attr("name", index)
-            assert ds_name is not None
-            return self.dictionary.indices[ds_name]
-
-        dataset = PrependDataset(
-            dataset, prepend_getter=ds_name, ensure_first_token_is=self.dictionary.eos()
-        )
-        return dataset
-
-    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = self.args.data.split(":")
-        assert len(paths) > 0
-
-        if self.args.multiple_datasets:
-            if len(paths) == 1:
-                paths = [os.path.join(paths[0], p) for p in next(os.walk(paths[0]))[1]]
-            datasets = [
-                ShardedDataset(
-                    self.dictionary,
-                    self.args.dataset_impl,
-                    path,
-                    split,
-                    epoch,
-                    combine=combine,
-                )
-                for path in paths
-            ]
-
-            if split in self.subsample_splits:
-                sizes = [sum(d.sizes) for d in datasets]
-                min_sz = min(sizes)
-                ratios = [min_sz / sz for sz in sizes]
-                datasets = [
-                    SubsampleDataset(d, r) if r < 1 else d
-                    for d, r in zip(datasets, ratios)
-                ]
-
-            dataset = ConcatDataset(datasets)
-        else:
-            data_path = paths[epoch % len(paths)]
-            split_path = os.path.join(data_path, split)
-
-            dataset = data_utils.load_indexed_dataset(
-                split_path, self.dictionary, self.args.dataset_impl, combine=combine
-            )
-            if dataset is None:
-                raise FileNotFoundError(
-                    "Dataset not found: {} ({})".format(split, split_path)
-                )
-
-        dataset = TokenBlockDataset(
-            dataset,
-            dataset.sizes,
-            self.args.tokens_per_sample,
-            pad=self.dictionary.pad(),
-            eos=self.dictionary.eos(),
-            break_mode=self.args.sample_break_mode,
-            include_targets=True,
-        )
-
-        if self.args.prepend_ds_name:
-            dataset = self.make_prepended_ds(dataset)
-
-        dataset = ReplaceDataset(dataset, { self.dictionary.eos(): self.dictionary.indices['\\n'] }, offset=1)
-
-        add_eos_for_other_targets = (
-            self.args.sample_break_mode is not None
-            and self.args.sample_break_mode != "none"
-        )
-
-        self.datasets[split] = MonolingualDataset(
-            dataset,
-            dataset.sizes,
-            self.dictionary,
-            self.output_dictionary,
-            add_eos_for_other_targets=add_eos_for_other_targets,
-            shuffle=True,
-            targets=self.targets,
-            add_bos_token=self.args.add_bos_token,
-        )

From 8c509a94faf84effa8cf652977bcd4f004cce6a1 Mon Sep 17 00:00:00 2001
From: Nathan Ng <n.ng555@gmail.com>
Date: Thu, 22 Aug 2019 12:35:14 -0700
Subject: [PATCH 116/213] Add links to cuda models (#828)

Summary:
Add links to pre-trained cuda models in pay less attention
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/828

Reviewed By: michaelauli

Differential Revision: D16833577

Pulled By: nng555

fbshipit-source-id: 1556aa77fd87ea259812de8ef65963257c370f9b
---
 examples/pay_less_attention_paper/README.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/examples/pay_less_attention_paper/README.md b/examples/pay_less_attention_paper/README.md
index 97ab847fc0..62e74ff747 100644
--- a/examples/pay_less_attention_paper/README.md
+++ b/examples/pay_less_attention_paper/README.md
@@ -29,6 +29,26 @@ LightConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html
 DynamicConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.bz2) | newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
 LightConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.bz2) | newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2)
 DynamicConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.bz2) | newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2)
+LightConv (CUDA module) | [WMT17 English-German](http://statmt.org/wmt17/translation-task.html#Download) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.en-de.joined-dict.transformer.light-conv-cuda-glu.tar.gz) | newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+DynamicConv (CUDA module) | [WMT17 English-German](http://statmt.org/wmt17/translation-task.html#Download) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.en-de.joined-dict.transformer.dynamic-conv-cuda-glu.tar.gz) | newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+
+### Memory-Efficient CUDA Kernels
+
+Since the PyTorch implementations of Light/Dynamic conv are quite memory intensive, we have developed CUDA kernels that implement the light and dynamic convolution operator in a memory-efficient and performant manner. For large sequence lengths, these kernels save about 50% memory compared to the PyTorch equivalent. 
+
+To install the kernels, use the commands below. Once installed, they will automatically be used in place of the PyTorch implementations whenever a light or dynamic convolution is used.
+
+```sh
+# to install lightconv
+cd fairseq/modules/lightconv_layer
+python cuda_function_gen.py
+python setup.py install
+
+# to install dynamicconv
+cd fairseq/modules/dynamicconv_layer
+python cuda_function_gen.py
+python setup.py install
+```
 
 ### Preprocessing the training datasets
 

From d4c9136ce0d6101f52307098595addfcc6a53db5 Mon Sep 17 00:00:00 2001
From: Nathan Ng <n.ng555@gmail.com>
Date: Thu, 22 Aug 2019 16:23:02 -0700
Subject: [PATCH 117/213] Fix year in noisy channel citation (#842)

Summary:
2018->2019
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/842

Differential Revision: D16973530

Pulled By: nng555

fbshipit-source-id: 00207b79821ac0257a53a0581a84582130e1bff5
---
 examples/noisychannel/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/noisychannel/README.md b/examples/noisychannel/README.md
index a5dd0b9de3..9d101aa874 100644
--- a/examples/noisychannel/README.md
+++ b/examples/noisychannel/README.md
@@ -3,7 +3,7 @@ This page contains pointers to pre-trained models as well as instructions on how
 
 ## Citation:
 ```bibtex
-@inproceedings{yee2018simple,
+@inproceedings{yee2019simple,
   title = {Simple and Effective Noisy Channel Modeling for Neural Machine Translation},
   author = {Kyra Yee and Yann Dauphin and Michael Auli},
   booktitle = {Conference on Empirical Methods in Natural Language Processing},

From 6e2bd794e02d61e07fbf2173e7d172600f9cc276 Mon Sep 17 00:00:00 2001
From: Alexei Baevski <abaevski@fb.com>
Date: Thu, 22 Aug 2019 18:33:34 -0700
Subject: [PATCH 118/213] wav2vec everstore support

Summary: changes for internal support

Differential Revision: D16646887

fbshipit-source-id: ac5bf6c32901819726249422324eae32a0a6e148
---
 fairseq/data/__init__.py                |   6 +-
 fairseq/data/audio/raw_audio_dataset.py | 132 +++++++++++++++---------
 fairseq/tasks/audio_pretraining.py      |  10 +-
 3 files changed, 89 insertions(+), 59 deletions(-)

diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index f3ecd7d178..1da1799c34 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -9,7 +9,7 @@
 
 from .base_wrapper_dataset import BaseWrapperDataset
 
-from .audio.raw_audio_dataset import RawAudioDataset
+from .audio.raw_audio_dataset import FileAudioDataset
 from .backtranslation_dataset import BacktranslationDataset
 from .concat_dataset import ConcatDataset
 from .concat_sentences_dataset import ConcatSentencesDataset
@@ -78,9 +78,9 @@
     'PadDataset',
     'PrependDataset',
     'PrependTokenDataset',
-    'RawAudioDataset',
-    'RawLabelDataset',
     'ReplaceDataset',
+    'FileAudioDataset',
+    "RawLabelDataset",
     'RightPadDataset',
     'RoundRobinZipDatasets',
     'ShardedDataset',
diff --git a/fairseq/data/audio/raw_audio_dataset.py b/fairseq/data/audio/raw_audio_dataset.py
index 59bee89066..40cbc20680 100644
--- a/fairseq/data/audio/raw_audio_dataset.py
+++ b/fairseq/data/audio/raw_audio_dataset.py
@@ -7,6 +7,7 @@
 import os
 import numpy as np
 import sys
+
 import torch
 import torch.nn.functional as F
 
@@ -14,61 +15,71 @@
 
 
 class RawAudioDataset(FairseqDataset):
-
-    def __init__(self, manifest_path, sample_rate, max_sample_size=None, min_sample_size=None,
-                 shuffle=True):
+    def __init__(
+        self,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=None,
+        shuffle=True,
+        min_length=0,
+    ):
         super().__init__()
 
         self.sample_rate = sample_rate
-        self.fnames = []
         self.sizes = []
-        self.max_sample_size = max_sample_size if max_sample_size is not None else sys.maxsize
-        self.min_sample_size = min_sample_size if min_sample_size is not None else self.max_sample_size
-
-        with open(manifest_path, 'r') as f:
-            self.root_dir = f.readline().strip()
-            for line in f:
-                items = line.strip().split('\t')
-                assert len(items) == 2, line
-                self.fnames.append(items[0])
-                self.sizes.append(int(items[1]))
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.min_sample_size = (
+            min_sample_size if min_sample_size is not None else self.max_sample_size
+        )
+        self.min_length = min_length
         self.shuffle = shuffle
 
     def __getitem__(self, index):
-        fname = os.path.join(self.root_dir, self.fnames[index])
-        import soundfile as sf
+        raise NotImplementedError()
 
-        wav, curr_sample_rate = sf.read(fname)
-        feats = torch.from_numpy(wav).float()
+    def __len__(self):
+        return len(self.sizes)
+
+    def postprocess(self, feats, curr_sample_rate):
+        def resample(x, factor):
+            return F.interpolate(x.view(1, 1, -1), scale_factor=factor).squeeze()
 
         if feats.dim() == 2:
             feats = feats.mean(-1)
 
         if curr_sample_rate != self.sample_rate:
             factor = self.sample_rate / curr_sample_rate
-            feats = self.resample(feats, factor)
+            feats = resample(feats, factor)
 
         assert feats.dim() == 1, feats.dim()
+        return feats
 
-        return {
-            'id': index,
-            'source': feats,
-        }
+    def crop_to_max_size(self, wav, target_size):
+        size = len(wav)
+        diff = size - target_size
+        if diff <= 0:
+            return wav
 
-    def resample(self, x, factor):
-        return F.interpolate(x.view(1, 1, -1), scale_factor=factor).squeeze()
-
-    def __len__(self):
-        return len(self.fnames)
+        start = np.random.randint(0, diff + 1)
+        end = size - diff + start
+        return wav[start:end]
 
     def collater(self, samples):
+        samples = [
+            s for s in samples if s["source"] is not None and len(s["source"]) > 0
+        ]
         if len(samples) == 0:
             return {}
 
-        sources = [s['source'] for s in samples]
+        sources = [s["source"] for s in samples]
         sizes = [len(s) for s in sources]
         target_size = min(min(sizes), self.max_sample_size)
 
+        if target_size < self.min_length:
+            return {}
+
         if self.min_sample_size < target_size:
             target_size = np.random.randint(self.min_sample_size, target_size + 1)
 
@@ -79,32 +90,13 @@ def collater(self, samples):
             if diff == 0:
                 collated_sources[i] = source
             else:
-                start = np.random.randint(0, diff + 1)
-                end = size - diff + start
-                collated_sources[i] = source[start:end]
+                collated_sources[i] = self.crop_to_max_size(source, target_size)
 
         return {
-            'id': torch.LongTensor([s['id'] for s in samples]),
-            'net_input': {
-                'source': collated_sources,
-            },
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "net_input": {"source": collated_sources},
         }
 
-    def get_dummy_batch(
-            self, num_tokens, max_positions, src_len=2048, tgt_len=128,
-    ):
-        """Return a dummy batch with a given number of tokens."""
-        if isinstance(max_positions, float) or isinstance(max_positions, int):
-            src_len = min(src_len, max_positions)
-        bsz = num_tokens // src_len
-        return self.collater([
-            {
-                'id': i,
-                'source': torch.rand(src_len),
-            }
-            for i in range(bsz)
-        ])
-
     def num_tokens(self, index):
         return self.size(index)
 
@@ -124,3 +116,41 @@ def ordered_indices(self):
 
         order.append(self.sizes)
         return np.lexsort(order)
+
+
+class FileAudioDataset(RawAudioDataset):
+    def __init__(
+        self,
+        manifest_path,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=None,
+        shuffle=True,
+        min_length=0,
+    ):
+        super().__init__(
+            sample_rate=sample_rate,
+            max_sample_size=max_sample_size,
+            min_sample_size=min_sample_size,
+            shuffle=shuffle,
+            min_length=min_length,
+        )
+
+        self.fnames = []
+
+        with open(manifest_path, "r") as f:
+            self.root_dir = f.readline().strip()
+            for line in f:
+                items = line.strip().split("\t")
+                assert len(items) == 2, line
+                self.fnames.append(items[0])
+                self.sizes.append(int(items[1]))
+
+    def __getitem__(self, index):
+        import soundfile as sf
+
+        fname = os.path.join(self.root_dir, self.fnames[index])
+        wav, curr_sample_rate = sf.read(fname)
+        feats = torch.from_numpy(wav).float()
+        feats = self.postprocess(feats, curr_sample_rate)
+        return {"id": index, "source": feats}
diff --git a/fairseq/tasks/audio_pretraining.py b/fairseq/tasks/audio_pretraining.py
index 76e072866b..e161c224e9 100644
--- a/fairseq/tasks/audio_pretraining.py
+++ b/fairseq/tasks/audio_pretraining.py
@@ -5,7 +5,7 @@
 
 import os
 
-from fairseq.data import RawAudioDataset
+from fairseq.data import FileAudioDataset
 from . import FairseqTask, register_task
 
 
@@ -46,10 +46,10 @@ def load_dataset(self, split, **kwargs):
         """
 
         manifest = os.path.join(self.args.data, '{}.tsv'.format(split))
-        self.datasets[split] = RawAudioDataset(manifest,
-                                               sample_rate=self.args.sample_rate,
-                                               max_sample_size=self.args.max_sample_size,
-                                               min_sample_size=self.args.min_sample_size)
+        self.datasets[split] = FileAudioDataset(manifest,
+                                                 sample_rate=self.args.sample_rate,
+                                                 max_sample_size=self.args.max_sample_size,
+                                                 min_sample_size=self.args.min_sample_size)
 
     @property
     def target_dictionary(self):

From 4fc39538aec5141aa41f5d6d7dc0097e7c0f7b48 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair0356.h2.fair>
Date: Fri, 23 Aug 2019 07:31:18 -0700
Subject: [PATCH 119/213] Cythonize token block dataset (#834)

Summary:
Cythonized token block dataset code, it's `> 100x` faster. Token block for entire `bookwiki+CC+stories+openweb` is just ~`39.9` seconds.

TODO:
1) I think, I can make it 2x more faster.
2) cleanup.

EDIT History:
~~First pass at parellelizing `token_block_dataset`. The code feels somewhat complicated and cluttered.
This is 2-3x faster though on my tests on `bookwiki` dataset with both `complete` and `complete_doc` modes.
myleott Can you take a look for correctness as I am still not 100% sure that I am not missing corner cases.~~
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/834

Test Plan:
Imported from GitHub, without a `Test Plan:` line.

Test workflow: f133816198

Reviewed By: myleott

Differential Revision: D16970257

Pulled By: myleott

fbshipit-source-id: ec45a308193c9e9f3e7075336c15df4723228d6f
---
 fairseq/data/data_utils.py              |  46 ++----
 fairseq/data/data_utils_fast.pyx        |  67 +++++++++
 fairseq/data/token_block_dataset.py     | 134 +++--------------
 fairseq/data/token_block_utils_fast.pyx | 184 ++++++++++++++++++++++++
 fairseq/tasks/fairseq_task.py           |   2 +
 setup.py                                |   6 +-
 6 files changed, 285 insertions(+), 154 deletions(-)
 create mode 100644 fairseq/data/data_utils_fast.pyx
 create mode 100644 fairseq/data/token_block_utils_fast.pyx

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 38c385018d..dee29528a4 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -12,6 +12,10 @@
 import os
 
 import numpy as np
+import sys
+import types
+
+from fairseq.data.data_utils_fast import batch_by_size_fast
 
 
 def infer_language_pair(path):
@@ -196,45 +200,13 @@ def batch_by_size(
         required_batch_size_multiple (int, optional): require batch size to
             be a multiple of N (default: 1).
     """
-    max_tokens = max_tokens if max_tokens is not None else float('Inf')
-    max_sentences = max_sentences if max_sentences is not None else float('Inf')
+    max_tokens = max_tokens if max_tokens is not None else sys.maxsize
+    max_sentences = max_sentences if max_sentences is not None else sys.maxsize
     bsz_mult = required_batch_size_multiple
 
-    batch = []
-
-    def is_batch_full(num_tokens):
-        if len(batch) == 0:
-            return False
-        if len(batch) == max_sentences:
-            return True
-        if num_tokens > max_tokens:
-            return True
-        return False
-
-    sample_len = 0
-    sample_lens = []
-    for idx in indices:
-        sample_lens.append(num_tokens_fn(idx))
-        sample_len = max(sample_len, sample_lens[-1])
-        assert sample_len <= max_tokens, (
-            "sentence at index {} of size {} exceeds max_tokens "
-            "limit of {}!".format(idx, sample_len, max_tokens)
-        )
-        num_tokens = (len(batch) + 1) * sample_len
-        if is_batch_full(num_tokens):
-            mod_len = max(
-                bsz_mult * (len(batch) // bsz_mult),
-                len(batch) % bsz_mult,
-            )
-            yield batch[:mod_len]
-            batch = batch[mod_len:]
-            sample_lens = sample_lens[mod_len:]
-            sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
-
-        batch.append(idx)
-
-    if len(batch) > 0:
-        yield batch
+    if isinstance(indices, types.GeneratorType):
+        indices = np.fromiter(indices, dtype=np.int64, count=-1)
+    return batch_by_size_fast(indices, num_tokens_fn, max_tokens, max_sentences, bsz_mult)
 
 
 def process_bpe_symbol(sentence: str, bpe_symbol: str):
diff --git a/fairseq/data/data_utils_fast.pyx b/fairseq/data/data_utils_fast.pyx
new file mode 100644
index 0000000000..a9c6e57b34
--- /dev/null
+++ b/fairseq/data/data_utils_fast.pyx
@@ -0,0 +1,67 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+cimport cython
+cimport numpy as np
+
+DTYPE = np.int64
+ctypedef np.int64_t DTYPE_t
+
+
+cdef _is_batch_full(list batch, long num_tokens, long max_tokens, long max_sentences):
+    if len(batch) == 0:
+        return 0
+    if len(batch) == max_sentences:
+        return 1
+    if num_tokens > max_tokens:
+        return 1
+    return 0
+
+
+@cython.cdivision(True)
+cpdef list batch_by_size_fast(
+    np.ndarray[DTYPE_t, ndim=1] indices,
+    num_tokens_fn,
+    long max_tokens,
+    long max_sentences,
+    int bsz_mult,
+):
+    cdef long sample_len = 0
+    cdef list sample_lens = []
+    cdef list batch = []
+    cdef list batches = []
+    cdef long mod_len
+    cdef long i
+    cdef long idx
+    cdef long num_tokens
+    cdef DTYPE_t[:] indices_view = indices
+
+    for i in range(len(indices_view)):
+        idx = indices_view[i]
+        num_tokens = num_tokens_fn(idx)
+        sample_lens.append(num_tokens)
+        sample_len = max(sample_len, num_tokens)
+
+        assert sample_len <= max_tokens, (
+            "sentence at index {} of size {} exceeds max_tokens "
+            "limit of {}!".format(idx, sample_len, max_tokens)
+        )
+        num_tokens = (len(batch) + 1) * sample_len
+
+        if _is_batch_full(batch, num_tokens, max_tokens, max_sentences):
+            mod_len = max(
+                bsz_mult * (len(batch) // bsz_mult),
+                len(batch) % bsz_mult,
+            )
+            batches.append(batch[:mod_len])
+            batch = batch[mod_len:]
+            sample_lens = sample_lens[mod_len:]
+            sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+        batch.append(idx)
+    if len(batch) > 0:
+        batches.append(batch)
+    return batches
diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
index 6dd2cc8615..eddbea43ba 100644
--- a/fairseq/data/token_block_dataset.py
+++ b/fairseq/data/token_block_dataset.py
@@ -3,11 +3,14 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
-
 import numpy as np
 import torch
 
+from fairseq.data.token_block_utils_fast import (
+    _get_slice_indices_fast,
+    _get_block_to_dataset_index_fast,
+)
+
 from fairseq.data import FairseqDataset, plasma_utils
 
 
@@ -33,7 +36,6 @@ class TokenBlockDataset(FairseqDataset):
             'complete_doc' break mode). Typically 1 if the sentences have eos
             and 0 otherwise.
     """
-
     def __init__(
         self,
         dataset,
@@ -50,70 +52,22 @@ def __init__(
         self.pad = pad
         self.eos = eos
         self.include_targets = include_targets
-        slice_indices = []
 
         assert len(dataset) == len(sizes)
         assert len(dataset) > 0
-        sizes = np.array(sizes, dtype=int)
 
-        if break_mode is None or break_mode == "none":
-            total_size = sum(sizes)
-            length = math.ceil(total_size / block_size)
+        if isinstance(sizes, list):
+            sizes = np.array(sizes, dtype=np.int64)
+        else:
+            sizes = sizes.astype(np.int64)
 
-            def block_at(i):
-                start = i * block_size
-                end = min(start + block_size, total_size)
-                return (start, end)
+        break_mode = break_mode if break_mode is not None else 'none'
 
-            slice_indices = [block_at(i) for i in range(length)]
-        elif break_mode == "complete":
-            tok_idx = 0
-            sz_idx = 0
-            curr_size = 0
-            while sz_idx < len(sizes):
-                if curr_size + sizes[sz_idx] <= block_size or curr_size == 0:
-                    curr_size += sizes[sz_idx]
-                    sz_idx += 1
-                else:
-                    slice_indices.append((tok_idx, tok_idx + curr_size))
-                    tok_idx += curr_size
-                    curr_size = 0
-            if curr_size > 0:
-                slice_indices.append((tok_idx, tok_idx + curr_size))
-        elif break_mode == "complete_doc":
-            tok_idx = 0
-            sz_idx = 0
-            curr_size = 0
-            while sz_idx < len(sizes):
-                if (
-                    (curr_size + sizes[sz_idx] <= block_size or curr_size == 0)
-                    # an empty sentence indicates end-of-document:
-                    and sizes[sz_idx] != document_sep_len
-                ):
-                    curr_size += sizes[sz_idx]
-                    sz_idx += 1
-                else:
-                    if curr_size > 1:
-                        slice_indices.append((tok_idx, tok_idx + curr_size))
-                    tok_idx += curr_size
-                    curr_size = 0
-                    if sizes[sz_idx] == document_sep_len:
-                        tok_idx += sizes[sz_idx]
-                        sz_idx += 1
-            if curr_size > 1:
-                slice_indices.append((tok_idx, tok_idx + curr_size))
-        elif break_mode == "eos":
-            slice_indices = np.empty((len(sizes), 2), dtype=int)
-            if not torch.is_tensor(sizes):
-                sizes = torch.tensor(sizes)
-            cumsum = torch.cumsum(sizes, dim=0)
-            slice_indices[0] = [0, sizes[0]]
-            if len(cumsum) > 1:
-                slice_indices[1:] = cumsum.unfold(0, 2, 1)
-        else:
-            raise ValueError("Invalid break_mode: " + break_mode)
+        # For "eos" break-mode, block_size is not required parameters.
+        if break_mode == "eos" and block_size is None:
+            block_size = 0
 
-        slice_indices = np.array(slice_indices, dtype=int)
+        slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size, document_sep_len)
         self._sizes = slice_indices[:, 1] - slice_indices[:, 0]
 
         # build index mapping block indices to the underlying dataset indices
@@ -130,23 +84,10 @@ def block_at(i):
                 1,
             )
         else:
-            ds = DatasetSearcher(sizes)
-            block_to_dataset_index = np.empty((len(slice_indices), 3), dtype=int)
-            for i, (s, e) in enumerate(slice_indices):
-                ds.seek(s)
-                start_ds_idx = ds.current_index
-                start_offset = ds.current_offset
-                if e <= s:
-                    end_ds_idx = start_ds_idx
-                else:
-                    ds.seek(e - 1)
-                    end_ds_idx = ds.current_index
-                block_to_dataset_index[i] = (
-                    start_ds_idx,  # starting index in dataset
-                    start_offset,  # starting offset within starting index
-                    end_ds_idx,  # ending index in dataset
-                )
-
+            block_to_dataset_index = _get_block_to_dataset_index_fast(
+                sizes,
+                slice_indices,
+            )
         self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
         self._sizes = plasma_utils.PlasmaArray(self._sizes)
         self._block_to_dataset_index = plasma_utils.PlasmaArray(block_to_dataset_index)
@@ -215,42 +156,3 @@ def prefetch(self, indices):
                 for ds_idx in range(start_ds_idx, end_ds_idx + 1)
             }
         )
-
-
-class DatasetSearcher(object):
-    """Helper for mapping "flat" indices to indices and offsets in an
-    underlying dataset."""
-
-    def __init__(self, sizes):
-        self.sizes = sizes
-        self.reset()
-
-    def reset(self):
-        self.current_index = 0  # index in underlying dataset
-        self.current_offset = 0  # offset within current index in underlying dataset
-        self.current_i = 0  # "flat" index
-
-    def seek(self, i):
-        assert i >= 0
-
-        def step():
-            if i < self.current_i:
-                self.reset()
-            if i > self.current_i:
-                to_consume = i - self.current_i
-                remaining = self.sizes[self.current_index] - self.current_offset
-                if remaining > to_consume:
-                    self.current_offset += to_consume
-                    self.current_i += to_consume
-                else:
-                    assert remaining > 0
-                    self.current_i += remaining
-                    self.current_index += 1
-                    self.current_offset = 0
-                    return True
-            return False
-
-        not_done = True
-        while not_done:
-            not_done = step()
-        assert self.current_i == i
diff --git a/fairseq/data/token_block_utils_fast.pyx b/fairseq/data/token_block_utils_fast.pyx
new file mode 100644
index 0000000000..bf3b0ecf07
--- /dev/null
+++ b/fairseq/data/token_block_utils_fast.pyx
@@ -0,0 +1,184 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from itertools import chain
+from libc.math cimport ceil
+
+cimport cython
+cimport numpy as np
+
+DTYPE = np.int64
+ctypedef np.int64_t DTYPE_t
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):
+    cdef DTYPE_t total_size = sizes.sum()
+    cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
+    cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)
+    cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+    cdef DTYPE_t i
+    cdef DTYPE_t start
+    cdef DTYPE_t end
+    for i in range(length):
+        start = i * block_size
+        end = min(start + block_size, total_size)
+        slice_indices_view[i][0] = start
+        slice_indices_view[i][1] = end
+    return slice_indices
+
+
+cdef np.ndarray[DTYPE_t, ndim=2] _fast_convert_to_np_array(list list_of_list):
+    """
+    Faster function to convert DTYPE_t list of list.
+    Only fast when there are huge number of rows and low number of columns.
+    """
+    cdef np.ndarray[DTYPE_t, ndim=1] flat = np.fromiter(chain.from_iterable(list_of_list), DTYPE, -1)
+    return flat.reshape((len(list_of_list), -1))
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):
+    cdef DTYPE_t tok_idx = 0
+    cdef DTYPE_t sz_idx = 0
+    cdef DTYPE_t curr_size = 0
+    cdef DTYPE_t i = 0
+    cdef DTYPE_t length
+    cdef DTYPE_t total_size
+    cdef DTYPE_t[:] sizes_view = sizes
+    cdef np.ndarray[DTYPE_t, ndim=2] slice_indices
+    cdef list slice_indices_list = []
+
+    if break_mode is None or break_mode == 'none':
+        slice_indices = _get_slice_indices_none_mode(sizes, block_size)
+    elif break_mode == 'complete':
+        while sz_idx < len(sizes_view):
+            if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
+                curr_size += sizes_view[sz_idx]
+                sz_idx += 1
+            else:
+                slice_indices_list.append((tok_idx, tok_idx + curr_size))
+                tok_idx += curr_size
+                curr_size = 0
+        if curr_size > 0:
+            slice_indices_list.append((tok_idx, tok_idx + curr_size))
+        slice_indices = _fast_convert_to_np_array(slice_indices_list)
+    elif break_mode == 'complete_doc':
+        while sz_idx < len(sizes_view):
+            if (
+                (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
+                # an empty sentence indicates end-of-document:
+                and sizes_view[sz_idx] != document_sep_len
+            ):
+                curr_size += sizes_view[sz_idx]
+                sz_idx += 1
+            else:
+                # Only keep non-empty documents.
+                if curr_size > 1:
+                    slice_indices_list.append((tok_idx, tok_idx + curr_size))
+                tok_idx += curr_size
+                curr_size = 0
+                if sizes_view[sz_idx] == document_sep_len:
+                    tok_idx += sizes_view[sz_idx]
+                    sz_idx += 1
+        if curr_size > 1:
+            slice_indices_list.append((tok_idx, tok_idx + curr_size))
+        slice_indices = _fast_convert_to_np_array(slice_indices_list)
+    elif break_mode == 'eos':
+        slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+        cumsum = sizes.cumsum(axis=0)
+        slice_indices[1:, 0] = cumsum[:-1]
+        slice_indices[:, 1] = cumsum
+    else:
+        raise ValueError('Invalid break_mode: ' + break_mode)
+    return slice_indices
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):
+    cdef DTYPE_t start_ds_idx
+    cdef DTYPE_t start_offset
+    cdef DTYPE_t end_ds_idx
+    cdef DTYPE_t i
+    cdef DTYPE_t s
+    cdef DTYPE_t e
+    cdef DatasetSearcher ds = DatasetSearcher(sizes)
+    cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)
+    cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
+    cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+    cdef Py_ssize_t x_max = slice_indices.shape[0]
+
+    for i in range(x_max):
+        s = slice_indices_view[i][0]
+        e = slice_indices_view[i][1]
+        ds.seek(s)
+        start_ds_idx = ds.current_index
+        start_offset = ds.current_offset
+        if e <= s:
+            end_ds_idx = start_ds_idx
+        else:
+            ds.seek(e - 1)
+            end_ds_idx = ds.current_index
+        block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset
+        block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index
+        block_to_dataset_index_view[i][2] = end_ds_idx    # ending index in dataset
+    return block_to_dataset_index
+
+
+cdef class DatasetSearcher(object):
+    """Helper for mapping "flat" indices to indices and offsets in an
+    underlying dataset."""
+    cdef DTYPE_t current_i
+    cdef DTYPE_t current_offset
+    cdef DTYPE_t current_index
+    cdef DTYPE_t[:] sizes
+
+    def __init__(self, DTYPE_t[:] sizes):
+        self.sizes = sizes
+        self.reset()
+
+    cdef reset(self):
+        self.current_offset = 0     # offset within current index in underlying dataset
+        self.current_i = 0          # "flat" index
+        self.current_index = 0      # index in underlying dataset
+
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    @cython.nonecheck(False)
+    cdef int step(self, DTYPE_t i):
+        cdef DTYPE_t to_consume
+        cdef DTYPE_t remaining
+        if i < self.current_i:
+            self.reset()
+        if i > self.current_i:
+            to_consume = i - self.current_i
+            remaining = self.sizes[self.current_index] - self.current_offset
+            if remaining > to_consume:
+                self.current_offset += to_consume
+                self.current_i += to_consume
+            else:
+                assert remaining > 0
+                self.current_i += remaining
+                self.current_index += 1
+                self.current_offset = 0
+                return 1
+        return 0
+
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    @cython.nonecheck(False)
+    cdef seek(self, DTYPE_t i):
+        cdef int not_done = 1
+        while not_done == 1:
+            not_done = self.step(i)
+        assert self.current_i == i
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index 3dea071629..cb7c1a8966 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import numpy as np
 import torch
 
 from fairseq import tokenizer
@@ -134,6 +135,7 @@ def get_batch_iterator(
             indices = data_utils.filter_by_size(
                 indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs),
             )
+            indices = np.fromiter(indices, dtype=np.int64, count=-1)
 
         # create mini-batches with given size constraints
         batch_sampler = data_utils.batch_by_size(
diff --git a/setup.py b/setup.py
index 59d3410af0..537898dcc6 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from setuptools import setup, find_packages, Extension
+from Cython.Build import cythonize
 import sys
 
 
@@ -27,6 +28,8 @@
     extra_compile_args=extra_compile_args,
 )
 
+token_block_utils = cythonize("fairseq/data/token_block_utils_fast.pyx")
+data_utils_fast = cythonize("fairseq/data/data_utils_fast.pyx", language="c++")
 
 setup(
     name='fairseq',
@@ -52,7 +55,7 @@
         'tqdm',
     ],
     packages=find_packages(exclude=['scripts', 'tests']),
-    ext_modules=[bleu],
+    ext_modules=token_block_utils + data_utils_fast + [bleu],
     test_suite='tests',
     entry_points={
         'console_scripts': [
@@ -65,4 +68,5 @@
             'fairseq-validate = fairseq_cli.validate:cli_main',
         ],
     },
+    zip_safe=False,
 )

From 833f053dd73e1f9ff5f898b2c2aabf4e5b0ae865 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 23 Aug 2019 07:33:38 -0700
Subject: [PATCH 120/213] Suppress leaked semaphore warnings

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/844

Differential Revision: D16985131

Pulled By: myleott

fbshipit-source-id: 66ba3b9aa0cdf329a1e38fc09786f34906afdb43
---
 fairseq/data/iterators.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fairseq/data/iterators.py b/fairseq/data/iterators.py
index 0f1ec7e404..7bae6ab355 100644
--- a/fairseq/data/iterators.py
+++ b/fairseq/data/iterators.py
@@ -5,6 +5,7 @@
 
 import itertools
 import math
+import os
 
 import numpy as np
 import torch
@@ -248,6 +249,9 @@ def shuffle_batches(batches, seed):
         if offset > 0 and offset >= len(batches):
             return None
 
+        if self.num_workers > 0:
+            os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning'
+
         return CountingIterator(
             torch.utils.data.DataLoader(
                 self.dataset,

From 8a8c0691baf1f9b626f99ce5e326bc710e004b71 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@devfair0110.h2.fair>
Date: Mon, 26 Aug 2019 07:18:01 -0700
Subject: [PATCH 121/213] fix cython dependency in the setup (#847)

Summary:
Fixes broken build for `pytext` https://github.com/pytorch/fairseq/commit/4fc39538aec5141aa41f5d6d7dc0097e7c0f7b48

Earlier version of setup tools required `cython` to be installed before even starting setup.py. This one fixes it.
More details: https://github.com/pypa/setuptools/blob/master/CHANGES.rst#180
and https://stackoverflow.com/questions/37471313/setup-requires-with-cython
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/847

Differential Revision: D16997450

fbshipit-source-id: 5f65026c228a1b94280ca73937078ee3e21ce4f8
---
 setup.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 537898dcc6..af47c58d5e 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 from setuptools import setup, find_packages, Extension
-from Cython.Build import cythonize
 import sys
 
 
@@ -28,8 +27,8 @@
     extra_compile_args=extra_compile_args,
 )
 
-token_block_utils = cythonize("fairseq/data/token_block_utils_fast.pyx")
-data_utils_fast = cythonize("fairseq/data/data_utils_fast.pyx", language="c++")
+token_block_utils = [Extension("fairseq.data.token_block_utils_fast", ["fairseq/data/token_block_utils_fast.pyx"])]
+data_utils_fast = [Extension("fairseq.data.data_utils_fast", ["fairseq/data/data_utils_fast.pyx"], language="c++")]
 
 setup(
     name='fairseq',
@@ -45,6 +44,10 @@
     ],
     long_description=readme,
     long_description_content_type='text/markdown',
+    setup_requires=[
+        'cython',
+        'setuptools>=18.0',
+    ],
     install_requires=[
         'cffi',
         'fastBPE',

From 3ab8e0fd73dc85eb62acbf2e0f25a350d33fd2d7 Mon Sep 17 00:00:00 2001
From: Alexei Baevski <abaevski@fb.com>
Date: Mon, 26 Aug 2019 18:30:17 -0700
Subject: [PATCH 122/213] wav2vec everstore support fix

Summary: fixes some merge issues that prevented wav2vec from training properly

Reviewed By: myleott

Differential Revision: D16981120

fbshipit-source-id: cad39aaf2f44daabcbafe7b4e8735d055b3842a7
---
 train.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/train.py b/train.py
index afe9c10232..ce05403767 100644
--- a/train.py
+++ b/train.py
@@ -117,6 +117,10 @@ def train(args, trainer, task, epoch_itr):
     valid_subsets = args.valid_subset.split(',')
     max_update = args.max_update or math.inf
     for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch):
+        samples = [s for s in samples if len(s) > 0]
+        if len(samples) == 0:
+            continue
+
         log_output = trainer.train_step(samples)
         if log_output is None:
             continue

From 396ff7f59f027a98d2df9951ed15045bdd91554b Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@devfair0110.h2.fair>
Date: Tue, 27 Aug 2019 07:10:35 -0700
Subject: [PATCH 123/213] installing numpy headers for cython

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/848

Differential Revision: D17060283

fbshipit-source-id: c7e61cae76a0566cc3e2ddc3ab4d48f8dec9d777
---
 fairseq/data/data_utils.py          |  8 +++--
 fairseq/data/token_block_dataset.py | 15 ++++++----
 setup.py                            | 46 +++++++++++++++++++++++++----
 3 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index dee29528a4..22c8c60db6 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -15,8 +15,6 @@
 import sys
 import types
 
-from fairseq.data.data_utils_fast import batch_by_size_fast
-
 
 def infer_language_pair(path):
     """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
@@ -200,6 +198,12 @@ def batch_by_size(
         required_batch_size_multiple (int, optional): require batch size to
             be a multiple of N (default: 1).
     """
+    try:
+        from fairseq.data.data_utils_fast import batch_by_size_fast
+    except ImportError:
+        raise ImportError(
+            'Please build Cython components with: `pip install --editable .`'
+        )
     max_tokens = max_tokens if max_tokens is not None else sys.maxsize
     max_sentences = max_sentences if max_sentences is not None else sys.maxsize
     bsz_mult = required_batch_size_multiple
diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
index eddbea43ba..192b7c155a 100644
--- a/fairseq/data/token_block_dataset.py
+++ b/fairseq/data/token_block_dataset.py
@@ -6,11 +6,6 @@
 import numpy as np
 import torch
 
-from fairseq.data.token_block_utils_fast import (
-    _get_slice_indices_fast,
-    _get_block_to_dataset_index_fast,
-)
-
 from fairseq.data import FairseqDataset, plasma_utils
 
 
@@ -47,6 +42,16 @@ def __init__(
         include_targets=False,
         document_sep_len=1,
     ):
+        try:
+            from fairseq.data.token_block_utils_fast import (
+                _get_slice_indices_fast,
+                _get_block_to_dataset_index_fast,
+            )
+        except ImportError:
+            raise ImportError(
+                'Please build Cython components with: `pip install --editable .`'
+            )
+
         super().__init__()
         self.dataset = dataset
         self.pad = pad
diff --git a/setup.py b/setup.py
index af47c58d5e..d900b94654 100644
--- a/setup.py
+++ b/setup.py
@@ -15,9 +15,12 @@
     readme = f.read()
 
 if sys.platform == 'darwin':
-    extra_compile_args = ['-stdlib=libc++']
+    extra_compile_args = ['-stdlib=libc++', '-O3']
+    extra_link_args = ['-stdlib=libc++']
 else:
-    extra_compile_args = ['-std=c++11']
+    extra_compile_args = ['-std=c++11', '-O3']
+    extra_link_args = ['-std=c++11']
+
 bleu = Extension(
     'fairseq.libbleu',
     sources=[
@@ -27,8 +30,39 @@
     extra_compile_args=extra_compile_args,
 )
 
-token_block_utils = [Extension("fairseq.data.token_block_utils_fast", ["fairseq/data/token_block_utils_fast.pyx"])]
-data_utils_fast = [Extension("fairseq.data.data_utils_fast", ["fairseq/data/data_utils_fast.pyx"], language="c++")]
+
+def get_cython_modules():
+    token_block_utils = Extension(
+        "fairseq.data.token_block_utils_fast",
+        ["fairseq/data/token_block_utils_fast.pyx"],
+        extra_compile_args=extra_compile_args,
+        extra_link_args=extra_link_args,
+    )
+    data_utils_fast = Extension(
+        "fairseq.data.data_utils_fast",
+        ["fairseq/data/data_utils_fast.pyx"],
+        language="c++",
+        extra_compile_args=extra_compile_args,
+        extra_link_args=extra_link_args,
+    )
+    return [token_block_utils, data_utils_fast]
+
+
+def my_build_ext(pars):
+    """
+    Delay loading of numpy headers.
+    More details: https://stackoverflow.com/questions/54117786/add-numpy-get-include-argument-to-setuptools-without-preinstalled-numpy
+    """
+    from setuptools.command.build_ext import build_ext as _build_ext
+
+    class build_ext(_build_ext):
+        def finalize_options(self):
+            _build_ext.finalize_options(self)
+            __builtins__.__NUMPY_SETUP__ = False
+            import numpy
+            self.include_dirs.append(numpy.get_include())
+    return build_ext(pars)
+
 
 setup(
     name='fairseq',
@@ -45,6 +79,7 @@
     long_description=readme,
     long_description_content_type='text/markdown',
     setup_requires=[
+        'numpy',
         'cython',
         'setuptools>=18.0',
     ],
@@ -58,7 +93,7 @@
         'tqdm',
     ],
     packages=find_packages(exclude=['scripts', 'tests']),
-    ext_modules=token_block_utils + data_utils_fast + [bleu],
+    ext_modules=get_cython_modules() + [bleu],
     test_suite='tests',
     entry_points={
         'console_scripts': [
@@ -71,5 +106,6 @@
             'fairseq-validate = fairseq_cli.validate:cli_main',
         ],
     },
+    cmdclass={'build_ext': my_build_ext},
     zip_safe=False,
 )

From 920b85d4bd39e181229db5639c701c854c83ec5c Mon Sep 17 00:00:00 2001
From: Sosuke Kobayashi <soskek@users.noreply.github.com>
Date: Tue, 27 Aug 2019 08:39:33 -0700
Subject: [PATCH 124/213] Minor update of README.md of language model example
 (#1063)

Summary:
With this white space, the command might fail.
```
fairseq-preprocess: error: unrecognized arguments:
zsh: command not found: --destdir
```
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1063

Differential Revision: D17072516

Pulled By: myleott

fbshipit-source-id: 68bb9d05b40b215b18aceac2bff3f5ec1ef2f537
---
 examples/language_model/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index 6199e69ece..8c7da50f38 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -51,7 +51,7 @@ fairseq-preprocess \
     --only-source \
     --trainpref $TEXT/wiki.train.tokens \
     --validpref $TEXT/wiki.valid.tokens \
-    --testpref $TEXT/wiki.test.tokens \ 
+    --testpref $TEXT/wiki.test.tokens \
     --destdir data-bin/wikitext-103 \
     --workers 20
 ```

From d2410c4207b3a32cd1147236982abec2273a3d69 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 27 Aug 2019 10:06:26 -0700
Subject: [PATCH 125/213] Minor cleanup for setup.py

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1078

Differential Revision: D17072514

Pulled By: myleott

fbshipit-source-id: 69a8c8c9cc7caa7e04c414329a5d79e6e1a6621c
---
 fairseq/data/data_utils.py |  6 +++--
 hubconf.py                 |  1 +
 setup.py                   | 51 +++++++++++++++++++-------------------
 3 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 22c8c60db6..9d72d93e6b 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -10,11 +10,11 @@
 import contextlib
 import itertools
 import os
-
-import numpy as np
 import sys
 import types
 
+import numpy as np
+
 
 def infer_language_pair(path):
     """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
@@ -204,12 +204,14 @@ def batch_by_size(
         raise ImportError(
             'Please build Cython components with: `pip install --editable .`'
         )
+
     max_tokens = max_tokens if max_tokens is not None else sys.maxsize
     max_sentences = max_sentences if max_sentences is not None else sys.maxsize
     bsz_mult = required_batch_size_multiple
 
     if isinstance(indices, types.GeneratorType):
         indices = np.fromiter(indices, dtype=np.int64, count=-1)
+
     return batch_by_size_fast(indices, num_tokens_fn, max_tokens, max_sentences, bsz_mult)
 
 
diff --git a/hubconf.py b/hubconf.py
index 34179c9dba..c13977085d 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -11,6 +11,7 @@
 
 
 dependencies = [
+    'numpy',
     'regex',
     'requests',
     'torch',
diff --git a/setup.py b/setup.py
index d900b94654..9ec2d7360d 100644
--- a/setup.py
+++ b/setup.py
@@ -11,47 +11,45 @@
 if sys.version_info < (3,):
     sys.exit('Sorry, Python3 is required for fairseq.')
 
+
 with open('README.md') as f:
     readme = f.read()
 
+
 if sys.platform == 'darwin':
     extra_compile_args = ['-stdlib=libc++', '-O3']
-    extra_link_args = ['-stdlib=libc++']
 else:
     extra_compile_args = ['-std=c++11', '-O3']
-    extra_link_args = ['-std=c++11']
-
-bleu = Extension(
-    'fairseq.libbleu',
-    sources=[
-        'fairseq/clib/libbleu/libbleu.cpp',
-        'fairseq/clib/libbleu/module.cpp',
-    ],
-    extra_compile_args=extra_compile_args,
-)
 
 
-def get_cython_modules():
-    token_block_utils = Extension(
-        "fairseq.data.token_block_utils_fast",
-        ["fairseq/data/token_block_utils_fast.pyx"],
+extensions = [
+    Extension(
+        'fairseq.libbleu',
+        sources=[
+            'fairseq/clib/libbleu/libbleu.cpp',
+            'fairseq/clib/libbleu/module.cpp',
+        ],
         extra_compile_args=extra_compile_args,
-        extra_link_args=extra_link_args,
-    )
-    data_utils_fast = Extension(
-        "fairseq.data.data_utils_fast",
-        ["fairseq/data/data_utils_fast.pyx"],
-        language="c++",
+    ),
+    Extension(
+        'fairseq.data.data_utils_fast',
+        sources=['fairseq/data/data_utils_fast.pyx'],
+        language='c++',
         extra_compile_args=extra_compile_args,
-        extra_link_args=extra_link_args,
-    )
-    return [token_block_utils, data_utils_fast]
+    ),
+    Extension(
+        'fairseq.data.token_block_utils_fast',
+        sources=['fairseq/data/token_block_utils_fast.pyx'],
+        language='c++',
+        extra_compile_args=extra_compile_args,
+    ),
+]
 
 
 def my_build_ext(pars):
     """
     Delay loading of numpy headers.
-    More details: https://stackoverflow.com/questions/54117786/add-numpy-get-include-argument-to-setuptools-without-preinstalled-numpy
+    More details: https://stackoverflow.com/a/54138355
     """
     from setuptools.command.build_ext import build_ext as _build_ext
 
@@ -81,6 +79,7 @@ def finalize_options(self):
     setup_requires=[
         'numpy',
         'cython',
+        'numpy',
         'setuptools>=18.0',
     ],
     install_requires=[
@@ -93,7 +92,7 @@ def finalize_options(self):
         'tqdm',
     ],
     packages=find_packages(exclude=['scripts', 'tests']),
-    ext_modules=get_cython_modules() + [bleu],
+    ext_modules=extensions,
     test_suite='tests',
     entry_points={
         'console_scripts': [

From 108f94bc2aaf32ba1882dc9fd8f014496fe8f0c5 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@devfair0110.h2.fair>
Date: Wed, 28 Aug 2019 11:35:15 -0700
Subject: [PATCH 126/213] use numpy function for filter by size when possible
 (#845)

Summary:
For general Masked language modeling use-case, this is much faster, (`3 minutes vs 1 sec`).

Let me know what you think about it myleott, if you don't like all the special case checking, we can think of reorganizing the dataset APIs to always have `sizes` as property calculated in `__init__`.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/845

Reviewed By: myleott

Differential Revision: D16993769

Pulled By: myleott

fbshipit-source-id: 161bba62af2965190c07c47e838ee967cb886e88
---
 fairseq/data/data_utils.py    | 63 +++++++++++++++++++++++------------
 fairseq/tasks/fairseq_task.py |  3 +-
 2 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 9d72d93e6b..ac1aa330a6 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -124,18 +124,7 @@ def collect_filtered(function, iterable, filtered):
             filtered.append(el)
 
 
-def filter_by_size(indices, size_fn, max_positions, raise_exception=False):
-    """
-    Filter indices based on their size.
-
-    Args:
-        indices (List[int]): ordered list of dataset indices
-        size_fn (callable): function that returns the size of a given index
-        max_positions (tuple): filter elements larger than this size.
-            Comparisons are done component-wise.
-        raise_exception (bool, optional): if ``True``, raise an exception if
-            any elements are filtered (default: False).
-    """
+def _filter_by_size_dynamic(indices, size_fn, max_positions, raise_exception=False):
     def check_size(idx):
         if isinstance(max_positions, float) or isinstance(max_positions, int):
             return size_fn(idx) <= max_positions
@@ -158,25 +147,55 @@ def check_size(idx):
             # For MultiCorpusSampledDataset, will generalize it later
             if not isinstance(size_fn(idx), Iterable):
                 return all(size_fn(idx) <= b for b in max_positions)
-            return all(a is None or b is None or a <= b
-                       for a, b in zip(size_fn(idx), max_positions))
-
+            return all(
+                a is None or b is None or a <= b
+                for a, b in zip(size_fn(idx), max_positions)
+            )
     ignored = []
     itr = collect_filtered(check_size, indices, ignored)
+    indices = np.fromiter(itr, dtype=np.int64, count=-1)
+    return indices, ignored
 
-    for idx in itr:
-        if len(ignored) > 0 and raise_exception:
-            raise Exception((
-                'Size of sample #{} is invalid (={}) since max_positions={}, '
-                'skip this example with --skip-invalid-size-inputs-valid-test'
-            ).format(ignored[0], size_fn(ignored[0]), max_positions))
-        yield idx
 
+def filter_by_size(indices, dataset, max_positions, raise_exception=False):
+    """
+    Filter indices based on their size.
+
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        dataset (FairseqDataset): fairseq dataset instance
+        max_positions (tuple): filter elements larger than this size.
+            Comparisons are done component-wise.
+        raise_exception (bool, optional): if ``True``, raise an exception if
+            any elements are filtered (default: False).
+    """
+    if isinstance(max_positions, float) or isinstance(max_positions, int):
+        if hasattr(dataset, 'sizes') and isinstance(dataset.sizes, np.ndarray):
+            ignored = indices[dataset.sizes > max_positions].tolist()
+            indices = indices[dataset.sizes <= max_positions]
+        elif (
+                hasattr(dataset, 'sizes') and
+                isinstance(dataset.sizes, list) and
+                len(dataset.sizes) == 1
+        ):
+            ignored = indices[dataset.sizes[0] > max_positions].tolist()
+            indices = indices[dataset.sizes[0] <= max_positions]
+        else:
+            indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
+    else:
+        indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
+
+    if len(ignored) > 0 and raise_exception:
+        raise Exception((
+            'Size of sample #{} is invalid (={}) since max_positions={}, '
+            'skip this example with --skip-invalid-size-inputs-valid-test'
+        ).format(ignored[0], dataset.size(ignored[0]), max_positions))
     if len(ignored) > 0:
         print((
             '| WARNING: {} samples have invalid sizes and will be skipped, '
             'max_positions={}, first few sample ids={}'
         ).format(len(ignored), max_positions, ignored[:10]))
+    return indices
 
 
 def batch_by_size(
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index cb7c1a8966..1e2b623be8 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -133,9 +133,8 @@ def get_batch_iterator(
         # filter examples that are too large
         if max_positions is not None:
             indices = data_utils.filter_by_size(
-                indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs),
+                indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs),
             )
-            indices = np.fromiter(indices, dtype=np.int64, count=-1)
 
         # create mini-batches with given size constraints
         batch_sampler = data_utils.batch_by_size(

From 0a96d22f2ea7b1158e8da2d348e93ddb91ef1f7d Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 28 Aug 2019 21:42:30 -0700
Subject: [PATCH 127/213] Fix multi-gpu training (fixes #1088)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1089

Differential Revision: D17108918

Pulled By: myleott

fbshipit-source-id: 818c77a5bbf3b146028991aca64d79b93f144b28
---
 train.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/train.py b/train.py
index ce05403767..afe9c10232 100644
--- a/train.py
+++ b/train.py
@@ -117,10 +117,6 @@ def train(args, trainer, task, epoch_itr):
     valid_subsets = args.valid_subset.split(',')
     max_update = args.max_update or math.inf
     for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch):
-        samples = [s for s in samples if len(s) > 0]
-        if len(samples) == 0:
-            continue
-
         log_output = trainer.train_step(samples)
         if log_output is None:
             continue

From 8777465b8fdceeacb1d2fc8535b46c4971527f42 Mon Sep 17 00:00:00 2001
From: Paul O'Shannessy <poshannessy@fb.com>
Date: Thu, 29 Aug 2019 23:19:10 -0700
Subject: [PATCH 128/213] Adopt Contributor Covenant

Summary:
In order to foster healthy open source communities, we're adopting the
[Contributor Covenant](https://www.contributor-covenant.org/). It has been
built by open source community members and represents a shared understanding of
what is expected from a healthy community.

Reviewed By: josephsavona, danobi, rdzhabarov

Differential Revision: D17104640

fbshipit-source-id: d210000de686c5f0d97d602b50472d5869bc6a49
---
 CODE_OF_CONDUCT.md | 77 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index ac27d8a51b..d1abc700d2 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -1,2 +1,77 @@
 # Code of Conduct
-Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.fb.com/codeofconduct) so that you can understand what actions will and will not be tolerated.
\ No newline at end of file
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
+

From 4a7cd5825717b0b0c96a6463e0dd2d7b18dd4331 Mon Sep 17 00:00:00 2001
From: alexeib <alexei.b@gmail.com>
Date: Fri, 30 Aug 2019 16:23:40 -0700
Subject: [PATCH 129/213] set numpy seed explicitly + other minor fixes (#850)

Summary:
not setting the numpy seed explicitly at the beginning was an extremely annoying bug to find. it it caused different gpus to have a different view of data if some randomization was used in the dataset (e.g. subsample dataset)
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/850

Differential Revision: D17085006

Pulled By: alexeib

fbshipit-source-id: 62bb2116369fb703df878e6bc24c06f1ea4e75a0
---
 fairseq/data/replace_dataset.py   | 24 +++++++++++++++++-------
 fairseq/data/subsample_dataset.py | 13 +++++++++++--
 train.py                          |  2 ++
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/fairseq/data/replace_dataset.py b/fairseq/data/replace_dataset.py
index 670b812f45..3bc52f0fb5 100644
--- a/fairseq/data/replace_dataset.py
+++ b/fairseq/data/replace_dataset.py
@@ -7,20 +7,30 @@
 
 
 class ReplaceDataset(BaseWrapperDataset):
-    def __init__(self, dataset, replace_map, offset=0):
+    """Replaces tokens found in the dataset by a specified replacement token
+
+        Args:
+            dataset (~torch.utils.data.Dataset): dataset to replace tokens in
+            replace_map(Dictionary[int,int]): map of token to replace -> replacement token
+            offsets (List[int]): do not replace tokens before (from left if pos, right if neg) this offset. should be
+            as many as the number of objects returned by the underlying dataset __getitem__ method.
+        """
+
+    def __init__(self, dataset, replace_map, offsets):
         super().__init__(dataset)
         assert len(replace_map) > 0
         self.replace_map = replace_map
-        self.offset = offset
+        self.offsets = offsets
 
     def __getitem__(self, index):
         item = self.dataset[index]
         is_tuple = isinstance(item, tuple)
-        src = item[0] if is_tuple else item
+        srcs = item if is_tuple else [item]
 
-        for k, v in self.replace_map.items():
-            src_off = src[self.offset:]
-            src_off.masked_fill_(src_off == k, v)
+        for offset, src in zip(self.offsets, srcs):
+            for k, v in self.replace_map.items():
+                src_off = src[offset:] if offset >= 0 else src[:offset]
+                src_off.masked_fill_(src_off == k, v)
 
-        item = tuple((src,) + item[1:]) if is_tuple else src
+        item = srcs if is_tuple else srcs[0]
         return item
diff --git a/fairseq/data/subsample_dataset.py b/fairseq/data/subsample_dataset.py
index 983a611393..f1c2942e52 100644
--- a/fairseq/data/subsample_dataset.py
+++ b/fairseq/data/subsample_dataset.py
@@ -9,15 +9,24 @@
 
 
 class SubsampleDataset(BaseWrapperDataset):
+    """Subsamples a given dataset by a specified ratio. Subsampling is done on the number of examples
+
+            Args:
+                dataset (~torch.utils.data.Dataset): dataset to subsample
+                size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive)
+            """
+
     def __init__(self, dataset, size_ratio):
         super().__init__(dataset)
         assert size_ratio < 1
         self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int)
         self.indices = np.random.choice(
-            range(len(self.dataset)), self.actual_size, replace=False
+            list(range(len(self.dataset))), self.actual_size, replace=False
         )
         print(
-            "subsampled dataset from {} to {} (ratio={})".format(len(self.dataset), self.actual_size, size_ratio)
+            "subsampled dataset from {} to {} (ratio={})".format(
+                len(self.dataset), self.actual_size, size_ratio
+            )
         )
 
     def __getitem__(self, index):
diff --git a/train.py b/train.py
index afe9c10232..e4f0f7a5d2 100644
--- a/train.py
+++ b/train.py
@@ -9,6 +9,7 @@
 
 import collections
 import math
+import numpy as np
 import random
 
 import torch
@@ -28,6 +29,7 @@ def main(args, init_distributed=False):
     # Initialize CUDA and distributed training
     if torch.cuda.is_available() and not args.cpu:
         torch.cuda.set_device(args.device_id)
+    np.random.seed(args.seed)
     torch.manual_seed(args.seed)
     if init_distributed:
         args.distributed_rank = distributed_utils.distributed_init(args)

From c1951aa277618848bb007cd9459e153a4d84a1d1 Mon Sep 17 00:00:00 2001
From: alexeib <alexei.b@gmail.com>
Date: Sat, 31 Aug 2019 01:01:00 -0700
Subject: [PATCH 130/213] add missing colorize dataset

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/851

Differential Revision: D17145769

Pulled By: alexeib

fbshipit-source-id: 9dd26799d044ae5386e8204a129b5e3fc66d6e85
---
 fairseq/data/__init__.py         |  2 ++
 fairseq/data/colorize_dataset.py | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 fairseq/data/colorize_dataset.py

diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index 1da1799c34..440d356b70 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -11,6 +11,7 @@
 
 from .audio.raw_audio_dataset import FileAudioDataset
 from .backtranslation_dataset import BacktranslationDataset
+from .colorize_dataset import ColorizeDataset
 from .concat_dataset import ConcatDataset
 from .concat_sentences_dataset import ConcatSentencesDataset
 from .id_dataset import IdDataset
@@ -51,6 +52,7 @@
 __all__ = [
     'BacktranslationDataset',
     'BaseWrapperDataset',
+    'ColorizeDataset',
     'ConcatDataset',
     'ConcatSentencesDataset',
     'CountingIterator',
diff --git a/fairseq/data/colorize_dataset.py b/fairseq/data/colorize_dataset.py
new file mode 100644
index 0000000000..89e0e04142
--- /dev/null
+++ b/fairseq/data/colorize_dataset.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import BaseWrapperDataset
+
+
+class ColorizeDataset(BaseWrapperDataset):
+    """ Adds 'colors' property to net input that is obtained from the provided color getter for use by models """
+    def __init__(self, dataset, color_getter):
+        super().__init__(dataset)
+        self.color_getter = color_getter
+
+    def collater(self, samples):
+        base_collate = super().collater(samples)
+        if len(base_collate) > 0:
+            base_collate["net_input"]["colors"] = torch.tensor(
+                list(self.color_getter(self.dataset, s["id"]) for s in samples),
+                dtype=torch.long,
+            )
+        return base_collate

From 746e59a26239a7cb96e42b879b81b887eeeb7454 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sat, 31 Aug 2019 13:43:00 -0700
Subject: [PATCH 131/213] Improve support for `python setup.py build_ext
 --inplace`

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/852

Differential Revision: D17147452

Pulled By: myleott

fbshipit-source-id: 5fd9c7da3cc019c7beec98d41db1aef1329ee57a
---
 fairseq/data/data_utils.py          |  9 +++------
 fairseq/data/token_block_dataset.py |  3 ++-
 setup.py                            | 24 +++++++++---------------
 train.py                            |  2 +-
 4 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index ac1aa330a6..234d2e921a 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -173,11 +173,7 @@ def filter_by_size(indices, dataset, max_positions, raise_exception=False):
         if hasattr(dataset, 'sizes') and isinstance(dataset.sizes, np.ndarray):
             ignored = indices[dataset.sizes > max_positions].tolist()
             indices = indices[dataset.sizes <= max_positions]
-        elif (
-                hasattr(dataset, 'sizes') and
-                isinstance(dataset.sizes, list) and
-                len(dataset.sizes) == 1
-        ):
+        elif hasattr(dataset, 'sizes') and isinstance(dataset.sizes, list) and len(dataset.sizes) == 1:
             ignored = indices[dataset.sizes[0] > max_positions].tolist()
             indices = indices[dataset.sizes[0] <= max_positions]
         else:
@@ -221,7 +217,8 @@ def batch_by_size(
         from fairseq.data.data_utils_fast import batch_by_size_fast
     except ImportError:
         raise ImportError(
-            'Please build Cython components with: `pip install --editable .`'
+            'Please build Cython components with: `pip install --editable .` '
+            'or `python setup.py build_ext --inplace`'
         )
 
     max_tokens = max_tokens if max_tokens is not None else sys.maxsize
diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
index 192b7c155a..f60b078d42 100644
--- a/fairseq/data/token_block_dataset.py
+++ b/fairseq/data/token_block_dataset.py
@@ -49,7 +49,8 @@ def __init__(
             )
         except ImportError:
             raise ImportError(
-                'Please build Cython components with: `pip install --editable .`'
+                'Please build Cython components with: `pip install --editable .` '
+                'or `python setup.py build_ext --inplace`'
             )
 
         super().__init__()
diff --git a/setup.py b/setup.py
index 9ec2d7360d..3a721685bd 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from setuptools import setup, find_packages, Extension
+from setuptools.command.build_ext import build_ext
 import sys
 
 
@@ -46,20 +47,13 @@
 ]
 
 
-def my_build_ext(pars):
-    """
-    Delay loading of numpy headers.
-    More details: https://stackoverflow.com/a/54138355
-    """
-    from setuptools.command.build_ext import build_ext as _build_ext
-
-    class build_ext(_build_ext):
-        def finalize_options(self):
-            _build_ext.finalize_options(self)
-            __builtins__.__NUMPY_SETUP__ = False
-            import numpy
-            self.include_dirs.append(numpy.get_include())
-    return build_ext(pars)
+class CustomBuildExtCommand(build_ext):
+    """Source: https://stackoverflow.com/a/42163080"""
+    def run(self):
+        # Import numpy here, only when headers are needed
+        import numpy
+        self.include_dirs.append(numpy.get_include())
+        super().run()
 
 
 setup(
@@ -105,6 +99,6 @@ def finalize_options(self):
             'fairseq-validate = fairseq_cli.validate:cli_main',
         ],
     },
-    cmdclass={'build_ext': my_build_ext},
+    cmdclass={'build_ext': CustomBuildExtCommand},
     zip_safe=False,
 )
diff --git a/train.py b/train.py
index e4f0f7a5d2..31c9507760 100644
--- a/train.py
+++ b/train.py
@@ -9,9 +9,9 @@
 
 import collections
 import math
-import numpy as np
 import random
 
+import numpy as np
 import torch
 
 from fairseq import checkpoint_utils, distributed_utils, options, progress_bar, tasks, utils

From 8d4588b1bacc10e9df70ba28bd3a308b2ca894d6 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sat, 31 Aug 2019 16:52:03 -0700
Subject: [PATCH 132/213] Cleaner handling of numpy-based extensions in
 setup.py

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/853

Differential Revision: D17147879

Pulled By: myleott

fbshipit-source-id: b1f5e838533de62ade52fa82112ea5308734c70f
---
 setup.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/setup.py b/setup.py
index 3a721685bd..1ec74a1633 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 from setuptools import setup, find_packages, Extension
-from setuptools.command.build_ext import build_ext
 import sys
 
 
@@ -23,6 +22,23 @@
     extra_compile_args = ['-std=c++11', '-O3']
 
 
+class NumpyExtension(Extension):
+    """Source: https://stackoverflow.com/a/54128391"""
+
+    def __init__(self, *args, **kwargs):
+        self.__include_dirs = []
+        super().__init__(*args, **kwargs)
+
+    @property
+    def include_dirs(self):
+        import numpy
+        return self.__include_dirs + [numpy.get_include()]
+
+    @include_dirs.setter
+    def include_dirs(self, dirs):
+        self.__include_dirs = dirs
+
+
 extensions = [
     Extension(
         'fairseq.libbleu',
@@ -32,13 +48,13 @@
         ],
         extra_compile_args=extra_compile_args,
     ),
-    Extension(
+    NumpyExtension(
         'fairseq.data.data_utils_fast',
         sources=['fairseq/data/data_utils_fast.pyx'],
         language='c++',
         extra_compile_args=extra_compile_args,
     ),
-    Extension(
+    NumpyExtension(
         'fairseq.data.token_block_utils_fast',
         sources=['fairseq/data/token_block_utils_fast.pyx'],
         language='c++',
@@ -47,15 +63,6 @@
 ]
 
 
-class CustomBuildExtCommand(build_ext):
-    """Source: https://stackoverflow.com/a/42163080"""
-    def run(self):
-        # Import numpy here, only when headers are needed
-        import numpy
-        self.include_dirs.append(numpy.get_include())
-        super().run()
-
-
 setup(
     name='fairseq',
     version='0.8.0',
@@ -71,7 +78,6 @@ def run(self):
     long_description=readme,
     long_description_content_type='text/markdown',
     setup_requires=[
-        'numpy',
         'cython',
         'numpy',
         'setuptools>=18.0',
@@ -99,6 +105,5 @@ def run(self):
             'fairseq-validate = fairseq_cli.validate:cli_main',
         ],
     },
-    cmdclass={'build_ext': CustomBuildExtCommand},
     zip_safe=False,
 )

From 20dfba73b9993864b24b456b4e46e6e3994114c0 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair0753.h2.fair>
Date: Sun, 1 Sep 2019 12:14:08 -0700
Subject: [PATCH 133/213] fixed numpy based size filtering (#854)

Summary:
This bug got introduced in my [commit](https://github.com/fairinternal/fairseq-py/commit/9624f9651478bcb88022decf7e1b0685b410133b) for fast numpy based size filtering.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/854

Differential Revision: D17150350

fbshipit-source-id: cb564119543e116d6a17784d1c22e9bce7059a0c
---
 fairseq/data/data_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 234d2e921a..66b880fa35 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -171,11 +171,11 @@ def filter_by_size(indices, dataset, max_positions, raise_exception=False):
     """
     if isinstance(max_positions, float) or isinstance(max_positions, int):
         if hasattr(dataset, 'sizes') and isinstance(dataset.sizes, np.ndarray):
-            ignored = indices[dataset.sizes > max_positions].tolist()
-            indices = indices[dataset.sizes <= max_positions]
+            ignored = indices[dataset.sizes[indices] > max_positions].tolist()
+            indices = indices[dataset.sizes[indices] <= max_positions]
         elif hasattr(dataset, 'sizes') and isinstance(dataset.sizes, list) and len(dataset.sizes) == 1:
-            ignored = indices[dataset.sizes[0] > max_positions].tolist()
-            indices = indices[dataset.sizes[0] <= max_positions]
+            ignored = indices[dataset.sizes[0][indices] > max_positions].tolist()
+            indices = indices[dataset.sizes[0][indices] <= max_positions]
         else:
             indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
     else:

From 6c00b338c54b17bfd5343a4eabcb4d0df160764e Mon Sep 17 00:00:00 2001
From: altale <578340750@qq.com>
Date: Tue, 3 Sep 2019 07:44:02 -0700
Subject: [PATCH 134/213] Fix an error in the command about Hierarchical Neural
 Story Generation (#1099)

Summary:
When I try to reproduce the experiment in  _Hierarchical Neural Story Generation_, I found the command about generation cannot be executed.

It said that **fairseq-generate: error: unrecognized arguments: --sampling-temperature 0.8**
In the document, I find:
```
--temperature   temperature for generation
Default: 1.0
```
And I don't find a parameter named `--sampling-temperature`, so I think the parameter `--sampling-temperature` should be changed to `--temperature`
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1099

Differential Revision: D17163065

Pulled By: myleott

fbshipit-source-id: 25c430eeee4703f8ec30353825ffec4bb973da0d
---
 examples/stories/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/stories/README.md b/examples/stories/README.md
index 625439e81a..a0a063787f 100644
--- a/examples/stories/README.md
+++ b/examples/stories/README.md
@@ -52,7 +52,7 @@ fairseq-train data-bin/writingPrompts -a fconv_self_att_wp --lr 0.25 --clip-norm
 # Generate:
 # Note: to load the pretrained model at generation time, you need to pass in a model-override argument to communicate to the fusion model at generation time where you have placed the pretrained checkpoint. By default, it will load the exact path of the fusion model's pretrained model from training time. You should use model-override if you have moved the pretrained model (or are using our provided models). If you are generating from a non-fusion model, the model-override argument is not necessary.
 
-fairseq-generate data-bin/writingPrompts --path /path/to/trained/model/checkpoint_best.pt --batch-size 32 --beam 1 --sampling --sampling-topk 10 --sampling-temperature 0.8 --nbest 1 --model-overrides "{'pretrained_checkpoint':'/path/to/pretrained/model/checkpoint'}"
+fairseq-generate data-bin/writingPrompts --path /path/to/trained/model/checkpoint_best.pt --batch-size 32 --beam 1 --sampling --sampling-topk 10 --temperature 0.8 --nbest 1 --model-overrides "{'pretrained_checkpoint':'/path/to/pretrained/model/checkpoint'}"
 ```
 
 ## Citation

From 1f0f7cd82ce88ef1c074c2e30ac5417f8d104cb1 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@devfair0110.h2.fair>
Date: Tue, 3 Sep 2019 09:06:44 -0700
Subject: [PATCH 135/213] added cython to install_requires

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/856

Reviewed By: myleott

Differential Revision: D17162411

Pulled By: myleott

fbshipit-source-id: e70ecc802398bbba2b5326e9700f2121c422fd18
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 1ec74a1633..8f4604be11 100644
--- a/setup.py
+++ b/setup.py
@@ -84,6 +84,7 @@ def include_dirs(self, dirs):
     ],
     install_requires=[
         'cffi',
+        'cython',
         'fastBPE',
         'numpy',
         'regex',

From 1566cfb9634fd06c551df510ac24f36c4bfca5a6 Mon Sep 17 00:00:00 2001
From: Peng-Jen Chen <pipibjc@fb.com>
Date: Tue, 3 Sep 2019 20:18:03 -0700
Subject: [PATCH 136/213] Fix multilingual translation bug for to-many case

Summary:
The logic for adding decoder side language token was wrongly implemented.
The way we inject the language token is by replacing the eos symbol with language token symbol. However, the parameter for source / target eos symbol was not set correctly.

Reviewed By: tangyuq

Differential Revision: D17129108

fbshipit-source-id: 6fae385b787370656fd7ca7ab74e6bb91fe5463b
---
 fairseq/tasks/multilingual_translation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fairseq/tasks/multilingual_translation.py b/fairseq/tasks/multilingual_translation.py
index 7b359ea868..b0cf304104 100644
--- a/fairseq/tasks/multilingual_translation.py
+++ b/fairseq/tasks/multilingual_translation.py
@@ -214,8 +214,9 @@ def language_pair_dataset(lang_pair):
             )
             return self.alter_dataset_langtok(
                 langpair_dataset,
-                src_eos=self.dicts[tgt].eos(),
+                src_eos=self.dicts[src].eos(),
                 src_lang=src,
+                tgt_eos=self.dicts[tgt].eos(),
                 tgt_lang=tgt,
             )
 
@@ -239,6 +240,7 @@ def build_dataset_for_inference(self, src_tokens, src_lengths):
                     ),
                     src_eos=self.source_dictionary.eos(),
                     src_lang=self.args.source_lang,
+                    tgt_eos=self.target_dictionary.eos(),
                     tgt_lang=self.args.target_lang,
                 ),
             )]),

From 3e3fe72299980f53262880e24e372ed7d785093c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Roman=20R=C3=A4dle?= <raedle@fb.com>
Date: Thu, 5 Sep 2019 15:33:53 -0700
Subject: [PATCH 137/213] Return predicted token for RoBERTa filling mask

Summary:
Added the `predicted_token` to each `topk` filled output item

Updated RoBERTa filling mask example in README.md

Reviewed By: myleott

Differential Revision: D17188810

fbshipit-source-id: 5fdc57ff2c13239dabf13a8dad43ae9a55e8931c
---
 examples/roberta/README.md              | 6 +++---
 fairseq/models/roberta/hub_interface.py | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 9006e4f193..1b8d637ccb 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -167,13 +167,13 @@ RoBERTa can be used to fill `<mask>` tokens in the input. Some examples from the
 [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/):
 ```python
 roberta.fill_mask('The first Star wars movie came out in <mask>', topk=3)
-# [('The first Star wars movie came out in 1977', 0.9504712224006653), ('The first Star wars movie came out in 1978', 0.009986752644181252), ('The first Star wars movie came out in 1979', 0.00957468245178461)]
+# [('The first Star wars movie came out in 1977', 0.9504708051681519, ' 1977'), ('The first Star wars movie came out in 1978', 0.009986862540245056, ' 1978'), ('The first Star wars movie came out in 1979', 0.009574787691235542, ' 1979')]
 
 roberta.fill_mask('Vikram samvat calender is official in <mask>', topk=3)
-# [('Vikram samvat calender is official in India', 0.21878768503665924), ('Vikram samvat calender is official in Delhi', 0.08547217398881912), ('Vikram samvat calender is official in Gujarat', 0.07556255906820297)]
+# [('Vikram samvat calender is official in India', 0.21878819167613983, ' India'), ('Vikram samvat calender is official in Delhi', 0.08547237515449524, ' Delhi'), ('Vikram samvat calender is official in Gujarat', 0.07556215673685074, ' Gujarat')]
 
 roberta.fill_mask('<mask> is the common currency of the European Union', topk=3)
-# [('Euro is the common currency of the European Union', 0.945650577545166), ('euro is the common currency of the European Union', 0.025747718289494514), ('€ is the common currency of the European Union', 0.011183015070855618)]
+# [('Euro is the common currency of the European Union', 0.9456493854522705, 'Euro'), ('euro is the common currency of the European Union', 0.025748178362846375, 'euro'), ('€ is the common currency of the European Union', 0.011183084920048714, '€')]
 ```
 
 #### Pronoun disambiguation (Winograd Schema Challenge):
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index e40e4ab92a..216b6fd90f 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -174,11 +174,13 @@ def fill_mask(self, masked_input: str, topk: int = 5):
                         ' {0}'.format(masked_token), predicted_token
                     ),
                     values[index].item(),
+                    predicted_token,
                 ))
             else:
                 topk_filled_outputs.append((
                     masked_input.replace(masked_token, predicted_token),
                     values[index].item(),
+                    predicted_token,
                 ))
         return topk_filled_outputs
 

From 1fd8943e94cd303b8bbdc64f6aeb064732ecaf9b Mon Sep 17 00:00:00 2001
From: Nayan Singhal <naysing@fb.com>
Date: Thu, 12 Sep 2019 10:56:16 -0700
Subject: [PATCH 138/213] Average local optimizer param after warmup and during
 bmuf sync

Summary: We have seen that averaging the local param instead of doing reset or broadcast after warmup improves the WER.

Reviewed By: skritika

Differential Revision: D16739278

fbshipit-source-id: 75033d2d25f9a88fd6dd325d0d9d4c856d22d947
---
 fairseq/optim/adam.py              |  12 ++++
 fairseq/optim/bmuf.py              | 108 +++++++++++++++++------------
 fairseq/optim/fairseq_optimizer.py |   3 +
 3 files changed, 78 insertions(+), 45 deletions(-)

diff --git a/fairseq/optim/adam.py b/fairseq/optim/adam.py
index 51a282380c..80de7f00d9 100644
--- a/fairseq/optim/adam.py
+++ b/fairseq/optim/adam.py
@@ -8,6 +8,7 @@
 
 import torch
 import torch.optim
+import torch.distributed as dist
 
 from . import FairseqOptimizer, register_optimizer
 
@@ -53,6 +54,17 @@ def optimizer_config(self):
             'weight_decay': self.args.weight_decay,
         }
 
+    def average_params(self):
+        """Reduce Params is only used during BMUF distributed training."""
+        state_dict = self.optimizer.state_dict()
+        total_gpus = float(dist.get_world_size())
+
+        for _, value in state_dict["state"].items():
+            value["exp_avg"] /= total_gpus
+            value["exp_avg_sq"] /= total_gpus
+            dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
+            dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
+
 
 class Adam(torch.optim.Optimizer):
     """Implements Adam algorithm.
diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py
index 651fe7e604..289b7e5831 100644
--- a/fairseq/optim/bmuf.py
+++ b/fairseq/optim/bmuf.py
@@ -31,6 +31,7 @@ def __init__(self, args, optimizer):
         self.warmup_iteration = self.args.warmup_iterations
         self.use_nbm = self.args.use_nbm
         self.initial_state = self._optimizer.state_dict()
+        self.average_sync = self.args.average_sync
 
     @staticmethod
     def add_args(parser):
@@ -62,6 +63,12 @@ def add_args(parser):
             action="store_true",
             help="Specify whether you want to use classical BM / Nesterov BM",
         )
+        parser.add_argument(
+            "--average-sync",
+            default=True,
+            action="store_true",
+            help="Specify whether you want to average the local momentum after each sync",
+        )
 
     @property
     def optimizer(self):
@@ -91,34 +98,50 @@ def clip_grad_norm(self, max_norm):
         """Clips gradient norm."""
         return self._optimizer.clip_grad_norm(max_norm)
 
+    def average_params(self):
+        self._optimizer.average_params()
+
     def _block_sync(self):
-        # Update the global model using local models from all GPUs.
-        if self._is_bmuf_iter():
-            if self.block_momentum != 0:
-                self._BM_before_sync()
+        # Update the global model using local models from all GPUs
+        # (Step-1) Calculate grad between previously synced model and
+        # currrent local model
+        if self.block_momentum != 0:
+            self._calc_grad()
+
+        # (Step-2) Average gradient from all GPUs
+        self._avg_grad_from_all_gpus()
 
-            self._allreduce_parameter()
+        # (Step-3) Calculate global momentum and update the global model
+        if self.block_momentum != 0:
+            self._update_global_model()
 
-            if self.block_momentum != 0:
-                self._BM_after_sync()
+        # (Step-4) Average local optimizer params
+        if self.average_sync:
+            self.average_params()
 
     def _is_warmup_end(self):
+        # Check whether train iterations is equal to warmup iter
         if self.get_num_updates() == self.warmup_iteration:
             return True
         return False
 
     def _is_bmuf_iter(self):
+        # Check whether train iterations is equal to bmuf sync iter
         if self.get_num_updates() % self.sync_iter == 0:
             return True
         return False
 
-    def _warmup_sync(self, rootRank=0):
-        # broadcast the local model to all GPUs
+    def _warmup_sync(self, root_rank=0):
+        # Broadcast the local model to all gpus
         for param in self.params:
-            dist.broadcast(param.data, src=rootRank)
+            dist.broadcast(param.data, src=root_rank)
+
+        # Update local optimizer state
+        if self.average_sync:
+            self._optimizer.average_params()
+        else:
+            self._optimizer.load_state_dict(self.initial_state)
 
-        # Reset the local optimizer state and local bmuf related param
-        self._optimizer.load_state_dict(self.initial_state)
         self._reset_local_data()
 
     def step(self, closure=None):
@@ -127,7 +150,7 @@ def step(self, closure=None):
         self.set_num_updates(self.get_num_updates() + 1)
         if self._is_warmup_end():
             self._warmup_sync()
-        else:
+        elif self._is_bmuf_iter():
             self._block_sync()
 
     def zero_grad(self):
@@ -144,61 +167,56 @@ def set_num_updates(self, num_updates):
 
     @torch.no_grad()
     def _reset_local_data(self):
-        """Resetting all the BMUF specific params."""
-        self.params_localprev = [torch.zeros_like(p.data) for p in self.params]
-
-        self.smoothed_grads_localprev = [
-            p.data.new_zeros(p.data.size()) for p in self.params
-        ]
-        self.grads_localprev = [p.data.new_zeros(p.data.size()) for p in self.params]
+        # (Step-0) Initialize global momentum parameters and store global copy on each gpu
+        self.global_params = [torch.zeros_like(p.data) for p in self.params]
+        self.smoothed_grads = [p.data.new_zeros(p.data.size()) for p in self.params]
+        self.grads = [p.data.new_zeros(p.data.size()) for p in self.params]
 
         # saving the global model locally for calculating gradient during bmuf sync
-        for param, copy_param in zip(self.params, self.params_localprev):
-            copy_param.copy_(param.data)
+        for param, global_param in zip(self.params, self.global_params):
+            global_param.copy_(param.data)
 
     @torch.no_grad()
-    def _BM_before_sync(self):
-        """Calculate grad between previously synced model and currrent local model."""
-        # prev_param is basically the global copy from the previously finished
+    def _calc_grad(self):
+        # global_params is basically the global copy from the previously finished
         # synchronisation. param.data is local parameter after block_sync_freq
         # for the local gpu. so grad is difference between previously synced
         # model and currrent local model.
-        for index, (param, prev_param) in enumerate(
-            zip(self.params, self.params_localprev)
+        for index, (param, global_param) in enumerate(
+            zip(self.params, self.global_params)
         ):
-            self.grads_localprev[index] = prev_param - param.data
+            self.grads[index] = global_param - param.data
 
-    def _allreduce_parameter(self):
-        """Average gradient from all the GPUs. """
+    def _avg_grad_from_all_gpus(self):
         for index, param in enumerate(self.params):
-            sync_para = (
-                param.data if self.block_momentum == 0 else self.grads_localprev[index]
-            )
+            sync_para = param.data if self.block_momentum == 0 else self.grads[index]
             sync_para /= float(dist.get_world_size())
             dist.all_reduce(sync_para, op=dist.ReduceOp.SUM)
 
     @torch.no_grad()
-    def _BM_after_sync(self):
-        for index, (param, prev_param, smoothed_grad, grad) in enumerate(
+    def _update_global_model(self):
+        for index, (param, global_param, smoothed_grad, grad) in enumerate(
             zip(
                 self.params,
-                self.params_localprev,
-                self.smoothed_grads_localprev,
-                # all machines would share the same value of smoothed_grad, since it is
+                self.global_params,
+                self.smoothed_grads,
+                # all gpus would share the same value of smoothed_grad, since it is
                 # always computed on synchronized gradients.
-                self.grads_localprev,
+                self.grads,
             )
         ):
-            # prev_param is basically last syncrhornized parameter. though
+            # global_param is basically last syncrhornized parameter. though
             # smoothed_grad is local, all processes will have same value of
             # smoothed_grad and hence param is globally synchronized copy.
-            # smoothed_grad(t)=BM * smoothed_grad(t-1) + BM_lr*grad(t)
-            smoothed_grad = smoothed_grad * self.block_momentum + grad * self.block_lr
-            param.data.copy_(prev_param - smoothed_grad)
+            # smoothed_grad(t) = BM * smoothed_grad(t-1) + BM_lr * grad(t)
+            smoothed_grad = self.block_momentum * smoothed_grad + self.block_lr * grad
+            param.data.copy_(global_param - smoothed_grad)
+
             # A Nesterov momentum here is to do a partial weight update before
             # calculating the gradient
             if self.use_nbm:
                 param.data.copy_(param.data - self.block_momentum * smoothed_grad)
+
             # backup for the next synchronization.
-            self.smoothed_grads_localprev[index] = smoothed_grad
-            prev_param.copy_(param.data)
+            self.smoothed_grads[index] = smoothed_grad
+            global_param.copy_(param.data)
diff --git a/fairseq/optim/fairseq_optimizer.py b/fairseq/optim/fairseq_optimizer.py
index 030b1fe4a0..b850d69788 100644
--- a/fairseq/optim/fairseq_optimizer.py
+++ b/fairseq/optim/fairseq_optimizer.py
@@ -108,3 +108,6 @@ def supports_memory_efficient_fp16(self):
         if hasattr(self.optimizer, 'supports_memory_efficient_fp16'):
             return self.optimizer.supports_memory_efficient_fp16
         return False
+
+    def average_params(self):
+        pass

From e1ba32aae224897bde5dd421f6be04f1e93f72eb Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair0685.h2.fair>
Date: Mon, 16 Sep 2019 12:52:05 -0700
Subject: [PATCH 139/213] added fast stats sync option (#858)

Summary:
Added `--fast-stat-sync` option.
This avoids pickle and achieves `~7%` more `wps` on 16 nodes.
It is less flexible as it just aggregates only basic stats and it ignores the aggregate function defined by criterion.

Let me know what you think myleott
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/858

Differential Revision: D17398770

fbshipit-source-id: 36261a1d970e67deeda8211af8f009ef9b4f9c14
---
 fairseq/criterions/cross_entropy.py |  1 +
 fairseq/criterions/masked_lm.py     |  1 +
 fairseq/options.py                  |  3 ++
 fairseq/trainer.py                  | 72 ++++++++++++++++++++++++-----
 4 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/fairseq/criterions/cross_entropy.py b/fairseq/criterions/cross_entropy.py
index d6e8ff545f..1996e9edf3 100644
--- a/fairseq/criterions/cross_entropy.py
+++ b/fairseq/criterions/cross_entropy.py
@@ -30,6 +30,7 @@ def forward(self, model, sample, reduce=True):
         sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
         logging_output = {
             'loss': utils.item(loss.data) if reduce else loss.data,
+            'nll_loss': utils.item(loss.data) if reduce else loss.data,
             'ntokens': sample['ntokens'],
             'nsentences': sample['target'].size(0),
             'sample_size': sample_size,
diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
index a133b5fa41..d8907eba5f 100644
--- a/fairseq/criterions/masked_lm.py
+++ b/fairseq/criterions/masked_lm.py
@@ -47,6 +47,7 @@ def forward(self, model, sample, reduce=True):
 
         logging_output = {
             'loss': utils.item(loss.data) if reduce else loss.data,
+            'nll_loss': utils.item(loss.data) if reduce else loss.data,
             'ntokens': sample['ntokens'],
             'nsentences': sample['nsentences'],
             'sample_size': sample_size,
diff --git a/fairseq/options.py b/fairseq/options.py
index 1bd54d5797..54c7863908 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -332,6 +332,9 @@ def add_distributed_training_args(parser):
     group.add_argument('--find-unused-parameters', default=False, action='store_true',
                        help='disable unused parameter detection (not applicable to '
                        'no_c10d ddp-backend')
+    group.add_argument('--fast-stat-sync', default=False, action='store_true',
+                        help='Enable fast sync of stats between nodes, this hardcodes to '
+                        'sync only some default stats from logging_output.')
     # fmt: on
     return group
 
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 58448c83a7..cadfe8edc6 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -57,6 +57,11 @@ def __init__(self, args, task, model, criterion, dummy_batch=None, oom_batch=Non
         self._wrapped_criterion = None
         self._wrapped_model = None
 
+        # Fast stats sync avoids memcpy and is 7% faster when tested on 16 nodes.
+        # It is less flexible and syncs only the default stats.
+        self._all_reduce_list = [0.0] * 6
+        self.fast_stat_sync = args.fast_stat_sync
+
         self.init_meters(args)
 
     def init_meters(self, args):
@@ -292,6 +297,13 @@ def maybe_no_sync():
                 if not ignore_grad:
                     logging_outputs.append(logging_output)
                     sample_sizes.append(sample_size)
+
+                    if self.fast_stat_sync:
+                        self._all_reduce_list[0] += sample_size
+                        self._all_reduce_list[1] += logging_output.get('nsentences', 0.0)
+                        self._all_reduce_list[2] += logging_output.get('loss', 0.0)
+                        self._all_reduce_list[3] += logging_output.get('nll_loss', 0.0)
+                        self._all_reduce_list[4] += logging_output.get('ntokens', 0.0)
             except RuntimeError as e:
                 if 'out of memory' in str(e):
                     msg = (
@@ -311,6 +323,10 @@ def maybe_no_sync():
                 else:
                     raise e
 
+            if self.fast_stat_sync:
+                self._all_reduce_list[5] += ooms
+
+
         if ooms > 0 and self._oom_batch is not None:
             self.handle_ooms(ooms)
 
@@ -318,13 +334,30 @@ def maybe_no_sync():
             return None
 
         # gather logging outputs from all replicas
-        if self.args.distributed_world_size > 1 and (
-            (not self.args.use_bmuf)
-            or (
-                self.args.use_bmuf
-                and (self.get_num_updates() + 1) % self.args.global_sync_iter == 0
+        if self.fast_stat_sync:
+            # rework all_gather_list
+            all_reduce_list_tensor = torch.cuda.DoubleTensor(self._all_reduce_list)
+            if self._sync_stats():
+                torch.distributed.all_reduce(all_reduce_list_tensor)
+            # Normalize loss and nll_loss by "sample_size"
+            # and convert to log base 2
+            all_reduce_list_tensor[2:4].div_(
+                (
+                    all_reduce_list_tensor[0:1] *
+                    torch.log(torch.cuda.DoubleTensor([2]))
+                )
             )
-        ):
+            self._all_reduce_list = all_reduce_list_tensor.tolist()
+            logging_output = {}
+            [
+                sample_size,
+                logging_output['nsentences'],
+                logging_output['loss'],
+                logging_output['nll_loss'],
+                logging_output['ntokens'],
+                ooms,
+            ] = self._all_reduce_list
+        elif self._sync_stats():
             logging_outputs, sample_sizes, ooms, prev_norms = \
                 zip(*distributed_utils.all_gather_list(
                     [logging_outputs, sample_sizes, ooms, self._prev_grad_norm],
@@ -345,11 +378,12 @@ def maybe_no_sync():
             self.zero_grad()
             return None
 
-        # aggregate logging outputs and sample sizes
-        logging_output = self.task.aggregate_logging_outputs(
-            logging_outputs, self.get_criterion()
-        )
-        sample_size = self.task.grad_denom(sample_sizes, self.get_criterion())
+        if not self.fast_stat_sync:
+            # aggregate logging outputs and sample sizes
+            logging_output = self.task.aggregate_logging_outputs(
+                logging_outputs, self.get_criterion()
+            )
+            sample_size = self.task.grad_denom(sample_sizes, self.get_criterion())
 
         if not all(k in logging_output for k in ['ntokens', 'nsentences']):
             raise Exception((
@@ -400,6 +434,7 @@ def maybe_no_sync():
             self.meters['loss_scale'].reset()
             self.meters['loss_scale'].update(self.optimizer.scaler.loss_scale)
 
+        self.clear_buffered_stats()
         self.meters['train_wall'].stop()
 
         return logging_output
@@ -484,6 +519,9 @@ def handle_ooms(self, number_of_ooms):
     def zero_grad(self):
         self.optimizer.zero_grad()
 
+    def clear_buffered_stats(self):
+        self._all_reduce_list = [0.0] * 6
+
     def lr_step(self, epoch, val_loss=None):
         """Adjust the learning rate based on the validation loss."""
         self.lr_scheduler.step(epoch, val_loss)
@@ -545,3 +583,15 @@ def _set_seed(self):
         torch.manual_seed(seed)
         if self.cuda:
             torch.cuda.manual_seed(seed)
+
+    def _sync_stats(self):
+        return (
+            self.args.distributed_world_size > 1 and
+            (
+                (not self.args.use_bmuf) or
+                (
+                    self.args.use_bmuf
+                    and (self.get_num_updates() + 1) % self.args.global_sync_iter == 0
+                )
+            )
+        )

From a3882abfbd5e68c2d5d667be63fb670f3783a65b Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Tue, 17 Sep 2019 13:39:15 -0700
Subject: [PATCH 140/213] Update README.md

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1140

Differential Revision: D17431506

Pulled By: myleott

fbshipit-source-id: b47dae303d7e76daa5b49795476b5e48d7b090ad
---
 examples/roberta/commonsense_qa/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/roberta/commonsense_qa/README.md b/examples/roberta/commonsense_qa/README.md
index b008fce17d..a984b9a242 100644
--- a/examples/roberta/commonsense_qa/README.md
+++ b/examples/roberta/commonsense_qa/README.md
@@ -39,7 +39,7 @@ DATA_DIR=data/CommonsenseQA
 FAIRSEQ_PATH=/path/to/fairseq
 FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa
 
-CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 \
+CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \
     $DATA_DIR \
     --user-dir $FAIRSEQ_USER_DIR \
     --restore-file $ROBERTA_PATH \
@@ -51,7 +51,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 \
     --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
     --criterion sentence_ranking --num-classes 5 \
     --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \
-    --lr-scheduler polynomial_decay --lr $LR
+    --lr-scheduler polynomial_decay --lr $LR \
     --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
     --max-sentences $MAX_SENTENCES \
     --max-update $MAX_UPDATES \

From 31dd13fa651cb6a604c8f1f513394792b6678354 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson-liu@users.noreply.github.com>
Date: Tue, 17 Sep 2019 13:59:46 -0700
Subject: [PATCH 141/213] Fix link to RACE fine-tuning instructions.

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1125

Differential Revision: D17431557

Pulled By: myleott

fbshipit-source-id: f712e5355d8dbb0a8f1170674d62e2b6880295b4
---
 examples/roberta/commonsense_qa/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/roberta/commonsense_qa/README.md b/examples/roberta/commonsense_qa/README.md
index a984b9a242..7302794805 100644
--- a/examples/roberta/commonsense_qa/README.md
+++ b/examples/roberta/commonsense_qa/README.md
@@ -1,6 +1,6 @@
 # Finetuning RoBERTa on Commonsense QA
 
-We follow a similar approach to [finetuning RACE](README.race.md). Specifically
+We follow a similar approach to [finetuning RACE](../README.race.md). Specifically
 for each question we construct five inputs, one for each of the five candidate
 answer choices. Each input is constructed by concatenating the question and
 candidate answer. We then encode each input and pass the resulting "[CLS]"

From 718677ebb044e27aaf1a30640c2f7ab6b8fa8509 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair0893.h2.fair>
Date: Wed, 18 Sep 2019 10:05:01 -0700
Subject: [PATCH 142/213] dont project maske tokens for mlm loss (#859)

Summary:
This saves ~4-5gb gpu memory while training roberta large with `seq_len=512`.

I am able to fit `--max-sentences=16` on `volta32gb` for `roberta-large`
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/859

Differential Revision: D17435814

fbshipit-source-id: 2663909768fac0ef0102107613770ee01b1f8c00
---
 fairseq/criterions/masked_lm.py |  8 ++++++--
 fairseq/models/roberta/model.py | 17 ++++++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
index d8907eba5f..4eae5c3849 100644
--- a/fairseq/criterions/masked_lm.py
+++ b/fairseq/criterions/masked_lm.py
@@ -30,8 +30,11 @@ def forward(self, model, sample, reduce=True):
         3) logging outputs to display while training
         """
         # compute MLM loss
-        logits = model(**sample['net_input'], return_all_hiddens=False)[0]
+        masked_tokens = sample['target'].ne(self.padding_idx)
+        logits = model(**sample['net_input'], masked_tokens=masked_tokens)[0]
         targets = model.get_targets(sample, [logits])
+        targets = targets[masked_tokens]
+
         loss = F.nll_loss(
             F.log_softmax(
                 logits.view(-1, logits.size(-1)),
@@ -43,7 +46,7 @@ def forward(self, model, sample, reduce=True):
             ignore_index=self.padding_idx,
         )
 
-        sample_size = targets.ne(self.padding_idx).int().sum().item()
+        sample_size = masked_tokens.int().sum().item()
 
         logging_output = {
             'loss': utils.item(loss.data) if reduce else loss.data,
@@ -64,6 +67,7 @@ def aggregate_logging_outputs(logging_outputs):
 
         agg_output = {
             'loss': loss / sample_size / math.log(2),
+            'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / sample_size / math.log(2) if ntokens > 0 else 0.,
             'ntokens': ntokens,
             'nsentences': nsentences,
             'sample_size': sample_size,
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index e5528dfc9c..7b9cbba532 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -201,14 +201,17 @@ def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
         self.weight = weight
         self.bias = nn.Parameter(torch.zeros(output_dim))
 
-    def forward(self, features, **kwargs):
+    def forward(self, features, masked_tokens=None, **kwargs):
+        # Only project the unmasked tokens while training,
+        # saves both memory and computation
+        if masked_tokens is not None:
+            features = features[masked_tokens, :]
+
         x = self.dense(features)
         x = self.activation_fn(x)
         x = self.layer_norm(x)
-
         # project back to size of vocabulary with bias
         x = F.linear(x, self.weight) + self.bias
-
         return x
 
 
@@ -265,7 +268,7 @@ def __init__(self, args, dictionary):
             weight=self.sentence_encoder.embed_tokens.weight,
         )
 
-    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, **unused):
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, masked_tokens=None, **unused):
         """
         Args:
             src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
@@ -283,7 +286,7 @@ def forward(self, src_tokens, features_only=False, return_all_hiddens=False, **u
         """
         x, extra = self.extract_features(src_tokens, return_all_hiddens)
         if not features_only:
-            x = self.output_layer(x)
+            x = self.output_layer(x, masked_tokens=masked_tokens)
         return x, extra
 
     def extract_features(self, src_tokens, return_all_hiddens=False, **unused):
@@ -293,8 +296,8 @@ def extract_features(self, src_tokens, return_all_hiddens=False, **unused):
         features = inner_states[-1]
         return features, {'inner_states': inner_states if return_all_hiddens else None}
 
-    def output_layer(self, features, **unused):
-        return self.lm_head(features)
+    def output_layer(self, features, masked_tokens=None, **unused):
+        return self.lm_head(features, masked_tokens)
 
     def max_positions(self):
         """Maximum output length supported by the encoder."""

From 8dbee4ab8fdce42be0ac72dab100d8e1f8434979 Mon Sep 17 00:00:00 2001
From: Akhilesh Gotmare <dg.akhilesh@gmail.com>
Date: Wed, 18 Sep 2019 15:20:27 -0700
Subject: [PATCH 143/213] Minor fix to make adafactor work for >2d conv kernels
 (#1122)

Summary:
missing .unsqueeze(-1) in line 124,
without this change we'll encounter runtime error for >2d convolutional kernels, with this fix, we're applying adafactor's 2d logic to the two final dimensions.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1122

Differential Revision: D17431662

Pulled By: myleott

fbshipit-source-id: e7435e77270a9252f75f01b2457ef0048f5bcf36
---
 fairseq/optim/adafactor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/optim/adafactor.py b/fairseq/optim/adafactor.py
index 680ac371b9..00e6ed3138 100644
--- a/fairseq/optim/adafactor.py
+++ b/fairseq/optim/adafactor.py
@@ -121,7 +121,7 @@ def _rms(self, tensor):
         return tensor.norm(2) / (tensor.numel() ** 0.5)
 
     def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col, output):
-        r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1)).rsqrt_().unsqueeze(-1)
+        r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1).unsqueeze(-1)).rsqrt_().unsqueeze(-1)
         c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
         torch.mul(r_factor, c_factor, out=output)
 

From f994c9b8b5655b889bc58d9b20d51e538a470c3c Mon Sep 17 00:00:00 2001
From: Jerry Ma <noreplyspamblackhole@gmail.com>
Date: Wed, 18 Sep 2019 15:57:15 -0700
Subject: [PATCH 144/213] Add autogenerated cython files to gitignore (#860)

Summary:
`python setup.py build_ext --inplace` generates C++ source files directly in the Python source tree. They should most likely be ignored by git.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/860

Differential Revision: D17460597

Pulled By: jma127

fbshipit-source-id: 72a29d438ebb57627b68ec7e9a2a77c8a36f1c21
---
 .gitignore | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 84ae18d953..0e8c3afb0c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -110,12 +110,16 @@ ENV/
 .mypy_cache/
 
 # Generated files
-fairseq/temporal_convolution_tbc
-fairseq/modules/*_layer/*_forward.cu
-fairseq/modules/*_layer/*_backward.cu
+/fairseq/temporal_convolution_tbc
+/fairseq/modules/*_layer/*_forward.cu
+/fairseq/modules/*_layer/*_backward.cu
 
 # data
 data-bin/
 
 # reranking
-examples/reranking/rerank_data
+/examples/reranking/rerank_data
+
+# Cython-generated C++ source files
+/fairseq/data/data_utils_fast.cpp
+/fairseq/data/token_block_utils_fast.cpp

From 0eaaf3551659de40dcb273b7a0a3272e50a6e38a Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 18 Sep 2019 18:25:18 -0700
Subject: [PATCH 145/213] Add cython language_level hints

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1147

Differential Revision: D17468447

Pulled By: myleott

fbshipit-source-id: 0dbac04b92c8df74ad991d5e92cd02036d662369
---
 fairseq/data/data_utils_fast.pyx        | 1 +
 fairseq/data/token_block_utils_fast.pyx | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fairseq/data/data_utils_fast.pyx b/fairseq/data/data_utils_fast.pyx
index a9c6e57b34..468c07292d 100644
--- a/fairseq/data/data_utils_fast.pyx
+++ b/fairseq/data/data_utils_fast.pyx
@@ -1,3 +1,4 @@
+# cython: language_level=3
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/fairseq/data/token_block_utils_fast.pyx b/fairseq/data/token_block_utils_fast.pyx
index bf3b0ecf07..06160d683f 100644
--- a/fairseq/data/token_block_utils_fast.pyx
+++ b/fairseq/data/token_block_utils_fast.pyx
@@ -1,3 +1,4 @@
+# cython: language_level=3
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the

From a8a85c267693f450c8039f4cfec217df8390edde Mon Sep 17 00:00:00 2001
From: Jerry Ma <noreplyspamblackhole@gmail.com>
Date: Thu, 19 Sep 2019 10:34:23 -0700
Subject: [PATCH 146/213] Add dataset class for weighted sampling with
 replacement. (#861)

Summary:
As discussed with Naman earlier today. Weighted sampling with
replacement can be done on a per-epoch basis using `set_epoch()`
functionality, which generates the samples as a function of random seed
and epoch.

Additionally, `FairseqTask` needs to set the starting epoch for the
dataset at the very beginning of iterator construction.

Not yet implemented is the per-epoch iterator construction, which
is necessary to actually regenerate the batches for each epoch.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/861

Differential Revision: D17460687

Pulled By: jma127

fbshipit-source-id: 1c2a54f04ac96b3561c100a6fd66a9fccbe3c658
---
 fairseq/data/__init__.py           |   1 +
 fairseq/data/resampling_dataset.py | 128 +++++++++++++++++++++++++++++
 fairseq/tasks/fairseq_task.py      |   3 +
 tests/test_resampling_dataset.py   | 104 +++++++++++++++++++++++
 4 files changed, 236 insertions(+)
 create mode 100644 fairseq/data/resampling_dataset.py
 create mode 100644 tests/test_resampling_dataset.py

diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index 440d356b70..a84cfbb538 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -41,6 +41,7 @@
 from .transform_eos_dataset import TransformEosDataset
 from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
 from .truncate_dataset import TruncateDataset
+from .resampling_dataset import ResamplingDataset
 
 from .iterators import (
     CountingIterator,
diff --git a/fairseq/data/resampling_dataset.py b/fairseq/data/resampling_dataset.py
new file mode 100644
index 0000000000..8327889405
--- /dev/null
+++ b/fairseq/data/resampling_dataset.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from . import BaseWrapperDataset, plasma_utils
+
+
+class ResamplingDataset(BaseWrapperDataset):
+    """Randomly samples from a given dataset at each epoch.
+
+    Sampling is done with or without replacement, depending on the "replace"
+    parameter.
+
+    Optionally, the epoch size can be rescaled. This is potentially desirable
+    to increase per-epoch coverage of the base dataset (since sampling with
+    replacement means that many items in the dataset will be left out). In the
+    case of sampling without replacement, size_ratio should be strictly less
+    than 1.
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset on which to sample.
+        weights (List[float]): list of probability weights
+            (default: None, which corresponds to uniform sampling).
+        replace (bool): sampling mode; True for "with replacement", or False
+            for "without replacement" (default: True)
+        size_ratio (float): the ratio to subsample to; must be positive
+            (default: 1.0).
+        batch_by_size (bool): whether or not to batch by sequence length
+            (default: True).
+        seed (int): RNG seed to use (default: 0).
+        epoch (int): starting epoch number (default: 0).
+    """
+
+    def __init__(
+        self,
+        dataset,
+        weights=None,
+        replace=True,
+        size_ratio=1.0,
+        batch_by_size=True,
+        seed=0,
+        epoch=0,
+    ):
+        super().__init__(dataset)
+
+        if weights is None:
+            self.weights = None
+
+        else:
+            assert len(weights) == len(dataset)
+            weights_arr = np.array(weights, dtype=np.float64)
+            weights_arr /= weights_arr.sum()
+            self.weights = plasma_utils.PlasmaArray(weights_arr)
+
+        self.replace = replace
+
+        assert size_ratio > 0.0
+        if not self.replace:
+            assert size_ratio < 1.0
+        self.size_ratio = float(size_ratio)
+        self.actual_size = np.ceil(len(dataset) * self.size_ratio).astype(int)
+
+        self.batch_by_size = batch_by_size
+        self.seed = seed
+
+        self._cur_epoch = None
+        self._cur_indices = None
+
+        self.set_epoch(epoch)
+
+    def __getitem__(self, index):
+        return self.dataset[self._cur_indices.array[index]]
+
+    def __len__(self):
+        return self.actual_size
+
+    @property
+    def sizes(self):
+        return self.dataset.sizes[self._cur_indices.array]
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(self._cur_indices.array[index])
+
+    def size(self, index):
+        return self.dataset.size(self._cur_indices.array[index])
+
+    def ordered_indices(self):
+        if self.batch_by_size:
+            order = [
+                np.arange(len(self)),
+                self.sizes,
+            ]  # No need to handle `self.shuffle == True`
+            return np.lexsort(order)
+        else:
+            return np.arange(len(self))
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(self._cur_indices.array[indices])
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+
+        if epoch == self._cur_epoch:
+            return
+
+        self._cur_epoch = epoch
+
+        # Generate a weighted sample of indices as a function of the
+        # random seed and the current epoch.
+
+        rng = np.random.RandomState(
+            [
+                42,  # magic number
+                self.seed % (2 ** 32),  # global seed
+                self._cur_epoch,  # epoch index
+            ]
+        )
+        self._cur_indices = plasma_utils.PlasmaArray(
+            rng.choice(
+                len(self.dataset),
+                self.actual_size,
+                replace=self.replace,
+                p=(None if self.weights is None else self.weights.array),
+            )
+        )
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index 1e2b623be8..0806b66cce 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -126,6 +126,9 @@ def get_batch_iterator(
         """
         assert isinstance(dataset, FairseqDataset)
 
+        # initialize the dataset with the correct starting epoch
+        dataset.set_epoch(epoch)
+
         # get indices ordered by example size
         with data_utils.numpy_seed(seed):
             indices = dataset.ordered_indices()
diff --git a/tests/test_resampling_dataset.py b/tests/test_resampling_dataset.py
new file mode 100644
index 0000000000..0d142f5a8d
--- /dev/null
+++ b/tests/test_resampling_dataset.py
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import collections
+import unittest
+
+import numpy as np
+
+from fairseq.data import ListDataset, ResamplingDataset
+
+
+class TestResamplingDataset(unittest.TestCase):
+    def setUp(self):
+        self.strings = ["ab", "c", "def", "ghij"]
+        self.weights = [4.0, 2.0, 7.0, 1.5]
+        self.size_ratio = 2
+        self.dataset = ListDataset(
+            self.strings, np.array([len(s) for s in self.strings])
+        )
+
+    def _test_common(self, resampling_dataset, iters):
+        assert len(self.dataset) == len(self.strings) == len(self.weights)
+        assert len(resampling_dataset) == self.size_ratio * len(self.strings)
+
+        results = {"ordered_by_size": True, "max_distribution_diff": 0.0}
+
+        totalfreqs = 0
+        freqs = collections.defaultdict(int)
+
+        for epoch_num in range(iters):
+            resampling_dataset.set_epoch(epoch_num)
+
+            indices = resampling_dataset.ordered_indices()
+            assert len(indices) == len(resampling_dataset)
+
+            prev_size = -1
+
+            for i in indices:
+                cur_size = resampling_dataset.size(i)
+                # Make sure indices map to same sequences within an epoch
+                assert resampling_dataset[i] == resampling_dataset[i]
+
+                # Make sure length of sequence is correct
+                assert cur_size == len(resampling_dataset[i])
+
+                freqs[resampling_dataset[i]] += 1
+                totalfreqs += 1
+
+                if prev_size > cur_size:
+                    results["ordered_by_size"] = False
+
+                prev_size = cur_size
+
+        assert set(freqs.keys()) == set(self.strings)
+        for s, weight in zip(self.strings, self.weights):
+            freq = freqs[s] / totalfreqs
+            expected_freq = weight / sum(self.weights)
+            results["max_distribution_diff"] = max(
+                results["max_distribution_diff"], abs(expected_freq - freq)
+            )
+
+        return results
+
+    def test_resampling_dataset_batch_by_size_false(self):
+        resampling_dataset = ResamplingDataset(
+            self.dataset,
+            self.weights,
+            size_ratio=self.size_ratio,
+            batch_by_size=False,
+            seed=0,
+        )
+
+        results = self._test_common(resampling_dataset, iters=1000)
+
+        # For batch_by_size = False, the batches should be returned in
+        # arbitrary order of size.
+        assert not results["ordered_by_size"]
+
+        # Allow tolerance in distribution error of 2%.
+        assert results["max_distribution_diff"] < 0.02
+
+    def test_resampling_dataset_batch_by_size_true(self):
+        resampling_dataset = ResamplingDataset(
+            self.dataset,
+            self.weights,
+            size_ratio=self.size_ratio,
+            batch_by_size=True,
+            seed=0,
+        )
+
+        results = self._test_common(resampling_dataset, iters=1000)
+
+        # For batch_by_size = True, the batches should be returned in
+        # increasing order of size.
+        assert results["ordered_by_size"]
+
+        # Allow tolerance in distribution error of 2%.
+        assert results["max_distribution_diff"] < 0.02
+
+
+if __name__ == "__main__":
+    unittest.main()

From 32335404f09c47cccbfbf731abc4c510d0eef043 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair1595.h2.fair>
Date: Fri, 20 Sep 2019 09:33:29 -0700
Subject: [PATCH 147/213] added multilingual masked LM training (#849)

Summary:
The multilingual-RoBERTa training is working with aconneau XLM data.

Two pieces remaining:

1) `XLM` limits batch to be from same language, I am not 100% sure about the reason for that, but should be easy to implement, basically we can add `batch_by_size_and_language` instead of default `batch_by_size` function. If it's not critical, I would want to leave it out as it keeps the code very clean and simple.

2) `sample_ratio` in `ConcatDataset` works with `int` by tiling the datasets based on ratio. Currently I am handling it by sounding off the ratio to `first decimal` and then multiplying by `10`. We can see if some such simple heuristics are good enough, there are other options (we can talk about them offline).
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/849

Differential Revision: D17162460

fbshipit-source-id: d967f3d872f7a1f0aa4ea418bd362b68af9e432f
---
 fairseq/checkpoint_utils.py             |   4 +-
 fairseq/data/__init__.py                |   8 +-
 fairseq/data/concat_dataset.py          |  12 +-
 fairseq/data/resampling_dataset.py      |   2 +
 fairseq/models/roberta/model.py         |  13 +-
 fairseq/tasks/fairseq_task.py           |  10 +-
 fairseq/tasks/multilingual_masked_lm.py | 302 ++++++++++++++++++++++++
 fairseq/trainer.py                      |   7 +-
 train.py                                |   6 +-
 9 files changed, 348 insertions(+), 16 deletions(-)
 create mode 100644 fairseq/tasks/multilingual_masked_lm.py

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 4a8855d540..6b83bf07b2 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -120,10 +120,10 @@ def load_checkpoint(args, trainer):
     if extra_state is not None and not args.reset_dataloader:
         # restore iterator from checkpoint
         itr_state = extra_state['train_iterator']
-        epoch_itr = trainer.get_train_iterator(epoch=itr_state['epoch'])
+        epoch_itr = trainer.get_train_iterator(epoch=itr_state['epoch'], load_dataset=True)
         epoch_itr.load_state_dict(itr_state)
     else:
-        epoch_itr = trainer.get_train_iterator(epoch=0)
+        epoch_itr = trainer.get_train_iterator(epoch=0, load_dataset=True)
 
     trainer.lr_step(epoch_itr.epoch)
 
diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
index a84cfbb538..aec8b819e7 100644
--- a/fairseq/data/__init__.py
+++ b/fairseq/data/__init__.py
@@ -32,6 +32,7 @@
 from .prepend_token_dataset import PrependTokenDataset
 from .raw_label_dataset import RawLabelDataset
 from .replace_dataset import ReplaceDataset
+from .resampling_dataset import ResamplingDataset
 from .round_robin_zip_datasets import RoundRobinZipDatasets
 from .sharded_dataset import ShardedDataset
 from .sort_dataset import SortDataset
@@ -77,13 +78,14 @@
     'NoisingDataset',
     'NumelDataset',
     'NumSamplesDataset',
-    "OffsetTokensDataset",
+    'OffsetTokensDataset',
     'PadDataset',
     'PrependDataset',
     'PrependTokenDataset',
     'ReplaceDataset',
     'FileAudioDataset',
-    "RawLabelDataset",
+    'RawLabelDataset',
+    'ResamplingDataset'
     'RightPadDataset',
     'RoundRobinZipDatasets',
     'ShardedDataset',
@@ -94,6 +96,6 @@
     'TokenBlockDataset',
     'TransformEosDataset',
     'TransformEosLangPairDataset',
-    "TruncateDataset",
+    'TruncateDataset',
     'TruncatedDictionary',
 ]
diff --git a/fairseq/data/concat_dataset.py b/fairseq/data/concat_dataset.py
index b61ebbe46d..26405670bb 100644
--- a/fairseq/data/concat_dataset.py
+++ b/fairseq/data/concat_dataset.py
@@ -70,9 +70,15 @@ def attr(self, attr: str, index: int):
 
     @property
     def sizes(self):
-        return np.concatenate(
-            [np.tile(ds.sizes, sr) for ds, sr in zip(self.datasets, self.sample_ratios)]
-        )
+        _dataset_sizes = []
+        for ds, sr in zip(self.datasets, self.sample_ratios):
+            if isinstance(ds.sizes, np.ndarray):
+                _dataset_sizes.append(np.tile(ds.sizes, sr))
+            else:
+                # Only support underlying dataset with single size array.
+                assert isinstance(ds.sizes, list)
+                _dataset_sizes.append(np.tile(ds.sizes[0], sr))
+        return np.concatenate(_dataset_sizes)
 
     @property
     def supports_prefetch(self):
diff --git a/fairseq/data/resampling_dataset.py b/fairseq/data/resampling_dataset.py
index 8327889405..8a2c48f83f 100644
--- a/fairseq/data/resampling_dataset.py
+++ b/fairseq/data/resampling_dataset.py
@@ -79,6 +79,8 @@ def __len__(self):
 
     @property
     def sizes(self):
+        if isinstance(self.dataset.sizes, list):
+            return [s[self._cur_indices.array] for s in self.dataset.sizes]
         return self.dataset.sizes[self._cur_indices.array]
 
     def num_tokens(self, index):
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index 7b9cbba532..ac94a04845 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -291,7 +291,8 @@ def forward(self, src_tokens, features_only=False, return_all_hiddens=False, mas
 
     def extract_features(self, src_tokens, return_all_hiddens=False, **unused):
         inner_states, _ = self.sentence_encoder(
-            src_tokens, last_state_only=not return_all_hiddens,
+            src_tokens,
+            last_state_only=not return_all_hiddens,
         )
         features = inner_states[-1]
         return features, {'inner_states': inner_states if return_all_hiddens else None}
@@ -332,3 +333,13 @@ def roberta_large_architecture(args):
     args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
     args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
     base_architecture(args)
+
+
+@register_model_architecture('roberta', 'xlm')
+def xlm_architecture(args):
+    args.encoder_layers = getattr(args, 'encoder_layers', 16)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1280)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 1280*4)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+
+    base_architecture(args)
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index 0806b66cce..be95a633bb 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -24,6 +24,7 @@ def add_args(parser):
     def __init__(self, args):
         self.args = args
         self.datasets = {}
+        self.epoch_iter = None
 
     @classmethod
     def load_dictionary(cls, filename):
@@ -124,6 +125,12 @@ def get_batch_iterator(
             ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                 given dataset split
         """
+        # For default fairseq task, return same iterator across epochs
+        # as datasets are not dynamic, can be overridden in task specific
+        # setting.
+        if self.epoch_iter is not None:
+            return self.epoch_iter
+
         assert isinstance(dataset, FairseqDataset)
 
         # initialize the dataset with the correct starting epoch
@@ -146,7 +153,7 @@ def get_batch_iterator(
         )
 
         # return a reusable, sharded iterator
-        return iterators.EpochBatchIterator(
+        self.epoch_iter = iterators.EpochBatchIterator(
             dataset=dataset,
             collate_fn=dataset.collater,
             batch_sampler=batch_sampler,
@@ -156,6 +163,7 @@ def get_batch_iterator(
             num_workers=num_workers,
             epoch=epoch,
         )
+        return self.epoch_iter
 
     def build_model(self, args):
         """
diff --git a/fairseq/tasks/multilingual_masked_lm.py b/fairseq/tasks/multilingual_masked_lm.py
new file mode 100644
index 0000000000..fd7bdde42a
--- /dev/null
+++ b/fairseq/tasks/multilingual_masked_lm.py
@@ -0,0 +1,302 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import numpy as np
+import torch
+
+from fairseq.data import (
+    data_utils,
+    Dictionary,
+    encoders,
+    ConcatDataset,
+    IdDataset,
+    MaskTokensDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PadDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    ResamplingDataset,
+    SortDataset,
+    TokenBlockDataset,
+)
+from fairseq.tasks import FairseqTask, register_task
+
+@register_task('multilingual_masked_lm')
+class MultiLingualMaskedLMTask(FairseqTask):
+    """Task for training masked language models (e.g., BERT, RoBERTa)."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner')
+        parser.add_argument('--sample-break-mode', default='complete',
+                            choices=['none', 'complete', 'complete_doc', 'eos'],
+                            help='If omitted or "none", fills each sample with tokens-per-sample '
+                                 'tokens. If set to "complete", splits samples only at the end '
+                                 'of sentence, but may include multiple sentences per sample. '
+                                 '"complete_doc" is similar but respects doc boundaries. '
+                                 'If set to "eos", includes only one sentence per sample.')
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments '
+                                 'per sample for BERT dataset')
+        parser.add_argument('--mask-prob', default=0.15, type=float,
+                            help='probability of replacing a token with mask')
+        parser.add_argument('--leave-unmasked-prob', default=0.1, type=float,
+                            help='probability that a masked token is unmasked')
+        parser.add_argument('--random-token-prob', default=0.1, type=float,
+                            help='probability of replacing a token with a random token')
+        parser.add_argument('--freq-weighted-replacement', action='store_true',
+                            help='sample random replacement words based on word frequencies')
+        parser.add_argument('--mask-whole-words', default=False, action='store_true',
+                            help='mask whole words; you may also want to set --bpe')
+        parser.add_argument('--multilang-sampling-alpha', type=float, default=1.0,
+                            help='smoothing alpha for sample rations across multiple datasets')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        # add mask token
+        self.mask_idx = dictionary.add_symbol('<mask>')
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        paths = args.data.split(':')
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))
+        print('| dictionary: {} types'.format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def _get_whole_word_mask(self):
+        # create masked input and targets
+        if self.args.mask_whole_words:
+            bpe = encoders.build_bpe(self.args)
+            if bpe is not None:
+
+                def is_beginning_of_word(i):
+                    if i < self.source_dictionary.nspecial:
+                        # special elements are always considered beginnings
+                        return True
+                    tok = self.source_dictionary[i]
+                    if tok.startswith('madeupword'):
+                        return True
+                    try:
+                        return bpe.is_beginning_of_word(tok)
+                    except ValueError:
+                        return True
+
+                mask_whole_words = torch.ByteTensor(list(
+                    map(is_beginning_of_word, range(len(self.source_dictionary)))
+                ))
+        else:
+            mask_whole_words = None
+        return mask_whole_words
+
+    def _get_sample_prob(self, dataset_lens):
+        """
+        Get smoothed sampling porbability by languages. This helps low resource
+        languages by upsampling them.
+        """
+        prob = dataset_lens / dataset_lens.sum()
+        smoothed_prob = prob ** self.args.multilang_sampling_alpha
+        smoothed_prob = smoothed_prob / smoothed_prob.sum()
+        return smoothed_prob
+
+    def load_dataset(self, split, epoch=0, combine=False):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = self.args.data.split(':')
+        assert len(paths) > 0
+        data_path = paths[epoch % len(paths)]
+
+        languages = [
+            name for name in os.listdir(data_path)
+            if os.path.isdir(os.path.join(data_path, name))
+        ]
+        print("| Training on {0} languages: {1}".format(len(languages), languages))
+        print("| Language to id mapping: ", {
+                lang: id for id, lang in enumerate(languages)
+            }
+        )
+
+        mask_whole_words = self._get_whole_word_mask()
+        lang_datasets = []
+        for lang_id, language in enumerate(languages):
+            split_path = os.path.join(data_path, language, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            if dataset is None:
+                raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))
+
+            # create continuous blocks of tokens
+            dataset = TokenBlockDataset(
+                dataset,
+                dataset.sizes,
+                self.args.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode=self.args.sample_break_mode,
+            )
+            print('| loaded {} blocks from: {}'.format(len(dataset), split_path))
+
+            # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+            dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+
+            src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
+                dataset,
+                self.source_dictionary,
+                pad_idx=self.source_dictionary.pad(),
+                mask_idx=self.mask_idx,
+                seed=self.args.seed,
+                mask_prob=self.args.mask_prob,
+                leave_unmasked_prob=self.args.leave_unmasked_prob,
+                random_token_prob=self.args.random_token_prob,
+                freq_weighted_replacement=self.args.freq_weighted_replacement,
+                mask_whole_words=mask_whole_words,
+            )
+
+            lang_dataset = NestedDictionaryDataset(
+                {
+                    'net_input': {
+                        'src_tokens': PadDataset(
+                            src_dataset,
+                            pad_idx=self.source_dictionary.pad(),
+                            left_pad=False,
+                        ),
+                        'src_lengths': NumelDataset(src_dataset, reduce=False),
+                    },
+                    'target': PadDataset(
+                        tgt_dataset,
+                        pad_idx=self.source_dictionary.pad(),
+                        left_pad=False,
+                    ),
+                    'nsentences': NumSamplesDataset(),
+                    'ntokens': NumelDataset(src_dataset, reduce=True),
+                    'lang_id': RawLabelDataset([lang_id] * src_dataset.sizes.shape[0]),
+                },
+                sizes=[src_dataset.sizes],
+            )
+            lang_datasets.append(lang_dataset)
+
+        if split == self.args.train_subset:
+            # For train subset, additionally up or down sample languages.
+            dataset_lengths = np.array(
+                [len(d) for d in lang_datasets],
+                dtype=float,
+            )
+            sample_probs = self._get_sample_prob(dataset_lengths)
+            print("| Sample probability by language: ", {
+                    lang: "{0:.4f}".format(sample_probs[id])
+                    for id, lang in enumerate(languages)
+                }
+            )
+            size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths
+            print("| Up/Down Sampling ratio by language: ", {
+                    lang: "{0:.2f}".format(size_ratio[id])
+                    for id, lang in enumerate(languages)
+                }
+            )
+
+            resampled_lang_datasets = [
+                ResamplingDataset(
+                    lang_datasets[i],
+                    size_ratio=size_ratio[i],
+                    seed=self.args.seed,
+                    epoch=epoch,
+                    replace=size_ratio[i] >= 1.0,
+                )
+                for i, d in enumerate(lang_datasets)
+            ]
+            dataset = ConcatDataset(resampled_lang_datasets)
+        else:
+            dataset = ConcatDataset(lang_datasets)
+            lang_splits = [split]
+            for lang_id, lang_dataset in enumerate(lang_datasets):
+                split_name = split + '_' + languages[lang_id]
+                lang_splits.append(split_name)
+                self.datasets[split_name] = lang_dataset
+
+            # [TODO]: This is hacky for now to print validation ppl for each
+            # language individually. Maybe need task API changes to allow it
+            # in more generic ways.
+            if split in self.args.valid_subset:
+                self.args.valid_subset = self.args.valid_subset.replace(
+                    split, ','.join(lang_splits)
+                )
+
+        with data_utils.numpy_seed(self.args.seed + epoch):
+            shuffle = np.random.permutation(len(dataset))
+
+        self.datasets[split] = SortDataset(
+            dataset,
+            sort_order=[
+                shuffle,
+                dataset.sizes,
+            ],
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
+        src_dataset = PadDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                self.args.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode='eos',
+            ),
+            pad_idx=self.source_dictionary.pad(),
+            left_pad=False,
+        )
+        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
+        src_dataset = NestedDictionaryDataset(
+            {
+                'id': IdDataset(),
+                'net_input': {
+                    'src_tokens': src_dataset,
+                    'src_lengths': NumelDataset(src_dataset, reduce=False),
+                },
+            },
+            sizes=src_lengths,
+        )
+        if sort:
+            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
+        return src_dataset
+
+    def get_batch_iterator(
+        self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
+        ignore_invalid_inputs=False, required_batch_size_multiple=1,
+        seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0,
+    ):
+        # Recreate epoch iterator every epoch cause the underlying
+        # datasets are dynamic due to sampling.
+        self.epoch_iter = None
+        return super().get_batch_iterator(
+            dataset, max_tokens, max_sentences, max_positions,
+            ignore_invalid_inputs, required_batch_size_multiple,
+            seed, num_shards, shard_id, num_workers, epoch,
+        )
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index cadfe8edc6..8e911a2174 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -225,10 +225,11 @@ def load_checkpoint(
 
         return extra_state
 
-    def get_train_iterator(self, epoch, combine=True):
+    def get_train_iterator(self, epoch, combine=True, load_dataset=True):
         """Return an EpochBatchIterator over the training set for a given epoch."""
-        print('| loading train data for epoch {}'.format(epoch))
-        self.task.load_dataset(self.args.train_subset, epoch=epoch, combine=combine)
+        if load_dataset:
+            print('| loading train data for epoch {}'.format(epoch))
+            self.task.load_dataset(self.args.train_subset, epoch=epoch, combine=combine)
         return self.task.get_batch_iterator(
             dataset=self.task.dataset(self.args.train_subset),
             max_tokens=self.args.max_tokens,
diff --git a/train.py b/train.py
index 31c9507760..db04dc2190 100644
--- a/train.py
+++ b/train.py
@@ -92,9 +92,9 @@ def main(args, init_distributed=False):
         if epoch_itr.epoch % args.save_interval == 0:
             checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0])
 
-        if ':' in getattr(args, 'data', ''):
-            # sharded data: get train iterator for next epoch
-            epoch_itr = trainer.get_train_iterator(epoch_itr.epoch)
+        reload_dataset = ':' in getattr(args, 'data', '')
+        # sharded data: get train iterator for next epoch
+        epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset)
     train_meter.stop()
     print('| done training in {:.1f} seconds'.format(train_meter.sum))
 

From e869c80df6bc13163e9a120ce1ea7fecc2e4702c Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 20 Sep 2019 14:51:32 -0700
Subject: [PATCH 148/213] Update README.race.md

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1155

Differential Revision: D17509762

Pulled By: myleott

fbshipit-source-id: 4de535289c1f35abff0d8142d8580f3ede039f47
---
 examples/roberta/README.race.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/roberta/README.race.md b/examples/roberta/README.race.md
index dc3f1b9f6f..b398092206 100644
--- a/examples/roberta/README.race.md
+++ b/examples/roberta/README.race.md
@@ -19,7 +19,7 @@ UPDATE_FREQ=8         # Accumulate gradients to simulate training on 8 GPUs.
 DATA_DIR=/path/to/race-output-dir
 ROBERTA_PATH=/path/to/roberta/model.pt
 
-CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR \
+CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=no_c10d \
     --restore-file $ROBERTA_PATH \
     --reset-optimizer --reset-dataloader --reset-meters \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \

From 10f9349e8a1f624255166aeef1c9c721de93041c Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 20 Sep 2019 15:11:03 -0700
Subject: [PATCH 149/213] Remove extraneous call to RNG in multi-GPU code path

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/865

Differential Revision: D17510276

Pulled By: myleott

fbshipit-source-id: 24119402ad5fe95a1312fadb77bafe49a9197c6b
---
 fairseq/distributed_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py
index 6da6ae115b..9cfe20d120 100644
--- a/fairseq/distributed_utils.py
+++ b/fairseq/distributed_utils.py
@@ -87,7 +87,7 @@ def distributed_init(args):
             socket.gethostname(), args.distributed_rank), flush=True)
 
         # perform a dummy all-reduce to initialize the NCCL communicator
-        dist.all_reduce(torch.rand(1).cuda())
+        dist.all_reduce(torch.zeros(1).cuda())
 
         suppress_output(is_master(args))
 

From 3b09b98b662d4bbde50ebd067f7a14268b2eab1e Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair0339.h2.fair>
Date: Mon, 23 Sep 2019 11:32:20 -0700
Subject: [PATCH 150/213] fixed train valid epoch iter

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/866

Differential Revision: D17517115

fbshipit-source-id: fd6921e642c99e37fce6ad58b24c93e70a5364e5
---
 fairseq/tasks/fairseq_task.py           | 12 ++++++------
 fairseq/tasks/multilingual_masked_lm.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index be95a633bb..ba5695785d 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -24,7 +24,7 @@ def add_args(parser):
     def __init__(self, args):
         self.args = args
         self.datasets = {}
-        self.epoch_iter = None
+        self.dataset_to_epoch_iter = {}
 
     @classmethod
     def load_dictionary(cls, filename):
@@ -120,7 +120,6 @@ def get_batch_iterator(
                 (default: 0).
             epoch (int, optional): the epoch to start the iterator from
                 (default: 0).
-
         Returns:
             ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                 given dataset split
@@ -128,8 +127,8 @@ def get_batch_iterator(
         # For default fairseq task, return same iterator across epochs
         # as datasets are not dynamic, can be overridden in task specific
         # setting.
-        if self.epoch_iter is not None:
-            return self.epoch_iter
+        if dataset in self.dataset_to_epoch_iter:
+            return self.dataset_to_epoch_iter[dataset]
 
         assert isinstance(dataset, FairseqDataset)
 
@@ -153,7 +152,7 @@ def get_batch_iterator(
         )
 
         # return a reusable, sharded iterator
-        self.epoch_iter = iterators.EpochBatchIterator(
+        epoch_iter = iterators.EpochBatchIterator(
             dataset=dataset,
             collate_fn=dataset.collater,
             batch_sampler=batch_sampler,
@@ -163,7 +162,8 @@ def get_batch_iterator(
             num_workers=num_workers,
             epoch=epoch,
         )
-        return self.epoch_iter
+        self.dataset_to_epoch_iter[dataset] = epoch_iter
+        return epoch_iter
 
     def build_model(self, args):
         """
diff --git a/fairseq/tasks/multilingual_masked_lm.py b/fairseq/tasks/multilingual_masked_lm.py
index fd7bdde42a..407f83e2eb 100644
--- a/fairseq/tasks/multilingual_masked_lm.py
+++ b/fairseq/tasks/multilingual_masked_lm.py
@@ -286,7 +286,7 @@ def get_batch_iterator(
     ):
         # Recreate epoch iterator every epoch cause the underlying
         # datasets are dynamic due to sampling.
-        self.epoch_iter = None
+        self.dataset_to_epoch_iter = None
         return super().get_batch_iterator(
             dataset, max_tokens, max_sentences, max_positions,
             ignore_invalid_inputs, required_batch_size_multiple,

From 3f4fc5016334255d6908b20202267ca0b0287335 Mon Sep 17 00:00:00 2001
From: Jerry Ma <noreplyspamblackhole@gmail.com>
Date: Mon, 23 Sep 2019 12:25:44 -0700
Subject: [PATCH 151/213] Miscellaneous documentation improvements: (#868)

Summary:
- More clearly document the correspondence between FairseqAdam and torch.optim.AdamW
- Add ResamplingDataset to Sphinx docs
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/868

Differential Revision: D17523244

Pulled By: jma127

fbshipit-source-id: 8e7b34b24889b2c8f70b09a52a625d2af135734b
---
 docs/data.rst         | 2 ++
 fairseq/optim/adam.py | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/data.rst b/docs/data.rst
index a2a464ecb5..6a390cb336 100644
--- a/docs/data.rst
+++ b/docs/data.rst
@@ -30,6 +30,8 @@ provide additional functionality:
     :members:
 .. autoclass:: fairseq.data.ConcatDataset
     :members:
+.. autoclass:: fairseq.data.ResamplingDataset
+    :members:
 .. autoclass:: fairseq.data.RoundRobinZipDatasets
     :members:
 .. autoclass:: fairseq.data.TransformEosDataset
diff --git a/fairseq/optim/adam.py b/fairseq/optim/adam.py
index 80de7f00d9..e60a7db551 100644
--- a/fairseq/optim/adam.py
+++ b/fairseq/optim/adam.py
@@ -15,6 +15,12 @@
 
 @register_optimizer('adam')
 class FairseqAdam(FairseqOptimizer):
+    """Adam optimizer for fairseq.
+
+    Important note: this optimizer corresponds to the "AdamW" variant of
+    Adam in its weight decay behavior. As such, it is most closely
+    analogous to torch.optim.AdamW from PyTorch.
+    """
 
     def __init__(self, args, params):
         super().__init__(args)

From 2ed65b68e06cae536ec7687286829033fbf241d7 Mon Sep 17 00:00:00 2001
From: Naman Goyal <namangoyal@learnfair1193.h2.fair>
Date: Mon, 23 Sep 2019 16:13:00 -0700
Subject: [PATCH 152/213] fixed corner case in mlm criterion when all tokens
 get masked

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/869

Reviewed By: myleott

Differential Revision: D17531776

Pulled By: myleott

fbshipit-source-id: 349c9449a0a7db5d3bb8449561302d4220cfa60c
---
 fairseq/criterions/masked_lm.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
index 4eae5c3849..eb2fcf3d3a 100644
--- a/fairseq/criterions/masked_lm.py
+++ b/fairseq/criterions/masked_lm.py
@@ -31,9 +31,18 @@ def forward(self, model, sample, reduce=True):
         """
         # compute MLM loss
         masked_tokens = sample['target'].ne(self.padding_idx)
+        sample_size = masked_tokens.int().sum().item()
+
+        # (Rare case) When all tokens are masked, the model results in empty
+        # tensor and gives CUDA error.
+        if sample_size == 0:
+            masked_tokens = None
+
         logits = model(**sample['net_input'], masked_tokens=masked_tokens)[0]
         targets = model.get_targets(sample, [logits])
-        targets = targets[masked_tokens]
+
+        if sample_size != 0:
+            targets = targets[masked_tokens]
 
         loss = F.nll_loss(
             F.log_softmax(
@@ -45,9 +54,6 @@ def forward(self, model, sample, reduce=True):
             reduction='sum',
             ignore_index=self.padding_idx,
         )
-
-        sample_size = masked_tokens.int().sum().item()
-
         logging_output = {
             'loss': utils.item(loss.data) if reduce else loss.data,
             'nll_loss': utils.item(loss.data) if reduce else loss.data,

From fa7dea6bf847404f24b9591aeef8bc96e6bf90c5 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jmorton@eng.ucsd.edu>
Date: Tue, 24 Sep 2019 06:14:24 -0700
Subject: [PATCH 153/213] Issue 1146: Minor fix to roberta pre-training readme
 (#1165)

Summary:
This is to make this instructions a little more generalizable, since in some systems, bash will parse the spaces within quotes

Addressing https://github.com/pytorch/fairseq/issues/1146
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1165

Differential Revision: D17547810

Pulled By: myleott

fbshipit-source-id: 5a026d42f678126b5ca8bc4477ba8f26ea549dcd
---
 examples/roberta/README.pretraining.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md
index 43bdf17676..9e627335c1 100644
--- a/examples/roberta/README.pretraining.md
+++ b/examples/roberta/README.pretraining.md
@@ -61,7 +61,7 @@ DATA_DIR=data-bin/wikitext-103
 fairseq-train --fp16 $DATA_DIR \
     --task masked_lm --criterion masked_lm \
     --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \
-    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
+    --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
     --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
     --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
     --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \

From e073ddfe46d71f80340c2600a7bf9aed2696c692 Mon Sep 17 00:00:00 2001
From: vineetk1 <vineetchandhok@gmail.com>
Date: Thu, 26 Sep 2019 08:18:36 -0700
Subject: [PATCH 154/213] PR for Issue #1154: Two comments in lstm.py seem to
 be incorrect

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1185

Differential Revision: D17602249

Pulled By: lematt1991

fbshipit-source-id: bd515b7d2ebce8181a80684f45223a8db7c7e3cd
---
 fairseq/models/lstm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/models/lstm.py b/fairseq/models/lstm.py
index 6b51350f96..0b3422d8bb 100644
--- a/fairseq/models/lstm.py
+++ b/fairseq/models/lstm.py
@@ -281,9 +281,9 @@ def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=Fal
 
     def forward(self, input, source_hids, encoder_padding_mask):
         # input: bsz x input_embed_dim
-        # source_hids: srclen x bsz x output_embed_dim
+        # source_hids: srclen x bsz x source_embed_dim
 
-        # x: bsz x output_embed_dim
+        # x: bsz x source_embed_dim
         x = self.input_proj(input)
 
         # compute attention

From 2314979ea59965caa1b1c566fd0e0e662ab0d212 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 27 Sep 2019 07:26:12 -0700
Subject: [PATCH 155/213] Update getting_started.rst (#1188)

Summary:
Hi,

I think there is a minor mistake in the doc. `--distributed-no-spawn` argument is needed for distributed training on multiple machines without `slurm`. Otherwise, the program will start 8 jobs on each GPU, when `nproc_per_node=8`.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1188

Differential Revision: D17627778

Pulled By: myleott

fbshipit-source-id: 35ab6b650dc1132d7cb2d150e80d2ebf0caf3e69
---
 docs/getting_started.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index a5fa17246c..54bff2d670 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -189,4 +189,4 @@ replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
         --lr 0.0005 --min-lr 1e-09 \
         --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
         --max-tokens 3584 \
-        --fp16
+        --fp16  --distributed-no-spawn 

From 62e65c418b1914ccf448783f66eed3d2f4a41525 Mon Sep 17 00:00:00 2001
From: Louis Martin <louisrtm@gmail.com>
Date: Fri, 27 Sep 2019 07:34:11 -0700
Subject: [PATCH 156/213] Explain the language modelling format in RoBERTa
 pretraining readme

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1174

Differential Revision: D17627767

Pulled By: myleott

fbshipit-source-id: 7b5f77146b8776a5967699e430136039c066c851
---
 examples/roberta/README.pretraining.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md
index 9e627335c1..b841631d3e 100644
--- a/examples/roberta/README.pretraining.md
+++ b/examples/roberta/README.pretraining.md
@@ -4,7 +4,7 @@ This tutorial will walk you through pretraining RoBERTa over your own data.
 
 ### 1) Preprocess the data
 
-Data should be preprocessed following the [language modeling format](/examples/language_model).
+Data should be preprocessed following the [language modeling format](/examples/language_model), i.e. each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`). Lines will be concatenated as a 1D text stream during training.
 
 We'll use the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)
 to demonstrate how to preprocess raw text data with the GPT-2 BPE. Of course

From 6c1da0f74b1187060715219da306cc1af6505e1b Mon Sep 17 00:00:00 2001
From: Nayan Singhal <naysing@fb.com>
Date: Fri, 27 Sep 2019 10:44:56 -0700
Subject: [PATCH 157/213] Fixing BMUF warmup and sync strategy

Summary:
Bmuf sync started happening even before warmup is done.
This diff fixes the behavior and do bmuf sync once warmup is done or if it's zero.

TODO: write a unit test case so that these problems can be figure out faster.

Reviewed By: jay-mahadeokar

Differential Revision: D17356277

fbshipit-source-id: 21500e6ed1225b97794e4ee203e5d7d04a2840f8
---
 fairseq/optim/bmuf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py
index 289b7e5831..c4da08d89e 100644
--- a/fairseq/optim/bmuf.py
+++ b/fairseq/optim/bmuf.py
@@ -127,7 +127,9 @@ def _is_warmup_end(self):
 
     def _is_bmuf_iter(self):
         # Check whether train iterations is equal to bmuf sync iter
-        if self.get_num_updates() % self.sync_iter == 0:
+        if (self.get_num_updates() > self.warmup_iteration) and (
+            self.get_num_updates() % self.sync_iter == 0
+        ):
             return True
         return False
 

From 86857a58bf2919c7bec3c29c58234aa4c434d566 Mon Sep 17 00:00:00 2001
From: Changhan Wang <changhan@fb.com>
Date: Fri, 27 Sep 2019 13:56:47 -0700
Subject: [PATCH 158/213] Levenshtein Transformer paper code

Summary:
Code for our NeurIPS paper [Levenshtein Transformer](https://arxiv.org/abs/1905.11006)
* Added Levenshtein Transformer model, task and criterion class
* Added iterative NAT Transformer, insertion Transformer and CMLM Transformer model class for baselines
* Add an option for prepending BOS to dictionary class and translation task class

Reviewed By: myleott

Differential Revision: D17297372

fbshipit-source-id: 54eca60831ae95dc721c2c34e882e1810ee575c7
---
 README.md                                     |  11 +-
 .../nonautoregressive_translation/README.md   |  90 +++
 .../nonautoregressive_translation/scripts.md  | 148 ++++
 fairseq/clib/libnat/edit_dist.cpp             | 222 ++++++
 fairseq/criterions/nat_loss.py                | 190 ++++++
 fairseq/data/dictionary.py                    |   5 +-
 fairseq/iterative_refinement_generator.py     | 154 +++++
 fairseq/models/cmlm_transformer.py            | 136 ++++
 fairseq/models/insertion_transformer.py       | 259 +++++++
 ...iterative_nonautoregressive_transformer.py | 196 ++++++
 fairseq/models/levenshtein_transformer.py     | 595 ++++++++++++++++
 fairseq/models/model_utils.py                 |  62 ++
 .../models/nonautoregressive_transformer.py   | 640 ++++++++++++++++++
 fairseq/models/transformer.py                 |  23 +-
 fairseq/modules/multihead_attention.py        |   5 +-
 fairseq/modules/transformer_layer.py          |   2 +-
 .../modules/transformer_sentence_encoder.py   |   3 +-
 fairseq/options.py                            |  14 +
 fairseq/tasks/translation.py                  |   9 +-
 fairseq/tasks/translation_lev.py              | 149 ++++
 fairseq/utils.py                              |   8 +
 generate.py                                   |   3 +
 setup.py                                      |   8 +
 tests/test_binaries.py                        |  46 ++
 train.py                                      |   5 +
 25 files changed, 2968 insertions(+), 15 deletions(-)
 create mode 100644 examples/nonautoregressive_translation/README.md
 create mode 100644 examples/nonautoregressive_translation/scripts.md
 create mode 100644 fairseq/clib/libnat/edit_dist.cpp
 create mode 100644 fairseq/criterions/nat_loss.py
 create mode 100644 fairseq/iterative_refinement_generator.py
 create mode 100644 fairseq/models/cmlm_transformer.py
 create mode 100644 fairseq/models/insertion_transformer.py
 create mode 100644 fairseq/models/iterative_nonautoregressive_transformer.py
 create mode 100644 fairseq/models/levenshtein_transformer.py
 create mode 100644 fairseq/models/model_utils.py
 create mode 100644 fairseq/models/nonautoregressive_transformer.py
 create mode 100644 fairseq/tasks/translation_lev.py

diff --git a/README.md b/README.md
index 45dce65cf0..c39ff22c97 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ modeling and other text generation tasks.
 
 ### What's New:
 
+- September 2019: [Nonautoregressive translation code released](examples/nonautoregressive_translation/README.md)
 - August 2019: [WMT'19 models released](examples/wmt19/README.md)
 - July 2019: fairseq relicensed under MIT license
 - July 2019: [RoBERTa models and code released](examples/roberta/README.md)
@@ -32,6 +33,13 @@ Fairseq provides reference implementations of various sequence-to-sequence model
   - [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md)
   - [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
   - [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md)
+- **Non-autoregressive Transformers**
+  - Non-Autoregressive Neural Machine Translation (Gu et al., 2017)
+  - Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al. 2018)
+  - Insertion Transformer: Flexible Sequence Generation via Insertion Operations (Stern et al. 2019)
+  - Mask-Predict: Parallel Decoding of Conditional Masked Language Models (Ghazvininejad et al., 2019)
+  - [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md)
+
 
 **Additionally:**
 - multi-GPU (distributed) training on one machine or across multiple machines
@@ -50,7 +58,7 @@ translation and language modeling datasets.
 
 # Requirements and Installation
 
-* [PyTorch](http://pytorch.org/) version >= 1.1.0
+* [PyTorch](http://pytorch.org/) version >= 1.2.0
 * Python version >= 3.5
 * For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl)
 * **For faster training** install NVIDIA's [apex](https://github.com/NVIDIA/apex) library with the `--cuda_ext` option
@@ -92,6 +100,7 @@ as well as example training and evaluation commands.
 - [Language Modeling](examples/language_model/README.md): convolutional and transformer models are available
 
 We also have more detailed READMEs to reproduce results from specific papers:
+- [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md)
 - [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md)
 - [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
 - [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md)
diff --git a/examples/nonautoregressive_translation/README.md b/examples/nonautoregressive_translation/README.md
new file mode 100644
index 0000000000..5f030868cc
--- /dev/null
+++ b/examples/nonautoregressive_translation/README.md
@@ -0,0 +1,90 @@
+# Non-autoregressive Neural Machine Translation (NAT)
+
+This page mainly includes instructions for reproducing results from the paper
+* [Levenshtein Transformer (Gu et al., 2019)](https://arxiv.org/abs/1905.11006).
+
+We also provided our own implementations for several popular non-autoregressive-based models as reference:<br>
+* [Non-Autoregressive Neural Machine Translation (Gu et al., 2017)](https://arxiv.org/abs/1711.02281)<br>
+* [Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al. 2018)](https://arxiv.org/abs/1802.06901)<br>
+* [Insertion Transformer: Flexible Sequence Generation via Insertion Operations (Stern et al. 2019)](https://arxiv.org/abs/1902.03249)<br>
+* [Mask-Predict: Parallel Decoding of Conditional Masked Language Models (Ghazvininejad et al., 2019)](https://arxiv.org/abs/1904.09324v2)
+
+## Dataset
+
+First, follow the [instructions to download and preprocess the WMT'14 En-De dataset](../translation#prepare-wmt14en2desh).
+Make sure to learn a joint vocabulary by passing the `--joined-dictionary` option to `fairseq-preprocess`.
+
+### Knowledge Distillation
+Following [Gu et al. 2019](https://arxiv.org/abs/1905.11006), [knowledge distillation](https://arxiv.org/abs/1606.07947) from an autoregressive model can effectively simplify the training data distribution, which is sometimes essential for NAT-based models to learn good translations.
+The easiest way of performing distillation is to follow the [instructions of training a standard transformer model](../translation) on the same data, and then decode the training set to produce a distillation dataset for NAT.
+
+### Download
+We also provided the preprocessed [original](http://dl.fbaipublicfiles.com/nat/original_dataset.zip) and [distillation](http://dl.fbaipublicfiles.com/nat/distill_dataset.zip) datasets. Please build the binarized dataset on your own.
+
+
+## Train a model
+
+Then we can train a nonautoregressive model using the `translation_lev` task and a new criterion `nat_loss`.
+Use the `--noise` flag to specify the input noise used on the target sentences.
+In default, we run the task for *Levenshtein Transformer*, with `--noise='random_delete'`. Full scripts to run other models can also be found [here](./scripts.md).
+
+The following command will train a *Levenshtein Transformer* on the binarized dataset.
+
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch levenshtein_transformer \
+    --noise random_delete \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+
+## Translate
+
+Once a model is trained, we can generate translations using an `iterative_refinement_generator` which will based on the model's initial output and iteratively read and greedily refine the translation until (1) the model predicts the same translations for two consecutive iterations; or (2) the generator reaches the maximum iterations (`--iter-decode-max-iter`). Use `--print-step` to check the actual # of iteration for each sentence.
+
+For *Levenshtein Transformer*, it sometimes helps to apply a `--iter-decode-eos-penalty` (typically, 0~3) to penalize the model finishing generation too early and generating too short translations.
+
+
+For example, to generate with `--iter-decode-max-iter=9`:
+```bash
+fairseq-generate \
+    data-bin/wmt14_en_de_distill \
+    --gen-subset test \
+    --task translation_lev \
+    --path checkpoints/checkpoint_best.pt \
+    --iter-decode-max-iter 9 \
+    --iter-decode-eos-penalty 0 \
+    --beam 1 --remove-bpe \
+    --print-step \
+    --batch-size 400
+```
+In the end of the generation, we can see the tokenized BLEU score for the translation.
+
+
+## Citation
+
+```bibtex
+@article{gu2019levenshtein,
+  title={Levenshtein Transformer},
+  author={Gu, Jiatao and Wang, Changhan and Zhao, Jake},
+  journal={arXiv preprint arXiv:1905.11006},
+  year={2019}
+}
+```
diff --git a/examples/nonautoregressive_translation/scripts.md b/examples/nonautoregressive_translation/scripts.md
new file mode 100644
index 0000000000..2fda7f6204
--- /dev/null
+++ b/examples/nonautoregressive_translation/scripts.md
@@ -0,0 +1,148 @@
+# Examples of Training scripts for Non-autoregressive Machine Translation models
+
+### Non-autoregressive Transformer (NAT, Gu et al., 2017)
+Note that we need to have an additional module to perform "length prediction" (`--length-loss-factor`) before generating the whole sequence.
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch nonautoregressive_transformer \
+    --noise full_mask \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --pred-length-offset \
+    --length-loss-factor 0.1 \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+
+### Non-autoregressive Transformer with Iterative Refinement (iNAT, Lee et al., 2018)
+Note that `--train-step` means how many iterations of refinement we used during training, and `--dae-ratio` controls the ratio of denoising auto-encoder training described in the original paper.
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch nonautoregressive_transformer \
+    --noise full_mask \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --pred-length-offset \
+    --length-loss-factor 0.1 \
+    --train-step 4 \
+    --dae-ratio 0.5 \
+    --stochastic-approx \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+
+### Insertion Transformer (InsT, Stern et al., 2019)
+Note that we need to specify the "slot-loss" (uniform or balanced tree) described in the original paper. Here we use `--label-tau` to control the temperature.
+
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch insertion_transformer \
+    --noise random_delete \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --pred-length-offset \
+    --length-loss-factor 0.1 \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+
+
+### Mask Predict (CMLM, Ghazvininejad et al., 2019)
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch cmlm_transformer \
+    --noise random_mask \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+
+
+
+
+### Levenshtein Transformer (LevT, Gu et al., 2019)
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch levenshtein_transformer \
+    --noise random_delete \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
diff --git a/fairseq/clib/libnat/edit_dist.cpp b/fairseq/clib/libnat/edit_dist.cpp
new file mode 100644
index 0000000000..966e9083bf
--- /dev/null
+++ b/fairseq/clib/libnat/edit_dist.cpp
@@ -0,0 +1,222 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/torch.h> // @manual=//caffe2:torch_extension
+#include <vector>
+
+using namespace ::std;
+
+vector<vector<uint32_t>> edit_distance2_with_dp(
+    vector<uint32_t>& x,
+    vector<uint32_t>& y) {
+  uint32_t lx = x.size();
+  uint32_t ly = y.size();
+  vector<vector<uint32_t>> d(lx + 1, vector<uint32_t>(ly + 1));
+  for (uint32_t i = 0; i < lx + 1; i++) {
+    d[i][0] = i;
+  }
+  for (uint32_t j = 0; j < ly + 1; j++) {
+    d[0][j] = j;
+  }
+  for (uint32_t i = 1; i < lx + 1; i++) {
+    for (uint32_t j = 1; j < ly + 1; j++) {
+      d[i][j] =
+          min(min(d[i - 1][j], d[i][j - 1]) + 1,
+              d[i - 1][j - 1] + 2 * (x.at(i - 1) == y.at(j - 1) ? 0 : 1));
+    }
+  }
+  return d;
+}
+
+vector<vector<uint32_t>> edit_distance2_backtracking(
+    vector<vector<uint32_t>>& d,
+    vector<uint32_t>& x,
+    vector<uint32_t>& y,
+    uint32_t terminal_symbol) {
+  vector<uint32_t> seq;
+  vector<vector<uint32_t>> edit_seqs(x.size() + 2, vector<uint32_t>());
+  /*
+  edit_seqs:
+  0~x.size() cell is the insertion sequences
+  last cell is the delete sequence
+  */
+
+  if (x.size() == 0) {
+    edit_seqs.at(0) = y;
+    return edit_seqs;
+  }
+
+  uint32_t i = d.size() - 1;
+  uint32_t j = d.at(0).size() - 1;
+
+  while ((i >= 0) && (j >= 0)) {
+    if ((i == 0) && (j == 0)) {
+      break;
+    }
+
+    if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) {
+      seq.push_back(1); // insert
+      seq.push_back(y.at(j - 1));
+      j--;
+    } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) {
+      seq.push_back(2); // delete
+      seq.push_back(x.at(i - 1));
+      i--;
+    } else {
+      seq.push_back(3); // keep
+      seq.push_back(x.at(i - 1));
+      i--;
+      j--;
+    }
+  }
+
+  uint32_t prev_op, op, s, word;
+  prev_op = 0, s = 0;
+  for (uint32_t k = 0; k < seq.size() / 2; k++) {
+    op = seq.at(seq.size() - 2 * k - 2);
+    word = seq.at(seq.size() - 2 * k - 1);
+    if (prev_op != 1) {
+      s++;
+    }
+    if (op == 1) // insert
+    {
+      edit_seqs.at(s - 1).push_back(word);
+    } else if (op == 2) // delete
+    {
+      edit_seqs.at(x.size() + 1).push_back(1);
+    } else {
+      edit_seqs.at(x.size() + 1).push_back(0);
+    }
+
+    prev_op = op;
+  }
+
+  for (uint32_t k = 0; k < edit_seqs.size(); k++) {
+    if (edit_seqs[k].size() == 0) {
+      edit_seqs[k].push_back(terminal_symbol);
+    }
+  }
+  return edit_seqs;
+}
+
+vector<vector<uint32_t>> edit_distance2_backtracking_with_delete(
+    vector<vector<uint32_t>>& d,
+    vector<uint32_t>& x,
+    vector<uint32_t>& y,
+    uint32_t terminal_symbol,
+    uint32_t deletion_symbol) {
+  vector<uint32_t> seq;
+  vector<vector<uint32_t>> edit_seqs(x.size() + 1, vector<uint32_t>());
+  /*
+  edit_seqs:
+  0~x.size() cell is the insertion sequences
+  last cell is the delete sequence
+  */
+
+  if (x.size() == 0) {
+    edit_seqs.at(0) = y;
+    return edit_seqs;
+  }
+
+  uint32_t i = d.size() - 1;
+  uint32_t j = d.at(0).size() - 1;
+
+  while ((i >= 0) && (j >= 0)) {
+    if ((i == 0) && (j == 0)) {
+      break;
+    }
+
+    if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) {
+      seq.push_back(1); // insert
+      seq.push_back(y.at(j - 1));
+      j--;
+    } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) {
+      seq.push_back(2); // delete
+      seq.push_back(x.at(i - 1));
+      i--;
+    } else {
+      seq.push_back(3); // keep
+      seq.push_back(x.at(i - 1));
+      i--;
+      j--;
+    }
+  }
+
+  uint32_t prev_op, op, s, word;
+  prev_op = 0, s = 0;
+  for (uint32_t k = 0; k < seq.size() / 2; k++) {
+    op = seq.at(seq.size() - 2 * k - 2);
+    word = seq.at(seq.size() - 2 * k - 1);
+    if (prev_op != 1) {
+      s++;
+    }
+    if (op == 1) // insert
+    {
+      edit_seqs.at(s - 1).push_back(word);
+    } else if (op == 2) // delete
+    {
+      edit_seqs.at(s - 1).push_back(deletion_symbol);
+    }
+
+    prev_op = op;
+  }
+
+  for (uint32_t k = 0; k < edit_seqs.size(); k++) {
+    if (edit_seqs.at(k).size() == 0) {
+      edit_seqs.at(k).push_back(terminal_symbol);
+    }
+  }
+  return edit_seqs;
+}
+
+vector<uint32_t> compute_ed2(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys) {
+  vector<uint32_t> distances(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    distances.at(i) = d.at(xs.at(i).size()).at(ys.at(i).size());
+  }
+  return distances;
+}
+
+vector<vector<vector<uint32_t>>> suggested_ed2_path(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys,
+    uint32_t terminal_symbol) {
+  vector<vector<vector<uint32_t>>> seq(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    seq.at(i) =
+        edit_distance2_backtracking(d, xs.at(i), ys.at(i), terminal_symbol);
+  }
+  return seq;
+}
+
+vector<vector<vector<uint32_t>>> suggested_ed2_path_with_delete(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys,
+    uint32_t terminal_symbol,
+    uint32_t deletion_symbol) {
+  vector<vector<vector<uint32_t>>> seq(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    seq.at(i) = edit_distance2_backtracking_with_delete(
+        d, xs.at(i), ys.at(i), terminal_symbol, deletion_symbol);
+  }
+  return seq;
+}
+
+PYBIND11_MODULE(libnat, m) {
+  m.def("compute_ed2", &compute_ed2, "compute_ed2");
+  m.def("suggested_ed2_path", &suggested_ed2_path, "suggested_ed2_path");
+  m.def(
+      "suggested_ed2_path_with_delete",
+      &suggested_ed2_path_with_delete,
+      "suggested_ed2_path_with_delete");
+}
diff --git a/fairseq/criterions/nat_loss.py b/fairseq/criterions/nat_loss.py
new file mode 100644
index 0000000000..ccb25298f4
--- /dev/null
+++ b/fairseq/criterions/nat_loss.py
@@ -0,0 +1,190 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch.nn.functional as F
+from fairseq import utils
+from torch import Tensor
+
+from . import FairseqCriterion, register_criterion
+
+
+@register_criterion("nat_loss")
+class LabelSmoothedDualImitationCriterion(FairseqCriterion):
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument(
+            '--label-smoothing',
+            default=0.,
+            type=float,
+            metavar='D',
+            help='epsilon for label smoothing, 0 means no label smoothing')
+        # fmt: on
+
+    def _compute_loss(
+        self, outputs, targets, masks=None, label_smoothing=0.0, name="loss", factor=1.0
+    ):
+        """
+            outputs: batch x len x d_model
+            targets: batch x len
+            masks:   batch x len
+
+            policy_logprob: if there is some policy
+                depends on the likelihood score as rewards.
+        """
+
+        def mean_ds(x: Tensor, dim=None) -> Tensor:
+            return (
+                x.float().mean().type_as(x)
+                if dim is None
+                else x.float().mean(dim).type_as(x)
+            )
+
+        if masks is not None:
+            outputs, targets = outputs[masks], targets[masks]
+
+        logits = F.log_softmax(outputs, dim=-1)
+        if targets.dim() == 1:
+            losses = F.nll_loss(logits, targets, reduction="none")
+
+        else:  # soft-labels
+            losses = F.kl_div(logits, targets, reduction="none")
+            losses = losses.float().sum(-1).type_as(losses)
+
+        nll_loss = mean_ds(losses)
+        if label_smoothing > 0:
+            loss = nll_loss * (1 - label_smoothing) - mean_ds(logits) * label_smoothing
+        else:
+            loss = nll_loss
+
+        loss = loss * factor
+        return {"name": name, "loss": loss, "nll_loss": nll_loss, "factor": factor}
+
+    def _custom_loss(self, loss, name="loss"):
+        return {"name": name, "loss": loss, "factor": 1}
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        nsentences, ntokens = sample["nsentences"], sample["ntokens"]
+
+        # B x T
+        src_tokens, src_lengths = (
+            sample["net_input"]["src_tokens"],
+            sample["net_input"]["src_lengths"],
+        )
+        tgt_tokens, prev_output_tokens = sample["target"], sample["prev_target"]
+
+        outputs = model(src_tokens, src_lengths, prev_output_tokens, tgt_tokens)
+        losses = []
+        if "mask_ins_out" in outputs:
+            mask_ins_losses = self._compute_loss(
+                outputs["mask_ins_out"],
+                outputs["mask_ins_tgt"],
+                outputs["mask_ins_mask"],
+                name="m_ins-loss",
+                factor=1 if "mask_ins_w" not in outputs else outputs["mask_ins_w"],
+            )
+            losses += [mask_ins_losses]
+
+        if "word_ins_out" in outputs:
+            word_ins_losses = self._compute_loss(
+                outputs["word_ins_out"],
+                outputs["word_ins_tgt"],
+                outputs["word_ins_mask"],
+                self.args.label_smoothing,
+                name="w_ins-loss",
+                factor=1 if "word_ins_w" not in outputs else outputs["word_ins_w"],
+            )
+
+            losses += [word_ins_losses]
+            nll_loss = word_ins_losses["nll_loss"]
+
+        if "word_del_out" in outputs:
+            word_del_losses = self._compute_loss(
+                outputs["word_del_out"],
+                outputs["word_del_tgt"],
+                outputs["word_del_mask"],
+                0.01,
+                name="w_del-loss",
+                factor=1 if "word_del_w" not in outputs else outputs["word_del_w"],
+            )
+
+            losses += [word_del_losses]
+
+        if "length_out" in outputs:
+            length_losses = self._compute_loss(
+                outputs["length_out"],
+                outputs["length_tgt"],
+                name="len-loss",
+                factor=1 if "length_w" not in outputs else outputs["length_w"],
+            )
+
+            losses += [length_losses]
+
+        for w in outputs:
+            if "-loss" in w:
+                losses += [self._custom_loss(outputs[w], w)]
+
+        loss = sum(l["loss"] for l in losses)
+
+        # NOTE: as we are summing up per token mlm loss and per sentence nsp loss
+        # we don't need to use sample_size as denominator for the gradient
+        # here sample_size is just used for logging
+        sample_size = 1
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "nll_loss": utils.item(nll_loss.data) if reduce else nll_loss.data,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+
+        for l in losses:
+            logging_output[l["name"]] = (
+                utils.item(l["loss"].data / l["factor"])
+                if reduce
+                else l[["loss"]].data / l["factor"]
+            )
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        loss = sum(log.get("loss", 0) for log in logging_outputs)
+        nll_loss = sum(log.get("nll_loss", 0) for log in logging_outputs)
+
+        results = {
+            "loss": loss / sample_size / math.log(2) if sample_size > 0 else 0.0,
+            "nll_loss": nll_loss / sample_size / math.log(2)
+            if sample_size > 0
+            else 0.0,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+
+        for key in logging_outputs[0]:
+            if key[-5:] == "-loss":
+                results[key[:-5]] = (
+                    sum(log.get(key, 0) for log in logging_outputs)
+                    / sample_size
+                    / math.log(2)
+                    if sample_size > 0
+                    else 0.0
+                )
+
+        return results
diff --git a/fairseq/data/dictionary.py b/fairseq/data/dictionary.py
index 417105e50b..5d135ba123 100644
--- a/fairseq/data/dictionary.py
+++ b/fairseq/data/dictionary.py
@@ -74,7 +74,10 @@ def token_string(i):
             else:
                 return self[i]
 
-        sent = ' '.join(token_string(i) for i in tensor if i != self.eos())
+        if hasattr(self, 'bos_index'):
+            sent = ' '.join(token_string(i) for i in tensor if (i != self.eos()) and (i != self.bos()))
+        else:
+            sent = ' '.join(token_string(i) for i in tensor if i != self.eos())
         return data_utils.process_bpe_symbol(sent, bpe_symbol)
 
     def unk_string(self, escape=False):
diff --git a/fairseq/iterative_refinement_generator.py b/fairseq/iterative_refinement_generator.py
new file mode 100644
index 0000000000..aee4884187
--- /dev/null
+++ b/fairseq/iterative_refinement_generator.py
@@ -0,0 +1,154 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from fairseq.models.model_utils import skip_tensors as _skip
+
+
+class IterativeRefinementGenerator(object):
+    def __init__(self,
+                 tgt_dict,
+                 eos_penalty=0.,
+                 max_iter=10,
+                 max_ratio=2,
+                 decoding_format=None,
+                 retain_dropout=False,
+                 adaptive=True):
+        """
+        Generates translations based on iterative refinement.
+
+        Args:
+            tgt_dict: target dictionary
+            eos_penalty: if > 0.0, it penalized early-stopping in decoding
+            max_iter: maximum number of refinement iterations
+            max_ratio: generate sequences of maximum length ax, where x is the source length
+            decoding_format: decoding mode in {'unigram', 'ensemble', 'vote', 'dp', 'bs'}
+            retain_dropout: retaining dropout in the inference
+            adaptive: decoding with early stop
+        """
+        self.bos = tgt_dict.bos()
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos()
+        self.vocab_size = len(tgt_dict)
+        self.eos_penalty = eos_penalty
+        self.max_iter = max_iter
+        self.max_ratio = max_ratio
+        self.decoding_format = decoding_format
+        self.retain_dropout = retain_dropout
+        self.adaptive = adaptive
+
+    @torch.no_grad()
+    def generate(self, models, sample, prefix_tokens=None):
+
+        # TODO: model ensemble
+        assert len(models) == 1, 'only support single model'
+        model = models[0]
+        if not self.retain_dropout:
+            model.eval()
+
+        # TODO: better encoder inputs?
+        src_tokens = sample['net_input']['src_tokens']
+        src_lengths = sample['net_input']['src_lengths']
+        bsz, src_len = src_tokens.size()
+        sent_idxs = torch.arange(bsz, device=src_tokens.device)
+
+        # encoding
+        encoder_out = model.forward_encoder([src_tokens, src_lengths])
+
+        # initialize buffers (very model specific, with length prediction or not)
+        prev_decoder_out = model.initialize_output_tokens(
+            encoder_out, src_tokens)
+        prev_out_tokens = prev_decoder_out['output_tokens'].clone()
+
+        finalized = [[] for _ in range(bsz)]
+
+        def is_a_loop(x, y, s, a):
+            b, l_x, l_y = x.size(0), x.size(1), y.size(1)
+            if l_x > l_y:
+                y = torch.cat([y, x.new_zeros(b, l_x - l_y).fill_(self.pad)], 1)
+                s = torch.cat([s, s.new_zeros(b, l_x - l_y)], 1)
+                if a is not None:
+                    a = torch.cat([a, a.new_zeros(b, l_x - l_y, a.size(2))], 1)
+            elif l_x < l_y:
+                x = torch.cat([x, y.new_zeros(b, l_y - l_x).fill_(self.pad)], 1)
+            return (x == y).all(1), y, s, a
+
+        def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn):
+            cutoff = prev_out_token.ne(self.pad)
+            tokens = prev_out_token[cutoff]
+            scores = prev_out_score[cutoff]
+            if prev_out_attn is None:
+                hypo_attn, alignment = None, None
+            else:
+                hypo_attn = prev_out_attn[cutoff]
+                alignment = hypo_attn.max(dim=1)[1]
+            return {
+                'steps': step,
+                'tokens': tokens,
+                'positional_scores': scores,
+                'score': scores.mean(),
+                'hypo_attn': hypo_attn,
+                'alignment': alignment,
+            }
+
+        for step in range(self.max_iter + 1):
+
+            decoder_options = {
+                'eos_penalty': self.eos_penalty,
+                'max_ratio': self.max_ratio,
+                'decoding_format': self.decoding_format
+            }
+            prev_decoder_out['step'] = step
+            prev_decoder_out['max_step'] = self.max_iter + 1
+
+            decoder_out = model.forward_decoder(
+                prev_decoder_out, encoder_out, **decoder_options
+            )
+
+            if self.adaptive:
+                # terminate if there is a loop
+                terminated, out_tokens, out_scores, out_attn = is_a_loop(
+                    prev_out_tokens, decoder_out['output_tokens'],
+                    decoder_out['output_scores'], decoder_out['attn'])
+                decoder_out['output_tokens'] = out_tokens
+                decoder_out['output_scores'] = out_scores
+                decoder_out['attn'] = out_attn
+
+            else:
+                terminated = decoder_out['output_tokens'].new_zeros(
+                    decoder_out['output_tokens'].size(0)).bool()
+
+            if step == self.max_iter:  # reach last iteration, terminate
+                terminated.fill_(1)
+
+            # collect finalized sentences
+            finalized_idxs = sent_idxs[terminated]
+            finalized_tokens = decoder_out['output_tokens'][terminated]
+            finalized_scores = decoder_out['output_scores'][terminated]
+            finalized_attn = None if decoder_out['attn'] is None else decoder_out['attn'][terminated]
+
+            for i in range(finalized_idxs.size(0)):
+                finalized[finalized_idxs[i]] = [
+                    finalized_hypos(
+                        step,
+                        finalized_tokens[i],
+                        finalized_scores[i],
+                        None if finalized_attn is None else finalized_attn[i]
+                    )
+                ]
+            # check if all terminated
+            if terminated.sum() == terminated.size(0):
+                break
+
+            # for next step
+            prev_decoder_out = _skip(decoder_out, ~terminated)
+            encoder_out = _skip(encoder_out, ~terminated)
+            sent_idxs = _skip(sent_idxs, ~terminated)
+
+            prev_out_tokens = prev_decoder_out['output_tokens'].clone()
+
+        return finalized
diff --git a/fairseq/models/cmlm_transformer.py b/fairseq/models/cmlm_transformer.py
new file mode 100644
index 0000000000..f76c93fd0f
--- /dev/null
+++ b/fairseq/models/cmlm_transformer.py
@@ -0,0 +1,136 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This file implements:
+Ghazvininejad, Marjan, et al.
+"Constant-time machine translation with conditional masked language models."
+arXiv preprint arXiv:1904.09324 (2019).
+"""
+
+import torch
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nonautoregressive_transformer import NATransformerModel
+
+
+def _skeptical_unmasking(output_scores, output_masks, p):
+    sorted_index = output_scores.sort(-1)[1]
+    boundary_len = (
+        (output_masks.sum(1, keepdim=True).type_as(output_scores) - 2) * p
+    ).long()
+    skeptical_mask = (
+        torch.arange(output_masks.size(1), device=output_masks.device)[None, :]
+        < boundary_len
+    )
+    return skeptical_mask.scatter(1, sorted_index, skeptical_mask)
+
+
+@register_model("cmlm_transformer")
+class CMLMNATransformerModel(NATransformerModel):
+    @staticmethod
+    def add_args(parser):
+        NATransformerModel.add_args(parser)
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+        assert not self.decoder.src_embedding_copy, "do not support embedding copy."
+
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        length_out, length_tgt = self.decoder.forward_length_prediction(
+            encoder_out, tgt_tokens
+        )
+
+        word_ins_out, word_ins_tgt, _ = self.decoder(
+            prev_output_tokens, encoder_out=encoder_out, tgt_tokens=tgt_tokens
+        )
+        word_ins_mask = prev_output_tokens.eq(self.unk)
+        return {
+            "word_ins_out": word_ins_out,
+            "word_ins_tgt": word_ins_tgt,
+            "word_ins_mask": word_ins_mask,
+            "length_out": length_out,
+            "length_tgt": length_tgt,
+            "length_w": self.decoder.length_loss_factor,
+        }
+
+    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
+
+        step = decoder_out["step"]
+        max_step = decoder_out["max_step"]
+
+        output_tokens = decoder_out["output_tokens"]
+        output_scores = decoder_out["output_scores"]
+
+        # execute the decoder
+        output_masks = output_tokens.eq(self.unk)
+        _scores, _tokens = self.decoder(
+            output_tokens, encoder_out=encoder_out, decoding_format=decoding_format
+        )
+        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
+        output_scores.masked_scatter_(output_masks, _scores[output_masks])
+
+        # skeptical decoding (depend on the maximum decoding steps.)
+        if (step + 1) < max_step:
+            skeptical_mask = _skeptical_unmasking(
+                output_scores, output_tokens.ne(self.pad), 1 - (step + 1) / max_step
+            )
+
+            output_tokens.masked_fill_(skeptical_mask, self.unk)
+            output_scores.masked_fill_(skeptical_mask, 0.0)
+
+        return {"output_tokens": output_tokens, "output_scores": output_scores}
+
+
+@register_model_architecture("cmlm_transformer", "cmlm_transformer")
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", True)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+
+
+@register_model_architecture("cmlm_transformer", "cmlm_transformer_wmt_en_de")
+def iter_nat_wmt_en_de(args):
+    base_architecture(args)
diff --git a/fairseq/models/insertion_transformer.py b/fairseq/models/insertion_transformer.py
new file mode 100644
index 0000000000..5f5868a550
--- /dev/null
+++ b/fairseq/models/insertion_transformer.py
@@ -0,0 +1,259 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq import libnat
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.levenshtein_transformer import (
+    LevenshteinTransformerDecoder,
+    LevenshteinTransformerModel,
+)
+from fairseq.models.transformer import Linear, TransformerModel
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+class NegativeDistanceScore(object):
+    def __init__(self):
+
+        # pre-compute some values
+        self.scores = {}
+
+        self.scores[0.5] = self.compute_score_full(50, 0.5)
+        self.scores[1.0] = self.compute_score_full(50, 1.0)
+        self.scores[2.0] = self.compute_score_full(50, 2.0)
+
+    def __call__(self, i, L, tau):
+        if (tau is None) or (tau > 1000):
+            return 1 / L
+
+        if tau in self.scores:
+            if L < self.scores[tau].shape[0]:
+                return self.scores[tau][L - 1, i]
+        return self.compute_score(L, tau)[i]
+
+    def compute_score(self, L, tau):
+        s = np.array([-abs(L / 2 - i) / tau for i in range(L)])
+        s = np.exp(s - s.max())
+        return s / s.sum()
+
+    def compute_score_full(self, L, tau):
+        s = -abs(np.arange(0, L - 1)[:, None] / 2 - np.arange(L)[None, :]) / tau
+        s = np.tril(s, 0) + np.triu(s - float("inf"), 1)
+        s = np.exp(s - s.max(1, keepdims=True))
+        return s / s.sum(1, keepdims=True)
+
+
+neg_scorer = NegativeDistanceScore()
+
+
+def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None):
+    B = in_tokens.size(0)
+    T = in_tokens.size(1)
+    V = vocab_size
+
+    with torch.cuda.device_of(in_tokens):
+        in_tokens_list = [
+            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+        ]
+        out_tokens_list = [
+            [t for t in s if t != padding_idx]
+            for i, s in enumerate(out_tokens.tolist())
+        ]
+
+    full_labels = libnat.suggested_ed2_path(
+        in_tokens_list, out_tokens_list, padding_idx
+    )
+    insert_labels = [a[:-1] for a in full_labels]
+
+    # numericalize1
+    insert_label_tensors = in_tokens.new_zeros(B * (T - 1) * V).float()
+    insert_index, insert_labels = zip(
+        *[
+            (w + (j + i * (T - 1)) * V, neg_scorer(k, len(label), tau))
+            for i, labels in enumerate(insert_labels)
+            for j, label in enumerate(labels[1:-1])
+            for k, w in enumerate(label)
+        ]
+    )  # HACK 1:-1
+    insert_index, insert_labels = [
+        torch.tensor(list(a), device=in_tokens.device)
+        for a in [insert_index, insert_labels]
+    ]
+    insert_label_tensors.scatter_(0, insert_index.long(), insert_labels)
+    insert_label_tensors = insert_label_tensors.view(B, T - 1, V)
+
+    return insert_label_tensors
+
+
+def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, padding_idx):
+
+    padding_masks = in_tokens[:, 1:].eq(padding_idx)
+    word_ins_scores.masked_fill_(padding_masks, 0.0)
+    word_ins_pred.masked_fill_(padding_masks, padding_idx)
+
+    in_coords = torch.arange(in_tokens.size(1), device=in_tokens.device)
+    in_coords = in_coords.unsqueeze(0).repeat(in_tokens.size(0), 1).type_as(in_scores)
+
+    # shift all padding predictions to infinite
+    out_coords = (in_coords[:, 1:] - 0.5).masked_fill(
+        word_ins_pred.eq(padding_idx), float("inf")
+    )
+    out_coords = torch.cat([in_coords, out_coords], 1).sort(-1)[1]
+    out_tokens = torch.cat([in_tokens, word_ins_pred], 1).gather(1, out_coords)
+    out_scores = torch.cat([in_scores, word_ins_scores], 1).gather(1, out_coords)
+    return out_tokens, out_scores
+
+
+@register_model("insertion_transformer")
+class InsertionTransformerModel(LevenshteinTransformerModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--apply-bert-init",
+            action="store_true",
+            help="use custom param initialization for BERT",
+        )
+        parser.add_argument("--label-tau", default=None, type=float)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = InsertionTransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        assert tgt_tokens is not None, "forward function only supports training."
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # generate training labels for insertion
+        word_ins_out = self.decoder.forward_word_ins(
+            prev_output_tokens, encoder_out=encoder_out
+        )
+        word_ins_tgt = _get_ins_targets(
+            prev_output_tokens,
+            tgt_tokens,
+            self.pad,
+            self.unk,
+            len(self.tgt_dict),
+            tau=self.decoder.label_tau,
+        ).type_as(word_ins_out)
+        word_ins_masks = prev_output_tokens[:, 1:].ne(self.pad)
+
+        return {
+            "word_ins_out": word_ins_out,
+            "word_ins_tgt": word_ins_tgt,
+            "word_ins_mask": word_ins_masks,
+        }
+
+    def forward_decoder(
+        self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
+    ):
+
+        output_tokens = decoder_out["output_tokens"]
+        output_scores = decoder_out["output_scores"]
+        # TODO: decoding for InsertionTransformer
+        word_ins_out = self.decoder.forward_word_ins(
+            output_tokens, encoder_out=encoder_out
+        )
+        word_ins_score = F.log_softmax(word_ins_out, 2)
+        if eos_penalty > 0.0:
+            word_ins_score[:, :, self.pad] -= eos_penalty
+        word_ins_score, word_ins_pred = word_ins_score.max(-1)
+        output_tokens, output_scores = _apply_ins_words(
+            output_tokens, output_scores, word_ins_pred, word_ins_score, self.pad
+        )
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+        return {"output_tokens": output_tokens, "output_scores": output_scores}
+
+
+class InsertionTransformerDecoder(LevenshteinTransformerDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        # use the TransformerDecoder's __init__
+        super(LevenshteinTransformerDecoder, self).__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+        self.pool_out = Linear(self.output_embed_dim * 2, self.output_embed_dim)
+
+        self.label_tau = getattr(args, "label_tau", None)
+
+    def forward_word_ins(self, prev_output_tokens, encoder_out=None):
+        features, _ = self.extract_features(prev_output_tokens, encoder_out=encoder_out)
+        features = self.pool_out(
+            torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
+        )
+        return self.output_layer(features)
+
+    def forward_mask_ins(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_word_del(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_word_del_mask_ins(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+@register_model_architecture("insertion_transformer", "insertion_transformer")
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # special for insertion transformer
+    args.label_tau = getattr(args, "label_tau", None)
diff --git a/fairseq/models/iterative_nonautoregressive_transformer.py b/fairseq/models/iterative_nonautoregressive_transformer.py
new file mode 100644
index 0000000000..73585db354
--- /dev/null
+++ b/fairseq/models/iterative_nonautoregressive_transformer.py
@@ -0,0 +1,196 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nonautoregressive_transformer import NATransformerModel
+
+
+def _sequential_poisoning(s, V, beta=0.33, bos=2, eos=3, pad=1):
+    # s: input batch
+    # V: vocabulary size
+    rand_words = torch.randint(low=4, high=V, size=s.size(), device=s.device)
+    choices = torch.rand(size=s.size(), device=s.device)
+    choices.masked_fill_((s == pad) | (s == bos) | (s == eos), 1)
+
+    replace = choices < beta / 3
+    repeat = (choices >= beta / 3) & (choices < beta * 2 / 3)
+    swap = (choices >= beta * 2 / 3) & (choices < beta)
+    safe = choices >= beta
+
+    for i in range(s.size(1) - 1):
+        rand_word = rand_words[:, i]
+        next_word = s[:, i + 1]
+        self_word = s[:, i]
+
+        replace_i = replace[:, i]
+        swap_i = swap[:, i] & (next_word != 3)
+        repeat_i = repeat[:, i] & (next_word != 3)
+        safe_i = safe[:, i] | ((next_word == 3) & (~replace_i))
+
+        s[:, i] = (
+            self_word * (safe_i | repeat_i).long()
+            + next_word * swap_i.long()
+            + rand_word * replace_i.long()
+        )
+        s[:, i + 1] = (
+            next_word * (safe_i | replace_i).long()
+            + self_word * (swap_i | repeat_i).long()
+        )
+    return s
+
+
+def gumbel_noise(input, TINY=1e-8):
+    return input.new_zeros(*input.size()).uniform_().add_(
+        TINY).log_().neg_().add_(TINY).log_().neg_()
+
+
+@register_model("iterative_nonautoregressive_transformer")
+class IterNATransformerModel(NATransformerModel):
+    @staticmethod
+    def add_args(parser):
+        NATransformerModel.add_args(parser)
+        parser.add_argument("--train-step", type=int,
+                            help="number of refinement iterations during training")
+        parser.add_argument("--dae-ratio", type=float,
+                            help="the probability of switching to the denoising auto-encoder loss")
+        parser.add_argument("--stochastic-approx", action="store_true",
+                            help="sampling from the decoder as the inputs for next iteration")
+
+    @classmethod
+    def build_model(cls, args, task):
+        model = super().build_model(args, task)
+        model.train_step = getattr(args, "train_step", 4)
+        model.dae_ratio = getattr(args, "dae_ratio", 0.5)
+        model.stochastic_approx = getattr(args, "stochastic_approx", False)
+        return model
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        B, T = prev_output_tokens.size()
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        length_out, length_tgt = self.decoder.forward_length_prediction(
+            encoder_out, tgt_tokens
+        )
+        word_ins_outs, word_ins_tgts, word_ins_masks = [], [], []
+        for t in range(self.train_step):
+            word_ins_out, word_ins_tgt, word_ins_mask = self.decoder(
+                prev_output_tokens,
+                encoder_out=encoder_out,
+                tgt_tokens=tgt_tokens,
+                step=t,
+            )
+
+            word_ins_outs.append(word_ins_out)
+            word_ins_tgts.append(word_ins_tgt)
+            word_ins_masks.append(word_ins_mask)
+
+            if t < (self.train_step - 1):
+                # prediction for next iteration
+                if self.stochastic_approx:
+                    word_ins_prediction = (
+                        word_ins_out + gumbel_noise(word_ins_out)
+                    ).max(-1)[1]
+                else:
+                    word_ins_prediction = word_ins_out.max(-1)[1]
+
+                prev_output_tokens = prev_output_tokens.masked_scatter(
+                    word_ins_mask, word_ins_prediction[word_ins_mask]
+                )
+
+                if self.dae_ratio > 0:
+                    # we do not perform denoising for the first iteration
+                    corrputed = (
+                        torch.rand(size=(B,), device=prev_output_tokens.device)
+                        < self.dae_ratio
+                    )
+                    corrputed_tokens = _sequential_poisoning(
+                        tgt_tokens[corrputed],
+                        len(self.tgt_dict),
+                        0.33,
+                        self.bos,
+                        self.eos,
+                        self.pad,
+                    )
+                    prev_output_tokens[corrputed] = corrputed_tokens
+
+        # concat everything
+        word_ins_out = torch.cat(word_ins_outs, 0)
+        word_ins_tgt = torch.cat(word_ins_tgts, 0)
+        word_ins_mask = torch.cat(word_ins_masks, 0)
+
+        return {
+            "word_ins_out": word_ins_out,
+            "word_ins_tgt": word_ins_tgt,
+            "word_ins_mask": word_ins_mask,
+            "length_out": length_out,
+            "length_tgt": length_tgt,
+            "length_w": self.decoder.length_loss_factor,
+        }
+
+
+@register_model_architecture(
+    "iterative_nonautoregressive_transformer", "iterative_nonautoregressive_transformer"
+)
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+
+    args.train_step = getattr(args, "train_step", 4)
+    args.dae_ratio = getattr(args, "dae_ratio", 0.5)
+    args.stochastic_approx = getattr(args, "stochastic_approx", False)
+
+
+@register_model_architecture(
+    "iterative_nonautoregressive_transformer",
+    "iterative_nonautoregressive_transformer_wmt_en_de",
+)
+def iter_nat_wmt_en_de(args):
+    base_architecture(args)
diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
new file mode 100644
index 0000000000..876bf01a0f
--- /dev/null
+++ b/fairseq/models/levenshtein_transformer.py
@@ -0,0 +1,595 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+from fairseq import libnat
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.model_utils import fill_tensors as _fill, skip_tensors as _skip
+from fairseq.models.transformer import (
+    Embedding,
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
+    in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
+
+    with torch.cuda.device_of(in_tokens):
+        in_tokens_list = [
+            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+        ]
+        out_tokens_list = [
+            [t for t in s if t != padding_idx]
+            for i, s in enumerate(out_tokens.tolist())
+        ]
+
+    full_labels = libnat.suggested_ed2_path(
+        in_tokens_list, out_tokens_list, padding_idx
+    )
+    mask_inputs = [
+        [len(c) if c[0] != padding_idx else 0 for c in a[:-1]] for a in full_labels
+    ]
+
+    # generate labels
+    masked_tgt_masks = []
+    for mask_input in mask_inputs:
+        mask_label = []
+        for beam_size in mask_input[1:-1]:  # HACK 1:-1
+            mask_label += [0] + [1 for _ in range(beam_size)]
+        masked_tgt_masks.append(
+            mask_label + [0 for _ in range(out_seq_len - len(mask_label))]
+        )
+    mask_ins_targets = [
+        mask_input[1:-1] + [0 for _ in range(in_seq_len - 1 - len(mask_input[1:-1]))]
+        for mask_input in mask_inputs
+    ]
+
+    # transform to tensor
+    masked_tgt_masks = torch.tensor(
+        masked_tgt_masks, device=out_tokens.device
+    ).bool()
+    mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device)
+    masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx)
+    return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets
+
+
+def _get_del_targets(in_tokens, out_tokens, padding_idx):
+    out_seq_len = out_tokens.size(1)
+
+    with torch.cuda.device_of(in_tokens):
+        in_tokens_list = [
+            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+        ]
+        out_tokens_list = [
+            [t for t in s if t != padding_idx]
+            for i, s in enumerate(out_tokens.tolist())
+        ]
+
+    full_labels = libnat.suggested_ed2_path(
+        in_tokens_list, out_tokens_list, padding_idx
+    )
+    word_del_targets = [b[-1] for b in full_labels]
+    word_del_targets = [
+        labels + [0 for _ in range(out_seq_len - len(labels))]
+        for labels in word_del_targets
+    ]
+
+    # transform to tensor
+    word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device)
+    return word_del_targets
+
+
+def _get_del_ins_targets(in_tokens, out_tokens, padding_idx):
+    in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
+
+    with torch.cuda.device_of(in_tokens):
+        in_tokens_list = [
+            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+        ]
+        out_tokens_list = [
+            [t for t in s if t != padding_idx]
+            for i, s in enumerate(out_tokens.tolist())
+        ]
+
+    full_labels = libnat.suggested_ed2_path(
+        in_tokens_list, out_tokens_list, padding_idx
+    )
+
+    word_del_targets = [b[-1] for b in full_labels]
+    word_del_targets = [
+        labels + [0 for _ in range(out_seq_len - len(labels))]
+        for labels in word_del_targets
+    ]
+
+    mask_inputs = [
+        [len(c) if c[0] != padding_idx else 0 for c in a[:-1]] for a in full_labels
+    ]
+    mask_ins_targets = [
+        mask_input[1:-1] + [0 for _ in range(in_seq_len - 1 - len(mask_input[1:-1]))]
+        for mask_input in mask_inputs
+    ]
+
+    # transform to tensor
+    mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device)
+    word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device)
+    return word_del_targets, mask_ins_targets
+
+
+def _apply_ins_masks(
+    in_tokens, in_scores, mask_ins_pred, padding_idx, unk_idx, eos_idx
+):
+
+    in_masks = in_tokens.ne(padding_idx)
+    in_lengths = in_masks.sum(1)
+
+    # HACK: hacky way to shift all the paddings to eos first.
+    in_tokens.masked_fill_(~in_masks, eos_idx)
+    mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0)
+
+    out_lengths = in_lengths + mask_ins_pred.sum(1)
+    out_max_len = out_lengths.max()
+    out_masks = (
+        torch.arange(out_max_len, device=out_lengths.device)[None, :]
+        < out_lengths[:, None]
+    )
+
+    reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1)
+    out_tokens = (
+        in_tokens.new_zeros(in_tokens.size(0), out_max_len)
+        .fill_(padding_idx)
+        .masked_fill_(out_masks, unk_idx)
+    )
+    out_tokens[:, 0] = in_tokens[:, 0]
+    out_tokens.scatter_(1, reordering, in_tokens[:, 1:])
+
+    out_scores = None
+    if in_scores is not None:
+        in_scores.masked_fill_(~in_masks, 0)
+        out_scores = in_scores.new_zeros(*out_tokens.size())
+        out_scores[:, 0] = in_scores[:, 0]
+        out_scores.scatter_(1, reordering, in_scores[:, 1:])
+
+    return out_tokens, out_scores
+
+
+def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, unk_idx):
+    word_ins_masks = in_tokens.eq(unk_idx)
+    out_tokens = in_tokens.masked_scatter(word_ins_masks, word_ins_pred[word_ins_masks])
+
+    if in_scores is not None:
+        out_scores = in_scores.masked_scatter(
+            word_ins_masks, word_ins_scores[word_ins_masks]
+        )
+    else:
+        out_scores = None
+
+    return out_tokens, out_scores
+
+
+def _apply_del_words(
+    in_tokens, in_scores, in_attn, word_del_pred, padding_idx, bos_idx, eos_idx
+):
+    # apply deletion to a tensor
+    in_masks = in_tokens.ne(padding_idx)
+    bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx)
+
+    max_len = in_tokens.size(1)
+    word_del_pred.masked_fill_(~in_masks, 1)
+    word_del_pred.masked_fill_(bos_eos_masks, 0)
+
+    reordering = (
+        torch.arange(max_len, device=in_tokens.device)[None, :]
+        .expand_as(in_tokens)
+        .contiguous()
+        .masked_fill_(word_del_pred, max_len)
+        .sort(1)[1]
+    )
+
+    out_tokens = in_tokens.masked_fill(word_del_pred, padding_idx).gather(1, reordering)
+
+    out_scores = None
+    if in_scores is not None:
+        out_scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering)
+
+    out_attn = None
+    if in_attn is not None:
+        _mask = word_del_pred[:, :, None].expand_as(in_attn)
+        _reordering = reordering[:, :, None].expand_as(in_attn)
+        out_attn = in_attn.masked_fill(_mask, 0.).gather(1, _reordering)
+
+    return out_tokens, out_scores, out_attn
+
+
+@register_model("levenshtein_transformer")
+class LevenshteinTransformerModel(TransformerModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.tgt_dict = decoder.dictionary
+        self.bos = decoder.dictionary.bos()
+        self.eos = decoder.dictionary.eos()
+        self.pad = decoder.dictionary.pad()
+        self.unk = decoder.dictionary.unk()
+
+    @staticmethod
+    def add_args(parser):
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--apply-bert-init",
+            action="store_true",
+            help="use custom param initialization for BERT",
+        )
+        parser.add_argument(
+            "--early-exit",
+            default="6,6,6",
+            type=str,
+            help="number of decoder layers before mask_ins, word_ins and word_del heads",
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = LevenshteinTransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        encoder = TransformerEncoder(args, src_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            encoder.apply(init_bert_params)
+        return encoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        assert tgt_tokens is not None, "forward function only supports training."
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # generate training labels for insertion
+        masked_tgt_masks, masked_tgt_tokens, mask_ins_targets = _get_ins_targets(
+            prev_output_tokens, tgt_tokens, self.pad, self.unk
+        )
+        mask_ins_targets = mask_ins_targets.clamp(min=0, max=255)  # for safe prediction
+        mask_ins_masks = prev_output_tokens[:, 1:].ne(self.pad)
+
+        mask_ins_out, _ = self.decoder.forward_mask_ins(
+            prev_output_tokens, encoder_out=encoder_out
+        )
+        word_ins_out, _ = self.decoder.forward_word_ins(
+            masked_tgt_tokens, encoder_out=encoder_out
+        )
+
+        # make online prediction
+        word_predictions = F.log_softmax(word_ins_out, dim=-1).max(2)[1]
+        word_predictions.masked_scatter_(
+            ~masked_tgt_masks, tgt_tokens[~masked_tgt_masks]
+        )
+
+        # generate training labels for deletion
+        word_del_targets = _get_del_targets(word_predictions, tgt_tokens, self.pad)
+        word_del_out, _ = self.decoder.forward_word_del(
+            word_predictions, encoder_out)
+
+        return {
+            "mask_ins_out": mask_ins_out,
+            "mask_ins_tgt": mask_ins_targets,
+            "mask_ins_mask": mask_ins_masks,
+            "word_ins_out": word_ins_out,
+            "word_ins_tgt": tgt_tokens,
+            "word_ins_mask": masked_tgt_masks,
+            "word_del_out": word_del_out,
+            "word_del_tgt": word_del_targets,
+            "word_del_mask": word_predictions.ne(self.pad),
+        }
+
+    def forward_encoder(self, encoder_inputs):
+        return self.encoder(*encoder_inputs)
+
+    def forward_decoder(
+        self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
+    ):
+
+        output_tokens = decoder_out["output_tokens"]
+        output_scores = decoder_out["output_scores"]
+        attn = decoder_out["attn"]
+
+        if max_ratio is None:
+            max_lens = output_tokens.new(output_tokens.size(0)).fill_(255)
+        else:
+            max_lens = (
+                (~encoder_out["encoder_padding_mask"]).sum(1) * max_ratio
+            ).clamp(min=10)
+
+        # delete words
+        # do not delete tokens if it is <s> </s>
+        can_del_word = output_tokens.ne(self.pad).sum(1) > 2
+        if can_del_word.sum() != 0:  # we cannot delete, skip
+            word_del_out, word_del_attn = self.decoder.forward_word_del(
+                _skip(output_tokens, can_del_word), _skip(encoder_out, can_del_word)
+            )
+            word_del_score = F.log_softmax(word_del_out, 2)
+            word_del_pred = word_del_score.max(-1)[1].bool()
+
+            _tokens, _scores, _attn = _apply_del_words(
+                output_tokens[can_del_word],
+                output_scores[can_del_word],
+                word_del_attn,
+                word_del_pred,
+                self.pad,
+                self.bos,
+                self.eos,
+            )
+            output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_del_word, _scores, 0)
+            attn = _fill(attn, can_del_word, _attn, 0.)
+
+        # insert placeholders
+        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
+        if can_ins_mask.sum() != 0:
+            mask_ins_out, _ = self.decoder.forward_mask_ins(
+                _skip(output_tokens, can_ins_mask), _skip(encoder_out, can_ins_mask)
+            )
+            mask_ins_score = F.log_softmax(mask_ins_out, 2)
+            if eos_penalty > 0.0:
+                mask_ins_score[:, :, 0] -= eos_penalty
+            mask_ins_pred = mask_ins_score.max(-1)[1]
+            mask_ins_pred = torch.min(
+                mask_ins_pred, max_lens[:, None].expand_as(mask_ins_pred)
+            )
+
+            _tokens, _scores = _apply_ins_masks(
+                output_tokens[can_ins_mask],
+                output_scores[can_ins_mask],
+                mask_ins_pred,
+                self.pad,
+                self.unk,
+                self.eos,
+            )
+            output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
+
+        # insert words
+        can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
+        if can_ins_word.sum() != 0:
+            word_ins_out, word_ins_attn = self.decoder.forward_word_ins(
+                _skip(output_tokens, can_ins_word), _skip(encoder_out, can_ins_word)
+            )
+            word_ins_score = F.log_softmax(word_ins_out, 2)
+            word_ins_pred = word_ins_score.max(-1)[1]
+
+            _tokens, _scores = _apply_ins_words(
+                output_tokens[can_ins_word],
+                output_scores[can_ins_word],
+                word_ins_pred,
+                word_ins_score,
+                self.unk,
+            )
+
+            output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_ins_word, _scores, 0)
+            attn = _fill(attn, can_ins_word, word_ins_attn, 0.)
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+        attn = None if attn is None else attn[:, :cut_off, :]
+        return {
+            "output_tokens": output_tokens,
+            "output_scores": output_scores,
+            "attn": attn,
+        }
+
+    def initialize_output_tokens(self, encoder_out, src_tokens):
+        initial_output_tokens = src_tokens.new_zeros(src_tokens.size(0), 2)
+        initial_output_tokens[:, 0] = self.bos
+        initial_output_tokens[:, 1] = self.eos
+
+        initial_output_scores = initial_output_tokens.new_zeros(
+            *initial_output_tokens.size()
+        ).type_as(encoder_out["encoder_out"])
+        return {
+            "output_tokens": initial_output_tokens,
+            "output_scores": initial_output_scores,
+            "attn": None,
+        }
+
+
+class LevenshteinTransformerDecoder(TransformerDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+
+        self.embed_mask_ins = Embedding(256, self.output_embed_dim * 2, None)
+        self.embed_word_del = Embedding(2, self.output_embed_dim, None)
+        # del_word, ins_mask, ins_word
+        self.early_exit = [int(i) for i in args.early_exit.split(',')]
+        assert len(self.early_exit) == 3
+
+    def extract_features(
+        self, prev_output_tokens, encoder_out=None, early_exit=None, **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Inputs:
+            prev_output_tokens: Tensor(B, T)
+            encoder_out: a dictionary of hidden states and masks
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+            the LevenshteinTransformer decoder has full-attention to all generated tokens
+        """
+        # embed positions
+        positions = (
+            self.embed_positions(prev_output_tokens)
+            if self.embed_positions is not None
+            else None
+        )
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+        inner_states = [x]
+
+        # decoder layers
+        decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        for i, layer in enumerate(self.layers):
+
+            # early exit from the decoder.
+            if (early_exit is not None) and (i >= early_exit):
+                break
+
+            x, attn = layer(
+                x,
+                encoder_out["encoder_out"] if encoder_out is not None else None,
+                encoder_out["encoder_padding_mask"]
+                if encoder_out is not None
+                else None,
+                self_attn_mask=None,
+                self_attn_padding_mask=decoder_padding_mask,
+            )
+            inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    def forward_mask_ins(self, prev_output_tokens, encoder_out=None):
+        features, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[1]
+        )
+        features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
+        return F.linear(features_cat, self.embed_mask_ins.weight), extra['attn']
+
+    def forward_word_ins(self, prev_output_tokens, encoder_out=None):
+        features, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[2]
+        )
+        return self.output_layer(features), extra['attn']
+
+    def forward_word_del(self, prev_output_tokens, encoder_out=None):
+        features, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[0]
+        )
+        return F.linear(features, self.embed_word_del.weight), extra['attn']
+
+    def forward_word_del_mask_ins(self, prev_output_tokens, encoder_out=None):
+        # merge the word-deletion and mask insertion into one operation,
+        assert self.early_exit[0] == self.early_exit[1], "must the same depth."
+        features, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[2]
+        )
+        features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
+        f_word_del = F.linear(features, self.embed_word_del.weight)
+        f_mask_ins = F.linear(features_cat, self.embed_mask_ins.weight)
+        return f_word_del, f_mask_ins, extra['attn']
+
+
+@register_model_architecture("levenshtein_transformer", "levenshtein_transformer")
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    args.early_exit = getattr(args, "early_exit", "(6, 6, 6)")
+
+
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_wmt_en_de"
+)
+def levenshtein_transformer_wmt_en_de(args):
+    base_architecture(args)
+
+
+# similar parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_vaswani_wmt_en_de_big"
+)
+def levenshtein_transformer_vaswani_wmt_en_de_big(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.3)
+    base_architecture(args)
+
+
+# default parameters used in tensor2tensor implementation
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_wmt_en_de_big"
+)
+def levenshtein_transformer_wmt_en_de_big_t2t(args):
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
+    levenshtein_transformer_vaswani_wmt_en_de_big(args)
diff --git a/fairseq/models/model_utils.py b/fairseq/models/model_utils.py
new file mode 100644
index 0000000000..8217731c9e
--- /dev/null
+++ b/fairseq/models/model_utils.py
@@ -0,0 +1,62 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def skip_tensors(x, mask):
+    """
+    Getting sliced (dim=0) tensor by mask. Supporting tensor and list/dict of tensors.
+    """
+    if isinstance(x, int):
+        return x
+
+    if x is None:
+        return None
+
+    if isinstance(x, torch.Tensor):
+        if x.size(0) == mask.size(0):
+            return x[mask]
+        elif x.size(1) == mask.size(0):
+            return x[:, mask]
+
+    if isinstance(x, list):
+        return [skip_tensors(x_i, mask) for x_i in x]
+
+    if isinstance(x, dict):
+        return {k: skip_tensors(v, mask) for k, v in x.items()}
+
+    raise NotImplementedError
+
+
+def fill_tensors(x, mask, y, padding_idx):
+    """
+    Filling tensor x with y at masked positions (dim=0).
+    """
+    if x is None:
+        return y
+    assert x.dim() == y.dim() and mask.size(0) == x.size(0)
+    assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
+    n_selected = mask.sum()
+    assert n_selected == y.size(0)
+
+    if n_selected == x.size(0):
+        return y
+
+    if x.size(1) < y.size(1):
+        dims = [x.size(0), y.size(1) - x.size(1)]
+        if x.dim() == 3:
+            dims.append(x.size(2))
+        x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1)
+        x[mask] = y
+    elif x.size(1) > y.size(1):
+        x[mask] = padding_idx
+        if x.dim() == 2:
+            x[mask, :y.size(1)] = y
+        else:
+            x[mask, :y.size(1), :] = y
+    else:
+        x[mask] = y
+    return x
diff --git a/fairseq/models/nonautoregressive_transformer.py b/fairseq/models/nonautoregressive_transformer.py
new file mode 100644
index 0000000000..d45a5b443b
--- /dev/null
+++ b/fairseq/models/nonautoregressive_transformer.py
@@ -0,0 +1,640 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import (
+    Embedding,
+    TransformerDecoder,
+    TransformerDecoderLayer,
+    TransformerEncoder,
+    TransformerModel,
+)
+from fairseq.modules import MultiheadAttention
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+def _mean_pooling(enc_feats, src_masks):
+    # enc_feats: T x B x C
+    # src_masks: B x T or None
+    if src_masks is None:
+        enc_feats = enc_feats.mean(0)
+    else:
+        src_masks = (~src_masks).transpose(0, 1).type_as(enc_feats)
+        enc_feats = (
+            (enc_feats / src_masks.sum(0)[None, :, None]) * src_masks[:, :, None]
+        ).sum(0)
+    return enc_feats
+
+
+def _argmax(x, dim):
+    return (x == x.max(dim, keepdim=True)[0]).type_as(x)
+
+
+def _dynamic_programming(tokens, scores):
+    N, B, T = tokens.size()
+    cum_scores = scores[:, :, 0].clone()  # N x B
+    cum_choice = tokens.new_zeros(B, T)
+
+    # forward
+    for t in range(T - 1):
+        score, choice = cum_scores.max(0)
+        cum_choice[:, t] = choice
+        cum_scores[0] = score + scores[0, :, t + 1]
+        cum_scores[1:] = cum_scores[:-1] + scores[1:, :, t + 1]
+
+    # back-tracking
+    end_score, end_choice = cum_scores.max(0)
+    cum_choice[:, T - 1] = end_choice
+    for t in range(T - 2, -1, -1):
+        is_start = (cum_choice[:, t + 1] == 0).type_as(cum_choice)
+        cum_choice[:, t] = (cum_choice[:, t + 1] - 1) * ~is_start + cum_choice[
+            :, t
+        ] * is_start
+
+    # finalize the prediction
+    tokens = tokens.gather(0, cum_choice.unsqueeze(0)).squeeze(0)
+    scores = scores.gather(0, cum_choice.unsqueeze(0)).squeeze(0)
+    return scores, tokens
+
+
+def _beam_search(tokens, scores, W=None):
+    N, B, T = tokens.size()
+
+    if (W is None) or (W > N):
+        W = N
+
+
+def _uniform_assignment(src_lens, trg_lens):
+    max_trg_len = trg_lens.max()
+    steps = (src_lens.float() - 1) / (trg_lens.float() - 1)  # step-size
+    # max_trg_len
+    index_t = torch.arange(max_trg_len, device=trg_lens.device).float()
+    index_t = steps[:, None] * index_t[None, :]  # batch_size X max_trg_len
+    index_t = torch.round(index_t).long().detach()
+    return index_t
+
+
+@register_model("nonautoregressive_transformer")
+class NATransformerModel(TransformerModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.tgt_dict = decoder.dictionary
+        self.bos = decoder.dictionary.bos()
+        self.eos = decoder.dictionary.eos()
+        self.pad = decoder.dictionary.pad()
+        self.unk = decoder.dictionary.unk()
+
+    @staticmethod
+    def add_args(parser):
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--apply-bert-init",
+            action="store_true",
+            help="use custom param initialization for BERT",
+        )
+
+        # length prediction
+        parser.add_argument("--src-embedding-copy", action="store_true",
+                            help="copy encoder word embeddings as the initial input of the decoder")
+        parser.add_argument("--pred-length-offset", action="store_true",
+                            help="predicting the length difference between the target and source sentences")
+        parser.add_argument("--sg-length-pred", action="store_true",
+                            help="stop the gradients back-propagated from the length predictor")
+        parser.add_argument("--length-loss-factor", type=float,
+                            help="weights on the length prediction loss")
+
+        # n-gram predictor
+        parser.add_argument(
+            "--ngram-predictor",
+            nargs="?",
+            const=4,
+            default=1,
+            type=int,
+            help="adding an additional n-gram predictor.",
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = NATransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        encoder = TransformerEncoder(args, src_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            encoder.apply(init_bert_params)
+        return encoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        length_out, length_tgt = self.decoder.forward_length_prediction(
+            encoder_out, tgt_tokens
+        )
+
+        word_ins_out, word_ins_tgt, word_ins_mask = self.decoder(
+            prev_output_tokens, encoder_out=encoder_out, tgt_tokens=tgt_tokens
+        )
+
+        return {
+            "word_ins_out": word_ins_out,
+            "word_ins_tgt": word_ins_tgt,
+            "word_ins_mask": word_ins_mask,
+            "length_out": length_out,
+            "length_tgt": length_tgt,
+            "length_w": self.decoder.length_loss_factor,
+        }
+
+    def forward_encoder(self, encoder_inputs):
+        return self.encoder(*encoder_inputs)
+
+    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
+        step = decoder_out["step"]
+        output_tokens = decoder_out["output_tokens"]
+        output_scores = decoder_out["output_scores"]
+
+        # execute the decoder
+        output_masks = output_tokens.ne(self.pad)
+        _scores, _tokens = self.decoder(
+            output_tokens,
+            encoder_out=encoder_out,
+            decoding_format=decoding_format,
+            step=step,
+        )
+        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
+        output_scores.masked_scatter_(output_masks, _scores[output_masks])
+
+        return {"output_tokens": output_tokens, "output_scores": output_scores}
+
+    def initialize_output_tokens(self, encoder_out, src_tokens):
+        # length prediction
+        _, length_tgt = self.decoder.forward_length_prediction(encoder_out)
+        max_length = length_tgt.max()
+        idx_length = torch.arange(max_length, device=src_tokens.device)
+
+        initial_output_tokens = src_tokens.new_zeros(
+            src_tokens.size(0), max_length
+        ).fill_(self.pad)
+        initial_output_tokens.masked_fill_(
+            idx_length[None, :] < length_tgt[:, None], self.unk
+        )
+        initial_output_tokens[:, 0] = self.bos
+        initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos)
+
+        initial_output_scores = initial_output_tokens.new_zeros(
+            *initial_output_tokens.size()
+        ).type_as(encoder_out["encoder_out"])
+
+        return {
+            "output_tokens": initial_output_tokens,
+            "output_scores": initial_output_scores,
+        }
+
+
+class NATransformerDecoder(TransformerDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+
+        self.encoder_embed_dim = args.encoder_embed_dim
+        self.sg_length_pred = getattr(args, "sg_length_pred", False)
+        self.pred_length_offset = getattr(args, "pred_length_offset", False)
+        self.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+        self.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+        self.embed_length = Embedding(256, self.encoder_embed_dim, None)
+
+        self.ngram_predictor = getattr(args, "ngram_predictor", 1)
+        self.ngram_layer = (
+            None if (self.ngram_predictor == 1) else NgramDecoderLayer(args, True)
+        )
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        tgt_tokens=None,
+        decoding_format=None,
+        step=0,
+        **kwargs
+    ):
+
+        features, _ = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            embedding_copy=(step == 0) & self.src_embedding_copy,
+        )
+
+        if tgt_tokens is not None:
+            if self.ngram_layer is None:
+                word_ins_mask = tgt_tokens.ne(self.padding_idx)
+                word_ins_tgt = tgt_tokens
+            else:
+                context_embeds, context_masks = self.forward_ngram_context(tgt_tokens)
+                features = self.ngram_layer(features, context_embeds=context_embeds)
+                word_ins_tgt = tgt_tokens[:, :, None].repeat(1, 1, self.ngram_predictor)
+                word_ins_mask = word_ins_tgt.ne(self.padding_idx) & context_masks
+
+            return self.output_layer(features), word_ins_tgt, word_ins_mask
+
+        else:
+            if self.ngram_layer is None:
+                return F.log_softmax(self.output_layer(features), -1).max(-1)
+            else:
+                # inner iterations
+                return self.forward_ngram_decoding(
+                    features, prev_output_tokens.eq(self.padding_idx), decoding_format
+                )
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        early_exit=None,
+        embedding_copy=False,
+        **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Inputs:
+            prev_output_tokens: Tensor(B, T)
+            encoder_out: a dictionary of hidden states and masks
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+            the LevenshteinTransformer decoder has full-attention to all generated tokens
+        """
+        # embedding
+        if embedding_copy:
+            src_embd = encoder_out["encoder_embedding"]
+            src_mask = encoder_out["encoder_padding_mask"]
+            src_mask = (
+                ~src_mask
+                if src_mask is not None
+                else prev_output_tokens.new_ones(*src_embd.size()[:2]).bool()
+            )
+
+            x, decoder_padding_mask = self.forward_embedding(
+                prev_output_tokens,
+                self.forward_copying_source(
+                    src_embd, src_mask, prev_output_tokens.ne(self.padding_idx)
+                ),
+            )
+
+        else:
+
+            x, decoder_padding_mask = self.forward_embedding(prev_output_tokens)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+        inner_states = [x]
+
+        # decoder layers
+        for i, layer in enumerate(self.layers):
+
+            # early exit from the decoder.
+            if (early_exit is not None) and (i >= early_exit):
+                break
+
+            x, attn = layer(
+                x,
+                encoder_out["encoder_out"] if encoder_out is not None else None,
+                encoder_out["encoder_padding_mask"]
+                if encoder_out is not None
+                else None,
+                self_attn_mask=None,
+                self_attn_padding_mask=decoder_padding_mask,
+            )
+            inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    def forward_ngram_context(self, tgt_tokens):
+        tgt_embeds = self.forward_embedding(tgt_tokens)
+        n_contexts = self.ngram_predictor - 1
+
+        # shifting the embeddings
+        # context_embeds: N x B x T x C
+        # context_masks:  B x T x N
+        context_embeds = tgt_embeds.new_zeros(n_contexts, *tgt_embeds.size())
+        context_masks = tgt_embeds.new_ones(
+            *tgt_embeds.size()[:2], self.ngram_predictor
+        ).bool()
+
+        for k in range(n_contexts):
+            context_embeds[k, :, k + 1:] = tgt_embeds[:, : -k - 1]
+            context_masks[:, : k + 1, k + 1] = 0
+
+        return context_embeds, context_masks
+
+    def forward_ngram_decoding(self, features, padding_mask=None, decoding_format=None):
+        context_embeds = None
+        scores, tokens = [], []
+        ensemble_score = None
+        ensemble_index = None
+
+        if decoding_format is None:
+            decoding_format = "ensemble"
+
+        for k in range(self.ngram_predictor):
+            ngram_out = self.ngram_layer(
+                features, context_embeds=context_embeds, incremental=True
+            )
+            ngram_scores = F.log_softmax(self.output_layer(ngram_out), -1)
+            max_score, max_token = ngram_scores.max(-1)
+
+            if decoding_format == "vote":
+                ngram_scores = _argmax(ngram_scores, -1)
+
+            if ensemble_score is None:
+                ensemble_score = ngram_scores
+                ensemble_index = ensemble_score.new_ones(*ensemble_score.size()[:2])
+            else:
+                ensemble_index[:, k:] = ensemble_index[:, k:] + 1
+                ensemble_score = ensemble_score + ngram_scores.masked_fill_(
+                    (ensemble_index < k)
+                    .unsqueeze(2)
+                    .repeat(1, 1, ensemble_score.size(2)),
+                    0,
+                )
+                max_score[:, :k] = float("-inf")
+
+            if decoding_format == "unigram":
+                break
+
+            scores.append(max_score.masked_fill_(padding_mask, 0))
+            tokens.append(max_token.masked_fill_(padding_mask, self.padding_idx))
+
+            # context_embeds: N x B x T x C
+            if context_embeds is None:
+                context_embeds = self.forward_embedding(max_token).unsqueeze(0)
+
+            else:
+                context_embeds = torch.cat(
+                    [self.forward_embedding(max_token).unsqueeze(0), context_embeds], 0
+                )
+
+            context_embeds[:, :, 1:] = context_embeds[:, :, :-1]
+
+        if decoding_format != "dp":
+            ensemble_score = ensemble_score / ensemble_index.unsqueeze(2)
+            return ensemble_score.max(-1)
+
+        else:
+            tokens = torch.cat([t.unsqueeze(0) for t in tokens], 0)
+            scores = torch.cat([s.unsqueeze(0) for s in scores], 0)
+            return _dynamic_programming(tokens, scores)
+
+    def forward_embedding(self, prev_output_tokens, states=None):
+        # embed positions
+        positions = (
+            self.embed_positions(prev_output_tokens)
+            if self.embed_positions is not None
+            else None
+        )
+
+        # embed tokens and positions
+        if states is None:
+            x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+            if self.project_in_dim is not None:
+                x = self.project_in_dim(x)
+        else:
+            x = states
+
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        return x, decoder_padding_mask
+
+    def forward_copying_source(self, src_embeds, src_masks, tgt_masks):
+        length_sources = src_masks.sum(1)
+        length_targets = tgt_masks.sum(1)
+        mapped_inputs = _uniform_assignment(length_sources, length_targets).masked_fill(
+            ~tgt_masks, 0
+        )
+        copied_embedding = torch.gather(
+            src_embeds,
+            1,
+            mapped_inputs.unsqueeze(-1).expand(
+                *mapped_inputs.size(), src_embeds.size(-1)
+            ),
+        )
+        return copied_embedding
+
+    def forward_length_prediction(self, encoder_out, tgt_tokens=None):
+        enc_feats = encoder_out["encoder_out"]  # T x B x C
+        src_masks = encoder_out["encoder_padding_mask"]  # B x T or None
+
+        if self.pred_length_offset:
+            if src_masks is None:
+                src_lengs = enc_feats.new_ones(enc_feats.size(1)).fill_(
+                    enc_feats.size(0)
+                )
+            else:
+                src_lengs = (~src_masks).transpose(0, 1).type_as(enc_feats).sum(0)
+            src_lengs = src_lengs.long()
+
+        enc_feats = _mean_pooling(enc_feats, src_masks)
+        if self.sg_length_pred:
+            enc_feats = enc_feats.detach()
+
+        length_out = F.linear(enc_feats, self.embed_length.weight)
+
+        if tgt_tokens is not None:
+            # obtain the length target
+            tgt_lengs = tgt_tokens.ne(self.padding_idx).sum(1).long()
+            if self.pred_length_offset:
+                length_tgt = tgt_lengs - src_lengs + 128
+            else:
+                length_tgt = tgt_lengs
+            length_tgt = length_tgt.clamp(min=0, max=255)
+
+        else:
+            # predict the length target (greedy for now)
+            # TODO: implementing length-beam
+            pred_lengs = length_out.max(-1)[1]
+            if self.pred_length_offset:
+                length_tgt = pred_lengs - 128 + src_lengs
+            else:
+                length_tgt = pred_lengs
+
+        return length_out, length_tgt
+
+
+class NgramDecoderLayer(TransformerDecoderLayer):
+    """
+    N-gram Decoder Layer:
+
+    This module can be pluged in the last layer of any Non-autoregressive Model's
+    It provides an alternative way to capture local n-gram information by running the block multiple times.
+    """
+
+    def __init__(self, args, no_encoder_attn=False):
+        super(NgramDecoderLayer, self).__init__(args, no_encoder_attn=no_encoder_attn)
+        self.self_attn = MultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=1,  # maybe n-gram does not need too many heads.
+            dropout=args.attention_dropout,
+            self_attention=False,
+            encoder_decoder_attention=True,
+        )
+
+    def forward(
+        self,
+        x,
+        encoder_out=None,
+        encoder_padding_mask=None,
+        context_embeds=None,
+        incremental=False,
+    ):
+        # x: T x B x C
+        # context_embeds: N x T x B x C
+        T, B, C = x.size()
+
+        residual = x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
+        x = x.contiguous().view(1, T * B, C).contiguous()
+
+        if context_embeds is not None:
+            N = context_embeds.size(0)
+            context_embeds = context_embeds.view(N, T * B, C).contiguous()
+
+        if not incremental:
+            assert context_embeds is not None, "we need context for training"
+            # attn_weights: (n_head x T x B) x 1 x N
+            # v: (n_head x T x B) x N x (dim / n_head)
+            # -- move the attention computation outside --
+            attn_weights, values = self.self_attn(
+                query=x, key=context_embeds, value=context_embeds, before_softmax=True
+            )
+
+            attn_weights = attn_weights.repeat(1, N, 1)
+            attn_masks = attn_weights.new_ones(N, N).triu_(1).bool()
+            attn_masks = attn_masks.unsqueeze(0).repeat(attn_weights.size(0), 1, 1)
+
+            attn_weights = attn_weights.masked_fill(attn_masks, float("-inf"))
+            attn_weights = utils.softmax(attn_weights, dim=-1).type_as(attn_weights)
+            attn_weights = F.dropout(
+                attn_weights, p=self.self_attn.dropout, training=self.training
+            )
+
+            # (n_head x T x B) x N x (dim / n_head)
+            attn = torch.bmm(attn_weights, values)
+            attn = attn.transpose(0, 1).contiguous()
+            attn = attn.view(N, T * B, C).contiguous()
+            attn = attn.transpose(1, 0).contiguous()
+            attn = attn.view(T, B, N, C)
+
+            residual = residual.unsqueeze(2)
+            x = self.self_attn.out_proj(attn)
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = torch.cat([residual, residual + x], 2)
+
+        else:
+            if context_embeds is None:
+                x = residual
+
+            else:
+                x, _ = self.self_attn(query=x, key=context_embeds, value=context_embeds)
+                x = x.view(T, B, C)
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = residual + x
+
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
+
+        if self.encoder_attn is not None:
+            raise NotImplementedError
+
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+        return x
+
+
+@register_model_architecture(
+    "nonautoregressive_transformer", "nonautoregressive_transformer"
+)
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
+
+
+@register_model_architecture(
+    "nonautoregressive_transformer", "nonautoregressive_transformer_wmt_en_de"
+)
+def nonautoregressive_transformer_wmt_en_de(args):
+    base_architecture(args)
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 7fedc77550..dd10ae5357 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -172,7 +172,7 @@ def build_embedding(dictionary, embed_dim, path=None):
 
         encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
         decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
-        return TransformerModel(encoder, decoder)
+        return cls(encoder, decoder)
 
     @classmethod
     def build_encoder(cls, args, src_dict, embed_tokens):
@@ -222,7 +222,15 @@ def __init__(self, args, dictionary, embed_tokens):
         else:
             self.layer_norm = None
 
-    def forward(self, src_tokens, src_lengths):
+    def forward_embedding(self, src_tokens):
+        # embed tokens and positions
+        embed = self.embed_scale * self.embed_tokens(src_tokens)
+        if self.embed_positions is not None:
+            x = embed + self.embed_positions(src_tokens)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        return x, embed
+
+    def forward(self, src_tokens, src_lengths, cls_input=None):
         """
         Args:
             src_tokens (LongTensor): tokens in the source language of shape
@@ -237,11 +245,7 @@ def forward(self, src_tokens, src_lengths):
                 - **encoder_padding_mask** (ByteTensor): the positions of
                   padding elements of shape `(batch, src_len)`
         """
-        # embed tokens and positions
-        x = self.embed_scale * self.embed_tokens(src_tokens)
-        if self.embed_positions is not None:
-            x += self.embed_positions(src_tokens)
-        x = F.dropout(x, p=self.dropout, training=self.training)
+        x, encoder_embedding = self.forward_embedding(src_tokens)
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)
@@ -261,6 +265,7 @@ def forward(self, src_tokens, src_lengths):
         return {
             'encoder_out': x,  # T x B x C
             'encoder_padding_mask': encoder_padding_mask,  # B x T
+            'encoder_embedding': encoder_embedding,  # B x T x C
         }
 
     def reorder_encoder_out(self, encoder_out, new_order):
@@ -332,7 +337,7 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
         embed_dim = args.decoder_embed_dim
         self.output_embed_dim = args.decoder_output_dim
 
-        padding_idx = embed_tokens.padding_idx
+        self.padding_idx = embed_tokens.padding_idx
         self.max_target_positions = args.max_target_positions
 
         self.embed_tokens = embed_tokens
@@ -341,7 +346,7 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
         self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None
 
         self.embed_positions = PositionalEmbedding(
-            args.max_target_positions, embed_dim, padding_idx,
+            args.max_target_positions, embed_dim, self.padding_idx,
             learned=args.decoder_learned_pos,
         ) if not args.no_token_positional_embeddings else None
 
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 4da628655e..8c28255dfb 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -91,7 +91,7 @@ def reset_parameters(self):
             nn.init.xavier_normal_(self.bias_v)
 
     def forward(self, query, key, value, key_padding_mask=None, incremental_state=None,
-                need_weights=True, static_kv=False, attn_mask=None):
+                need_weights=True, static_kv=False, attn_mask=None, before_softmax=False):
         """Input shape: Time x Batch x Channel
 
         Timesteps can be masked by supplying a T x T mask in the
@@ -239,6 +239,9 @@ def forward(self, query, key, value, key_padding_mask=None, incremental_state=No
                 )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
+        if before_softmax:
+            return attn_weights, v
+
         attn_weights = utils.softmax(
             attn_weights, dim=-1, onnx_trace=self.onnx_trace,
         ).type_as(attn_weights)
diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py
index 5da4909ca2..f4a80cceea 100644
--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
@@ -83,7 +83,7 @@ def forward(self, x, encoder_padding_mask, attn_mask=None):
         residual = x
         x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
         if attn_mask is not None:
-            attn_mask = attn_mask.masked_fill(attn_mask.byte(), -1e8)
+            attn_mask = attn_mask.masked_fill(attn_mask.bool(), -1e8)
         # anything in original attn_mask = 1, becomes -1e8
         # anything in original attn_mask = 0, becomes 0
         # Note that we cannot use -inf here, because at some edge cases,
diff --git a/fairseq/modules/transformer_sentence_encoder.py b/fairseq/modules/transformer_sentence_encoder.py
index 1699291253..9be7ab3080 100644
--- a/fairseq/modules/transformer_sentence_encoder.py
+++ b/fairseq/modules/transformer_sentence_encoder.py
@@ -36,7 +36,8 @@ def init_bert_params(module):
             module.bias.data.zero_()
     if isinstance(module, nn.Embedding):
         module.weight.data.normal_(mean=0.0, std=0.02)
-        module.weight.data[module.padding_idx].zero_()
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
     if isinstance(module, MultiheadAttention):
         module.in_proj_weight.data.normal_(mean=0.0, std=0.02)
 
diff --git a/fairseq/options.py b/fairseq/options.py
index 54c7863908..bb1e27aeb7 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -280,6 +280,8 @@ def add_dataset_args(parser, train=False, gen=False):
                                 ' (train, valid, valid1, test, test1)')
         group.add_argument('--validate-interval', type=int, default=1, metavar='N',
                            help='validate every N epochs')
+        group.add_argument('--fixed-validation-seed', default=None, type=int, metavar='N',
+                           help='specified random seed for validation')
         group.add_argument('--disable-validation', action='store_true',
                            help='disable validation')
         group.add_argument('--max-tokens-valid', type=int, metavar='N',
@@ -493,6 +495,18 @@ def add_generation_args(parser):
                        help='strength of diversity penalty for Diverse Beam Search')
     group.add_argument('--print-alignment', action='store_true',
                        help='if set, uses attention feedback to compute and print alignment to source tokens')
+    group.add_argument('--print-step', action='store_true')
+
+    # arguments for iterative refinement generator
+    group.add_argument('---iter-decode-eos-penalty', default=0.0, type=float, metavar='N',
+                       help='if > 0.0, it penalized early-stopping in decoding.')
+    group.add_argument('--iter-decode-max-iter', default=10, type=int, metavar='N',
+                       help='maximum iterations for iterative refinement.')
+    group.add_argument('--iter-decode-force-max-iter', action='store_true',
+                       help='if set, run exact the maximum number of iterations without early stop')
+
+    # special decoding format for advanced decoding.
+    group.add_argument('--decoding-format', default=None, type=str, choices=['unigram', 'ensemble', 'vote', 'dp', 'bs'])
     # fmt: on
     return group
 
diff --git a/fairseq/tasks/translation.py b/fairseq/tasks/translation.py
index d3f51cb35c..f3d60403ba 100644
--- a/fairseq/tasks/translation.py
+++ b/fairseq/tasks/translation.py
@@ -12,6 +12,7 @@
     data_utils,
     indexed_dataset,
     LanguagePairDataset,
+    PrependTokenDataset,
 )
 
 from . import FairseqTask, register_task
@@ -22,7 +23,8 @@ def load_langpair_dataset(
     src, src_dict,
     tgt, tgt_dict,
     combine, dataset_impl, upsample_primary,
-    left_pad_source, left_pad_target, max_source_positions, max_target_positions,
+    left_pad_source, left_pad_target, max_source_positions,
+    max_target_positions, prepend_bos=False,
 ):
     def split_exists(split, src, tgt, lang, data_path):
         filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
@@ -67,6 +69,11 @@ def split_exists(split, src, tgt, lang, data_path):
         src_dataset = ConcatDataset(src_datasets, sample_ratios)
         tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios)
 
+    if prepend_bos:
+        assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index")
+        src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
+        tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())
+
     return LanguagePairDataset(
         src_dataset, src_dataset.sizes, src_dict,
         tgt_dataset, tgt_dataset.sizes, tgt_dict,
diff --git a/fairseq/tasks/translation_lev.py b/fairseq/tasks/translation_lev.py
new file mode 100644
index 0000000000..47d6a3ed4a
--- /dev/null
+++ b/fairseq/tasks/translation_lev.py
@@ -0,0 +1,149 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from fairseq.tasks import register_task
+from fairseq.tasks.translation import TranslationTask, load_langpair_dataset
+
+
+@register_task('translation_lev')
+class TranslationLevenshteinTask(TranslationTask):
+    """
+    Translation (Sequence Generation) task for Levenshtein Transformer
+    See `"Levenshtein Transformer" <https://arxiv.org/abs/1905.11006>`_.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        TranslationTask.add_args(parser)
+        parser.add_argument(
+            '--noise',
+            default='random_delete',
+            choices=['random_delete', 'random_mask', 'no_noise', 'full_mask'])
+
+    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = self.args.data.split(':')
+        assert len(paths) > 0
+        data_path = paths[epoch % len(paths)]
+
+        # infer langcode
+        src, tgt = self.args.source_lang, self.args.target_lang
+
+        self.datasets[split] = load_langpair_dataset(
+            data_path, split, src, self.src_dict, tgt, self.tgt_dict,
+            combine=combine, dataset_impl=self.args.dataset_impl,
+            upsample_primary=self.args.upsample_primary,
+            left_pad_source=self.args.left_pad_source,
+            left_pad_target=self.args.left_pad_target,
+            max_source_positions=self.args.max_source_positions,
+            max_target_positions=self.args.max_target_positions,
+            prepend_bos=True,
+        )
+
+    def inject_noise(self, target_tokens):
+        def _random_delete(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+
+            max_len = target_tokens.size(1)
+            target_mask = target_tokens.eq(pad)
+            target_score = target_tokens.clone().float().uniform_()
+            target_score.masked_fill_(
+                target_tokens.eq(bos) | target_tokens.eq(eos), 0.0)
+            target_score.masked_fill_(target_mask, 1)
+            target_score, target_rank = target_score.sort(1)
+            target_length = target_mask.size(1) - target_mask.float().sum(
+                1, keepdim=True)
+
+            # do not delete <bos> and <eos> (we assign 0 score for them)
+            target_cutoff = 2 + ((target_length - 2) * target_score.new_zeros(
+                target_score.size(0), 1).uniform_()).long()
+            target_cutoff = target_score.sort(1)[1] >= target_cutoff
+
+            prev_target_tokens = target_tokens.gather(
+                1, target_rank).masked_fill_(target_cutoff, pad).gather(
+                    1,
+                    target_rank.masked_fill_(target_cutoff,
+                                             max_len).sort(1)[1])
+            prev_target_tokens = prev_target_tokens[:, :prev_target_tokens.
+                                                    ne(pad).sum(1).max()]
+
+            return prev_target_tokens
+
+        def _random_mask(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+            unk = self.tgt_dict.unk()
+
+            target_mask = target_tokens.eq(bos) | target_tokens.eq(
+                eos) | target_tokens.eq(pad)
+            target_score = target_tokens.clone().float().uniform_()
+            target_score.masked_fill_(target_mask, 1.0)
+
+            prev_target_tokens = target_tokens.masked_fill(
+                target_score < target_score.new_zeros(target_score.size(0),
+                                                      1).uniform_(), unk)
+            return prev_target_tokens
+
+        def _full_mask(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+            unk = self.tgt_dict.unk()
+
+            target_mask = target_tokens.eq(bos) | target_tokens.eq(
+                eos) | target_tokens.eq(pad)
+            return target_tokens.masked_fill(~target_mask, unk)
+
+        if self.args.noise == 'random_delete':
+            return _random_delete(target_tokens)
+        elif self.args.noise == 'random_mask':
+            return _random_mask(target_tokens)
+        elif self.args.noise == 'full_mask':
+            return _full_mask(target_tokens)
+        elif self.args.noise == 'no_noise':
+            return target_tokens
+        else:
+            raise NotImplementedError
+
+    def build_generator(self, args):
+        from fairseq.iterative_refinement_generator import IterativeRefinementGenerator
+        return IterativeRefinementGenerator(
+            self.target_dictionary,
+            eos_penalty=getattr(args, 'iter_decode_eos_penalty', 0.0),
+            max_iter=getattr(args, 'iter_decode_max_iter', 10),
+            decoding_format=getattr(args, 'decoding_format', None),
+            adaptive=not getattr(args, 'iter_decode_force_max_iter', False))
+
+    def train_step(self,
+                   sample,
+                   model,
+                   criterion,
+                   optimizer,
+                   ignore_grad=False):
+        model.train()
+        sample['prev_target'] = self.inject_noise(sample['target'])
+        loss, sample_size, logging_output = criterion(model, sample)
+        if ignore_grad:
+            loss *= 0
+        optimizer.backward(loss)
+        return loss, sample_size, logging_output
+
+    def valid_step(self, sample, model, criterion):
+        model.eval()
+        with torch.no_grad():
+            sample['prev_target'] = self.inject_noise(sample['target'])
+            loss, sample_size, logging_output = criterion(model, sample)
+        return loss, sample_size, logging_output
diff --git a/fairseq/utils.py b/fairseq/utils.py
index 1af2394434..80ecb6d083 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -359,3 +359,11 @@ def has_parameters(module):
         return True
     except StopIteration:
         return False
+
+
+def set_torch_seed(seed):
+    # Set seed based on args.seed and the update number so that we get
+    # reproducible results when resuming from checkpoints
+    assert isinstance(seed, int)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
diff --git a/generate.py b/generate.py
index c23cc79868..6de1a69abd 100644
--- a/generate.py
+++ b/generate.py
@@ -159,6 +159,9 @@ def main(args):
                                 ' '.join(map(lambda x: str(utils.item(x)), alignment))
                             ))
 
+                        if args.print_step:
+                            print('I-{}\t{}'.format(sample_id, hypo['steps']))
+
                     # Score only the top hypothesis
                     if has_target and j == 0:
                         if align_dict is not None or args.remove_bpe is not None:
diff --git a/setup.py b/setup.py
index 8f4604be11..33849f8105 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from setuptools import setup, find_packages, Extension
+from torch.utils import cpp_extension
 import sys
 
 
@@ -60,6 +61,12 @@ def include_dirs(self, dirs):
         language='c++',
         extra_compile_args=extra_compile_args,
     ),
+    cpp_extension.CppExtension(
+        'fairseq.libnat',
+        sources=[
+            'fairseq/clib/libnat/edit_dist.cpp',
+        ],
+    )
 ]
 
 
@@ -106,5 +113,6 @@ def include_dirs(self, dirs):
             'fairseq-validate = fairseq_cli.validate:cli_main',
         ],
     },
+    cmdclass={'build_ext': cpp_extension.BuildExtension},
     zip_safe=False,
 )
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
index b517278273..8cede3c9fa 100644
--- a/tests/test_binaries.py
+++ b/tests/test_binaries.py
@@ -180,6 +180,52 @@ def test_dynamicconv(self):
                 ])
                 generate_main(data_dir)
 
+    def test_levenshtein_transformer(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory('test_levenshtein_transformer') as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(data_dir, 'levenshtein_transformer', [
+                    '--apply-bert-init', '--early-exit', '6,6,6',
+                    '--criterion', 'nat_loss'
+                ], task='translation_lev')
+                generate_main(data_dir)
+
+    def test_nonautoregressive_transformer(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory('test_nonautoregressive_transformer') as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(data_dir, 'nonautoregressive_transformer', [
+                    '--apply-bert-init', '--src-embedding-copy', '--criterion',
+                    'nat_loss', '--noise', 'full_mask', '--pred-length-offset',
+                    '--length-loss-factor', '0.1'
+                ], task='translation_lev')
+                generate_main(data_dir)
+
+    def test_iterative_nonautoregressive_transformer(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory('test_iterative_nonautoregressive_transformer') as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(data_dir, 'iterative_nonautoregressive_transformer', [
+                    '--apply-bert-init', '--src-embedding-copy', '--criterion',
+                    'nat_loss', '--noise', 'full_mask', '--stochastic-approx',
+                    '--dae-ratio', '0.5', '--train-step', '3'
+                ], task='translation_lev')
+                generate_main(data_dir)
+
+    def test_insertion_transformer(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory('test_insertion_transformer') as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(data_dir, 'insertion_transformer', [
+                    '--apply-bert-init', '--criterion', 'nat_loss', '--noise',
+                    'random_mask'
+                ], task='translation_lev')
+                generate_main(data_dir)
+
     def test_mixture_of_experts(self):
         with contextlib.redirect_stdout(StringIO()):
             with tempfile.TemporaryDirectory('test_moe') as data_dir:
diff --git a/train.py b/train.py
index db04dc2190..3879375fe9 100644
--- a/train.py
+++ b/train.py
@@ -194,6 +194,11 @@ def get_training_stats(trainer):
 
 def validate(args, trainer, task, epoch_itr, subsets):
     """Evaluate the model on the validation set(s) and return the losses."""
+
+    if args.fixed_validation_seed is not None:
+        # set fixed seed for every validation
+        utils.set_torch_seed(args.fixed_validation_seed)
+
     valid_losses = []
     for subset in subsets:
         # Initialize data iterator

From 1cb267ed9412e9f86b27e0ebd0cd0ebae3f5bc58 Mon Sep 17 00:00:00 2001
From: Aditya Chetan <achetan40@gmail.com>
Date: Fri, 27 Sep 2019 16:06:03 -0700
Subject: [PATCH 159/213] Fixing example of batched predictions for Roberta
 (#1195)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
For batched predictions in Roberta, the README was giving an example that was pretty unclear. After a thorough discussion with ngoyal2707 in issue https://github.com/pytorch/fairseq/issues/1167 he gave a clear example of how batched predictions were supposed to be done. Since I spent a lot of time on this inconsistency, I thought that it might benefit the community if his solution was in the official README 😄 !

For for details, see issue https://github.com/pytorch/fairseq/issues/1167
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1195

Differential Revision: D17639354

Pulled By: myleott

fbshipit-source-id: 3eb60c5804a6481f533b19073da7880dfd0d522d
---
 examples/roberta/README.md | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 1b8d637ccb..68dc6701ea 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -146,11 +146,26 @@ logprobs = roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.
 
 ##### Batched prediction:
 ```python
+import torch
 from fairseq.data.data_utils import collate_tokens
-sentences = ['Hello world.', 'Another unrelated sentence.']
-batch = collate_tokens([roberta.encode(sent) for sent in sentences], pad_idx=1)
-logprobs = roberta.predict('new_task', batch)
-assert logprobs.size() == torch.Size([2, 3])
+
+roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
+roberta.eval()
+
+batch_of_pairs = [
+    ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
+    ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'],
+    ['potatoes are awesome.', 'I like to run.'],
+    ['Mars is very far from earth.', 'Mars is very close.'],
+]
+
+batch = collate_tokens(
+    [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
+)
+
+logprobs = roberta.predict('mnli', batch)
+print(logprobs.argmax(dim=1))
+# tensor([0, 2, 1, 0])
 ```
 
 ##### Using the GPU:

From ea1a410d590e63e6fd24942ab8376600c12e2194 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sat, 28 Sep 2019 08:56:03 -0700
Subject: [PATCH 160/213] RoBERTa now supported on TPU and TensorFlow via
 transformers library

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1197

Differential Revision: D17651374

Pulled By: myleott

fbshipit-source-id: 5feb986de1e682eb83c4479f419ad51325718572
---
 examples/roberta/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index 68dc6701ea..1661b604f7 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -8,6 +8,7 @@ RoBERTa iterates on BERT's pretraining procedure, including training the model l
 
 ### What's New:
 
+- September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers).
 - August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers).
 - August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/master/examples/roberta/wsc#roberta-training-on-winogrande-dataset).
 - August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).

From 4ac2c5f2cc8a8b1f221f1e8e9b7839f07c25d997 Mon Sep 17 00:00:00 2001
From: Stephan Peitz <speitz@apple.com>
Date: Sun, 29 Sep 2019 05:08:24 -0700
Subject: [PATCH 161/213] Implementation of the WeCNLP abstract
 "Cross+Self-Attention for Transformer Models" (#1097)

Summary:
This PR implements a new attention module which combines cross-attention (encoder-decoder attention) and the decoder self-attention. This work was accepted as an abstract at WeCNLP 2019 (https://www.wecnlp.ai/wecnlp-2019).

Cross+Self-Attention reduces the amount of parameter and increases the inference speed without any degradation in translation quality.
More details can be found in the attached [abstract](https://github.com/pytorch/fairseq/files/3561282/paper.pdf)
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1097

Differential Revision: D17653168

Pulled By: myleott

fbshipit-source-id: deb834c2c78a229d7418ffbfea20ba3ce252991c
---
 fairseq/models/transformer.py          | 66 ++++++++++++++++++++++++--
 fairseq/modules/multihead_attention.py | 10 +++-
 fairseq/modules/transformer_layer.py   | 34 ++++++++++---
 tests/test_binaries.py                 | 22 ++++++++-
 4 files changed, 120 insertions(+), 12 deletions(-)

diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index dd10ae5357..910c2eda09 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -122,6 +122,13 @@ def add_args(parser):
                                  'Must be used with adaptive_loss criterion'),
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
+        # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
+        parser.add_argument('--no-cross-attention', default=False, action='store_true',
+                            help='do not perform cross-attention')
+        parser.add_argument('--cross-self-attention', default=False, action='store_true',
+                            help='perform cross+self-attention')
+        parser.add_argument('--layer-wise-attention', default=False, action='store_true',
+                            help='perform layer-wise attention (cross-attention or cross+self-attention)')
         # fmt: on
 
     @classmethod
@@ -180,7 +187,12 @@ def build_encoder(cls, args, src_dict, embed_tokens):
 
     @classmethod
     def build_decoder(cls, args, tgt_dict, embed_tokens):
-        return TransformerDecoder(args, tgt_dict, embed_tokens)
+        return TransformerDecoder(
+            args,
+            tgt_dict,
+            embed_tokens,
+            no_encoder_attn=getattr(args, 'no_cross_attention', False),
+        )
 
 
 class TransformerEncoder(FairseqEncoder):
@@ -211,6 +223,8 @@ def __init__(self, args, dictionary, embed_tokens):
             learned=args.encoder_learned_pos,
         ) if not args.no_token_positional_embeddings else None
 
+        self.layer_wise_attention = getattr(args, 'layer_wise_attention', False)
+
         self.layers = nn.ModuleList([])
         self.layers.extend([
             TransformerEncoderLayer(args)
@@ -230,13 +244,15 @@ def forward_embedding(self, src_tokens):
         x = F.dropout(x, p=self.dropout, training=self.training)
         return x, embed
 
-    def forward(self, src_tokens, src_lengths, cls_input=None):
+    def forward(self, src_tokens, src_lengths, cls_input=None, return_all_hiddens=False):
         """
         Args:
             src_tokens (LongTensor): tokens in the source language of shape
                 `(batch, src_len)`
             src_lengths (torch.LongTensor): lengths of each source sentence of
                 shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
 
         Returns:
             dict:
@@ -244,7 +260,13 @@ def forward(self, src_tokens, src_lengths, cls_input=None):
                   shape `(src_len, batch, embed_dim)`
                 - **encoder_padding_mask** (ByteTensor): the positions of
                   padding elements of shape `(batch, src_len)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
         """
+        if self.layer_wise_attention:
+            return_all_hiddens = True
+
         x, encoder_embedding = self.forward_embedding(src_tokens)
 
         # B x T x C -> T x B x C
@@ -255,17 +277,24 @@ def forward(self, src_tokens, src_lengths, cls_input=None):
         if not encoder_padding_mask.any():
             encoder_padding_mask = None
 
+        encoder_states = [] if return_all_hiddens else None
+
         # encoder layers
         for layer in self.layers:
             x = layer(x, encoder_padding_mask)
+            if return_all_hiddens:
+                encoder_states.append(x)
 
         if self.layer_norm:
             x = self.layer_norm(x)
+            if return_all_hiddens:
+                encoder_states[-1] = x
 
         return {
             'encoder_out': x,  # T x B x C
             'encoder_padding_mask': encoder_padding_mask,  # B x T
             'encoder_embedding': encoder_embedding,  # B x T x C
+            'encoder_states': encoder_states,  # List[T x B x C]
         }
 
     def reorder_encoder_out(self, encoder_out, new_order):
@@ -285,6 +314,9 @@ def reorder_encoder_out(self, encoder_out, new_order):
         if encoder_out['encoder_padding_mask'] is not None:
             encoder_out['encoder_padding_mask'] = \
                 encoder_out['encoder_padding_mask'].index_select(0, new_order)
+        if encoder_out.get('encoder_states', None) is not None:
+            for idx, state in enumerate(encoder_out['encoder_states']):
+                encoder_out['encoder_states'][idx] = state.index_select(1, new_order)
         return encoder_out
 
     def max_positions(self):
@@ -293,6 +325,14 @@ def max_positions(self):
             return self.max_source_positions
         return min(self.max_source_positions, self.embed_positions.max_positions())
 
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device:
+            self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
+            if self._future_mask.size(0) < dim:
+                self._future_mask = torch.triu(utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1)
+        return self._future_mask[:dim, :dim]
+
     def upgrade_state_dict_named(self, state_dict, name):
         """Upgrade a (possibly old) state dict for new versions of fairseq."""
         if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
@@ -350,6 +390,9 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
             learned=args.decoder_learned_pos,
         ) if not args.no_token_positional_embeddings else None
 
+        self.cross_self_attention = getattr(args, 'cross_self_attention', False)
+        self.layer_wise_attention = getattr(args, 'layer_wise_attention', False)
+
         self.layers = nn.ModuleList([])
         self.layers.extend([
             TransformerDecoderLayer(args, no_encoder_attn)
@@ -435,14 +478,26 @@ def extract_features(self, prev_output_tokens, encoder_out=None, incremental_sta
 
         inner_states = [x]
 
+        self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        if not self_attn_padding_mask.any() and not self.cross_self_attention:
+            self_attn_padding_mask = None
+
         # decoder layers
-        for layer in self.layers:
+        for idx, layer in enumerate(self.layers):
+            encoder_state = None
+            if encoder_out is not None:
+                if self.layer_wise_attention:
+                    encoder_state = encoder_out['encoder_states'][idx]
+                else:
+                    encoder_state = encoder_out['encoder_out']
+
             x, attn = layer(
                 x,
-                encoder_out['encoder_out'] if encoder_out is not None else None,
+                encoder_state,
                 encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
                 incremental_state,
                 self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None,
+                self_attn_padding_mask=self_attn_padding_mask,
             )
             inner_states.append(x)
 
@@ -553,6 +608,9 @@ def base_architecture(args):
     args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
     args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)
     args.adaptive_input = getattr(args, 'adaptive_input', False)
+    args.no_cross_attention = getattr(args, 'no_cross_attention', False)
+    args.cross_self_attention = getattr(args, 'cross_self_attention', False)
+    args.layer_wise_attention = getattr(args, 'layer_wise_attention', False)
 
     args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
     args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 8c28255dfb..9aaea82484 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -186,8 +186,15 @@ def forward(self, query, key, value, key_padding_mask=None, incremental_state=No
                     v = prev_value
                 else:
                     v = torch.cat((prev_value, v), dim=1)
+            if 'prev_key_padding_mask' in saved_state and saved_state['prev_key_padding_mask'] is not None:
+                prev_key_padding_mask = saved_state['prev_key_padding_mask']
+                if static_kv:
+                    key_padding_mask = prev_key_padding_mask
+                else:
+                    key_padding_mask = torch.cat((prev_key_padding_mask, key_padding_mask), dim=1)
             saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim)
             saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state['prev_key_padding_mask'] = key_padding_mask
 
             self._set_input_buffer(incremental_state, saved_state)
 
@@ -311,7 +318,8 @@ def reorder_incremental_state(self, incremental_state, new_order):
         input_buffer = self._get_input_buffer(incremental_state)
         if input_buffer is not None:
             for k in input_buffer.keys():
-                input_buffer[k] = input_buffer[k].index_select(0, new_order)
+                if input_buffer[k] is not None:
+                    input_buffer[k] = input_buffer[k].index_select(0, new_order)
             self._set_input_buffer(incremental_state, input_buffer)
 
     def _get_input_buffer(self, incremental_state):
diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py
index f4a80cceea..63c6cdf552 100644
--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from fairseq import utils
@@ -134,13 +135,14 @@ class TransformerDecoderLayer(nn.Module):
     def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False):
         super().__init__()
         self.embed_dim = args.decoder_embed_dim
+        self.cross_self_attention = getattr(args, 'cross_self_attention', False)
         self.self_attn = MultiheadAttention(
             embed_dim=self.embed_dim,
             num_heads=args.decoder_attention_heads,
             dropout=args.attention_dropout,
             add_bias_kv=add_bias_kv,
             add_zero_attn=add_zero_attn,
-            self_attention=True
+            self_attention=not self.cross_self_attention,
         )
         self.dropout = args.dropout
         self.activation_fn = utils.get_activation_fn(
@@ -208,13 +210,27 @@ def forward(
         if prev_self_attn_state is not None:
             if incremental_state is None:
                 incremental_state = {}
-            prev_key, prev_value = prev_self_attn_state
+            prev_key, prev_value = prev_self_attn_state[:2]
             saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+            if len(prev_self_attn_state) >= 3:
+                saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
             self.self_attn._set_input_buffer(incremental_state, saved_state)
+
+        if self.cross_self_attention and not (incremental_state is not None and "prev_key" in self.self_attn._get_input_buffer(incremental_state)):
+            if self_attn_mask is not None:
+                self_attn_mask = torch.cat((x.new(x.size(0), encoder_out.size(0)).zero_(), self_attn_mask), dim=1)
+            if self_attn_padding_mask is not None:
+                if encoder_padding_mask is None:
+                    encoder_padding_mask = self_attn_padding_mask.new(encoder_out.size(1), encoder_out.size(0)).zero_()
+                self_attn_padding_mask = torch.cat((encoder_padding_mask, self_attn_padding_mask), dim=1)
+            y = torch.cat((encoder_out, x), dim=0)
+        else:
+            y = x
+
         x, attn = self.self_attn(
             query=x,
-            key=x,
-            value=x,
+            key=y,
+            value=y,
             key_padding_mask=self_attn_padding_mask,
             incremental_state=incremental_state,
             need_weights=False,
@@ -230,9 +246,12 @@ def forward(
             if prev_attn_state is not None:
                 if incremental_state is None:
                     incremental_state = {}
-                prev_key, prev_value = prev_attn_state
+                prev_key, prev_value = prev_attn_state[:2]
                 saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+                if len(prev_attn_state) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_attn_state[2]
                 self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+
             x, attn = self.encoder_attn(
                 query=x,
                 key=encoder_out,
@@ -256,7 +275,10 @@ def forward(
         x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
         if self.onnx_trace and incremental_state is not None:
             saved_state = self.self_attn._get_input_buffer(incremental_state)
-            self_attn_state = saved_state["prev_key"], saved_state["prev_value"]
+            if self_attn_padding_mask is not None:
+                self_attn_state = saved_state["prev_key"], saved_state["prev_value"], saved_state["prev_key_padding_mask"]
+            else:
+                self_attn_state = saved_state["prev_key"], saved_state["prev_value"]
             return x, attn, self_attn_state
         return x, attn
 
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
index 8cede3c9fa..f77806bd6a 100644
--- a/tests/test_binaries.py
+++ b/tests/test_binaries.py
@@ -154,6 +154,23 @@ def test_transformer(self):
                 ], run_validation=True)
                 generate_main(data_dir)
 
+    def test_transformer_cross_self_attention(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory('test_transformer_cross_self_attention') as data_dir:
+                create_dummy_data(data_dir)
+                preprocess_translation_data(data_dir)
+                train_translation_model(data_dir, 'transformer_iwslt_de_en', [
+                    '--encoder-layers', '2',
+                    '--decoder-layers', '2',
+                    '--encoder-embed-dim', '8',
+                    '--decoder-embed-dim', '8',
+                    '--decoder-embed-dim', '8',
+                    '--no-cross-attention',
+                    '--cross-self-attention',
+                    '--layer-wise-attention',
+                ], run_validation=True)
+                generate_main(data_dir, extra_flags=[])
+
     def test_lightconv(self):
         with contextlib.redirect_stdout(StringIO()):
             with tempfile.TemporaryDirectory('test_lightconv') as data_dir:
@@ -543,6 +560,10 @@ def train_translation_model(data_dir, arch, extra_flags=None, task='translation'
 
 
 def generate_main(data_dir, extra_flags=None):
+    if extra_flags is None:
+        extra_flags = [
+            '--print-alignment',
+        ]
     generate_parser = options.get_generation_parser()
     generate_args = options.parse_args_and_arch(
         generate_parser,
@@ -554,7 +575,6 @@ def generate_main(data_dir, extra_flags=None):
             '--max-len-b', '5',
             '--gen-subset', 'valid',
             '--no-progress-bar',
-            '--print-alignment',
         ] + (extra_flags or []),
     )
 

From 13519720f3132c38d1d8f2145b73216d42a62a67 Mon Sep 17 00:00:00 2001
From: Guntupalli Venkata Sai Kalyan
 <kalyan-6941@kalyan-6941.csez.zohocorpin.com>
Date: Sun, 29 Sep 2019 14:51:52 -0700
Subject: [PATCH 162/213] fix typo in README of examples/translation

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1200

Differential Revision: D17659658

Pulled By: myleott

fbshipit-source-id: 1863e6d60a439dbb7e71e5da68817c9d53649737
---
 examples/translation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/translation/README.md b/examples/translation/README.md
index b93115147a..9807a13e9d 100644
--- a/examples/translation/README.md
+++ b/examples/translation/README.md
@@ -236,7 +236,7 @@ cat iwslt17.test.${SRC}-en.${SRC}.bpe \
     | fairseq-interactive data-bin/iwslt17.de_fr.en.bpe16k/ \
       --task multilingual_translation --source-lang ${SRC} --target-lang en \
       --path checkpoints/multilingual_transformer/checkpoint_best.pt \
-      --buffer 2000 --batch-size 128 \
+      --buffer-size 2000 --batch-size 128 \
       --beam 5 --remove-bpe=sentencepiece \
     > iwslt17.test.${SRC}-en.en.sys
 grep ^H iwslt17.test.${SRC}-en.en.sys | cut -f3 \

From acb6fba005f45e363a6da98d7ce79c36c011d473 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 30 Sep 2019 02:09:13 -0700
Subject: [PATCH 163/213] Fix torch.hub to not depend on libnat

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/878

Differential Revision: D17661768

Pulled By: myleott

fbshipit-source-id: 1e4c5f09eb14c40d491ca2459fd2adb8382fb6d2
---
 fairseq/models/insertion_transformer.py   |  9 ++++++++-
 fairseq/models/levenshtein_transformer.py | 23 ++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/fairseq/models/insertion_transformer.py b/fairseq/models/insertion_transformer.py
index 5f5868a550..1296333b5e 100644
--- a/fairseq/models/insertion_transformer.py
+++ b/fairseq/models/insertion_transformer.py
@@ -6,7 +6,7 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-from fairseq import libnat
+
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.levenshtein_transformer import (
     LevenshteinTransformerDecoder,
@@ -51,6 +51,13 @@ def compute_score_full(self, L, tau):
 
 
 def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
     B = in_tokens.size(0)
     T = in_tokens.size(1)
     V = vocab_size
diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index 876bf01a0f..2e17b790ec 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -5,7 +5,7 @@
 
 import torch
 import torch.nn.functional as F
-from fairseq import libnat
+
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.model_utils import fill_tensors as _fill, skip_tensors as _skip
 from fairseq.models.transformer import (
@@ -18,6 +18,13 @@
 
 
 def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -60,6 +67,13 @@ def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
 
 
 def _get_del_targets(in_tokens, out_tokens, padding_idx):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
     out_seq_len = out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -86,6 +100,13 @@ def _get_del_targets(in_tokens, out_tokens, padding_idx):
 
 
 def _get_del_ins_targets(in_tokens, out_tokens, padding_idx):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):

From 1c6679294848f303a361cba7b306b760e299bd9c Mon Sep 17 00:00:00 2001
From: Sarthak Garg <sarthak_garg@apple.com>
Date: Mon, 30 Sep 2019 06:56:15 -0700
Subject: [PATCH 164/213] Implementation of the paper "Jointly Learning to
 Align and Translate with Transformer Models" (#877)

Summary:
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/877

This PR implements guided alignment training described in  "Jointly Learning to Align and Translate with Transformer Models (https://arxiv.org/abs/1909.02074)".

In summary, it allows for training selected heads of the Transformer Model with external alignments computed by Statistical Alignment Toolkits. During inference, attention probabilities from the trained heads can be used to extract reliable alignments. In our work, we did not see any regressions in the translation performance because of guided alignment training.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1095

Differential Revision: D17170337

Pulled By: myleott

fbshipit-source-id: daa418bef70324d7088dbb30aa2adf9f95774859
---
 README.md                                     |   2 +
 .../joint_alignment_translation/README.md     |  89 ++++++++++
 ...t18en2de_no_norm_no_escape_no_agressive.sh | 118 +++++++++++++
 fairseq/binarizer.py                          |  16 ++
 ...l_smoothed_cross_entropy_with_alignment.py |  90 ++++++++++
 fairseq/data/language_pair_dataset.py         |  65 +++++++-
 fairseq/models/fairseq_model.py               |   3 +
 fairseq/models/transformer.py                 | 155 ++++++++++++++++--
 fairseq/modules/multihead_attention.py        |  50 ++++--
 fairseq/modules/transformer_layer.py          |  16 +-
 fairseq/options.py                            |   2 +
 fairseq/sequence_generator.py                 | 142 +++++++++++++---
 fairseq/sequence_scorer.py                    |   9 +-
 fairseq/tasks/fairseq_task.py                 |   8 +-
 fairseq/tasks/translation.py                  |  12 +-
 fairseq/utils.py                              |  45 +++++
 generate.py                                   |   5 +-
 interactive.py                                |   5 +-
 preprocess.py                                 |  87 +++++++++-
 tests/test_binaries.py                        |  41 ++++-
 20 files changed, 899 insertions(+), 61 deletions(-)
 create mode 100644 examples/joint_alignment_translation/README.md
 create mode 100755 examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
 create mode 100644 fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py

diff --git a/README.md b/README.md
index c39ff22c97..e05af20c6c 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ Fairseq provides reference implementations of various sequence-to-sequence model
   - [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md)
   - [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
   - [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md)
+  - [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md )
 - **Non-autoregressive Transformers**
   - Non-Autoregressive Neural Machine Translation (Gu et al., 2017)
   - Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al. 2018)
@@ -100,6 +101,7 @@ as well as example training and evaluation commands.
 - [Language Modeling](examples/language_model/README.md): convolutional and transformer models are available
 
 We also have more detailed READMEs to reproduce results from specific papers:
+- [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md )
 - [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md)
 - [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md)
 - [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
diff --git a/examples/joint_alignment_translation/README.md b/examples/joint_alignment_translation/README.md
new file mode 100644
index 0000000000..cd9c0ea65f
--- /dev/null
+++ b/examples/joint_alignment_translation/README.md
@@ -0,0 +1,89 @@
+# Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)
+
+This page includes instructions for training models described in [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](https://arxiv.org/abs/1909.02074).
+
+## Training a joint alignment-translation model on WMT'18 En-De
+
+##### 1. Extract and preprocess the WMT'18 En-De data
+```bash
+./prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
+```
+
+##### 2. Generate alignments from statistical alignment toolkits e.g. Giza++/FastAlign.
+In this example, we use FastAlign.
+```bash
+git clone git@github.com:clab/fast_align.git
+pushd fast_align
+mkdir build
+cd build
+cmake ..
+make
+popd
+ALIGN=fast_align/build/fast_align
+paste bpe.32k/train.en bpe.32k/train.de | awk -F '\t' '{print $1 " ||| " $2}' > bpe.32k/train.en-de
+$ALIGN -i bpe.32k/train.en-de -d -o -v > bpe.32k/train.align
+```
+
+##### 3. Preprocess the dataset with the above generated alignments.
+```bash
+fairseq-preprocess \
+    --source-lang en --target-lang de \
+    --trainpref bpe.32k/train \
+    --validpref bpe.32k/valid \
+    --testpref bpe.32k/test \
+    --align-suffix align \
+    --destdir binarized/ \
+    --joined-dictionary \
+    --workers 32
+```
+
+##### 4. Train a model
+```bash
+fairseq-train \
+    binarized \
+    --arch transformer_wmt_en_de_big_align --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --activation-fn relu\
+    --lr 0.0002 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+    --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
+    --max-tokens 3500 --label-smoothing 0.1 \
+    --save-dir ./checkpoints --log-interval 1000 --max-update 60000 \
+    --keep-interval-updates -1 --save-interval-updates 0 \
+    --load-alignments --criterion label_smoothed_cross_entropy_with_alignment \
+    --fp16
+```
+
+Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer.
+
+If you want to train the above model with big batches (assuming your machine has 8 GPUs):
+- add `--update-freq 8` to simulate training on 8x8=64 GPUs
+- increase the learning rate; 0.0007 works well for big batches
+
+##### 5. Evaluate and generate the alignments (BPE level)
+```bash
+fairseq-generate \
+    binarized --gen-subset test --print-alignment \
+    --source-lang en --target-lang de \
+    --path checkpoints/checkpoint_best.pt --beam 5 --nbest 1
+```
+
+##### 6. Other resources.
+The code for:
+1. preparing alignment test sets
+2. converting BPE level alignments to token level alignments
+3. symmetrizing bidirectional alignments
+4. evaluating alignments using AER metric
+can be found [here](https://github.com/lilt/alignment-scripts)
+
+## Citation
+
+```bibtex
+@inproceedings{garg2019jointly,
+  title = {Jointly Learning to Align and Translate with Transformer Models},
+  author = {Garg, Sarthak and Peitz, Stephan and Nallasamy, Udhyakumar and Paulik, Matthias},
+  booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  address = {Hong Kong},
+  month = {November},
+  url = {https://arxiv.org/abs/1909.02074},
+  year = {2019},
+}
+```
diff --git a/examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh b/examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
new file mode 100755
index 0000000000..e78ed66a15
--- /dev/null
+++ b/examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
+
+URLS=(
+    "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
+    "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
+    "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
+    "http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
+    "http://data.statmt.org/wmt17/translation-task/dev.tgz"
+    "http://statmt.org/wmt14/test-full.tgz"
+)
+CORPORA=(
+    "training/europarl-v7.de-en"
+    "commoncrawl.de-en"
+    "training-parallel-nc-v13/news-commentary-v13.de-en"
+    "rapid2016.de-en"
+)
+
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+
+src=en
+tgt=de
+lang=en-de
+prep=wmt18_en_de
+tmp=$prep/tmp
+orig=orig
+dev=dev/newstest2012
+codes=32000
+bpe=bpe.32k
+
+mkdir -p $orig $tmp $prep $bpe
+
+cd $orig
+
+for ((i=0;i<${#URLS[@]};++i)); do
+    url=${URLS[i]}
+    file=$(basename $url)
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit 1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        fi
+    fi
+done
+cd ..
+
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    rm  -rf $tmp/train.tags.$lang.tok.$l
+    for f in "${CORPORA[@]}"; do
+        cat $orig/$f.$l | \
+            perl $REM_NON_PRINT_CHAR | \
+            perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/train.tags.$lang.tok.$l
+    done
+done
+
+echo "pre-processing test data..."
+for l in $src $tgt; do
+    if [ "$l" == "$src" ]; then
+        t="src"
+    else
+        t="ref"
+    fi
+    grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -l $l -no-escape > $tmp/test.$l
+    echo ""
+done
+
+# apply length filtering before BPE
+perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 100
+
+# use newstest2012 for valid
+echo "pre-processing valid data..."
+for l in $src $tgt; do
+    rm  -rf $tmp/valid.$l
+    cat $orig/$dev.$l | \
+        perl $REM_NON_PRINT_CHAR | \
+        perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/valid.$l
+done
+
+mkdir output
+mv $tmp/{train,valid,test}.{$src,$tgt} output
+
+#BPE
+git clone git@github.com:glample/fastBPE.git
+pushd fastBPE
+g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
+popd
+fastBPE/fast learnbpe $codes output/train.$src output/train.$tgt > $bpe/codes
+for split in {train,valid,test}; do for lang in {en,de}; do fastBPE/fast applybpe $bpe/$split.$lang output/$split.$lang $bpe/codes; done; done
diff --git a/fairseq/binarizer.py b/fairseq/binarizer.py
index 44dcb256c4..744c5e3fc8 100644
--- a/fairseq/binarizer.py
+++ b/fairseq/binarizer.py
@@ -52,6 +52,22 @@ def replaced_consumer(word, idx):
                 line = f.readline()
         return {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced}
 
+    @staticmethod
+    def binarize_alignments(filename, alignment_parser, consumer, offset=0, end=-1):
+        nseq = 0
+
+        with open(filename, 'r') as f:
+            f.seek(offset)
+            line = safe_readline(f)
+            while line:
+                if end > 0 and f.tell() > end:
+                    break
+                ids = alignment_parser(line)
+                nseq += 1
+                consumer(ids)
+                line = f.readline()
+        return {'nseq': nseq}
+
     @staticmethod
     def find_offsets(filename, num_chunks):
         with open(filename, 'r', encoding='utf-8') as f:
diff --git a/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py b/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
new file mode 100644
index 0000000000..2cb5621498
--- /dev/null
+++ b/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from fairseq import utils
+
+from .label_smoothed_cross_entropy import LabelSmoothedCrossEntropyCriterion
+from . import register_criterion
+
+
+@register_criterion('label_smoothed_cross_entropy_with_alignment')
+class LabelSmoothedCrossEntropyCriterionWithAlignment(LabelSmoothedCrossEntropyCriterion):
+
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        self.alignment_lambda = args.alignment_lambda
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        super(LabelSmoothedCrossEntropyCriterionWithAlignment,
+              LabelSmoothedCrossEntropyCriterionWithAlignment).add_args(parser)
+        parser.add_argument('--alignment-lambda', default=0.05, type=float, metavar='D',
+                            help='weight for the alignment loss')
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample['net_input'])
+        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
+        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'nll_loss': utils.item(nll_loss.data) if reduce else nll_loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['target'].size(0),
+            'sample_size': sample_size,
+        }
+
+        alignment_loss = None
+
+        # Compute alignment loss only for training set and non dummy batches.
+        if 'alignments' in sample and sample['alignments'] is not None:
+            alignment_loss = self.compute_alignment_loss(sample, net_output)
+
+        if alignment_loss is not None:
+            logging_output['alignment_loss'] = utils.item(alignment_loss.data)
+            loss += self.alignment_lambda * alignment_loss
+
+        return loss, sample_size, logging_output
+
+    def compute_alignment_loss(self, sample, net_output):
+        attn_prob = net_output[1]['attn']
+        bsz, tgt_sz, src_sz = attn_prob.shape
+        attn = attn_prob.view(bsz * tgt_sz, src_sz)
+
+        align = sample['alignments']
+        align_weights = sample['align_weights'].float()
+
+        if len(align) > 0:
+            # Alignment loss computation. align (shape [:, 2]) contains the src-tgt index pairs corresponding to
+            # the alignments. align_weights (shape [:]) contains the 1 / frequency of a tgt index for normalizing.
+            loss = -((attn[align[:, 1][:, None], align[:, 0][:, None]]).log() * align_weights[:, None]).sum()
+        else:
+            return None
+
+        return loss
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+        return {
+            'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2) if sample_size > 0 else 0.,
+            'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / ntokens / math.log(2) if ntokens > 0 else 0.,
+            'alignment_loss': sum(log.get('alignment_loss', 0) for log in logging_outputs) / sample_size / math.log(2) if sample_size > 0 else 0.,
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py
index 5fc1371aae..09c7193ab4 100644
--- a/fairseq/data/language_pair_dataset.py
+++ b/fairseq/data/language_pair_dataset.py
@@ -22,6 +22,28 @@ def merge(key, left_pad, move_eos_to_beginning=False):
             pad_idx, eos_idx, left_pad, move_eos_to_beginning,
         )
 
+    def check_alignment(alignment, src_len, tgt_len):
+        if alignment is None or len(alignment) == 0:
+            return False
+        if alignment[:, 0].max().item() >= src_len - 1 or alignment[:, 1].max().item() >= tgt_len - 1:
+            print("| alignment size mismatch found, skipping alignment!")
+            return False
+        return True
+
+    def compute_alignment_weights(alignments):
+        """
+        Given a tensor of shape [:, 2] containing the source-target indices
+        corresponding to the alignments, a weight vector containing the
+        inverse frequency of each target index is computed.
+        For e.g. if alignments = [[5, 7], [2, 3], [1, 3], [4, 2]], then
+        a tensor containing [1., 0.5, 0.5, 1] should be returned (since target
+        index 3 is repeated twice)
+        """
+        align_tgt = alignments[:, 1]
+        _, align_tgt_i, align_tgt_c = torch.unique(align_tgt, return_inverse=True, return_counts=True)
+        align_weights = align_tgt_c[align_tgt_i[np.arange(len(align_tgt))]]
+        return 1. / align_weights.float()
+
     id = torch.LongTensor([s['id'] for s in samples])
     src_tokens = merge('source', left_pad=left_pad_source)
     # sort by descending source length
@@ -35,6 +57,7 @@ def merge(key, left_pad, move_eos_to_beginning=False):
     if samples[0].get('target', None) is not None:
         target = merge('target', left_pad=left_pad_target)
         target = target.index_select(0, sort_order)
+        tgt_lengths = torch.LongTensor([s['target'].numel() for s in samples]).index_select(0, sort_order)
         ntokens = sum(len(s['target']) for s in samples)
 
         if input_feeding:
@@ -61,6 +84,32 @@ def merge(key, left_pad, move_eos_to_beginning=False):
     }
     if prev_output_tokens is not None:
         batch['net_input']['prev_output_tokens'] = prev_output_tokens
+
+    if samples[0].get('alignment', None) is not None:
+        bsz, tgt_sz = batch['target'].shape
+        src_sz = batch['net_input']['src_tokens'].shape[1]
+
+        offsets = torch.zeros((len(sort_order), 2), dtype=torch.long)
+        offsets[:, 1] += (torch.arange(len(sort_order), dtype=torch.long) * tgt_sz)
+        if left_pad_source:
+            offsets[:, 0] += (src_sz - src_lengths)
+        if left_pad_target:
+            offsets[:, 1] += (tgt_sz - tgt_lengths)
+
+        alignments = [
+            alignment + offset
+            for align_idx, offset, src_len, tgt_len in zip(sort_order, offsets, src_lengths, tgt_lengths)
+            for alignment in [samples[align_idx]['alignment'].view(-1, 2)]
+            if check_alignment(alignment, src_len, tgt_len)
+        ]
+
+        if len(alignments) > 0:
+            alignments = torch.cat(alignments, dim=0)
+            align_weights = compute_alignment_weights(alignments)
+
+            batch['alignments'] = alignments
+            batch['align_weights'] = align_weights
+
     return batch
 
 
@@ -91,6 +140,8 @@ class LanguagePairDataset(FairseqDataset):
             of source if it's present (default: False).
         append_eos_to_target (bool, optional): if set, appends eos to end of
             target if it's absent (default: False).
+        align_dataset (torch.utils.data.Dataset, optional): dataset
+            containing alignments.
     """
 
     def __init__(
@@ -98,7 +149,9 @@ def __init__(
         tgt=None, tgt_sizes=None, tgt_dict=None,
         left_pad_source=True, left_pad_target=False,
         max_source_positions=1024, max_target_positions=1024,
-        shuffle=True, input_feeding=True, remove_eos_from_source=False, append_eos_to_target=False,
+        shuffle=True, input_feeding=True,
+        remove_eos_from_source=False, append_eos_to_target=False,
+        align_dataset=None,
     ):
         if tgt_dict is not None:
             assert src_dict.pad() == tgt_dict.pad()
@@ -118,6 +171,9 @@ def __init__(
         self.input_feeding = input_feeding
         self.remove_eos_from_source = remove_eos_from_source
         self.append_eos_to_target = append_eos_to_target
+        self.align_dataset = align_dataset
+        if self.align_dataset is not None:
+            assert self.tgt_sizes is not None, "Both source and target needed when alignments are provided"
 
     def __getitem__(self, index):
         tgt_item = self.tgt[index] if self.tgt is not None else None
@@ -136,11 +192,14 @@ def __getitem__(self, index):
             if self.src[index][-1] == eos:
                 src_item = self.src[index][:-1]
 
-        return {
+        example = {
             'id': index,
             'source': src_item,
             'target': tgt_item,
         }
+        if self.align_dataset is not None:
+            example['alignment'] = self.align_dataset[index]
+        return example
 
     def __len__(self):
         return len(self.src)
@@ -212,3 +271,5 @@ def prefetch(self, indices):
         self.src.prefetch(indices)
         if self.tgt is not None:
             self.tgt.prefetch(indices)
+        if self.align_dataset is not None:
+            self.align_dataset.prefetch(indices)
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index fc53a7c9d7..674de01310 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -222,6 +222,9 @@ def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
         decoder_out = self.decoder(prev_output_tokens, encoder_out=encoder_out, **kwargs)
         return decoder_out
 
+    def forward_decoder(self, prev_output_tokens, **kwargs):
+        return self.decoder(prev_output_tokens, **kwargs)
+
     def extract_features(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
         """
         Similar to *forward* but only return features.
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 910c2eda09..f5f23f1b95 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -68,6 +68,7 @@ def hub_models(cls):
 
     def __init__(self, encoder, decoder):
         super().__init__(encoder, decoder)
+        self.supports_align_args = True
 
     @staticmethod
     def add_args(parser):
@@ -195,6 +196,69 @@ def build_decoder(cls, args, tgt_dict, embed_tokens):
         )
 
 
+@register_model('transformer_align')
+class TransformerAlignModel(TransformerModel):
+    """
+    See "Jointly Learning to Align and Translate with Transformer
+    Models" (Garg et al., EMNLP 2019).
+    """
+
+    def __init__(self, encoder, decoder, args):
+        super().__init__(encoder, decoder)
+        self.alignment_heads = args.alignment_heads
+        self.alignment_layer = args.alignment_layer
+        self.full_context_alignment = args.full_context_alignment
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        super(TransformerAlignModel, TransformerAlignModel).add_args(parser)
+        parser.add_argument('--alignment-heads', type=int, metavar='D',
+                            help='Number of cross attention heads per layer to supervised with alignments')
+        parser.add_argument('--alignment-layer', type=int, metavar='D',
+                            help='Layer number which has to be supervised. 0 corresponding to the bottommost layer.')
+        parser.add_argument('--full-context-alignment', type=bool, metavar='D',
+                            help='Whether or not alignment is supervised conditioned on the full target context.')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        # set any default arguments
+        transformer_align(args)
+
+        transformer_model = TransformerModel.build_model(args, task)
+        return TransformerAlignModel(transformer_model.encoder, transformer_model.decoder, args)
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens):
+        encoder_out = self.encoder(src_tokens, src_lengths)
+        return self.forward_decoder(prev_output_tokens, encoder_out)
+
+    def forward_decoder(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        features_only=False,
+        **extra_args,
+    ):
+        attn_args = {'alignment_layer': self.alignment_layer, 'alignment_heads': self.alignment_heads}
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out,
+            **attn_args,
+            **extra_args,
+        )
+
+        if self.full_context_alignment:
+            attn_args['full_context_alignment'] = self.full_context_alignment
+            _, alignment_out = self.decoder(
+                prev_output_tokens, encoder_out, features_only=True, **attn_args, **extra_args,
+            )
+            decoder_out[1]['attn'] = alignment_out['attn']
+
+        return decoder_out
+
+
 class TransformerEncoder(FairseqEncoder):
     """
     Transformer encoder consisting of *args.encoder_layers* layers. Each layer
@@ -423,7 +487,14 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
         else:
             self.layer_norm = None
 
-    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused):
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        features_only=False,
+        **extra_args,
+    ):
         """
         Args:
             prev_output_tokens (LongTensor): previous decoder outputs of shape
@@ -432,25 +503,53 @@ def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None,
                 encoder-side attention
             incremental_state (dict): dictionary used for storing state during
                 :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
 
         Returns:
             tuple:
                 - the decoder's output of shape `(batch, tgt_len, vocab)`
                 - a dictionary with any model-specific outputs
         """
-        x, extra = self.extract_features(prev_output_tokens, encoder_out, incremental_state)
-        x = self.output_layer(x)
+        x, extra = self.extract_features(
+            prev_output_tokens, encoder_out, incremental_state, **extra_args,
+        )
+        if not features_only:
+            x = self.output_layer(x)
         return x, extra
 
-    def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused):
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        full_context_alignment=False,
+        alignment_layer=None,
+        alignment_heads=None,
+        **unused,
+    ):
         """
         Similar to *forward* but only return features.
 
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+
+        Args:
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+            alignment_layer (int, optional): return mean alignment over
+                heads at this layer (default: last layer).
+            alignment_heads (int, optional): only average alignment over
+                this many heads (default: all heads).
+
         Returns:
             tuple:
                 - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                 - a dictionary with any model-specific outputs
         """
+        if alignment_layer is None:
+            alignment_layer = len(self.layers) - 1
+
         # embed positions
         positions = self.embed_positions(
             prev_output_tokens,
@@ -474,15 +573,14 @@ def extract_features(self, prev_output_tokens, encoder_out=None, incremental_sta
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)
-        attn = None
-
-        inner_states = [x]
 
         self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
         if not self_attn_padding_mask.any() and not self.cross_self_attention:
             self_attn_padding_mask = None
 
         # decoder layers
+        attn = None
+        inner_states = [x]
         for idx, layer in enumerate(self.layers):
             encoder_state = None
             if encoder_out is not None:
@@ -491,15 +589,32 @@ def extract_features(self, prev_output_tokens, encoder_out=None, incremental_sta
                 else:
                     encoder_state = encoder_out['encoder_out']
 
-            x, attn = layer(
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+
+            x, layer_attn = layer(
                 x,
                 encoder_state,
                 encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
                 incremental_state,
-                self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None,
+                self_attn_mask=self_attn_mask,
                 self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=(idx == alignment_layer),
+                need_head_weights=(idx == alignment_layer),
             )
+
             inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float()
+
+        if attn is not None:
+            if alignment_heads is not None:
+                attn = attn[:alignment_heads]
+
+            # average probabilities over heads
+            attn = attn.mean(dim=0)
 
         if self.layer_norm:
             x = self.layer_norm(x)
@@ -531,7 +646,12 @@ def max_positions(self):
 
     def buffered_future_mask(self, tensor):
         dim = tensor.size(0)
-        if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim:
+        if (
+            not hasattr(self, '_future_mask')
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+            or self._future_mask.size(0) < dim
+        ):
             self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
         return self._future_mask[:dim, :dim]
 
@@ -668,3 +788,18 @@ def transformer_wmt_en_de_big_t2t(args):
     args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
     args.activation_dropout = getattr(args, 'activation_dropout', 0.1)
     transformer_vaswani_wmt_en_de_big(args)
+
+
+@register_model_architecture('transformer_align', 'transformer_align')
+def transformer_align(args):
+    args.alignment_heads = getattr(args, 'alignment_heads', 1)
+    args.alignment_layer = getattr(args, 'alignment_layer', 4)
+    args.full_context_alignment = getattr(args, 'full_context_alignment', False)
+    base_architecture(args)
+
+
+@register_model_architecture('transformer_align', 'transformer_wmt_en_de_big_align')
+def transformer_wmt_en_de_big_align(args):
+    args.alignment_heads = getattr(args, 'alignment_heads', 1)
+    args.alignment_layer = getattr(args, 'alignment_layer', 4)
+    transformer_wmt_en_de_big(args)
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 9aaea82484..96849790f6 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -90,15 +90,37 @@ def reset_parameters(self):
         if self.bias_v is not None:
             nn.init.xavier_normal_(self.bias_v)
 
-    def forward(self, query, key, value, key_padding_mask=None, incremental_state=None,
-                need_weights=True, static_kv=False, attn_mask=None, before_softmax=False):
+    def forward(
+        self,
+        query, key, value,
+        key_padding_mask=None,
+        incremental_state=None,
+        need_weights=True,
+        static_kv=False,
+        attn_mask=None,
+        before_softmax=False,
+        need_head_weights=False,
+    ):
         """Input shape: Time x Batch x Channel
 
-        Timesteps can be masked by supplying a T x T mask in the
-        `attn_mask` argument. Padding elements can be excluded from
-        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
-        batch x src_len, where padding elements are indicated by 1s.
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
         """
+        if need_head_weights:
+            need_weights = True
+
         tgt_len, bsz, embed_dim = query.size()
         assert embed_dim == self.embed_dim
         assert list(query.size()) == [tgt_len, bsz, embed_dim]
@@ -249,12 +271,11 @@ def forward(self, query, key, value, key_padding_mask=None, incremental_state=No
         if before_softmax:
             return attn_weights, v
 
-        attn_weights = utils.softmax(
-            attn_weights, dim=-1, onnx_trace=self.onnx_trace,
-        ).type_as(attn_weights)
-        attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_weights_float = utils.softmax(attn_weights, dim=-1, onnx_trace=self.onnx_trace)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
 
-        attn = torch.bmm(attn_weights, v)
+        attn = torch.bmm(attn_probs, v)
         assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
         if (self.onnx_trace and attn.size(1) == 1):
             # when ONNX tracing a single decoder step (sequence length == 1)
@@ -265,9 +286,10 @@ def forward(self, query, key, value, key_padding_mask=None, incremental_state=No
         attn = self.out_proj(attn)
 
         if need_weights:
-            # average attention weights over heads
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.sum(dim=1) / self.num_heads
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
         else:
             attn_weights = None
 
diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py
index 63c6cdf552..a3579fb990 100644
--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
@@ -195,16 +195,25 @@ def forward(
         prev_attn_state=None,
         self_attn_mask=None,
         self_attn_padding_mask=None,
+        need_attn=False,
+        need_head_weights=False,
     ):
         """
         Args:
             x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, src_len)` where padding elements are indicated by ``1``.
+            encoder_padding_mask (ByteTensor, optional): binary
+                ByteTensor of shape `(batch, src_len)` where padding
+                elements are indicated by ``1``.
+            need_attn (bool, optional): return attention weights
+            need_head_weights (bool, optional): return attention weights
+                for each head (default: return average over heads).
 
         Returns:
             encoded output of shape `(seq_len, batch, embed_dim)`
         """
+        if need_head_weights:
+            need_attn = True
+
         residual = x
         x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
         if prev_self_attn_state is not None:
@@ -259,7 +268,8 @@ def forward(
                 key_padding_mask=encoder_padding_mask,
                 incremental_state=incremental_state,
                 static_kv=True,
-                need_weights=(not self.training and self.need_attn),
+                need_weights=need_attn or (not self.training and self.need_attn),
+                need_head_weights=need_head_weights,
             )
             x = F.dropout(x, p=self.dropout, training=self.training)
             x = residual + x
diff --git a/fairseq/options.py b/fairseq/options.py
index bb1e27aeb7..06a52b62ba 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -224,6 +224,8 @@ def add_preprocess_args(parser):
                        help="comma separated, valid file prefixes")
     group.add_argument("--testpref", metavar="FP", default=None,
                        help="comma separated, test file prefixes")
+    group.add_argument("--align-suffix", metavar="FP", default=None,
+                       help="alignment file suffix")
     group.add_argument("--destdir", metavar="DIR", default="data-bin",
                        help="destination dir")
     group.add_argument("--thresholdtgt", metavar="N", default=0, type=int,
diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py
index 3b100b9615..dd3fb86f7b 100644
--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -7,7 +7,8 @@
 
 import torch
 
-from fairseq import search
+from fairseq import search, utils
+from fairseq.data import data_utils
 from fairseq.models import FairseqIncrementalDecoder
 
 
@@ -81,7 +82,6 @@ def __init__(
         self.temperature = temperature
         self.match_source_len = match_source_len
         self.no_repeat_ngram_size = no_repeat_ngram_size
-
         assert sampling_topk < 0 or sampling, '--sampling-topk requires --sampling'
         assert sampling_topp < 0 or sampling, '--sampling-topp requires --sampling'
         assert temperature > 0, '--temperature must be greater than 0'
@@ -98,14 +98,7 @@ def __init__(
             self.search = search.BeamSearch(tgt_dict)
 
     @torch.no_grad()
-    def generate(
-        self,
-        models,
-        sample,
-        prefix_tokens=None,
-        bos_token=None,
-        **kwargs
-    ):
+    def generate(self, models, sample, **kwargs):
         """Generate a batch of translations.
 
         Args:
@@ -113,8 +106,21 @@ def generate(
             sample (dict): batch
             prefix_tokens (torch.LongTensor, optional): force decoder to begin
                 with these tokens
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
         """
         model = EnsembleModel(models)
+        return self._generate(model, sample, **kwargs)
+
+    @torch.no_grad()
+    def _generate(
+        self,
+        model,
+        sample,
+        prefix_tokens=None,
+        bos_token=None,
+        **kwargs
+    ):
         if not self.retain_dropout:
             model.eval()
 
@@ -155,7 +161,6 @@ def generate(
         tokens_buf = tokens.clone()
         tokens[:, 0] = self.eos if bos_token is None else bos_token
         attn, attn_buf = None, None
-        nonpad_idxs = None
 
         # The blacklist indicates candidates that should be ignored.
         # For example, suppose we're sampling and have already finalized 2/5
@@ -251,17 +256,15 @@ def get_hypo():
 
                     if attn_clone is not None:
                         # remove padding tokens from attn scores
-                        hypo_attn = attn_clone[i][nonpad_idxs[sent]]
-                        _, alignment = hypo_attn.max(dim=0)
+                        hypo_attn = attn_clone[i]
                     else:
                         hypo_attn = None
-                        alignment = None
 
                     return {
                         'tokens': tokens_clone[i],
                         'score': score,
                         'attention': hypo_attn,  # src_len x tgt_len
-                        'alignment': alignment,
+                        'alignment': None,
                         'positional_scores': pos_scores[i],
                     }
 
@@ -345,7 +348,6 @@ def replicate_first_beam(tensor, mask):
                 if attn is None:
                     attn = scores.new(bsz * beam_size, src_tokens.size(1), max_len + 2)
                     attn_buf = attn.clone()
-                    nonpad_idxs = src_tokens.ne(self.pad)
                 attn[:, :, step + 1].copy_(avg_attn_scores)
 
             scores = scores.type_as(lprobs)
@@ -512,7 +514,6 @@ def calculate_banned_tokens(bbsz_idx):
         # sort by score descending
         for sent in range(len(finalized)):
             finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True)
-
         return finalized
 
 
@@ -577,9 +578,11 @@ def _decode_one(
         temperature=1.,
     ):
         if self.incremental_states is not None:
-            decoder_out = list(model.decoder(tokens, encoder_out, incremental_state=self.incremental_states[model]))
+            decoder_out = list(model.forward_decoder(
+                tokens, encoder_out=encoder_out, incremental_state=self.incremental_states[model],
+            ))
         else:
-            decoder_out = list(model.decoder(tokens, encoder_out))
+            decoder_out = list(model.forward_decoder(tokens, encoder_out=encoder_out))
         decoder_out[0] = decoder_out[0][:, -1:, :]
         if temperature != 1.:
             decoder_out[0].div_(temperature)
@@ -605,3 +608,104 @@ def reorder_incremental_state(self, new_order):
             return
         for model in self.models:
             model.decoder.reorder_incremental_state(self.incremental_states[model], new_order)
+
+
+class SequenceGeneratorWithAlignment(SequenceGenerator):
+
+    def __init__(self, tgt_dict, left_pad_target=False, **kwargs):
+        """Generates translations of a given source sentence.
+
+        Produces alignments following "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+
+        Args:
+            left_pad_target (bool, optional): Whether or not the
+                hypothesis should be left padded or not when they are
+                teacher forced for generating alignments.
+        """
+        super().__init__(tgt_dict, **kwargs)
+        self.left_pad_target = left_pad_target
+
+    @torch.no_grad()
+    def generate(self, models, sample, **kwargs):
+        model = EnsembleModelWithAlignment(models)
+        finalized = super()._generate(model, sample, **kwargs)
+
+        src_tokens = sample['net_input']['src_tokens']
+        bsz = src_tokens.shape[0]
+        beam_size = self.beam_size
+        src_tokens, src_lengths, prev_output_tokens, tgt_tokens = \
+            self._prepare_batch_for_alignment(sample, finalized)
+        if any(getattr(m, 'full_context_alignment', False) for m in model.models):
+            attn = model.forward_align(src_tokens, src_lengths, prev_output_tokens)
+        else:
+            attn = [
+                finalized[i // beam_size][i % beam_size]['attention'].transpose(1, 0)
+                for i in range(bsz * beam_size)
+            ]
+
+        # Process the attn matrix to extract hard alignments.
+        for i in range(bsz * beam_size):
+            alignment = utils.extract_hard_alignment(attn[i], src_tokens[i], tgt_tokens[i], self.pad, self.eos)
+            finalized[i // beam_size][i % beam_size]['alignment'] = alignment
+        return finalized
+
+    def _prepare_batch_for_alignment(self, sample, hypothesis):
+        src_tokens = sample['net_input']['src_tokens']
+        bsz = src_tokens.shape[0]
+        src_tokens = src_tokens[:, None, :].expand(-1, self.beam_size, -1).contiguous().view(bsz * self.beam_size, -1)
+        src_lengths = sample['net_input']['src_lengths']
+        src_lengths = src_lengths[:, None].expand(-1, self.beam_size).contiguous().view(bsz * self.beam_size)
+        prev_output_tokens = data_utils.collate_tokens(
+            [beam['tokens'] for example in hypothesis for beam in example],
+            self.pad, self.eos, self.left_pad_target, move_eos_to_beginning=True,
+        )
+        tgt_tokens = data_utils.collate_tokens(
+            [beam['tokens'] for example in hypothesis for beam in example],
+            self.pad, self.eos, self.left_pad_target, move_eos_to_beginning=False,
+        )
+        return src_tokens, src_lengths, prev_output_tokens, tgt_tokens
+
+
+class EnsembleModelWithAlignment(EnsembleModel):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__(models)
+
+    def forward_align(self, src_tokens, src_lengths, prev_output_tokens):
+        avg_attn = None
+        for model in self.models:
+            decoder_out = model(src_tokens, src_lengths, prev_output_tokens)
+            attn = decoder_out[1]['attn']
+            if avg_attn is None:
+                avg_attn = attn
+            else:
+                avg_attn.add_(attn)
+        if len(self.models) > 1:
+            avg_attn.div_(len(self.models))
+        return avg_attn
+
+    def _decode_one(
+        self, tokens, model, encoder_out, incremental_states, log_probs,
+        temperature=1.,
+    ):
+        if self.incremental_states is not None:
+            decoder_out = list(model.forward_decoder(
+                tokens,
+                encoder_out=encoder_out,
+                incremental_state=self.incremental_states[model],
+            ))
+        else:
+            decoder_out = list(model.forward_decoder(tokens, encoder_out=encoder_out))
+        decoder_out[0] = decoder_out[0][:, -1:, :]
+        if temperature != 1.:
+            decoder_out[0].div_(temperature)
+        attn = decoder_out[1]
+        if type(attn) is dict:
+            attn = attn.get('attn', None)
+        if attn is not None:
+            attn = attn[:, -1, :]
+        probs = model.get_normalized_probs(decoder_out, log_probs=log_probs)
+        probs = probs[:, -1, :]
+        return probs, attn
diff --git a/fairseq/sequence_scorer.py b/fairseq/sequence_scorer.py
index d125422340..75ff4cf051 100644
--- a/fairseq/sequence_scorer.py
+++ b/fairseq/sequence_scorer.py
@@ -14,6 +14,7 @@ class SequenceScorer(object):
 
     def __init__(self, tgt_dict, softmax_batch=None):
         self.pad = tgt_dict.pad()
+        self.eos = tgt_dict.eos()
         self.softmax_batch = softmax_batch or sys.maxsize
         assert self.softmax_batch > 0
 
@@ -44,6 +45,7 @@ def gather_target_probs(probs, target):
             )
             return probs
 
+
         orig_target = sample['target']
 
         # compute scores for each model in the ensemble
@@ -53,6 +55,8 @@ def gather_target_probs(probs, target):
             model.eval()
             decoder_out = model.forward(**net_input)
             attn = decoder_out[1]
+            if type(attn) is dict:
+                attn = attn.get('attn', None)
 
             batched = batch_for_softmax(decoder_out, orig_target)
             probs, idx = None, 0
@@ -100,8 +104,9 @@ def gather_target_probs(probs, target):
             avg_probs_i = avg_probs[i][start_idxs[i]:start_idxs[i] + tgt_len]
             score_i = avg_probs_i.sum() / tgt_len
             if avg_attn is not None:
-                avg_attn_i = avg_attn[i, start_idxs[i]:]
-                _, alignment = avg_attn_i.max(dim=0)
+                avg_attn_i = avg_attn[i]
+                alignment = utils.extract_hard_alignment(avg_attn_i, sample['net_input']['src_tokens'][i],
+                                                         sample['target'][i], self.pad, self.eos)
             else:
                 avg_attn_i = alignment = None
             hypos.append([{
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index ba5695785d..538532b20e 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -198,8 +198,12 @@ def build_generator(self, args):
             from fairseq.sequence_scorer import SequenceScorer
             return SequenceScorer(self.target_dictionary)
         else:
-            from fairseq.sequence_generator import SequenceGenerator
-            return SequenceGenerator(
+            from fairseq.sequence_generator import SequenceGenerator, SequenceGeneratorWithAlignment
+            if getattr(args, 'print_alignment', False):
+                seq_gen_cls = SequenceGeneratorWithAlignment
+            else:
+                seq_gen_cls = SequenceGenerator
+            return seq_gen_cls(
                 self.target_dictionary,
                 beam_size=getattr(args, 'beam', 5),
                 max_len_a=getattr(args, 'max_len_a', 0),
diff --git a/fairseq/tasks/translation.py b/fairseq/tasks/translation.py
index f3d60403ba..353e640bf6 100644
--- a/fairseq/tasks/translation.py
+++ b/fairseq/tasks/translation.py
@@ -24,7 +24,7 @@ def load_langpair_dataset(
     tgt, tgt_dict,
     combine, dataset_impl, upsample_primary,
     left_pad_source, left_pad_target, max_source_positions,
-    max_target_positions, prepend_bos=False,
+    max_target_positions, prepend_bos=False, load_alignments=False,
 ):
     def split_exists(split, src, tgt, lang, data_path):
         filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
@@ -74,6 +74,12 @@ def split_exists(split, src, tgt, lang, data_path):
         src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
         tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())
 
+    align_dataset = None
+    if load_alignments:
+        align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt))
+        if indexed_dataset.dataset_exists(align_path, impl=dataset_impl):
+            align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl)
+
     return LanguagePairDataset(
         src_dataset, src_dataset.sizes, src_dict,
         tgt_dataset, tgt_dataset.sizes, tgt_dict,
@@ -81,6 +87,7 @@ def split_exists(split, src, tgt, lang, data_path):
         left_pad_target=left_pad_target,
         max_source_positions=max_source_positions,
         max_target_positions=max_target_positions,
+        align_dataset=align_dataset,
     )
 
 
@@ -120,6 +127,8 @@ def add_args(parser):
                             help='load the dataset lazily')
         parser.add_argument('--raw-text', action='store_true',
                             help='load raw text dataset')
+        parser.add_argument('--load-alignments', action='store_true',
+                            help='load the binarized alignments')
         parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL',
                             help='pad the source on the left')
         parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL',
@@ -193,6 +202,7 @@ def load_dataset(self, split, epoch=0, combine=False, **kwargs):
             left_pad_target=self.args.left_pad_target,
             max_source_positions=self.args.max_source_positions,
             max_target_positions=self.args.max_target_positions,
+            load_alignments=self.args.load_alignments,
         )
 
     def build_dataset_for_inference(self, src_tokens, src_lengths):
diff --git a/fairseq/utils.py b/fairseq/utils.py
index 80ecb6d083..9dd41fbfea 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -16,6 +16,7 @@
 import torch
 import torch.nn.functional as F
 
+from itertools import accumulate
 from fairseq.modules import gelu, gelu_accurate
 
 
@@ -367,3 +368,47 @@ def set_torch_seed(seed):
     assert isinstance(seed, int)
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
+
+
+def parse_alignment(line):
+    """
+    Parses a single line from the alingment file.
+
+    Args:
+        line (str): String containing the alignment of the format:
+            <src_idx_1>-<tgt_idx_1> <src_idx_2>-<tgt_idx_2> ..
+            <src_idx_m>-<tgt_idx_m>. All indices are 0 indexed.
+
+    Returns:
+        torch.IntTensor: packed alignments of shape (2 * m).
+    """
+    alignments = line.strip().split()
+    parsed_alignment = torch.IntTensor(2 * len(alignments))
+    for idx, alignment in enumerate(alignments):
+        src_idx, tgt_idx = alignment.split('-')
+        parsed_alignment[2 * idx] = int(src_idx)
+        parsed_alignment[2 * idx + 1] = int(tgt_idx)
+    return parsed_alignment
+
+
+def get_token_to_word_mapping(tokens, exclude_list):
+    n = len(tokens)
+    word_start = [int(token not in exclude_list) for token in tokens]
+    word_idx = list(accumulate(word_start))
+    token_to_word = {i: word_idx[i] for i in range(n)}
+    return token_to_word
+
+
+def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos):
+    tgt_valid = ((tgt_sent != pad) & (tgt_sent != eos)).nonzero().squeeze(dim=-1)
+    src_invalid = ((src_sent == pad) | (src_sent == eos)).nonzero().squeeze(dim=-1)
+    src_token_to_word = get_token_to_word_mapping(src_sent, [eos, pad])
+    tgt_token_to_word = get_token_to_word_mapping(tgt_sent, [eos, pad])
+    alignment = []
+    if len(tgt_valid) != 0 and len(src_invalid) < len(src_sent):
+        attn_valid = attn[tgt_valid]
+        attn_valid[:, src_invalid] = float('-inf')
+        _, src_indices = attn_valid.max(dim=1)
+        for tgt_idx, src_idx in zip(tgt_valid, src_indices):
+            alignment.append((src_token_to_word[src_idx.item()] - 1, tgt_token_to_word[tgt_idx.item()] - 1))
+    return alignment
diff --git a/generate.py b/generate.py
index 6de1a69abd..aba611d4b0 100644
--- a/generate.py
+++ b/generate.py
@@ -137,7 +137,7 @@ def main(args):
                     hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                         hypo_tokens=hypo['tokens'].int().cpu(),
                         src_str=src_str,
-                        alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None,
+                        alignment=hypo['alignment'],
                         align_dict=align_dict,
                         tgt_dict=tgt_dict,
                         remove_bpe=args.remove_bpe,
@@ -156,7 +156,7 @@ def main(args):
                         if args.print_alignment:
                             print('A-{}\t{}'.format(
                                 sample_id,
-                                ' '.join(map(lambda x: str(utils.item(x)), alignment))
+                                ' '.join(['{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment])
                             ))
 
                         if args.print_step:
@@ -180,6 +180,7 @@ def main(args):
         num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg))
     if has_target:
         print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
+
     return scorer
 
 
diff --git a/interactive.py b/interactive.py
index d9d547a974..36e2bd0ca9 100644
--- a/interactive.py
+++ b/interactive.py
@@ -162,7 +162,7 @@ def decode_fn(x):
                 hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                     hypo_tokens=hypo['tokens'].int().cpu(),
                     src_str=src_str,
-                    alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None,
+                    alignment=hypo['alignment'],
                     align_dict=align_dict,
                     tgt_dict=tgt_dict,
                     remove_bpe=args.remove_bpe,
@@ -174,9 +174,10 @@ def decode_fn(x):
                     ' '.join(map(lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist()))
                 ))
                 if args.print_alignment:
+                    alignment_str = " ".join(["{}-{}".format(src, tgt) for src, tgt in alignment])
                     print('A-{}\t{}'.format(
                         id,
-                        ' '.join(map(lambda x: str(utils.item(x)), alignment))
+                        alignment_str
                     ))
 
         # update running id counter
diff --git a/preprocess.py b/preprocess.py
index a157feeb68..538ff2b006 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -157,6 +157,60 @@ def merge_result(worker_result):
             )
         )
 
+    def make_binary_alignment_dataset(input_prefix, output_prefix, num_workers):
+        nseq = [0]
+
+        def merge_result(worker_result):
+            nseq[0] += worker_result['nseq']
+
+        input_file = input_prefix
+        offsets = Binarizer.find_offsets(input_file, num_workers)
+        pool = None
+        if num_workers > 1:
+            pool = Pool(processes=num_workers - 1)
+            for worker_id in range(1, num_workers):
+                prefix = "{}{}".format(output_prefix, worker_id)
+                pool.apply_async(
+                    binarize_alignments,
+                    (
+                        args,
+                        input_file,
+                        utils.parse_alignment,
+                        prefix,
+                        offsets[worker_id],
+                        offsets[worker_id + 1]
+                    ),
+                    callback=merge_result
+                )
+            pool.close()
+
+        ds = indexed_dataset.make_builder(dataset_dest_file(args, output_prefix, None, "bin"),
+                                          impl=args.dataset_impl)
+
+        merge_result(
+            Binarizer.binarize_alignments(
+                input_file, utils.parse_alignment, lambda t: ds.add_item(t),
+                offset=0, end=offsets[1]
+            )
+        )
+        if num_workers > 1:
+            pool.join()
+            for worker_id in range(1, num_workers):
+                prefix = "{}{}".format(output_prefix, worker_id)
+                temp_file_path = dataset_dest_prefix(args, prefix, None)
+                ds.merge_file_(temp_file_path)
+                os.remove(indexed_dataset.data_file_path(temp_file_path))
+                os.remove(indexed_dataset.index_file_path(temp_file_path))
+
+        ds.finalize(dataset_dest_file(args, output_prefix, None, "idx"))
+
+        print(
+            "| [alignments] {}: parsed {} alignments".format(
+                input_file,
+                nseq[0]
+            )
+        )
+
     def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
         if args.dataset_impl == "raw":
             # Copy original text file to destination folder
@@ -180,9 +234,19 @@ def make_all(lang, vocab):
                 outprefix = "test{}".format(k) if k > 0 else "test"
                 make_dataset(vocab, testpref, outprefix, lang, num_workers=args.workers)
 
+    def make_all_alignments():
+        if args.trainpref and os.path.exists(args.trainpref + "." + args.align_suffix):
+            make_binary_alignment_dataset(args.trainpref + "." + args.align_suffix, "train.align", num_workers=args.workers)
+        if args.validpref and os.path.exists(args.validpref + "." + args.align_suffix):
+            make_binary_alignment_dataset(args.validpref + "." + args.align_suffix, "valid.align", num_workers=args.workers)
+        if args.testpref and os.path.exists(args.testpref + "." + args.align_suffix):
+            make_binary_alignment_dataset(args.testpref + "." + args.align_suffix, "test.align", num_workers=args.workers)
+
     make_all(args.source_lang, src_dict)
     if target:
         make_all(args.target_lang, tgt_dict)
+    if args.align_suffix:
+        make_all_alignments()
 
     print("| Wrote preprocessed data to {}".format(args.destdir))
 
@@ -242,11 +306,28 @@ def consumer(tensor):
     return res
 
 
+def binarize_alignments(args, filename, parse_alignment, output_prefix, offset, end):
+    ds = indexed_dataset.make_builder(dataset_dest_file(args, output_prefix, None, "bin"),
+                                      impl=args.dataset_impl, vocab_size=None)
+
+    def consumer(tensor):
+        ds.add_item(tensor)
+
+    res = Binarizer.binarize_alignments(filename, parse_alignment, consumer, offset=offset,
+                                        end=end)
+    ds.finalize(dataset_dest_file(args, output_prefix, None, "idx"))
+    return res
+
+
 def dataset_dest_prefix(args, output_prefix, lang):
     base = "{}/{}".format(args.destdir, output_prefix)
-    lang_part = (
-        ".{}-{}.{}".format(args.source_lang, args.target_lang, lang) if lang is not None else ""
-    )
+    if lang is not None:
+        lang_part = ".{}-{}.{}".format(args.source_lang, args.target_lang, lang)
+    elif args.only_source:
+        lang_part = ""
+    else:
+        lang_part = ".{}-{}".format(args.source_lang, args.target_lang)
+
     return "{}{}".format(base, lang_part)
 
 
diff --git a/tests/test_binaries.py b/tests/test_binaries.py
index f77806bd6a..113901ab06 100644
--- a/tests/test_binaries.py
+++ b/tests/test_binaries.py
@@ -266,6 +266,27 @@ def test_mixture_of_experts(self):
                     '--gen-expert', '0'
                 ])
 
+    def test_alignment(self):
+        with contextlib.redirect_stdout(StringIO()):
+            with tempfile.TemporaryDirectory('test_alignment') as data_dir:
+                create_dummy_data(data_dir, alignment=True)
+                preprocess_translation_data(data_dir, ['--align-suffix', 'align'])
+                train_translation_model(
+                    data_dir,
+                    'transformer_align',
+                    [
+                        '--encoder-layers', '2',
+                        '--decoder-layers', '2',
+                        '--encoder-embed-dim', '8',
+                        '--decoder-embed-dim', '8',
+                        '--load-alignments',
+                        '--alignment-layer', '1',
+                        '--criterion', 'label_smoothed_cross_entropy_with_alignment'
+                    ],
+                    run_validation=True,
+                )
+                generate_main(data_dir)
+
 
 class TestStories(unittest.TestCase):
 
@@ -484,7 +505,7 @@ def test_optimizers(self):
                     generate_main(data_dir)
 
 
-def create_dummy_data(data_dir, num_examples=1000, maxlen=20):
+def create_dummy_data(data_dir, num_examples=1000, maxlen=20, alignment=False):
 
     def _create_dummy_data(filename):
         data = torch.rand(num_examples * maxlen)
@@ -497,6 +518,20 @@ def _create_dummy_data(filename):
                 print(ex_str, file=h)
                 offset += ex_len
 
+    def _create_dummy_alignment_data(filename_src, filename_tgt, filename):
+        with open(os.path.join(data_dir, filename_src), 'r') as src_f, \
+             open(os.path.join(data_dir, filename_tgt), 'r') as tgt_f, \
+             open(os.path.join(data_dir, filename), 'w') as h:
+                    for src, tgt in zip(src_f, tgt_f):
+                        src_len = len(src.split())
+                        tgt_len = len(tgt.split())
+                        avg_len = (src_len + tgt_len) // 2
+                        num_alignments = random.randint(avg_len // 2, 2 * avg_len)
+                        src_indices = torch.floor(torch.rand(num_alignments) * src_len).int()
+                        tgt_indices = torch.floor(torch.rand(num_alignments) * tgt_len).int()
+                        ex_str = ' '.join(["{}-{}".format(src, tgt) for src, tgt in zip(src_indices, tgt_indices)])
+                        print(ex_str, file=h)
+
     _create_dummy_data('train.in')
     _create_dummy_data('train.out')
     _create_dummy_data('valid.in')
@@ -504,6 +539,10 @@ def _create_dummy_data(filename):
     _create_dummy_data('test.in')
     _create_dummy_data('test.out')
 
+    if alignment:
+        _create_dummy_alignment_data('train.in', 'train.out', 'train.align')
+        _create_dummy_alignment_data('valid.in', 'valid.out', 'valid.align')
+        _create_dummy_alignment_data('test.in', 'test.out', 'test.align')
 
 def preprocess_translation_data(data_dir, extra_flags=None):
     preprocess_parser = options.get_preprocessing_parser()

From 58e43cb3ff18f1f47fd62926f00c70cb5920a66f Mon Sep 17 00:00:00 2001
From: Chenyang Yu <chenyangyu@instagram.com>
Date: Tue, 1 Oct 2019 11:12:06 -0700
Subject: [PATCH 165/213] extract FP16OptimizerMixin for share the same logic
 in PyText (#1180)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1180

Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/874

extract FP16OptimizerMixin for share the same logic in PyText

Reviewed By: hudeven

Differential Revision: D17594102

fbshipit-source-id: 8625a4e4f3e09cbaba6ae92599c1121b86ed4e78
---
 fairseq/optim/fp16_optimizer.py | 114 ++++++++++++++++++--------------
 1 file changed, 63 insertions(+), 51 deletions(-)

diff --git a/fairseq/optim/fp16_optimizer.py b/fairseq/optim/fp16_optimizer.py
index 194e0f4f44..192c78c1f4 100644
--- a/fairseq/optim/fp16_optimizer.py
+++ b/fairseq/optim/fp16_optimizer.py
@@ -54,41 +54,14 @@ def has_overflow(grad_norm):
         return False
 
 
-class FP16Optimizer(optim.FairseqOptimizer):
-    """
-    Wrap an *optimizer* to support FP16 (mixed precision) training.
-    """
-
-    def __init__(self, args, params, fp32_optimizer, fp32_params):
-        super().__init__(args)
-        self.fp16_params = params
-        self.fp32_optimizer = fp32_optimizer
-        self.fp32_params = fp32_params
-
-        if getattr(args, 'fp16_scale_window', None) is None:
-            if len(args.update_freq) > 1:
-                raise ValueError(
-                    '--fp16-scale-window must be given explicitly when using a '
-                    'custom --update-freq schedule'
-                )
-            scale_window = 2**14 / args.distributed_world_size / args.update_freq[0]
-        else:
-            scale_window = args.fp16_scale_window
+class _FP16OptimizerMixin(object):
 
-        self.scaler = DynamicLossScaler(
-            init_scale=args.fp16_init_scale,
-            scale_window=scale_window,
-            tolerance=args.fp16_scale_tolerance,
-            threshold=args.threshold_loss_scale,
-        )
+    def __init__(self, *args, **kwargs):
+        # forward __init__ call to the next class in mro(method resolution order)
+        super().__init__(*args, **kwargs)
 
     @classmethod
-    def build_optimizer(cls, args, params):
-        """
-        Args:
-            args (argparse.Namespace): fairseq args
-            params (iterable): iterable of parameters to optimize
-        """
+    def build_fp32_params(cls, params):
         # create FP32 copy of parameters and grads
         total_param_size = sum(p.data.numel() for p in params)
         fp32_params = params[0].new(0).float().new(total_param_size)
@@ -99,23 +72,7 @@ def build_optimizer(cls, args, params):
             offset += numel
         fp32_params = torch.nn.Parameter(fp32_params)
         fp32_params.grad = fp32_params.data.new(total_param_size)
-
-        fp32_optimizer = optim.build_optimizer(args, [fp32_params])
-        return cls(args, params, fp32_optimizer, fp32_params)
-
-    @property
-    def optimizer(self):
-        return self.fp32_optimizer.optimizer
-
-    @property
-    def optimizer_config(self):
-        return self.fp32_optimizer.optimizer_config
-
-    def get_lr(self):
-        return self.fp32_optimizer.get_lr()
-
-    def set_lr(self, lr):
-        self.fp32_optimizer.set_lr(lr)
+        return fp32_params
 
     def state_dict(self):
         """Return the optimizer's state dict."""
@@ -179,14 +136,14 @@ def clip_grad_norm(self, max_norm):
         overflow = DynamicLossScaler.has_overflow(grad_norm)
         self.scaler.update_scale(overflow)
         if overflow:
-            if self.scaler.loss_scale <= self.args.min_loss_scale:
+            if self.scaler.loss_scale <= self.min_loss_scale:
                 # Use FloatingPointError as an uncommon error that parent
                 # functions can safely catch to stop training.
                 raise FloatingPointError((
                     'Minimum loss scale reached ({}). Your loss is probably exploding. '
                     'Try lowering the learning rate, using gradient clipping or '
                     'increasing the batch size.'
-                ).format(self.args.min_loss_scale))
+                ).format(self.min_loss_scale))
             raise OverflowError('setting loss scale to: ' + str(self.scaler.loss_scale))
         return grad_norm
 
@@ -211,6 +168,61 @@ def zero_grad(self):
         self._needs_sync = False
 
 
+class FP16Optimizer(_FP16OptimizerMixin, optim.FairseqOptimizer):
+    """
+    Wrap an *optimizer* to support FP16 (mixed precision) training.
+    """
+
+    def __init__(self, args, params, fp32_optimizer, fp32_params):
+        super().__init__(args)
+        self.fp16_params = params
+        self.fp32_optimizer = fp32_optimizer
+        self.fp32_params = fp32_params
+
+        if getattr(args, 'fp16_scale_window', None) is None:
+            if len(args.update_freq) > 1:
+                raise ValueError(
+                    '--fp16-scale-window must be given explicitly when using a '
+                    'custom --update-freq schedule'
+                )
+            scale_window = 2**14 / args.distributed_world_size / args.update_freq[0]
+        else:
+            scale_window = args.fp16_scale_window
+
+        self.scaler = DynamicLossScaler(
+            init_scale=args.fp16_init_scale,
+            scale_window=scale_window,
+            tolerance=args.fp16_scale_tolerance,
+            threshold=args.threshold_loss_scale,
+        )
+        self.min_loss_scale = self.args.min_loss_scale
+
+    @classmethod
+    def build_optimizer(cls, args, params):
+        """
+        Args:
+            args (argparse.Namespace): fairseq args
+            params (iterable): iterable of parameters to optimize
+        """
+        fp32_params = cls.build_fp32_params(params)
+        fp32_optimizer = optim.build_optimizer(args, [fp32_params])
+        return cls(args, params, fp32_optimizer, fp32_params)
+
+    @property
+    def optimizer(self):
+        return self.fp32_optimizer.optimizer
+
+    @property
+    def optimizer_config(self):
+        return self.fp32_optimizer.optimizer_config
+
+    def get_lr(self):
+        return self.fp32_optimizer.get_lr()
+
+    def set_lr(self, lr):
+        self.fp32_optimizer.set_lr(lr)
+
+
 class MemoryEfficientFP16Optimizer(optim.FairseqOptimizer):
     """
     Wrap an *optimizer* to support FP16 (mixed precision) training.

From de348d1f0a6862b50e2f9ca6ba821d6fd26d5a59 Mon Sep 17 00:00:00 2001
From: Debojeet Chatterjee <debo@fb.com>
Date: Fri, 4 Oct 2019 09:59:33 -0700
Subject: [PATCH 166/213] Native Torchscript Wordpiece Tokenizer Op for
 BERTSquadQA, Torchscriptify BertSQUADQAModel (#879)

Summary:
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/879

Pull Request resolved: https://github.com/facebookresearch/pytext/pull/1023

Pull Request resolved: https://github.com/pytorch/fairseq/pull/1211

Added a new native op that does wordpiece tokenization while additionally returning token start and end indices in the raw text as required by BertSquadQA. Includes Unit Tests for the native op and also to check its parity with the PyText Wordpiece Tokenizer.

Also combined is a torchscript implementation of the Bert SQUAD QA Model.

There are scripts for evaluation and testing of the torchscript code as well.

Reviewed By: borguz, hikushalhere

Differential Revision: D17455985

fbshipit-source-id: c2617c7ecbce0f733b31d04558da965d0b62637b
---
 fairseq/modules/learned_positional_embedding.py |  2 +-
 fairseq/modules/multihead_attention.py          | 15 ++++-----------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/fairseq/modules/learned_positional_embedding.py b/fairseq/modules/learned_positional_embedding.py
index e52b8d4715..e8e4a6fb4d 100644
--- a/fairseq/modules/learned_positional_embedding.py
+++ b/fairseq/modules/learned_positional_embedding.py
@@ -38,7 +38,7 @@ def forward(self, input, incremental_state=None, positions=None):
                 positions = input.data.new(1, 1).fill_(int(self.padding_idx + input.size(1)))
             else:
                 positions = utils.make_positions(
-                    input.data, self.padding_idx, onnx_trace=self.onnx_trace,
+                    input, self.padding_idx, onnx_trace=self.onnx_trace,
                 )
         return super().forward(positions)
 
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 96849790f6..0ff05d16db 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -255,17 +255,10 @@ def forward(
         if key_padding_mask is not None:
             # don't attend to padding symbols
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            if self.onnx_trace:
-                attn_weights = torch.where(
-                    key_padding_mask.unsqueeze(1).unsqueeze(2),
-                    torch.Tensor([float("-Inf")]),
-                    attn_weights.float()
-                ).type_as(attn_weights)
-            else:
-                attn_weights = attn_weights.masked_fill(
-                    key_padding_mask.unsqueeze(1).unsqueeze(2),
-                    float('-inf'),
-                )
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float('-inf'),
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if before_softmax:

From 315c463d4546037bb6698fcd504f647a03d795cc Mon Sep 17 00:00:00 2001
From: Jerry Ma <noreplyspamblackhole@gmail.com>
Date: Fri, 4 Oct 2019 13:35:25 -0700
Subject: [PATCH 167/213] Add periodic CUDA cache cleanup (#882)

Summary:
This adds a periodic call to `torch.cuda.empty_cache()` in order to
mitigate memory fragmentation in the PyTorch CUDA cached allocator
that can cause OOMs on models approaching GPU memory limit.
By default, this will occur every 64 updates.

Performance considerations:

- I've benchmarked this on a reasonably large model with memory
  footprint 16 GB, and the overhead with the default setting is <0.2%.
  With `update-freq > 1`, the cost is mitigated even further.
- This behavior can be disabled with a value of zero.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/882

Differential Revision: D17742386

Pulled By: jma127

fbshipit-source-id: 68d8f93f798d6818b5efc3d67d43b52dfb8b2865
---
 fairseq/options.py | 2 ++
 fairseq/trainer.py | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/fairseq/options.py b/fairseq/options.py
index 06a52b62ba..c33e1ac8e9 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -193,6 +193,8 @@ def get_parser(desc, default_task='translation'):
                         help='threshold FP16 loss scale from below')
     parser.add_argument('--user-dir', default=None,
                         help='path to a python module containing custom extensions (tasks and/or architectures)')
+    parser.add_argument('--empty-cache-freq', default=0, type=int,
+                        help='how often to clear the PyTorch CUDA cache (0 to disable)')
 
     from fairseq.registry import REGISTRIES
     for registry_name, REGISTRY in REGISTRIES.items():
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 8e911a2174..03a1333ff1 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -426,6 +426,14 @@ def maybe_no_sync():
 
             if 'nll_loss' in logging_output:
                 self.meters['train_nll_loss'].update(logging_output.get('nll_loss', 0), ntokens)
+
+            # clear CUDA cache to reduce memory fragmentation
+            if (self.args.empty_cache_freq > 0 and
+                    ((self.get_num_updates() + self.args.empty_cache_freq - 1) %
+                     self.args.empty_cache_freq) == 0 and
+                    torch.cuda.is_available() and
+                    not self.args.cpu):
+                torch.cuda.empty_cache()
         except OverflowError as e:
             print('| WARNING: overflow detected, ' + str(e))
             self.zero_grad()

From 4cb895b6f6b3e16ceeefb579432bddb1a73c1e39 Mon Sep 17 00:00:00 2001
From: alexeib <alexei.b@gmail.com>
Date: Fri, 4 Oct 2019 17:09:27 -0700
Subject: [PATCH 168/213] add pre-trained wav2vec model

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/884

Differential Revision: D17774515

Pulled By: alexeib

fbshipit-source-id: d1ffe8ab723fa284c69b067bbd43d699eaa2f02f
---
 README.md                  |  1 +
 examples/wav2vec/README.md | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/README.md b/README.md
index e05af20c6c..2d6705627a 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,7 @@ as well as example training and evaluation commands.
 
 - [Translation](examples/translation/README.md): convolutional and transformer models are available
 - [Language Modeling](examples/language_model/README.md): convolutional and transformer models are available
+- [wav2vec](examples/wav2vec/README.md): wav2vec large model is available
 
 We also have more detailed READMEs to reproduce results from specific papers:
 - [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md )
diff --git a/examples/wav2vec/README.md b/examples/wav2vec/README.md
index 325e25420f..689d6494b7 100644
--- a/examples/wav2vec/README.md
+++ b/examples/wav2vec/README.md
@@ -2,6 +2,27 @@
 
 Example to train a wav2vec model as described in [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](https://arxiv.org/abs/1904.05862).
 
+## Pre-trained models
+
+Description | Parameters | Dataset | Model
+---|---:|---|---
+Wav2Vec large <br> ([(Schneider et al., 2019)](https://arxiv.org/abs/1904.05862)) | 32.5M | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt)
+
+#### Example usage:
+```python
+import torch
+from fairseq.models.wav2vec import Wav2VecModel
+
+cp = torch.load('/path/to/wav2vec.pt')
+model = Wav2VecModel.build_model(cp['args'], task=None)
+model.load_state_dict(cp['model'])
+model.eval()
+
+wav_input_16khz = torch.randn(1,10000)
+z = model.feature_extractor(wav_input_16khz)
+c = model.feature_aggregator(z)
+```
+
 ## Training a new model with the CLI tools
 
 Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length)

From 6f58e15e240007bb58fa9364d9bd35295548f1d7 Mon Sep 17 00:00:00 2001
From: Nayan Singhal <naysing@fb.com>
Date: Mon, 7 Oct 2019 11:13:34 -0700
Subject: [PATCH 169/213] Setting Global sync to 50 in BMUF

Summary:
In all our final settings, we are using global_sync = 50 and we get comparable results with DDP and caffe2.

Setting the default global-sync-iter = 50
and users can just define --use-bmuf to enable it for training.

Reviewed By: skritika

Differential Revision: D17765094

fbshipit-source-id: 369591eeff266d757f89e1fc8dda01711146fdbc
---
 fairseq/optim/bmuf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py
index c4da08d89e..9edd27dc73 100644
--- a/fairseq/optim/bmuf.py
+++ b/fairseq/optim/bmuf.py
@@ -47,7 +47,7 @@ def add_args(parser):
         )
         parser.add_argument(
             "--global-sync-iter",
-            default=10,
+            default=50,
             type=int,
             help="Iteration for syncing global model",
         )

From c2165224d198450a3b4329ae099a772aa65d51c5 Mon Sep 17 00:00:00 2001
From: Changhan Wang <changhan@fb.com>
Date: Tue, 8 Oct 2019 14:08:05 -0700
Subject: [PATCH 170/213] fix max lengths in Levenshtein Tramsformer

Summary: Fix the max length calculation in Levenshtein Transformer

Reviewed By: jhcross

Differential Revision: D17672946

fbshipit-source-id: e5efbe7e56cf879d3e822864e4398f99f45b04d4
---
 fairseq/models/levenshtein_transformer.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index 2e17b790ec..9f016dbb4a 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -323,12 +323,16 @@ def forward_decoder(
         output_scores = decoder_out["output_scores"]
         attn = decoder_out["attn"]
 
+        bsz = output_tokens.size(0)
         if max_ratio is None:
-            max_lens = output_tokens.new(output_tokens.size(0)).fill_(255)
+            max_lens = output_tokens.new().fill_(255)
         else:
-            max_lens = (
-                (~encoder_out["encoder_padding_mask"]).sum(1) * max_ratio
-            ).clamp(min=10)
+            if encoder_out["encoder_padding_mask"] is None:
+                max_src_len = encoder_out["encoder_out"].size(1)
+                src_lens = encoder_out["encoder_out"].new(bsz).fill_(max_src_len)
+            else:
+                src_lens = (~encoder_out["encoder_padding_mask"]).sum(1)
+            max_lens = (src_lens * max_ratio).clamp(min=10).long()
 
         # delete words
         # do not delete tokens if it is <s> </s>
@@ -364,7 +368,7 @@ def forward_decoder(
                 mask_ins_score[:, :, 0] -= eos_penalty
             mask_ins_pred = mask_ins_score.max(-1)[1]
             mask_ins_pred = torch.min(
-                mask_ins_pred, max_lens[:, None].expand_as(mask_ins_pred)
+                mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
             )
 
             _tokens, _scores = _apply_ins_masks(

From 34e79c58d3cd71c8cd3e5604b4faa8df964a4763 Mon Sep 17 00:00:00 2001
From: Jungo Kasai <jkasai@fb.com>
Date: Tue, 8 Oct 2019 14:55:25 -0700
Subject: [PATCH 171/213] ensemble levts

Summary:
Add ensemble wrappers to the levenshtein NAT.
Levenshtein
Final softmax ensemble over the pipeline of three steps: deletion, placeholder insertion, and word selection.
1. Deletion
2. Placeholder Insertion
3. Word Selection

Each step involves scoring, averaging the scores over the ensemble, and then make hard decisions with argmax. Then next step follows. We cannot do the three steps in parallel by design.

Reviewed By: kahne

Differential Revision: D17723202

fbshipit-source-id: 05f7a4fcd922a972cc4796ca397e8220f0b4d53e
---
 fairseq/iterative_refinement_generator.py     |  13 +-
 fairseq/models/nonautoregressive_ensembles.py | 209 ++++++++++++++++++
 2 files changed, 219 insertions(+), 3 deletions(-)
 create mode 100644 fairseq/models/nonautoregressive_ensembles.py

diff --git a/fairseq/iterative_refinement_generator.py b/fairseq/iterative_refinement_generator.py
index aee4884187..eeb6241039 100644
--- a/fairseq/iterative_refinement_generator.py
+++ b/fairseq/iterative_refinement_generator.py
@@ -6,6 +6,8 @@
 import torch
 
 from fairseq.models.model_utils import skip_tensors as _skip
+from fairseq.models.nonautoregressive_ensembles import EnsembleLevT
+from fairseq.models.levenshtein_transformer import LevenshteinTransformerModel
 
 
 class IterativeRefinementGenerator(object):
@@ -44,9 +46,14 @@ def __init__(self,
     @torch.no_grad()
     def generate(self, models, sample, prefix_tokens=None):
 
-        # TODO: model ensemble
-        assert len(models) == 1, 'only support single model'
-        model = models[0]
+        if len(models) == 1:
+            # Keep this for other NAT models for which we have yet to implement ensemble wrappers. Later delete this.
+            model = models[0]
+        elif isinstance(models[0], LevenshteinTransformerModel):
+            model = EnsembleLevT(models)
+        else:
+            raise NotImplementedError
+
         if not self.retain_dropout:
             model.eval()
 
diff --git a/fairseq/models/nonautoregressive_ensembles.py b/fairseq/models/nonautoregressive_ensembles.py
new file mode 100644
index 0000000000..12b9856931
--- /dev/null
+++ b/fairseq/models/nonautoregressive_ensembles.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+import math
+from fairseq.models.model_utils import fill_tensors as _fill, skip_tensors as _skip
+from fairseq.models.levenshtein_transformer import _apply_del_words, _apply_ins_masks, _apply_ins_words
+
+
+class BasicEnsembleModel(torch.nn.Module):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__()
+        self.models = torch.nn.ModuleList(models)
+        self.bos = self.models[0].decoder.dictionary.bos()
+        self.eos = self.models[0].decoder.dictionary.eos()
+        self.pad = self.models[0].decoder.dictionary.pad()
+        self.unk = self.models[0].decoder.dictionary.unk()
+
+    def has_encoder(self):
+        return hasattr(self.models[0], 'encoder')
+
+    def max_decoder_positions(self):
+        return min(m.max_decoder_positions() for m in self.models)
+
+    @torch.no_grad()
+    def forward_encoder(self, encoder_input):
+        if not self.has_encoder():
+            return None
+        return [model.forward_encoder(encoder_input) for model in self.models]
+
+    @torch.no_grad()
+    def forward_decoder(self, *inputs):
+        raise NotImplementedError
+
+    def initialize_output_tokens(self, *inputs):
+        raise NotImplementedError
+
+
+class EnsembleLevT(BasicEnsembleModel):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__(models)
+
+    @torch.no_grad()
+    def forward_decoder(self, decoder_out, encoder_outs, eos_penalty=0.0, max_ratio=None, **kwargs):
+        # LevT ensembling
+        # A pipeline of three steps: deletion, placeholder, and word insertion.
+        # We need to average scores in each step in a pipeline way because of dependence.
+        # deletion
+        output_tokens = decoder_out["output_tokens"]
+        output_scores = decoder_out["output_scores"]
+        attn = decoder_out["attn"]
+
+        bsz = output_tokens.size(0)
+        if max_ratio is None:
+            max_lens = output_tokens.new().fill_(255)
+        else:
+            if encoder_outs[0]["encoder_padding_mask"] is None:
+                src_lens = encoder_outs[0]["encoder_out"].new(bsz).fill_(encoder_outs[0]["encoder_out"].size(1))
+            else:
+                src_lens = (~encoder_outs[0]["encoder_padding_mask"]).sum(1)
+            max_lens = (src_lens * max_ratio).clamp(min=10).long()
+
+        # delete words
+        # do not delete tokens if it is <s> </s>
+        can_del_word = output_tokens.ne(self.pad).sum(1) > 2
+        if can_del_word.sum() != 0:  # we cannot delete, skip
+            output_tokens, output_scores, attn = self.forward_word_del(
+                encoder_outs,
+                output_tokens,
+                output_scores,
+                attn,
+                can_del_word,
+            )
+
+        # insert placeholders
+        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
+        if can_ins_mask.sum() != 0:
+            output_tokens, output_scores = self.forward_mask_ins(
+                 encoder_outs,
+                 output_tokens,
+                 output_scores,
+                 can_ins_mask,
+                 eos_penalty,
+                 max_lens,
+             )
+
+        # insert words
+        can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
+        if can_ins_word.sum() != 0:
+            output_tokens, output_scores, attn = self.forward_word_ins(
+                encoder_outs,
+                output_tokens,
+                output_scores,
+                attn,
+                can_ins_word,
+            )
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+        attn = None if attn is None else attn[:, :cut_off, :]
+        return {
+            "output_tokens": output_tokens,
+            "output_scores": output_scores,
+            "attn": attn,
+        }
+
+    def forward_word_del(self, encoder_outs, output_tokens, output_scores, attn, can_del_word):
+        word_del_score_avg = []
+        word_del_attn_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            word_del_out, word_del_attn = model.decoder.forward_word_del(
+                _skip(output_tokens, can_del_word),
+                _skip(encoder_out, can_del_word),
+            )
+            word_del_score = F.log_softmax(word_del_out, 2)
+            word_del_score_avg.append(word_del_score)
+            word_del_attn_avg.append(word_del_attn)
+        word_del_score_avg = torch.logsumexp(torch.stack(word_del_score_avg, dim=0), dim=0) - math.log(len(self.models))
+        word_del_pred = word_del_score_avg.max(-1)[1].bool()
+        if word_del_attn_avg[0] is not None:
+            word_del_attn_avg = torch.stack(word_del_attn_avg, dim=0)/len(self.models)
+        else:
+            word_del_attn_avg = None
+
+        _tokens, _scores, _attn = _apply_del_words(
+            output_tokens[can_del_word],
+            output_scores[can_del_word],
+            word_del_attn_avg,
+            word_del_pred,
+            self.pad,
+            self.bos,
+            self.eos,
+        )
+        output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_del_word, _scores, 0)
+        attn = _fill(attn, can_del_word, _attn, 0.)
+        return output_tokens, output_scores, attn
+
+    def forward_mask_ins(self, encoder_outs, output_tokens, output_scores, can_ins_mask, eos_penalty, max_lens):
+        mask_ins_score_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            mask_ins_out, _ = model.decoder.forward_mask_ins(
+                _skip(output_tokens, can_ins_mask),
+                _skip(encoder_out, can_ins_mask),
+            )
+            mask_ins_score = F.log_softmax(mask_ins_out, 2)
+            if eos_penalty > 0.0:
+                mask_ins_score[:, :, 0] -= eos_penalty
+            mask_ins_score_avg.append(mask_ins_score)
+        mask_ins_score_avg = torch.logsumexp(torch.stack(mask_ins_score_avg, dim=0), dim=0) - math.log(len(self.models))
+        mask_ins_pred = mask_ins_score_avg.max(-1)[1]
+        mask_ins_pred = torch.min(
+            mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
+        )
+        _tokens, _scores = _apply_ins_masks(
+            output_tokens[can_ins_mask],
+            output_scores[can_ins_mask],
+            mask_ins_pred,
+            self.pad,
+            self.unk,
+            self.eos,
+        )
+        output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
+        return output_tokens, output_scores
+
+    def forward_word_ins(self, encoder_outs, output_tokens, output_scores, attn, can_ins_word):
+        word_ins_score_avg = []
+        word_ins_attn_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            word_ins_out, word_ins_attn = model.decoder.forward_word_ins(
+                _skip(output_tokens, can_ins_word),
+                _skip(encoder_out, can_ins_word),
+            )
+            word_ins_score = F.log_softmax(word_ins_out, 2)
+            word_ins_score_avg.append(word_ins_score)
+            word_ins_attn_avg.append(word_ins_attn)
+        word_ins_score_avg = torch.logsumexp(torch.stack(word_ins_score_avg, dim=0), dim=0) - math.log(len(self.models))
+        if word_ins_attn_avg[0] is not None:
+            word_ins_attn_avg = torch.stack(word_ins_attn_avg, dim=0)/len(self.models)
+        else:
+            word_ins_attn_avg = None
+        word_ins_score_max, word_ins_pred = word_ins_score_avg.max(-1)
+
+        _tokens, _scores = _apply_ins_words(
+            output_tokens[can_ins_word],
+            output_scores[can_ins_word],
+            word_ins_pred,
+            word_ins_score_max,
+            self.unk,
+        )
+
+        output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_ins_word, _scores, 0)
+        attn = _fill(attn, can_ins_word, word_ins_attn, 0.)
+        return output_tokens, output_scores, attn
+
+    def initialize_output_tokens(self, encoder_outs, src_tokens):
+        # LevT doesn't do length prediction.
+        return self.models[0].initialize_output_tokens(encoder_outs[0], src_tokens)

From 63b6b3f411fd037d97f452df0417171ba5aa4f5d Mon Sep 17 00:00:00 2001
From: Jerry Ma <noreplyspamblackhole@gmail.com>
Date: Tue, 8 Oct 2019 16:34:51 -0700
Subject: [PATCH 172/213] Add printing of PyTorch memory summary on OOM (#885)

Summary:
PyTorch now has more comprehensive memory instrumentation, added in https://github.com/pytorch/pytorch/pull/27361 . This PR makes fairseq print a summary table of the memory state when an OOM occurs.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/885

Differential Revision: D17820445

Pulled By: jma127

fbshipit-source-id: 1887417c7648d703f78e1cff9f2a5b89901f49d0
---
 fairseq/trainer.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 03a1333ff1..0f293d0251 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -313,10 +313,16 @@ def maybe_no_sync():
                         + '\n Skipping batch'
                     )
                     # TODO: print should really go to logger, this print goes
-                    # to stdout, which is buffered, which in many case is not
-                    # printed out if another exception happens
-                    # print(msg)
+                    # to stderr, which is buffered, which in many cases is not
+                    # printed out if another exception happens.
+                    # NB(jerry): added a flush to mitigate this
                     print(msg, file=sys.stderr)
+                    if torch.cuda.is_available() and hasattr(torch.cuda, "memory_summary"):
+                        for device_idx in range(torch.cuda.device_count()):
+                            print(torch.cuda.memory_summary(device=torch.cuda.device(device_idx)),
+                                  file=sys.stderr)
+                    sys.stderr.flush()
+
                     if raise_oom:
                         raise ValueError(msg)
                     ooms += 1

From b6e001f634a13ee216d4b496a2e39031e22ddecb Mon Sep 17 00:00:00 2001
From: Alex Xiao <axiao@fb.com>
Date: Wed, 9 Oct 2019 14:51:16 -0700
Subject: [PATCH 173/213] Fix data loading memory issue in pyspeech

Summary:
We currently shard data when creating the batch iterator. This means we first load all indicese/frame lengths/handles into memory, and then do the sharding. This makes it impossible to train on large datasets with a high amount of workers  because each worker will need to load the entire dataset into memory. For training on a million hours of data (i.e. semi-supervised or unsupervised approaches) this data loading just makes it flat out impossible to use 8 GPU's.

3 changes:

1. This diff modifies the data loading such that we do the sharding while we read the handles file, rather than later. This modification is done on a task-by-task basis, since the task specifies how the data is loaded. I've tried to make the code compatible with both sharding during handle loading and sharding during batch iteration. I've currently only done the sharding during handle loading for the aligned_training task.

2. To support data sharding at data loading time and the requirement that all shards must have exactly the same # of batches, I've added a method to do this synchronization where all shards with too many batches would just truncate the extra ones, similar to what we already do.

2. In fairspeq/train.py, we are actually loading the training dataset and batch iterator twice, once in train.py and once when loading the checkpoint (which we always do regardless if there is a checkpoint). This means double the loading time which can be painful for very large files. I've removed the extraneous loading in this diff as well.

Reviewed By: yqwangustc

Differential Revision: D17750715

fbshipit-source-id: 0e6e3d363525fa5661f1c784303390ea13f46377
---
 fairseq/checkpoint_utils.py | 6 +++---
 fairseq/data/iterators.py   | 8 ++++++++
 fairseq/trainer.py          | 9 +++++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 6b83bf07b2..5ef45b8463 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -90,7 +90,7 @@ def is_better(a, b):
                 os.remove(old_chk)
 
 
-def load_checkpoint(args, trainer):
+def load_checkpoint(args, trainer, data_selector=None):
     """Load a checkpoint and restore the training iterator."""
     # only one worker should attempt to create the required dir
     if args.distributed_rank == 0:
@@ -120,10 +120,10 @@ def load_checkpoint(args, trainer):
     if extra_state is not None and not args.reset_dataloader:
         # restore iterator from checkpoint
         itr_state = extra_state['train_iterator']
-        epoch_itr = trainer.get_train_iterator(epoch=itr_state['epoch'], load_dataset=True)
+        epoch_itr = trainer.get_train_iterator(epoch=itr_state['epoch'], load_dataset=True, data_selector=data_selector)
         epoch_itr.load_state_dict(itr_state)
     else:
-        epoch_itr = trainer.get_train_iterator(epoch=0, load_dataset=True)
+        epoch_itr = trainer.get_train_iterator(epoch=0, load_dataset=True, data_selector=data_selector)
 
     trainer.lr_step(epoch_itr.epoch)
 
diff --git a/fairseq/data/iterators.py b/fairseq/data/iterators.py
index 7bae6ab355..83760b3615 100644
--- a/fairseq/data/iterators.py
+++ b/fairseq/data/iterators.py
@@ -34,6 +34,8 @@ def __len__(self):
 
     def __iter__(self):
         for x in self.iterable:
+            if self.count >= self.len:
+                return
             self.count += 1
             yield x
 
@@ -49,6 +51,12 @@ def skip(self, num_to_skip):
         next(itertools.islice(self.itr, num_to_skip, num_to_skip), None)
         return self
 
+    def take(self, n):
+        """
+        Truncates the iterator to n elements at most.
+        """
+        self.len = min(self.len, n)
+
 
 class EpochBatchIterating(object):
     def __len__(self) -> int:
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 0f293d0251..35f9c1d759 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -225,11 +225,16 @@ def load_checkpoint(
 
         return extra_state
 
-    def get_train_iterator(self, epoch, combine=True, load_dataset=True):
+    def get_train_iterator(self, epoch, combine=True, load_dataset=True, data_selector=None):
         """Return an EpochBatchIterator over the training set for a given epoch."""
         if load_dataset:
             print('| loading train data for epoch {}'.format(epoch))
-            self.task.load_dataset(self.args.train_subset, epoch=epoch, combine=combine)
+            self.task.load_dataset(
+                self.args.train_subset,
+                epoch=epoch,
+                combine=combine,
+                data_selector=data_selector,
+            )
         return self.task.get_batch_iterator(
             dataset=self.task.dataset(self.args.train_subset),
             max_tokens=self.args.max_tokens,

From 33646ac9b7b720444416e2c18d1120d03b37e156 Mon Sep 17 00:00:00 2001
From: Jeff Cai <jcai@fb.com>
Date: Wed, 9 Oct 2019 18:14:39 -0700
Subject: [PATCH 174/213] wav2letter integration

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/846

Reviewed By: jcai1

Differential Revision: D17845996

Pulled By: okhonko

fbshipit-source-id: 3826fd9a4418496916bf1835c319dd85c89945cc
---
 examples/speech_recognition/README.md         |  74 ++++++++
 .../speech_recognition/criterions/ASG_loss.py | 154 ++++++++++++++++
 .../speech_recognition/criterions/__init__.py |  16 +-
 examples/speech_recognition/data/replabels.py |  70 +++++++
 examples/speech_recognition/infer.py          |  87 +++++----
 .../models/w2l_conv_glu_enc.py                | 174 ++++++++++++++++++
 .../tasks/speech_recognition.py               |  34 +++-
 examples/speech_recognition/w2l_decoder.py    | 161 ++++++++++++++++
 8 files changed, 728 insertions(+), 42 deletions(-)
 create mode 100644 examples/speech_recognition/criterions/ASG_loss.py
 create mode 100644 examples/speech_recognition/data/replabels.py
 create mode 100644 examples/speech_recognition/models/w2l_conv_glu_enc.py
 create mode 100644 examples/speech_recognition/w2l_decoder.py

diff --git a/examples/speech_recognition/README.md b/examples/speech_recognition/README.md
index 36363b0376..d27edab7e0 100644
--- a/examples/speech_recognition/README.md
+++ b/examples/speech_recognition/README.md
@@ -7,6 +7,7 @@ On top of main fairseq dependencies there are couple more additional requirement
 
 1) Please follow the instructions to install [torchaudio](https://github.com/pytorch/audio). This is required to compute audio fbank features.
 2) [Sclite](http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/sclite.htm#sclite_name_0) is used to measure WER. Sclite can be downloaded and installed from source from sctk package [here](http://www.openslr.org/4/). Training and inference doesn't require Sclite dependency.
+3) [sentencepiece](https://github.com/google/sentencepiece) is required in order to create dataset with word-piece targets.
 
 ## Preparing librispeech data
 ```
@@ -30,3 +31,76 @@ python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task sp
 sclite -r ${RES_DIR}/ref.word-checkpoint_last.pt-${SET}.txt -h ${RES_DIR}/hypo.word-checkpoint_last.pt-${SET}.txt -i rm -o all stdout > $RES_REPORT
 ```
 `Sum/Avg` row from first table of the report has WER
+
+## Using wav2letter components
+[wav2letter](https://github.com/facebookresearch/wav2letter) now has integration with fairseq. Currently this includes:
+
+* AutoSegmentationCriterion (ASG)
+* wav2letter-style Conv/GLU model
+* wav2letter's beam search decoder
+
+To use these, follow the instructions at the bottom of [this page](https://github.com/facebookresearch/wav2letter/blob/master/docs/installation.md) to install python bindings. Please note that python bindings are for a *subset* of wav2letter and don't require its full dependencies (notably, `flashlight` and `ArrayFire` are *not* required).
+
+To quickly summarize the instructions: first, install [CUDA](https://developer.nvidia.com/cuda-downloads). Then follow these steps:
+```
+# additional prerequisites - use equivalents for your distro
+sudo apt-get install build-essential cmake libatlas-base-dev libfftw3-dev liblzma-dev libbz2-dev libzstd-dev
+# install KenLM from source
+git clone https://github.com/kpu/kenlm.git
+cd kenlm
+mkdir -p build && cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+make -j16
+cd ..
+export KENLM_ROOT_DIR=$(pwd)
+cd ..
+# install wav2letter python bindings
+git clone https://github.com/facebookresearch/wav2letter.git
+cd wav2letter/bindings/python
+# make sure your python environment is active at this point
+pip install torch packaging
+pip install -e .
+# try some examples to verify installation succeeded
+python ./examples/criterion_example.py
+python ./examples/decoder_example.py ../../src/decoder/test
+python ./examples/feature_example.py ../../src/feature/test/data
+```
+
+## Training librispeech data (wav2letter style, Conv/GLU + ASG loss)
+Training command:
+```
+python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 100 --task speech_recognition --arch w2l_conv_glu_enc --batch-size 4 --optimizer sgd --lr 0.3,0.8 --momentum 0.8 --clip-norm 0.2 --max-tokens 50000 --log-format json --log-interval 100 --num-workers 0 --sentence-avg --criterion asg_loss --asg-transitions-init 5 --max-replabel 2 --linseg-updates 8789 --user-dir examples/speech_recognition
+```
+
+Note that ASG loss currently doesn't do well with word-pieces. You should prepare a dataset with character targets by setting `nbpe=31` in `prepare-librispeech.sh`.
+
+## Inference for librispeech (wav2letter decoder, n-gram LM)
+Inference command:
+```
+python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --seed 1 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --gen-subset $SET --results-path $RES_DIR --w2l-decoder kenlm --kenlm-model $KENLM_MODEL_PATH --lexicon $LEXICON_PATH --beam 200 --beam-threshold 15 --lm-weight 1.5 --word-score 1.5 --sil-weight -0.3 --criterion asg_loss --max-replabel 2 --user-dir examples/speech_recognition
+```
+
+`$KENLM_MODEL_PATH` should be a standard n-gram language model file. `$LEXICON_PATH` should be a wav2letter-style lexicon (list of known words and their spellings). For ASG inference, a lexicon line should look like this (note the repetition labels):
+```
+doorbell  D O 1 R B E L 1 ▁
+```
+For CTC inference with word-pieces, repetition labels are not used and the lexicon should have most common spellings for each word (one can use sentencepiece's `NBestEncodeAsPieces` for this):
+```
+doorbell  ▁DOOR BE LL
+doorbell  ▁DOOR B E LL
+doorbell  ▁DO OR BE LL
+doorbell  ▁DOOR B EL L
+doorbell  ▁DOOR BE L L
+doorbell  ▁DO OR B E LL
+doorbell  ▁DOOR B E L L
+doorbell  ▁DO OR B EL L
+doorbell  ▁DO O R BE LL
+doorbell  ▁DO OR BE L L
+```
+Lowercase vs. uppercase matters: the *word* should match the case of the n-gram language model (i.e. `$KENLM_MODEL_PATH`), while the *spelling* should match the case of the token dictionary (i.e. `$DIR_FOR_PREPROCESSED_DATA/dict.txt`).
+
+## Inference for librispeech (wav2letter decoder, viterbi only)
+Inference command:
+```
+python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --seed 1 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --gen-subset $SET --results-path $RES_DIR --w2l-decoder viterbi --criterion asg_loss --max-replabel 2 --user-dir examples/speech_recognition
+```
diff --git a/examples/speech_recognition/criterions/ASG_loss.py b/examples/speech_recognition/criterions/ASG_loss.py
new file mode 100644
index 0000000000..29c8a3d78e
--- /dev/null
+++ b/examples/speech_recognition/criterions/ASG_loss.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import numpy as np
+import torch
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from examples.speech_recognition.data.replabels import pack_replabels
+
+from wav2letter.criterion import ASGLoss, CriterionScaleMode
+
+
+@register_criterion("asg_loss")
+class ASGCriterion(FairseqCriterion):
+    @staticmethod
+    def add_args(parser):
+        group = parser.add_argument_group("ASG Loss")
+        group.add_argument(
+            "--asg-transitions-init",
+            help="initial diagonal value of transition matrix",
+            type=float,
+            default=0.0,
+        )
+        group.add_argument(
+            "--max-replabel", help="maximum # of replabels", type=int, default=2
+        )
+        group.add_argument(
+            "--linseg-updates",
+            help="# of training updates to use LinSeg initialization",
+            type=int,
+            default=0,
+        )
+        group.add_argument(
+            "--hide-linseg-messages",
+            help="hide messages about LinSeg initialization",
+            action="store_true",
+        )
+
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        self.tgt_dict = task.target_dictionary
+        self.eos = self.tgt_dict.eos()
+        self.silence = (
+            self.tgt_dict.index(args.silence_token)
+            if args.silence_token in self.tgt_dict
+            else None
+        )
+        self.max_replabel = args.max_replabel
+
+        num_labels = len(self.tgt_dict)
+        self.asg = ASGLoss(num_labels, scale_mode=CriterionScaleMode.TARGET_SZ_SQRT)
+        self.asg.trans = torch.nn.Parameter(
+            args.asg_transitions_init * torch.eye(num_labels), requires_grad=True
+        )
+
+        self.linseg_progress = torch.nn.Parameter(
+            torch.tensor([0], dtype=torch.int), requires_grad=False
+        )
+        self.linseg_maximum = args.linseg_updates
+        self.linseg_message_state = "none" if args.hide_linseg_messages else "start"
+
+    def linseg_step(self):
+        if not self.training:
+            return False
+        if self.linseg_progress.item() < self.linseg_maximum:
+            if self.linseg_message_state == "start":
+                print("| using LinSeg to initialize ASG")
+                self.linseg_message_state = "finish"
+            self.linseg_progress.add_(1)
+            return True
+        elif self.linseg_message_state == "finish":
+            print("| finished LinSeg initialization")
+            self.linseg_message_state = "none"
+        return False
+
+    def replace_eos_with_silence(self, tgt):
+        if tgt[-1] != self.eos:
+            return tgt
+        elif self.silence is None or (len(tgt) > 1 and tgt[-2] == self.silence):
+            return tgt[:-1]
+        else:
+            return tgt[:-1] + [self.silence]
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+
+        net_output = model(**sample["net_input"])
+        emissions = net_output["encoder_out"].transpose(0, 1).contiguous()
+        B = emissions.size(0)
+        T = emissions.size(1)
+        device = emissions.device
+
+        target = torch.IntTensor(B, T)
+        target_size = torch.IntTensor(B)
+        using_linseg = self.linseg_step()
+
+        for b in range(B):
+            initial_target_size = sample["target_lengths"][b].item()
+            if initial_target_size == 0:
+                raise ValueError("target size cannot be zero")
+
+            tgt = sample["target"][b, :initial_target_size].tolist()
+            tgt = self.replace_eos_with_silence(tgt)
+            tgt = pack_replabels(tgt, self.tgt_dict, self.max_replabel)
+            tgt = tgt[:T]
+
+            if using_linseg:
+                tgt = [tgt[t * len(tgt) // T] for t in range(T)]
+
+            target[b][: len(tgt)] = torch.IntTensor(tgt)
+            target_size[b] = len(tgt)
+
+        loss = self.asg.forward(emissions, target.to(device), target_size.to(device))
+
+        if reduce:
+            loss = torch.sum(loss)
+
+        sample_size = (
+            sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / nsentences,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        return agg_output
diff --git a/examples/speech_recognition/criterions/__init__.py b/examples/speech_recognition/criterions/__init__.py
index 5ba9fc1601..e3a348afa4 100644
--- a/examples/speech_recognition/criterions/__init__.py
+++ b/examples/speech_recognition/criterions/__init__.py
@@ -1,7 +1,17 @@
 import importlib
 import os
 
+
+# ASG loss requires wav2letter
+blacklist = set()
+try:
+    import wav2letter
+except ImportError:
+    blacklist.add("ASG_loss.py")
+
 for file in os.listdir(os.path.dirname(__file__)):
-    if file.endswith('.py') and not file.startswith('_'):
-        criterion_name = file[:file.find('.py')]
-        importlib.import_module('examples.speech_recognition.criterions.' + criterion_name)
+    if file.endswith(".py") and not file.startswith("_") and file not in blacklist:
+        criterion_name = file[: file.find(".py")]
+        importlib.import_module(
+            "examples.speech_recognition.criterions." + criterion_name
+        )
diff --git a/examples/speech_recognition/data/replabels.py b/examples/speech_recognition/data/replabels.py
new file mode 100644
index 0000000000..d76bda7aef
--- /dev/null
+++ b/examples/speech_recognition/data/replabels.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Replabel transforms for use with wav2letter's ASG criterion.
+"""
+
+
+def replabel_symbol(i):
+    """
+    Replabel symbols used in wav2letter, currently just "1", "2", ...
+    This prevents training with numeral tokens, so this might change in the future
+    """
+    return str(i)
+
+
+def pack_replabels(tokens, dictionary, max_reps):
+    """
+    Pack a token sequence so that repeated symbols are replaced by replabels
+    """
+    if len(tokens) == 0 or max_reps <= 0:
+        return tokens
+
+    replabel_value_to_idx = [0] * (max_reps + 1)
+    for i in range(1, max_reps + 1):
+        replabel_value_to_idx[i] = dictionary.index(replabel_symbol(i))
+
+    result = []
+    prev_token = -1
+    num_reps = 0
+    for token in tokens:
+        if token == prev_token and num_reps < max_reps:
+            num_reps += 1
+        else:
+            if num_reps > 0:
+                result.append(replabel_value_to_idx[num_reps])
+                num_reps = 0
+            result.append(token)
+            prev_token = token
+    if num_reps > 0:
+        result.append(replabel_value_to_idx[num_reps])
+    return result
+
+
+def unpack_replabels(tokens, dictionary, max_reps):
+    """
+    Unpack a token sequence so that replabels are replaced by repeated symbols
+    """
+    if len(tokens) == 0 or max_reps <= 0:
+        return tokens
+
+    replabel_idx_to_value = {}
+    for i in range(1, max_reps + 1):
+        replabel_idx_to_value[dictionary.index(replabel_symbol(i))] = i
+
+    result = []
+    prev_token = -1
+    for token in tokens:
+        try:
+            for _ in range(replabel_idx_to_value[token]):
+                result.append(prev_token)
+            prev_token = -1
+        except KeyError:
+            result.append(token)
+            prev_token = token
+    return result
diff --git a/examples/speech_recognition/infer.py b/examples/speech_recognition/infer.py
index ce5f4f7654..909400b589 100644
--- a/examples/speech_recognition/infer.py
+++ b/examples/speech_recognition/infer.py
@@ -9,11 +9,12 @@
 """
 
 import logging
+import math
 import os
 
 import sentencepiece as spm
 import torch
-from fairseq import options, progress_bar, utils, tasks
+from fairseq import checkpoint_utils, options, progress_bar, utils, tasks
 from fairseq.meters import StopwatchMeter, TimeMeter
 from fairseq.utils import import_user_module
 
@@ -23,8 +24,6 @@
 
 
 def add_asr_eval_argument(parser):
-    parser.add_argument("--ctc", action="store_true", help="decode a ctc model")
-    parser.add_argument("--rnnt", default=False, help="decode a rnnt model")
     parser.add_argument("--kspmodel", default=None, help="sentence piece model")
     parser.add_argument(
         "--wfstlm", default=None, help="wfstlm on dictonary output units"
@@ -36,14 +35,24 @@ def add_asr_eval_argument(parser):
 output units",
     )
     parser.add_argument(
+        "--lm-weight",
         "--lm_weight",
+        type=float,
         default=0.2,
-        help="weight for wfstlm while interpolating\
-with neural score",
+        help="weight for lm while interpolating with neural score",
     )
     parser.add_argument(
         "--rnnt_len_penalty", default=-0.5, help="rnnt length penalty on word level"
     )
+    parser.add_argument(
+        "--w2l-decoder", choices=["viterbi", "kenlm"], help="use a w2l decoder"
+    )
+    parser.add_argument("--lexicon", help="lexicon for w2l decoder")
+    parser.add_argument("--kenlm-model", help="kenlm model for w2l decoder")
+    parser.add_argument("--beam-threshold", type=float, default=25.0)
+    parser.add_argument("--word-score", type=float, default=1.0)
+    parser.add_argument("--unk-weight", type=float, default=-math.inf)
+    parser.add_argument("--sil-weight", type=float, default=0.0)
     return parser
 
 
@@ -72,29 +81,21 @@ def get_dataset_itr(args, task):
     ).next_epoch_itr(shuffle=False)
 
 
-def process_predictions(args, hypos, sp, tgt_dict, target_tokens, res_files, speaker, id):
+def process_predictions(
+    args, hypos, sp, tgt_dict, target_tokens, res_files, speaker, id
+):
     for hypo in hypos[: min(len(hypos), args.nbest)]:
         hyp_pieces = tgt_dict.string(hypo["tokens"].int().cpu())
         hyp_words = sp.DecodePieces(hyp_pieces.split())
         print(
-            "{} ({}-{})".format(hyp_pieces, speaker, id),
-            file=res_files["hypo.units"],
-        )
-        print(
-            "{} ({}-{})".format(hyp_words, speaker, id),
-            file=res_files["hypo.words"],
+            "{} ({}-{})".format(hyp_pieces, speaker, id), file=res_files["hypo.units"]
         )
+        print("{} ({}-{})".format(hyp_words, speaker, id), file=res_files["hypo.words"])
 
         tgt_pieces = tgt_dict.string(target_tokens)
         tgt_words = sp.DecodePieces(tgt_pieces.split())
-        print(
-            "{} ({}-{})".format(tgt_pieces, speaker, id),
-            file=res_files["ref.units"],
-        )
-        print(
-            "{} ({}-{})".format(tgt_words, speaker, id),
-            file=res_files["ref.words"],
-        )
+        print("{} ({}-{})".format(tgt_pieces, speaker, id), file=res_files["ref.units"])
+        print("{} ({}-{})".format(tgt_words, speaker, id), file=res_files["ref.words"])
         # only score top hypothesis
         if not args.quiet:
             logger.debug("HYPO:" + hyp_words)
@@ -120,6 +121,30 @@ def get_res_file(file_prefix):
     }
 
 
+def load_models_and_criterions(filenames, arg_overrides=None, task=None):
+    models = []
+    criterions = []
+    for filename in filenames:
+        if not os.path.exists(filename):
+            raise IOError("Model file not found: {}".format(filename))
+        state = checkpoint_utils.load_checkpoint_to_cpu(filename, arg_overrides)
+
+        args = state["args"]
+        if task is None:
+            task = tasks.setup_task(args)
+
+        # build model for ensemble
+        model = task.build_model(args)
+        model.load_state_dict(state["model"], strict=True)
+        models.append(model)
+
+        criterion = task.build_criterion(args)
+        if "criterion" in state:
+            criterion.load_state_dict(state["criterion"], strict=True)
+        criterions.append(criterion)
+    return models, criterions, args
+
+
 def optimize_models(args, use_cuda, models):
     """Optimize ensemble for generation
     """
@@ -156,22 +181,22 @@ def main(args):
     # Set dictionary
     tgt_dict = task.target_dictionary
 
-    if args.ctc or args.rnnt:
-        tgt_dict.add_symbol("<ctc_blank>")
-        if args.ctc:
-            logger.info("| decoding a ctc model")
-        if args.rnnt:
-            logger.info("| decoding a rnnt model")
+    logger.info("| decoding with criterion {}".format(args.criterion))
 
     # Load ensemble
     logger.info("| loading model(s) from {}".format(args.path))
-    models, _model_args = utils.load_ensemble_for_inference(
+    models, criterions, _model_args = load_models_and_criterions(
         args.path.split(":"),
-        task,
-        model_arg_overrides=eval(args.model_overrides),  # noqa
+        arg_overrides=eval(args.model_overrides),  # noqa
+        task=task,
     )
     optimize_models(args, use_cuda, models)
 
+    # hack to pass transitions to W2lDecoder
+    if args.criterion == "asg_loss":
+        trans = criterions[0].asg.trans.data
+        args.asg_transitions = torch.flatten(trans).tolist()
+
     # Load dataset (possibly sharded)
     itr = get_dataset_itr(args, task)
 
@@ -185,7 +210,7 @@ def main(args):
         os.makedirs(args.results_path)
 
     sp = spm.SentencePieceProcessor()
-    sp.Load(os.path.join(args.data, 'spm.model'))
+    sp.Load(os.path.join(args.data, "spm.model"))
 
     res_files = prepare_result_files(args)
     with progress_bar.build_progress_bar(args, itr) as t:
@@ -204,7 +229,7 @@ def main(args):
             num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos)
             gen_timer.stop(num_generated_tokens)
 
-            for i, sample_id in enumerate(sample['id'].tolist()):
+            for i, sample_id in enumerate(sample["id"].tolist()):
                 speaker = task.dataset(args.gen_subset).speakers[int(sample_id)]
                 id = task.dataset(args.gen_subset).ids[int(sample_id)]
                 target_tokens = (
diff --git a/examples/speech_recognition/models/w2l_conv_glu_enc.py b/examples/speech_recognition/models/w2l_conv_glu_enc.py
new file mode 100644
index 0000000000..31cf3401b3
--- /dev/null
+++ b/examples/speech_recognition/models/w2l_conv_glu_enc.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderModel,
+    register_model,
+    register_model_architecture,
+)
+
+
+default_conv_enc_config = """[ 
+    (400, 13, 170, 0.2),
+    (440, 14, 0, 0.214),
+    (484, 15, 0, 0.22898),
+    (532, 16, 0, 0.2450086),
+    (584, 17, 0, 0.262159202),
+    (642, 18, 0, 0.28051034614),
+    (706, 19, 0, 0.30014607037),
+    (776, 20, 0, 0.321156295296),
+    (852, 21, 0, 0.343637235966),
+    (936, 22, 0, 0.367691842484),
+    (1028, 23, 0, 0.393430271458),
+    (1130, 24, 0, 0.42097039046),
+    (1242, 25, 0, 0.450438317792),
+    (1366, 26, 0, 0.481969000038),
+    (1502, 27, 0, 0.51570683004),
+    (1652, 28, 0, 0.551806308143),
+    (1816, 29, 0, 0.590432749713),
+]"""
+
+
+@register_model("asr_w2l_conv_glu_encoder")
+class W2lConvGluEncoderModel(FairseqEncoderModel):
+    def __init__(self, encoder):
+        super().__init__(encoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="encoder input dimension per input channel",
+        )
+        parser.add_argument(
+            "--in-channels",
+            type=int,
+            metavar="N",
+            help="number of encoder input channels",
+        )
+        parser.add_argument(
+            "--conv-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples each containing the configuration of one conv layer
+    [(out_channels, kernel_size, padding, dropout), ...]
+            """,
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        conv_enc_config = getattr(args, "conv_enc_config", default_conv_enc_config)
+        encoder = W2lConvGluEncoder(
+            vocab_size=len(task.target_dictionary),
+            input_feat_per_channel=args.input_feat_per_channel,
+            in_channels=args.in_channels,
+            conv_enc_config=eval(conv_enc_config),
+        )
+        return cls(encoder)
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        lprobs.batch_first = False
+        return lprobs
+
+
+class W2lConvGluEncoder(FairseqEncoder):
+    def __init__(
+        self, vocab_size, input_feat_per_channel, in_channels, conv_enc_config
+    ):
+        super().__init__(None)
+
+        self.input_dim = input_feat_per_channel
+        if in_channels != 1:
+            raise ValueError("only 1 input channel is currently supported")
+
+        self.conv_layers = nn.ModuleList()
+        self.linear_layers = nn.ModuleList()
+        self.dropouts = []
+        cur_channels = input_feat_per_channel
+
+        for out_channels, kernel_size, padding, dropout in conv_enc_config:
+            layer = nn.Conv1d(cur_channels, out_channels, kernel_size, padding=padding)
+            layer.weight.data.mul_(math.sqrt(3))  # match wav2letter init
+            self.conv_layers.append(nn.utils.weight_norm(layer))
+            self.dropouts.append(dropout)
+            if out_channels % 2 != 0:
+                raise ValueError("odd # of out_channels is incompatible with GLU")
+            cur_channels = out_channels // 2  # halved by GLU
+
+        for out_channels in [2 * cur_channels, vocab_size]:
+            layer = nn.Linear(cur_channels, out_channels)
+            layer.weight.data.mul_(math.sqrt(3))
+            self.linear_layers.append(nn.utils.weight_norm(layer))
+            cur_channels = out_channels // 2
+
+    def forward(self, src_tokens, src_lengths, **kwargs):
+
+        """
+        src_tokens: padded tensor (B, T, C * feat)
+        src_lengths: tensor of original lengths of input utterances (B,)
+        """
+        B, T, _ = src_tokens.size()
+        x = src_tokens.transpose(1, 2).contiguous()  # (B, feat, T) assuming C == 1
+
+        for layer_idx in range(len(self.conv_layers)):
+            x = self.conv_layers[layer_idx](x)
+            x = F.glu(x, dim=1)
+            x = F.dropout(x, p=self.dropouts[layer_idx], training=self.training)
+
+        x = x.transpose(1, 2).contiguous()  # (B, T, 908)
+        x = self.linear_layers[0](x)
+        x = F.glu(x, dim=2)
+        x = F.dropout(x, p=self.dropouts[-1])
+        x = self.linear_layers[1](x)
+
+        assert x.size(0) == B
+        assert x.size(1) == T
+
+        encoder_out = x.transpose(0, 1)  # (T, B, vocab_size)
+
+        # need to debug this -- find a simpler/elegant way in pytorch APIs
+        encoder_padding_mask = (
+            torch.arange(T).view(1, T).expand(B, -1).to(x.device)
+            >= src_lengths.view(B, 1).expand(-1, T)
+        ).t()  # (B x T) -> (T x B)
+
+        return {
+            "encoder_out": encoder_out,  # (T, B, vocab_size)
+            "encoder_padding_mask": encoder_padding_mask,  # (T, B)
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
+            1, new_order
+        )
+        encoder_out["encoder_padding_mask"] = encoder_out[
+            "encoder_padding_mask"
+        ].index_select(1, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return (1e6, 1e6)  # an arbitrary large number
+
+
+@register_model_architecture("asr_w2l_conv_glu_encoder", "w2l_conv_glu_enc")
+def w2l_conv_glu_enc(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.in_channels = getattr(args, "in_channels", 1)
+    args.conv_enc_config = getattr(args, "conv_enc_config", default_conv_enc_config)
diff --git a/examples/speech_recognition/tasks/speech_recognition.py b/examples/speech_recognition/tasks/speech_recognition.py
index 8c974aa720..699fa4a290 100644
--- a/examples/speech_recognition/tasks/speech_recognition.py
+++ b/examples/speech_recognition/tasks/speech_recognition.py
@@ -11,6 +11,7 @@
 from fairseq.data import Dictionary
 from fairseq.tasks import FairseqTask, register_task
 from examples.speech_recognition.data import AsrDataset
+from examples.speech_recognition.data.replabels import replabel_symbol
 
 
 def get_asr_dataset_from_json(data_json_path, tgt_dict):
@@ -55,16 +56,12 @@ def get_asr_dataset_from_json(data_json_path, tgt_dict):
             speakers.append(m.group(1) + "_" + m.group(2))
         frame_sizes = [s[1]["input"]["length_ms"] for s in sorted_samples]
         tgt = [
-            torch.LongTensor(
-                [int(i) for i in s[1]["output"]["tokenid"].split(", ")]
-            )
+            torch.LongTensor([int(i) for i in s[1]["output"]["tokenid"].split(", ")])
             for s in sorted_samples
         ]
         # append eos
         tgt = [torch.cat([t, torch.LongTensor([tgt_dict.eos()])]) for t in tgt]
-        return AsrDataset(
-            aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers
-        )
+        return AsrDataset(aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers)
 
 
 @register_task("speech_recognition")
@@ -77,6 +74,9 @@ class SpeechRecognitionTask(FairseqTask):
     def add_args(parser):
         """Add task-specific arguments to the parser."""
         parser.add_argument("data", help="path to data directory")
+        parser.add_argument(
+            "--silence-token", default="\u2581", help="token for silence (used by w2l)"
+        )
 
     def __init__(self, args, tgt_dict):
         super().__init__(args)
@@ -90,6 +90,12 @@ def setup_task(cls, args, **kwargs):
             raise FileNotFoundError("Dict not found: {}".format(dict_path))
         tgt_dict = Dictionary.load(dict_path)
 
+        if args.criterion == "ctc_loss":
+            tgt_dict.add_symbol("<ctc_blank>")
+        elif args.criterion == "asg_loss":
+            for i in range(1, args.max_replabel + 1):
+                tgt_dict.add_symbol(replabel_symbol(i))
+
         print("| dictionary: {} types".format(len(tgt_dict)))
         return cls(args, tgt_dict)
 
@@ -100,8 +106,20 @@ def load_dataset(self, split, combine=False, **kwargs):
             split (str): name of the split (e.g., train, valid, test)
         """
         data_json_path = os.path.join(self.args.data, "{}.json".format(split))
-        self.datasets[split] = get_asr_dataset_from_json(
-            data_json_path, self.tgt_dict)
+        self.datasets[split] = get_asr_dataset_from_json(data_json_path, self.tgt_dict)
+
+    def build_generator(self, args):
+        w2l_decoder = getattr(args, "w2l_decoder", None)
+        if w2l_decoder == "viterbi":
+            from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
+
+            return W2lViterbiDecoder(args, self.target_dictionary)
+        elif w2l_decoder == "kenlm":
+            from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
+
+            return W2lKenLMDecoder(args, self.target_dictionary)
+        else:
+            return super().build_generator(args)
 
     @property
     def target_dictionary(self):
diff --git a/examples/speech_recognition/w2l_decoder.py b/examples/speech_recognition/w2l_decoder.py
new file mode 100644
index 0000000000..141d41d6ca
--- /dev/null
+++ b/examples/speech_recognition/w2l_decoder.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Wav2letter decoders.
+"""
+import math
+import itertools as it
+import torch
+from fairseq import utils
+from examples.speech_recognition.data.replabels import unpack_replabels
+from wav2letter.common import create_word_dict, load_words
+from wav2letter.criterion import CpuViterbiPath, get_data_ptr_as_bytes
+from wav2letter.decoder import (
+    CriterionType,
+    DecoderOptions,
+    KenLM,
+    SmearingMode,
+    Trie,
+    WordLMDecoder,
+)
+
+
+class W2lDecoder(object):
+    def __init__(self, args, tgt_dict):
+        self.tgt_dict = tgt_dict
+        self.vocab_size = len(tgt_dict)
+        self.nbest = args.nbest
+
+        # criterion-specific init
+        if args.criterion == "ctc_loss":
+            self.criterion_type = CriterionType.CTC
+            self.blank = tgt_dict.index("<ctc_blank>")
+            self.asg_transitions = None
+        elif args.criterion == "asg_loss":
+            self.criterion_type = CriterionType.ASG
+            self.blank = -1
+            self.asg_transitions = args.asg_transitions
+            self.max_replabel = args.max_replabel
+            assert len(self.asg_transitions) == self.vocab_size ** 2
+        else:
+            raise RuntimeError(f"unknown criterion: {args.criterion}")
+
+    def generate(self, models, sample, prefix_tokens=None):
+        """Generate a batch of inferences."""
+        # model.forward normally channels prev_output_tokens into the decoder
+        # separately, but SequenceGenerator directly calls model.encoder
+        encoder_input = {
+            k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
+        }
+        emissions = self.get_emissions(models, encoder_input)
+        return self.decode(emissions)
+
+    def get_emissions(self, models, encoder_input):
+        """Run encoder and normalize emissions"""
+        encoder_out = models[0].encoder(**encoder_input)
+        if self.criterion_type == CriterionType.CTC:
+            emissions = models[0].get_normalized_probs(encoder_out, log_probs=True)
+        elif self.criterion_type == CriterionType.ASG:
+            emissions = encoder_out["encoder_out"]
+        return emissions.transpose(0, 1).float().cpu().contiguous()
+
+    def get_tokens(self, idxs):
+        """Normalize tokens by handling CTC blank, ASG replabels, etc."""
+        idxs = (g[0] for g in it.groupby(idxs))
+        idxs = filter(lambda x: x >= 0, idxs)
+        if self.criterion_type == CriterionType.CTC:
+            idxs = filter(lambda x: x != self.blank, idxs)
+        elif self.criterion_type == CriterionType.ASG:
+            idxs = unpack_replabels(list(idxs), self.tgt_dict, self.max_replabel)
+        return torch.LongTensor(list(idxs))
+
+
+class W2lViterbiDecoder(W2lDecoder):
+    def __init__(self, args, tgt_dict):
+        super().__init__(args, tgt_dict)
+
+    def decode(self, emissions):
+        B, T, N = emissions.size()
+        hypos = []
+        if self.asg_transitions is None:
+            transitions = torch.FloatTensor(N, N).zero_()
+        else:
+            transitions = torch.FloatTensor(self.asg_transitions).view(N, N)
+        viterbi_path = torch.IntTensor(B, T)
+        workspace = torch.ByteTensor(CpuViterbiPath.get_workspace_size(B, T, N))
+        CpuViterbiPath.compute(
+            B,
+            T,
+            N,
+            get_data_ptr_as_bytes(emissions),
+            get_data_ptr_as_bytes(transitions),
+            get_data_ptr_as_bytes(viterbi_path),
+            get_data_ptr_as_bytes(workspace),
+        )
+        return [
+            [{"tokens": self.get_tokens(viterbi_path[b].tolist()), "score": 0}]
+            for b in range(B)
+        ]
+
+
+class W2lKenLMDecoder(W2lDecoder):
+    def __init__(self, args, tgt_dict):
+        super().__init__(args, tgt_dict)
+
+        self.silence = tgt_dict.index(args.silence_token)
+
+        self.lexicon = load_words(args.lexicon)
+        self.word_dict = create_word_dict(self.lexicon)
+        self.unk_word = self.word_dict.get_index("<unk>")
+
+        self.lm = KenLM(args.kenlm_model, self.word_dict)
+        self.trie = Trie(self.vocab_size, self.silence)
+
+        start_state = self.lm.start(False)
+        for word, spellings in self.lexicon.items():
+            word_idx = self.word_dict.get_index(word)
+            _, score = self.lm.score(start_state, word_idx)
+            for spelling in spellings:
+                spelling_idxs = [tgt_dict.index(token) for token in spelling]
+                self.trie.insert(spelling_idxs, word_idx, score)
+        self.trie.smear(SmearingMode.MAX)
+
+        self.decoder_opts = DecoderOptions(
+            args.beam,
+            args.beam_threshold,
+            args.lm_weight,
+            args.word_score,
+            args.unk_weight,
+            False,
+            args.sil_weight,
+            self.criterion_type,
+        )
+
+        self.decoder = WordLMDecoder(
+            self.decoder_opts,
+            self.trie,
+            self.lm,
+            self.silence,
+            self.blank,
+            self.unk_word,
+            self.asg_transitions,
+        )
+
+    def decode(self, emissions):
+        B, T, N = emissions.size()
+        hypos = []
+        for b in range(B):
+            emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
+            nbest_results = self.decoder.decode(emissions_ptr, T, N)[: self.nbest]
+            hypos.append(
+                [
+                    {"tokens": self.get_tokens(result.tokens), "score": result.score}
+                    for result in nbest_results
+                ]
+            )
+        return hypos

From c4893ca6eaefda846088f27df1fd38471a9c8640 Mon Sep 17 00:00:00 2001
From: Dmytro Okhonko <oxo@fb.com>
Date: Thu, 10 Oct 2019 11:12:38 -0700
Subject: [PATCH 175/213] Add ctc loss to ASR task (#1233)

Summary:
Adds CTC loss and corresponding transformer ctc based models.

Tested with
`CUDA_VISIBLE_DEVICES=0 python train.py $DATA_PATH --save-dir $SAVE_DIR --max-epoch 30 --task speech_recognition --arch vggtransformer_enc_1 --optimizer adadelta --lr 1.0 --adadelta-eps 1e-8 --adadelta-rho 0.95 --clip-norm 10.0  --max-tokens 10000 --log-format json --log-interval 1 --criterion ctc_loss --user-dir examples/speech_recognition/ --validate-interval=10`
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1233

Reviewed By: jcai1

Differential Revision: D17856824

Pulled By: okhonko

fbshipit-source-id: f3eac64d3fdd0c37cf8c539dd360cfb610d8a6ef
---
 .../speech_recognition/criterions/CTC_loss.py | 194 +++++++++
 .../speech_recognition/data/data_utils.py     |  36 ++
 .../models/vggtransformer.py                  | 168 ++++++++
 .../speech_recognition/utils/wer_utils.py     | 381 ++++++++++++++++++
 4 files changed, 779 insertions(+)
 create mode 100644 examples/speech_recognition/criterions/CTC_loss.py
 create mode 100644 examples/speech_recognition/utils/wer_utils.py

diff --git a/examples/speech_recognition/criterions/CTC_loss.py b/examples/speech_recognition/criterions/CTC_loss.py
new file mode 100644
index 0000000000..7d35c937eb
--- /dev/null
+++ b/examples/speech_recognition/criterions/CTC_loss.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from itertools import groupby
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from examples.speech_recognition.data.data_utils import encoder_padding_mask_to_lengths
+from examples.speech_recognition.utils.wer_utils import Code, EditDistance, Token
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+def arr_to_toks(arr):
+    toks = []
+    for a in arr:
+        toks.append(Token(str(a), 0.0, 0.0))
+    return toks
+
+
+def compute_ctc_uer(logprobs, targets, input_lengths, target_lengths, blank_idx):
+    """
+        Computes utterance error rate for CTC outputs
+
+        Args:
+            logprobs: (Torch.tensor)  N, T1, D tensor of log probabilities out
+                of the encoder
+            targets: (Torch.tensor) N, T2 tensor of targets
+            input_lengths: (Torch.tensor) lengths of inputs for each sample
+            target_lengths: (Torch.tensor) lengths of targets for each sample
+            blank_idx: (integer) id of blank symbol in target dictionary
+
+        Returns:
+            batch_errors: (float) errors in the batch
+            batch_total: (float)  total number of valid samples in batch
+    """
+    batch_errors = 0.0
+    batch_total = 0.0
+    for b in range(logprobs.shape[0]):
+        predicted = logprobs[b][: input_lengths[b]].argmax(1).tolist()
+        target = targets[b][: target_lengths[b]].tolist()
+        # dedup predictions
+        predicted = [p[0] for p in groupby(predicted)]
+        # remove blanks
+        nonblanks = []
+        for p in predicted:
+            if p != blank_idx:
+                nonblanks.append(p)
+        predicted = nonblanks
+
+        # compute the alignment based on EditDistance
+        alignment = EditDistance(False).align(
+            arr_to_toks(predicted), arr_to_toks(target)
+        )
+
+        # compute the number of errors
+        # note that alignment.codes can also be used for computing
+        # deletion, insersion and substitution error breakdowns in future
+        for a in alignment.codes:
+            if a != Code.match:
+                batch_errors += 1
+        batch_total += len(target)
+
+    return batch_errors, batch_total
+
+
+@register_criterion("ctc_loss")
+class CTCCriterion(FairseqCriterion):
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        self.blank_idx = task.target_dictionary.index("<ctc_blank>")
+        self.pad_idx = task.target_dictionary.pad()
+        self.task = task
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument(
+            "--use-source-side-sample-size",
+            action="store_true",
+            default=False,
+            help=(
+                "when compute average loss, using number of source tokens "
+                + "as denominator. "
+                + "This argument will be no-op if sentence-avg is used."
+            ),
+        )
+
+    def forward(self, model, sample, reduce=True, log_probs=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        lprobs = model.get_normalized_probs(net_output, log_probs=log_probs)
+        if not hasattr(lprobs, "batch_first"):
+            logging.warning(
+                "ERROR: we need to know whether "
+                "batch first for the encoder output; "
+                "you need to set batch_first attribute for the return value of "
+                "model.get_normalized_probs. Now, we assume this is true, but "
+                "in the future, we will raise exception instead. "
+            )
+
+        batch_first = getattr(lprobs, "batch_first", True)
+
+        if not batch_first:
+            max_seq_len = lprobs.size(0)
+            bsz = lprobs.size(1)
+        else:
+            max_seq_len = lprobs.size(1)
+            bsz = lprobs.size(0)
+        device = net_output["encoder_out"].device
+
+        input_lengths = encoder_padding_mask_to_lengths(
+            net_output["encoder_padding_mask"], max_seq_len, bsz, device
+        )
+        target_lengths = sample["target_lengths"]
+        targets = sample["target"]
+
+        if batch_first:
+            # N T D -> T N D (F.ctc_loss expects this)
+            lprobs = lprobs.transpose(0, 1)
+
+        pad_mask = sample["target"] != self.pad_idx
+        targets_flat = targets.masked_select(pad_mask)
+
+        loss = F.ctc_loss(
+            lprobs,
+            targets_flat,
+            input_lengths,
+            target_lengths,
+            blank=self.blank_idx,
+            reduction="sum",
+            zero_infinity=True,
+        )
+
+        lprobs = lprobs.transpose(0, 1)  # T N D -> N T D
+        errors, total = compute_ctc_uer(
+            lprobs, targets, input_lengths, target_lengths, self.blank_idx
+        )
+
+        if self.args.sentence_avg:
+            sample_size = sample["target"].size(0)
+        else:
+            if self.args.use_source_side_sample_size:
+                sample_size = torch.sum(input_lengths).item()
+            else:
+                sample_size = sample["ntokens"]
+
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+            "errors": errors,
+            "total": total,
+            "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(),
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        errors = sum(log.get("errors", 0) for log in logging_outputs)
+        total = sum(log.get("total", 0) for log in logging_outputs)
+        nframes = sum(log.get("nframes", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2),
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "nframes": nframes,
+            "sample_size": sample_size,
+            "acc": 100.0 - min(errors * 100.0 / total, 100.0),
+        }
+        if sample_size != ntokens:
+            agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
+        return agg_output
diff --git a/examples/speech_recognition/data/data_utils.py b/examples/speech_recognition/data/data_utils.py
index 5380461651..03c41f47d9 100644
--- a/examples/speech_recognition/data/data_utils.py
+++ b/examples/speech_recognition/data/data_utils.py
@@ -58,3 +58,39 @@ def lengths_to_encoder_padding_mask(lengths, batch_first=False):
         return encoder_padding_mask.t(), max_lengths
     else:
         return encoder_padding_mask, max_lengths
+
+
+def encoder_padding_mask_to_lengths(
+    encoder_padding_mask, max_lengths, batch_size, device
+):
+    """
+    convert encoder_padding_mask (2-D binary tensor) to a 1-D tensor
+
+    Conventionally, encoder output contains a encoder_padding_mask, which is
+    a 2-D mask in a shape (T, B), whose (t, b) element indicate whether
+    encoder_out[t, b] is a valid output (=0) or not (=1). Occasionally, we
+    need to convert this mask tensor to a 1-D tensor in shape (B, ), where
+    [b] denotes the valid length of b-th sequence
+
+    Args:
+        encoder_padding_mask: a (T, B)-shaped binary tensor or None; if None,
+        indicating all are valid
+    Return:
+        seq_lengths: a (B,)-shaped tensor, where its (b, )-th element is the
+        number of valid elements of b-th sequence
+
+        max_lengths: maximum length of all sequence, if encoder_padding_mask is
+        not None, max_lengths must equal to encoder_padding_mask.size(0)
+
+        batch_size: batch size; if encoder_padding_mask is
+        not None, max_lengths must equal to encoder_padding_mask.size(1)
+
+        device: which device to put the result on
+    """
+    if encoder_padding_mask is None:
+        return torch.Tensor([max_lengths] * batch_size).to(torch.int32).to(device)
+
+    assert encoder_padding_mask.size(0) == max_lengths, "max_lengths does not match"
+    assert encoder_padding_mask.size(1) == batch_size, "batch_size does not match"
+
+    return max_lengths - torch.sum(encoder_padding_mask, dim=0)
diff --git a/examples/speech_recognition/models/vggtransformer.py b/examples/speech_recognition/models/vggtransformer.py
index 3a078ec6ff..a39300fc6e 100644
--- a/examples/speech_recognition/models/vggtransformer.py
+++ b/examples/speech_recognition/models/vggtransformer.py
@@ -12,6 +12,7 @@
 from fairseq import utils
 from fairseq.models import (
     FairseqEncoder,
+    FairseqEncoderModel,
     FairseqIncrementalDecoder,
     FairseqEncoderDecoderModel,
     register_model,
@@ -709,6 +710,141 @@ def _transpose_if_inference(self, x, incremental_state):
             x = x.transpose(0, 1)
         return x
 
+@register_model("asr_vggtransformer_encoder")
+class VGGTransformerEncoderModel(FairseqEncoderModel):
+    def __init__(self, encoder):
+        super().__init__(encoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="encoder input dimension per input channel",
+        )
+        parser.add_argument(
+            "--vggblock-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples each containing the configuration of one vggblock
+    [(out_channels, conv_kernel_size, pooling_kernel_size,num_conv_layers), ...]
+    """,
+        )
+        parser.add_argument(
+            "--transformer-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    a tuple containing the configuration of the Transformer layers
+    configurations:
+    [(input_dim,
+      num_heads,
+      ffn_dim,
+      normalize_before,
+      dropout,
+      attention_dropout,
+      relu_dropout), ]""",
+        )
+        parser.add_argument(
+            "--enc-output-dim",
+            type=int,
+            metavar="N",
+            help="encoder output dimension, projecting the LSTM output",
+        )
+        parser.add_argument(
+            "--in-channels",
+            type=int,
+            metavar="N",
+            help="number of encoder input channels",
+        )
+        parser.add_argument(
+            "--transformer-context",
+            type=str,
+            metavar="EXPR",
+            help="""
+    either None or a tuple of two ints, indicating left/right context a
+    transformer can have access to""",
+        )
+        parser.add_argument(
+            "--transformer-sampling",
+            type=str,
+            metavar="EXPR",
+            help="""
+    either None or a tuple of ints, indicating sampling factor in each layer""",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        base_architecture_enconly(args)
+        encoder = VGGTransformerEncoderOnly(
+            vocab_size=len(task.target_dictionary),
+            input_feat_per_channel=args.input_feat_per_channel,
+            vggblock_config=eval(args.vggblock_enc_config),
+            transformer_config=eval(args.transformer_enc_config),
+            encoder_output_dim=args.enc_output_dim,
+            in_channels=args.in_channels,
+            transformer_context=eval(args.transformer_context),
+            transformer_sampling=eval(args.transformer_sampling),
+        )
+        return cls(encoder)
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        # net_output['encoder_out'] is a (T, B, D) tensor
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        # lprobs is a (T, B, D) tensor
+        # we need to transoose to get (B, T, D) tensor
+        lprobs = lprobs.transpose(0, 1).contiguous()
+        lprobs.batch_first = True
+        return lprobs
+
+
+class VGGTransformerEncoderOnly(VGGTransformerEncoder):
+    def __init__(
+        self,
+        vocab_size,
+        input_feat_per_channel,
+        vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG,
+        transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG,
+        encoder_output_dim=512,
+        in_channels=1,
+        transformer_context=None,
+        transformer_sampling=None,
+    ):
+        super().__init__(
+            input_feat_per_channel=input_feat_per_channel,
+            vggblock_config=vggblock_config,
+            transformer_config=transformer_config,
+            encoder_output_dim=encoder_output_dim,
+            in_channels=in_channels,
+            transformer_context=transformer_context,
+            transformer_sampling=transformer_sampling,
+        )
+        self.fc_out = Linear(self.encoder_output_dim, vocab_size)
+
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        src_tokens: padded tensor (B, T, C * feat)
+        src_lengths: tensor of original lengths of input utterances (B,)
+        """
+
+        enc_out = super().forward(src_tokens, src_lengths)
+        x = self.fc_out(enc_out["encoder_out"])
+        # x = F.log_softmax(x, dim=-1)
+        # Note: no need this line, because model.get_normalized_prob will call
+        # log_softmax
+        return {
+            "encoder_out": x,  # (T, B, C)
+            "encoder_padding_mask": enc_out["encoder_padding_mask"],  # (T, B)
+        }
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return (1e6, 1e6)  # an arbitrary large number
+
 
 def Embedding(num_embeddings, embedding_dim, padding_idx):
     m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
@@ -836,3 +972,35 @@ def vggtransformer_base(args):
     #   - FC: 512*5000 = 256K (assuming vocab size 5K)
     # In total:
     #       ~65 M
+
+
+# CTC models
+def base_architecture_enconly(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 40)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(32, 3, 2, 2, True)] * 2"
+    )
+    args.transformer_enc_config = getattr(
+        args, "transformer_enc_config", "((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2"
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 512)
+    args.in_channels = getattr(args, "in_channels", 1)
+    args.transformer_context = getattr(args, "transformer_context", "None")
+    args.transformer_sampling = getattr(args, "transformer_sampling", "None")
+
+
+@register_model_architecture("asr_vggtransformer_encoder", "vggtransformer_enc_1")
+def vggtransformer_enc_1(args):
+    # vggtransformer_1 is the same as vggtransformer_enc_big, except the number
+    # of layers is increased to 16
+    # keep it here for backward compatiablity purpose
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args,
+        "transformer_enc_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16",
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 1024)
diff --git a/examples/speech_recognition/utils/wer_utils.py b/examples/speech_recognition/utils/wer_utils.py
new file mode 100644
index 0000000000..cf6f3d09ba
--- /dev/null
+++ b/examples/speech_recognition/utils/wer_utils.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import re
+from collections import deque
+from enum import Enum
+
+import numpy as np
+
+
+"""
+    Utility modules for computation of Word Error Rate,
+    Alignments, as well as more granular metrics like
+    deletion, insersion and substitutions.
+"""
+
+
+class Code(Enum):
+    match = 1
+    substitution = 2
+    insertion = 3
+    deletion = 4
+
+
+class Token(object):
+    def __init__(self, lbl="", st=np.nan, en=np.nan):
+        if np.isnan(st):
+            self.label, self.start, self.end = "", 0.0, 0.0
+        else:
+            self.label, self.start, self.end = lbl, st, en
+
+
+class AlignmentResult(object):
+    def __init__(self, refs, hyps, codes, score):
+        self.refs = refs  # std::deque<int>
+        self.hyps = hyps  # std::deque<int>
+        self.codes = codes  # std::deque<Code>
+        self.score = score  # float
+
+
+def coordinate_to_offset(row, col, ncols):
+    return int(row * ncols + col)
+
+
+def offset_to_row(offset, ncols):
+    return int(offset / ncols)
+
+
+def offset_to_col(offset, ncols):
+    return int(offset % ncols)
+
+
+def trimWhitespace(str):
+    return re.sub(" +", " ", re.sub(" *$", "", re.sub("^ *", "", str)))
+
+
+def str2toks(str):
+    pieces = trimWhitespace(str).split(" ")
+    toks = []
+    for p in pieces:
+        toks.append(Token(p, 0.0, 0.0))
+    return toks
+
+
+class EditDistance(object):
+    def __init__(self, time_mediated):
+        self.time_mediated_ = time_mediated
+        self.scores_ = np.nan  # Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>
+        self.backtraces_ = (
+            np.nan
+        )  # Eigen::Matrix<size_t, Eigen::Dynamic, Eigen::Dynamic> backtraces_;
+        self.confusion_pairs_ = {}
+
+    def cost(self, ref, hyp, code):
+        if self.time_mediated_:
+            if code == Code.match:
+                return abs(ref.start - hyp.start) + abs(ref.end - hyp.end)
+            elif code == Code.insertion:
+                return hyp.end - hyp.start
+            elif code == Code.deletion:
+                return ref.end - ref.start
+            else:  # substitution
+                return abs(ref.start - hyp.start) + abs(ref.end - hyp.end) + 0.1
+        else:
+            if code == Code.match:
+                return 0
+            elif code == Code.insertion or code == Code.deletion:
+                return 3
+            else:  # substitution
+                return 4
+
+    def get_result(self, refs, hyps):
+        res = AlignmentResult(refs=deque(), hyps=deque(), codes=deque(), score=np.nan)
+
+        num_rows, num_cols = self.scores_.shape
+        res.score = self.scores_[num_rows - 1, num_cols - 1]
+
+        curr_offset = coordinate_to_offset(num_rows - 1, num_cols - 1, num_cols)
+
+        while curr_offset != 0:
+            curr_row = offset_to_row(curr_offset, num_cols)
+            curr_col = offset_to_col(curr_offset, num_cols)
+
+            prev_offset = self.backtraces_[curr_row, curr_col]
+
+            prev_row = offset_to_row(prev_offset, num_cols)
+            prev_col = offset_to_col(prev_offset, num_cols)
+
+            res.refs.appendleft(curr_row - 1)  # Note: this was .push_front() in C++
+            res.hyps.appendleft(curr_col - 1)
+            if curr_row - 1 == prev_row and curr_col == prev_col:
+                res.codes.appendleft(Code.deletion)
+            elif curr_row == prev_row and curr_col - 1 == prev_col:
+                res.codes.appendleft(Code.insertion)
+            else:
+                # assert(curr_row - 1 == prev_row and curr_col - 1 == prev_col)
+                ref_str = refs[res.refs[0]].label
+                hyp_str = hyps[res.hyps[0]].label
+
+                if ref_str == hyp_str:
+                    res.codes.appendleft(Code.match)
+                else:
+                    res.codes.appendleft(Code.substitution)
+
+                    confusion_pair = "%s -> %s" % (ref_str, hyp_str)
+                    if confusion_pair not in self.confusion_pairs_:
+                        self.confusion_pairs_[confusion_pair] = 1
+                    else:
+                        self.confusion_pairs_[confusion_pair] += 1
+
+            curr_offset = prev_offset
+
+        return res
+
+    def align(self, refs, hyps):
+        if len(refs) == 0 and len(hyps) == 0:
+            return np.nan
+
+        # NOTE: we're not resetting the values in these matrices because every value
+        # will be overridden in the loop below. If this assumption doesn't hold,
+        # be sure to set all entries in self.scores_ and self.backtraces_ to 0.
+        self.scores_ = np.zeros((len(refs) + 1, len(hyps) + 1))
+        self.backtraces_ = np.zeros((len(refs) + 1, len(hyps) + 1))
+
+        num_rows, num_cols = self.scores_.shape
+
+        for i in range(num_rows):
+            for j in range(num_cols):
+                if i == 0 and j == 0:
+                    self.scores_[i, j] = 0.0
+                    self.backtraces_[i, j] = 0
+                    continue
+
+                if i == 0:
+                    self.scores_[i, j] = self.scores_[i, j - 1] + self.cost(
+                        None, hyps[j - 1], Code.insertion
+                    )
+                    self.backtraces_[i, j] = coordinate_to_offset(i, j - 1, num_cols)
+                    continue
+
+                if j == 0:
+                    self.scores_[i, j] = self.scores_[i - 1, j] + self.cost(
+                        refs[i - 1], None, Code.deletion
+                    )
+                    self.backtraces_[i, j] = coordinate_to_offset(i - 1, j, num_cols)
+                    continue
+
+                # Below here both i and j are greater than 0
+                ref = refs[i - 1]
+                hyp = hyps[j - 1]
+                best_score = self.scores_[i - 1, j - 1] + (
+                    self.cost(ref, hyp, Code.match)
+                    if (ref.label == hyp.label)
+                    else self.cost(ref, hyp, Code.substitution)
+                )
+
+                prev_row = i - 1
+                prev_col = j - 1
+                ins = self.scores_[i, j - 1] + self.cost(None, hyp, Code.insertion)
+                if ins < best_score:
+                    best_score = ins
+                    prev_row = i
+                    prev_col = j - 1
+
+                delt = self.scores_[i - 1, j] + self.cost(ref, None, Code.deletion)
+                if delt < best_score:
+                    best_score = delt
+                    prev_row = i - 1
+                    prev_col = j
+
+                self.scores_[i, j] = best_score
+                self.backtraces_[i, j] = coordinate_to_offset(
+                    prev_row, prev_col, num_cols
+                )
+
+        return self.get_result(refs, hyps)
+
+
+class WERTransformer(object):
+    def __init__(self, hyp_str, ref_str, verbose=True):
+        self.ed_ = EditDistance(False)
+        self.id2oracle_errs_ = {}
+        self.utts_ = 0
+        self.words_ = 0
+        self.insertions_ = 0
+        self.deletions_ = 0
+        self.substitutions_ = 0
+
+        self.process(["dummy_str", hyp_str, ref_str])
+
+        if verbose:
+            print("'%s' vs '%s'" % (hyp_str, ref_str))
+            self.report_result()
+
+    def process(self, input):  # std::vector<std::string>&& input
+        if len(input) < 3:
+            print(
+                "Input must be of the form <id> ... <hypo> <ref> , got ",
+                len(input),
+                " inputs:",
+            )
+            return None
+
+        # Align
+        # std::vector<Token> hyps;
+        # std::vector<Token> refs;
+
+        hyps = str2toks(input[-2])
+        refs = str2toks(input[-1])
+
+        alignment = self.ed_.align(refs, hyps)
+        if alignment is None:
+            print("Alignment is null")
+            return np.nan
+
+        # Tally errors
+        ins = 0
+        dels = 0
+        subs = 0
+        for code in alignment.codes:
+            if code == Code.substitution:
+                subs += 1
+            elif code == Code.insertion:
+                ins += 1
+            elif code == Code.deletion:
+                dels += 1
+
+        # Output
+        row = input
+        row.append(str(len(refs)))
+        row.append(str(ins))
+        row.append(str(dels))
+        row.append(str(subs))
+        # print(row)
+
+        # Accumulate
+        kIdIndex = 0
+        kNBestSep = "/"
+
+        pieces = input[kIdIndex].split(kNBestSep)
+
+        if len(pieces) == 0:
+            print(
+                "Error splitting ",
+                input[kIdIndex],
+                " on '",
+                kNBestSep,
+                "', got empty list",
+            )
+            return np.nan
+
+        id = pieces[0]
+        if id not in self.id2oracle_errs_:
+            self.utts_ += 1
+            self.words_ += len(refs)
+            self.insertions_ += ins
+            self.deletions_ += dels
+            self.substitutions_ += subs
+            self.id2oracle_errs_[id] = [ins, dels, subs]
+        else:
+            curr_err = ins + dels + subs
+            prev_err = np.sum(self.id2oracle_errs_[id])
+            if curr_err < prev_err:
+                self.id2oracle_errs_[id] = [ins, dels, subs]
+
+        return 0
+
+    def report_result(self):
+        # print("----------  Summary ---------------")
+        if self.words_ == 0:
+            print("No words counted")
+            return
+
+        # 1-best
+        best_wer = (
+            100.0
+            * (self.insertions_ + self.deletions_ + self.substitutions_)
+            / self.words_
+        )
+
+        print(
+            "\tWER = %0.2f%% (%i utts, %i words, %0.2f%% ins, "
+            "%0.2f%% dels, %0.2f%% subs)"
+            % (
+                best_wer,
+                self.utts_,
+                self.words_,
+                100.0 * self.insertions_ / self.words_,
+                100.0 * self.deletions_ / self.words_,
+                100.0 * self.substitutions_ / self.words_,
+            )
+        )
+
+    def wer(self):
+        if self.words_ == 0:
+            wer = np.nan
+        else:
+            wer = (
+                100.0
+                * (self.insertions_ + self.deletions_ + self.substitutions_)
+                / self.words_
+            )
+        return wer
+
+    def stats(self):
+        if self.words_ == 0:
+            stats = {}
+        else:
+            wer = (
+                100.0
+                * (self.insertions_ + self.deletions_ + self.substitutions_)
+                / self.words_
+            )
+            stats = dict(
+                {
+                    "wer": wer,
+                    "utts": self.utts_,
+                    "numwords": self.words_,
+                    "ins": self.insertions_,
+                    "dels": self.deletions_,
+                    "subs": self.substitutions_,
+                    "confusion_pairs": self.ed_.confusion_pairs_,
+                }
+            )
+        return stats
+
+
+def calc_wer(hyp_str, ref_str):
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.wer()
+
+
+def calc_wer_stats(hyp_str, ref_str):
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.stats()
+
+
+def get_wer_alignment_codes(hyp_str, ref_str):
+    """
+    INPUT: hypothesis string, reference string
+    OUTPUT: List of alignment codes (intermediate results from WER computation)
+    """
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.ed_.align(str2toks(ref_str), str2toks(hyp_str)).codes
+
+
+def merge_counts(x, y):
+    # Merge two hashes which have 'counts' as their values
+    # This can be used for example to merge confusion pair counts
+    #   conf_pairs = merge_counts(conf_pairs, stats['confusion_pairs'])
+    for k, v in y.items():
+        if k not in x:
+            x[k] = 0
+        x[k] += v
+    return x

From cce92bdd5a53b7bb45524c70d03d3ba13eab5412 Mon Sep 17 00:00:00 2001
From: Jiatao Gu <jgu@fb.com>
Date: Fri, 11 Oct 2019 07:37:13 -0700
Subject: [PATCH 176/213] add new_arange function + FIX BUGS of returning attn
 values

Summary:
Implementation of Levenshtein Transformer paper.
Add a new helper function "new_arange" to create arange tensor easily.
Fix bugs of returning attn values for NAT models
Delete files which are not necessary or experimental.

Reviewed By: kahne

Differential Revision: D17652009

fbshipit-source-id: 436bbb5d45de2f8067003232de4f2bd51e87719c
---
 fairseq/models/cmlm_transformer.py            |   7 +-
 fairseq/models/insertion_transformer.py       |  15 +-
 fairseq/models/levenshtein_transformer.py     |  30 +--
 .../models/nonautoregressive_transformer.py   | 249 +-----------------
 fairseq/utils.py                              |  10 +
 5 files changed, 27 insertions(+), 284 deletions(-)

diff --git a/fairseq/models/cmlm_transformer.py b/fairseq/models/cmlm_transformer.py
index f76c93fd0f..91a5a48a66 100644
--- a/fairseq/models/cmlm_transformer.py
+++ b/fairseq/models/cmlm_transformer.py
@@ -10,7 +10,7 @@
 arXiv preprint arXiv:1904.09324 (2019).
 """
 
-import torch
+from fairseq.utils import new_arange
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.nonautoregressive_transformer import NATransformerModel
 
@@ -20,10 +20,7 @@ def _skeptical_unmasking(output_scores, output_masks, p):
     boundary_len = (
         (output_masks.sum(1, keepdim=True).type_as(output_scores) - 2) * p
     ).long()
-    skeptical_mask = (
-        torch.arange(output_masks.size(1), device=output_masks.device)[None, :]
-        < boundary_len
-    )
+    skeptical_mask = new_arange(output_masks) < boundary_len
     return skeptical_mask.scatter(1, sorted_index, skeptical_mask)
 
 
diff --git a/fairseq/models/insertion_transformer.py b/fairseq/models/insertion_transformer.py
index 1296333b5e..f8a7a61dc5 100644
--- a/fairseq/models/insertion_transformer.py
+++ b/fairseq/models/insertion_transformer.py
@@ -6,7 +6,8 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-
+from fairseq import libnat
+from fairseq.utils import new_arange
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.levenshtein_transformer import (
     LevenshteinTransformerDecoder,
@@ -51,13 +52,6 @@ def compute_score_full(self, L, tau):
 
 
 def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None):
-    try:
-        from fairseq import libnat
-    except ImportError as e:
-        import sys
-        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
-        raise e
-
     B = in_tokens.size(0)
     T = in_tokens.size(1)
     V = vocab_size
@@ -102,8 +96,7 @@ def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, paddi
     word_ins_scores.masked_fill_(padding_masks, 0.0)
     word_ins_pred.masked_fill_(padding_masks, padding_idx)
 
-    in_coords = torch.arange(in_tokens.size(1), device=in_tokens.device)
-    in_coords = in_coords.unsqueeze(0).repeat(in_tokens.size(0), 1).type_as(in_scores)
+    in_coords = new_arange(in_tokens).type_as(in_scores)
 
     # shift all padding predictions to infinite
     out_coords = (in_coords[:, 1:] - 0.5).masked_fill(
@@ -188,7 +181,7 @@ def forward_decoder(
         cut_off = output_tokens.ne(self.pad).sum(1).max()
         output_tokens = output_tokens[:, :cut_off]
         output_scores = output_scores[:, :cut_off]
-        return {"output_tokens": output_tokens, "output_scores": output_scores}
+        return {"output_tokens": output_tokens, "output_scores": output_scores, "attn": None}
 
 
 class InsertionTransformerDecoder(LevenshteinTransformerDecoder):
diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index 9f016dbb4a..18a171980c 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -5,7 +5,8 @@
 
 import torch
 import torch.nn.functional as F
-
+from fairseq import libnat
+from fairseq.utils import new_arange
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.model_utils import fill_tensors as _fill, skip_tensors as _skip
 from fairseq.models.transformer import (
@@ -18,13 +19,6 @@
 
 
 def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
-    try:
-        from fairseq import libnat
-    except ImportError as e:
-        import sys
-        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
-        raise e
-
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -67,13 +61,6 @@ def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
 
 
 def _get_del_targets(in_tokens, out_tokens, padding_idx):
-    try:
-        from fairseq import libnat
-    except ImportError as e:
-        import sys
-        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
-        raise e
-
     out_seq_len = out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -100,13 +87,6 @@ def _get_del_targets(in_tokens, out_tokens, padding_idx):
 
 
 def _get_del_ins_targets(in_tokens, out_tokens, padding_idx):
-    try:
-        from fairseq import libnat
-    except ImportError as e:
-        import sys
-        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
-        raise e
-
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -156,7 +136,7 @@ def _apply_ins_masks(
     out_lengths = in_lengths + mask_ins_pred.sum(1)
     out_max_len = out_lengths.max()
     out_masks = (
-        torch.arange(out_max_len, device=out_lengths.device)[None, :]
+        new_arange(out_lengths, out_max_len)[None, :]
         < out_lengths[:, None]
     )
 
@@ -205,9 +185,7 @@ def _apply_del_words(
     word_del_pred.masked_fill_(bos_eos_masks, 0)
 
     reordering = (
-        torch.arange(max_len, device=in_tokens.device)[None, :]
-        .expand_as(in_tokens)
-        .contiguous()
+        new_arange(in_tokens)
         .masked_fill_(word_del_pred, max_len)
         .sort(1)[1]
     )
diff --git a/fairseq/models/nonautoregressive_transformer.py b/fairseq/models/nonautoregressive_transformer.py
index d45a5b443b..1add0bd480 100644
--- a/fairseq/models/nonautoregressive_transformer.py
+++ b/fairseq/models/nonautoregressive_transformer.py
@@ -10,11 +10,9 @@
 from fairseq.models.transformer import (
     Embedding,
     TransformerDecoder,
-    TransformerDecoderLayer,
     TransformerEncoder,
     TransformerModel,
 )
-from fairseq.modules import MultiheadAttention
 from fairseq.modules.transformer_sentence_encoder import init_bert_params
 
 
@@ -35,45 +33,11 @@ def _argmax(x, dim):
     return (x == x.max(dim, keepdim=True)[0]).type_as(x)
 
 
-def _dynamic_programming(tokens, scores):
-    N, B, T = tokens.size()
-    cum_scores = scores[:, :, 0].clone()  # N x B
-    cum_choice = tokens.new_zeros(B, T)
-
-    # forward
-    for t in range(T - 1):
-        score, choice = cum_scores.max(0)
-        cum_choice[:, t] = choice
-        cum_scores[0] = score + scores[0, :, t + 1]
-        cum_scores[1:] = cum_scores[:-1] + scores[1:, :, t + 1]
-
-    # back-tracking
-    end_score, end_choice = cum_scores.max(0)
-    cum_choice[:, T - 1] = end_choice
-    for t in range(T - 2, -1, -1):
-        is_start = (cum_choice[:, t + 1] == 0).type_as(cum_choice)
-        cum_choice[:, t] = (cum_choice[:, t + 1] - 1) * ~is_start + cum_choice[
-            :, t
-        ] * is_start
-
-    # finalize the prediction
-    tokens = tokens.gather(0, cum_choice.unsqueeze(0)).squeeze(0)
-    scores = scores.gather(0, cum_choice.unsqueeze(0)).squeeze(0)
-    return scores, tokens
-
-
-def _beam_search(tokens, scores, W=None):
-    N, B, T = tokens.size()
-
-    if (W is None) or (W > N):
-        W = N
-
-
 def _uniform_assignment(src_lens, trg_lens):
     max_trg_len = trg_lens.max()
     steps = (src_lens.float() - 1) / (trg_lens.float() - 1)  # step-size
     # max_trg_len
-    index_t = torch.arange(max_trg_len, device=trg_lens.device).float()
+    index_t = utils.new_arange(trg_lens, max_trg_len).float()
     index_t = steps[:, None] * index_t[None, :]  # batch_size X max_trg_len
     index_t = torch.round(index_t).long().detach()
     return index_t
@@ -108,16 +72,6 @@ def add_args(parser):
         parser.add_argument("--length-loss-factor", type=float,
                             help="weights on the length prediction loss")
 
-        # n-gram predictor
-        parser.add_argument(
-            "--ngram-predictor",
-            nargs="?",
-            const=4,
-            default=1,
-            type=int,
-            help="adding an additional n-gram predictor.",
-        )
-
     @classmethod
     def build_decoder(cls, args, tgt_dict, embed_tokens):
         decoder = NATransformerDecoder(args, tgt_dict, embed_tokens)
@@ -173,13 +127,13 @@ def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwar
         output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
         output_scores.masked_scatter_(output_masks, _scores[output_masks])
 
-        return {"output_tokens": output_tokens, "output_scores": output_scores}
+        return {"output_tokens": output_tokens, "output_scores": output_scores, "attn": None}
 
     def initialize_output_tokens(self, encoder_out, src_tokens):
         # length prediction
         _, length_tgt = self.decoder.forward_length_prediction(encoder_out)
         max_length = length_tgt.max()
-        idx_length = torch.arange(max_length, device=src_tokens.device)
+        idx_length = utils.new_arange(src_tokens, max_length)
 
         initial_output_tokens = src_tokens.new_zeros(
             src_tokens.size(0), max_length
@@ -197,6 +151,7 @@ def initialize_output_tokens(self, encoder_out, src_tokens):
         return {
             "output_tokens": initial_output_tokens,
             "output_scores": initial_output_scores,
+            "attn": None
         }
 
 
@@ -218,11 +173,6 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
         self.src_embedding_copy = getattr(args, "src_embedding_copy", False)
         self.embed_length = Embedding(256, self.encoder_embed_dim, None)
 
-        self.ngram_predictor = getattr(args, "ngram_predictor", 1)
-        self.ngram_layer = (
-            None if (self.ngram_predictor == 1) else NgramDecoderLayer(args, True)
-        )
-
     def forward(
         self,
         prev_output_tokens,
@@ -240,25 +190,12 @@ def forward(
         )
 
         if tgt_tokens is not None:
-            if self.ngram_layer is None:
-                word_ins_mask = tgt_tokens.ne(self.padding_idx)
-                word_ins_tgt = tgt_tokens
-            else:
-                context_embeds, context_masks = self.forward_ngram_context(tgt_tokens)
-                features = self.ngram_layer(features, context_embeds=context_embeds)
-                word_ins_tgt = tgt_tokens[:, :, None].repeat(1, 1, self.ngram_predictor)
-                word_ins_mask = word_ins_tgt.ne(self.padding_idx) & context_masks
-
+            word_ins_mask = tgt_tokens.ne(self.padding_idx)
+            word_ins_tgt = tgt_tokens
             return self.output_layer(features), word_ins_tgt, word_ins_mask
 
         else:
-            if self.ngram_layer is None:
-                return F.log_softmax(self.output_layer(features), -1).max(-1)
-            else:
-                # inner iterations
-                return self.forward_ngram_decoding(
-                    features, prev_output_tokens.eq(self.padding_idx), decoding_format
-                )
+            return F.log_softmax(self.output_layer(features), -1).max(-1)
 
     def extract_features(
         self,
@@ -336,82 +273,6 @@ def extract_features(
 
         return x, {"attn": attn, "inner_states": inner_states}
 
-    def forward_ngram_context(self, tgt_tokens):
-        tgt_embeds = self.forward_embedding(tgt_tokens)
-        n_contexts = self.ngram_predictor - 1
-
-        # shifting the embeddings
-        # context_embeds: N x B x T x C
-        # context_masks:  B x T x N
-        context_embeds = tgt_embeds.new_zeros(n_contexts, *tgt_embeds.size())
-        context_masks = tgt_embeds.new_ones(
-            *tgt_embeds.size()[:2], self.ngram_predictor
-        ).bool()
-
-        for k in range(n_contexts):
-            context_embeds[k, :, k + 1:] = tgt_embeds[:, : -k - 1]
-            context_masks[:, : k + 1, k + 1] = 0
-
-        return context_embeds, context_masks
-
-    def forward_ngram_decoding(self, features, padding_mask=None, decoding_format=None):
-        context_embeds = None
-        scores, tokens = [], []
-        ensemble_score = None
-        ensemble_index = None
-
-        if decoding_format is None:
-            decoding_format = "ensemble"
-
-        for k in range(self.ngram_predictor):
-            ngram_out = self.ngram_layer(
-                features, context_embeds=context_embeds, incremental=True
-            )
-            ngram_scores = F.log_softmax(self.output_layer(ngram_out), -1)
-            max_score, max_token = ngram_scores.max(-1)
-
-            if decoding_format == "vote":
-                ngram_scores = _argmax(ngram_scores, -1)
-
-            if ensemble_score is None:
-                ensemble_score = ngram_scores
-                ensemble_index = ensemble_score.new_ones(*ensemble_score.size()[:2])
-            else:
-                ensemble_index[:, k:] = ensemble_index[:, k:] + 1
-                ensemble_score = ensemble_score + ngram_scores.masked_fill_(
-                    (ensemble_index < k)
-                    .unsqueeze(2)
-                    .repeat(1, 1, ensemble_score.size(2)),
-                    0,
-                )
-                max_score[:, :k] = float("-inf")
-
-            if decoding_format == "unigram":
-                break
-
-            scores.append(max_score.masked_fill_(padding_mask, 0))
-            tokens.append(max_token.masked_fill_(padding_mask, self.padding_idx))
-
-            # context_embeds: N x B x T x C
-            if context_embeds is None:
-                context_embeds = self.forward_embedding(max_token).unsqueeze(0)
-
-            else:
-                context_embeds = torch.cat(
-                    [self.forward_embedding(max_token).unsqueeze(0), context_embeds], 0
-                )
-
-            context_embeds[:, :, 1:] = context_embeds[:, :, :-1]
-
-        if decoding_format != "dp":
-            ensemble_score = ensemble_score / ensemble_index.unsqueeze(2)
-            return ensemble_score.max(-1)
-
-        else:
-            tokens = torch.cat([t.unsqueeze(0) for t in tokens], 0)
-            scores = torch.cat([s.unsqueeze(0) for s in scores], 0)
-            return _dynamic_programming(tokens, scores)
-
     def forward_embedding(self, prev_output_tokens, states=None):
         # embed positions
         positions = (
@@ -489,101 +350,6 @@ def forward_length_prediction(self, encoder_out, tgt_tokens=None):
         return length_out, length_tgt
 
 
-class NgramDecoderLayer(TransformerDecoderLayer):
-    """
-    N-gram Decoder Layer:
-
-    This module can be pluged in the last layer of any Non-autoregressive Model's
-    It provides an alternative way to capture local n-gram information by running the block multiple times.
-    """
-
-    def __init__(self, args, no_encoder_attn=False):
-        super(NgramDecoderLayer, self).__init__(args, no_encoder_attn=no_encoder_attn)
-        self.self_attn = MultiheadAttention(
-            embed_dim=self.embed_dim,
-            num_heads=1,  # maybe n-gram does not need too many heads.
-            dropout=args.attention_dropout,
-            self_attention=False,
-            encoder_decoder_attention=True,
-        )
-
-    def forward(
-        self,
-        x,
-        encoder_out=None,
-        encoder_padding_mask=None,
-        context_embeds=None,
-        incremental=False,
-    ):
-        # x: T x B x C
-        # context_embeds: N x T x B x C
-        T, B, C = x.size()
-
-        residual = x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
-        x = x.contiguous().view(1, T * B, C).contiguous()
-
-        if context_embeds is not None:
-            N = context_embeds.size(0)
-            context_embeds = context_embeds.view(N, T * B, C).contiguous()
-
-        if not incremental:
-            assert context_embeds is not None, "we need context for training"
-            # attn_weights: (n_head x T x B) x 1 x N
-            # v: (n_head x T x B) x N x (dim / n_head)
-            # -- move the attention computation outside --
-            attn_weights, values = self.self_attn(
-                query=x, key=context_embeds, value=context_embeds, before_softmax=True
-            )
-
-            attn_weights = attn_weights.repeat(1, N, 1)
-            attn_masks = attn_weights.new_ones(N, N).triu_(1).bool()
-            attn_masks = attn_masks.unsqueeze(0).repeat(attn_weights.size(0), 1, 1)
-
-            attn_weights = attn_weights.masked_fill(attn_masks, float("-inf"))
-            attn_weights = utils.softmax(attn_weights, dim=-1).type_as(attn_weights)
-            attn_weights = F.dropout(
-                attn_weights, p=self.self_attn.dropout, training=self.training
-            )
-
-            # (n_head x T x B) x N x (dim / n_head)
-            attn = torch.bmm(attn_weights, values)
-            attn = attn.transpose(0, 1).contiguous()
-            attn = attn.view(N, T * B, C).contiguous()
-            attn = attn.transpose(1, 0).contiguous()
-            attn = attn.view(T, B, N, C)
-
-            residual = residual.unsqueeze(2)
-            x = self.self_attn.out_proj(attn)
-            x = F.dropout(x, p=self.dropout, training=self.training)
-            x = torch.cat([residual, residual + x], 2)
-
-        else:
-            if context_embeds is None:
-                x = residual
-
-            else:
-                x, _ = self.self_attn(query=x, key=context_embeds, value=context_embeds)
-                x = x.view(T, B, C)
-                x = F.dropout(x, p=self.dropout, training=self.training)
-                x = residual + x
-
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
-
-        if self.encoder_attn is not None:
-            raise NotImplementedError
-
-        residual = x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
-        return x
-
-
 @register_model_architecture(
     "nonautoregressive_transformer", "nonautoregressive_transformer"
 )
@@ -630,7 +396,6 @@ def base_architecture(args):
     args.pred_length_offset = getattr(args, "pred_length_offset", False)
     args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
     args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
-    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
 
 
 @register_model_architecture(
diff --git a/fairseq/utils.py b/fairseq/utils.py
index 9dd41fbfea..79a89d41ec 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -412,3 +412,13 @@ def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos):
         for tgt_idx, src_idx in zip(tgt_valid, src_indices):
             alignment.append((src_token_to_word[src_idx.item()] - 1, tgt_token_to_word[tgt_idx.item()] - 1))
     return alignment
+
+
+def new_arange(x, *size):
+    """
+    Return a Tensor of `size` filled with a range function on the device of x.
+    If size is empty, using the size of the variable x.
+    """
+    if len(size) == 0:
+        size = x.size()
+    return torch.arange(size[-1], device=x.device).expand(*size).contiguous()

From 02b74c58b9de9ccc0f280657533eb1bd757bdb4a Mon Sep 17 00:00:00 2001
From: Jiatao Gu <jgu@fb.com>
Date: Fri, 11 Oct 2019 11:23:19 -0700
Subject: [PATCH 177/213] fix the random mask function for CMLM model

Summary: The original implementation of the random mask is different from what the paper was stated.

Reviewed By: kahne

Differential Revision: D17652564

fbshipit-source-id: 238a9158041b3ff2482ee50ce6151c3f77f0b2c1
---
 fairseq/options.py               |  2 +-
 fairseq/tasks/translation_lev.py | 16 +++++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/fairseq/options.py b/fairseq/options.py
index c33e1ac8e9..a6fd2cac3c 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -502,7 +502,7 @@ def add_generation_args(parser):
     group.add_argument('--print-step', action='store_true')
 
     # arguments for iterative refinement generator
-    group.add_argument('---iter-decode-eos-penalty', default=0.0, type=float, metavar='N',
+    group.add_argument('--iter-decode-eos-penalty', default=0.0, type=float, metavar='N',
                        help='if > 0.0, it penalized early-stopping in decoding.')
     group.add_argument('--iter-decode-max-iter', default=10, type=int, metavar='N',
                        help='maximum iterations for iterative refinement.')
diff --git a/fairseq/tasks/translation_lev.py b/fairseq/tasks/translation_lev.py
index 47d6a3ed4a..20da7d3d64 100644
--- a/fairseq/tasks/translation_lev.py
+++ b/fairseq/tasks/translation_lev.py
@@ -5,6 +5,7 @@
 
 import torch
 
+from fairseq.utils import new_arange
 from fairseq.tasks import register_task
 from fairseq.tasks.translation import TranslationTask, load_langpair_dataset
 
@@ -87,14 +88,19 @@ def _random_mask(target_tokens):
             eos = self.tgt_dict.eos()
             unk = self.tgt_dict.unk()
 
-            target_mask = target_tokens.eq(bos) | target_tokens.eq(
-                eos) | target_tokens.eq(pad)
+            target_masks = target_tokens.ne(pad) & \
+                           target_tokens.ne(bos) & \
+                           target_tokens.ne(eos)
             target_score = target_tokens.clone().float().uniform_()
-            target_score.masked_fill_(target_mask, 1.0)
+            target_score.masked_fill_(~target_masks, 2.0)
+            target_length = target_masks.sum(1).float()
+            target_length = target_length * target_length.clone().uniform_()
+            target_length = target_length + 1  # make sure to mask at least one token.
 
+            _, target_rank = target_score.sort(1)
+            target_cutoff = new_arange(target_rank) < target_length[:, None].long()
             prev_target_tokens = target_tokens.masked_fill(
-                target_score < target_score.new_zeros(target_score.size(0),
-                                                      1).uniform_(), unk)
+                target_cutoff.scatter(1, target_rank, target_cutoff), unk)
             return prev_target_tokens
 
         def _full_mask(target_tokens):

From d80ad54f75186adf9b597ef0bcef005c98381b9e Mon Sep 17 00:00:00 2001
From: Sujit Verma <sujit.verma@oculus.com>
Date: Fri, 11 Oct 2019 21:51:50 -0700
Subject: [PATCH 178/213] Added option to save checkpoints using Path Manager.

Summary: Added option to save checkpoints using Path Manager.

Reviewed By: hudeven

Differential Revision: D17392754

fbshipit-source-id: 4b8e556ef8455a1548e5a083d779ed809cd785be
---
 fairseq/checkpoint_utils.py | 29 ++++++++++++++++++++++++-----
 fairseq/trainer.py          |  8 +++++++-
 train.py                    | 11 +++++++++++
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 5ef45b8463..ded8ce32f5 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -65,7 +65,11 @@ def is_better(a, b):
     if len(checkpoints) > 0:
         trainer.save_checkpoint(checkpoints[0], extra_state)
         for cp in checkpoints[1:]:
-            shutil.copyfile(checkpoints[0], cp)
+            try:
+                from fairseq.fb_pathmgr import fb_pathmgr
+                fb_pathmgr.copy(checkpoints[0], cp, True)
+            except (ModuleNotFoundError, ImportError):
+                shutil.copyfile(checkpoints[0], cp)
 
         write_timer.stop()
         print('| saved checkpoint {} (epoch {} @ {} updates) (writing took {} seconds)'.format(
@@ -132,9 +136,17 @@ def load_checkpoint(args, trainer, data_selector=None):
 
 def load_checkpoint_to_cpu(path, arg_overrides=None):
     """Loads a checkpoint to CPU (with upgrading for backward compatibility)."""
-    state = torch.load(
-        path, map_location=lambda s, l: default_restore_location(s, 'cpu'),
-    )
+    try:
+        from fairseq.fb_pathmgr import fb_pathmgr
+        with fb_pathmgr.open(path, "rb") as f:
+            state = torch.load(
+                f, map_location=lambda s, l: default_restore_location(s, 'cpu'),
+            )
+    except (ModuleNotFoundError, ImportError):
+        # if path manager not found, continue with local file.
+        state = torch.load(
+            path, map_location=lambda s, l: default_restore_location(s, 'cpu'),
+        )
     args = state['args']
     if arg_overrides is not None:
         for arg_name, arg_val in arg_overrides.items():
@@ -244,7 +256,14 @@ def save_state(
         state_dict['criterion'] = criterion.state_dict()
     if not args.no_save_optimizer_state:
         state_dict['last_optimizer_state'] = convert_state_dict_type(optimizer.state_dict())
-    torch_persistent_save(state_dict, filename)
+
+    try:
+        from fairseq.fb_pathmgr import fb_pathmgr
+        with fb_pathmgr.open(filename, "wb") as f:
+            torch_persistent_save(state_dict, f)
+    except (ModuleNotFoundError, ImportError):
+        # if path manager not found, continue with local file.
+        torch_persistent_save(state_dict, filename)
 
 
 def _upgrade_state_dict(state):
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 35f9c1d759..03601f69d2 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -170,7 +170,13 @@ def load_checkpoint(
         """Load all training state from a checkpoint file."""
         extra_state, self._optim_history, last_optim_state = None, [], None
 
-        if os.path.exists(filename):
+        try:
+            from fairseq.fb_pathmgr import fb_pathmgr
+            bexists = fb_pathmgr.isfile(filename)
+        except Exception:
+            bexists = os.path.exists(filename)
+
+        if bexists:
             state = checkpoint_utils.load_checkpoint_to_cpu(filename)
 
             # load model parameters
diff --git a/train.py b/train.py
index 3879375fe9..9396358aa9 100644
--- a/train.py
+++ b/train.py
@@ -19,10 +19,21 @@
 from fairseq.trainer import Trainer
 from fairseq.meters import AverageMeter, StopwatchMeter
 
+fb_pathmgr_registerd = False
+
 
 def main(args, init_distributed=False):
     utils.import_user_module(args)
 
+    try:
+        from fairseq.fb_pathmgr import fb_pathmgr
+        global fb_pathmgr_registerd
+        if not fb_pathmgr_registerd:
+            fb_pathmgr.register()
+            fb_pathmgr_registerd = True
+    except (ModuleNotFoundError, ImportError):
+        pass
+
     assert args.max_tokens is not None or args.max_sentences is not None, \
         'Must specify batch size either with --max-tokens or --max-sentences'
 

From e3a40d9d34ec9669c40c2f38a5de962cb0b54e88 Mon Sep 17 00:00:00 2001
From: Changhan Wang <changhan@fb.com>
Date: Mon, 14 Oct 2019 18:21:35 -0700
Subject: [PATCH 179/213] fix libnat imports

Summary: Bring back the changes in D17661768

Reviewed By: ailzhang

Differential Revision: D17920299

fbshipit-source-id: be3f93a044a8710c8b475012c39e36a3e6507fad
---
 fairseq/models/insertion_transformer.py   |  8 +++++++-
 fairseq/models/levenshtein_transformer.py | 22 +++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/fairseq/models/insertion_transformer.py b/fairseq/models/insertion_transformer.py
index f8a7a61dc5..1657bd0b1b 100644
--- a/fairseq/models/insertion_transformer.py
+++ b/fairseq/models/insertion_transformer.py
@@ -6,7 +6,6 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-from fairseq import libnat
 from fairseq.utils import new_arange
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.levenshtein_transformer import (
@@ -52,6 +51,13 @@ def compute_score_full(self, L, tau):
 
 
 def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
     B = in_tokens.size(0)
     T = in_tokens.size(1)
     V = vocab_size
diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index 18a171980c..b153f99495 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -5,7 +5,6 @@
 
 import torch
 import torch.nn.functional as F
-from fairseq import libnat
 from fairseq.utils import new_arange
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.model_utils import fill_tensors as _fill, skip_tensors as _skip
@@ -19,6 +18,13 @@
 
 
 def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -61,6 +67,13 @@ def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
 
 
 def _get_del_targets(in_tokens, out_tokens, padding_idx):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
     out_seq_len = out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -87,6 +100,13 @@ def _get_del_targets(in_tokens, out_tokens, padding_idx):
 
 
 def _get_del_ins_targets(in_tokens, out_tokens, padding_idx):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):

From b5f41f828b0ec9b67fa60aceb0778073d1b368b2 Mon Sep 17 00:00:00 2001
From: Nayan Singhal <naysing@fb.com>
Date: Tue, 15 Oct 2019 09:58:06 -0700
Subject: [PATCH 180/213] Add Unit test cases for BMUF

Summary:
This unit test guards the bmuf code.

change:
1. distributed_init assumes we are always using cuda device which is not the case if you are using "gloo" backend on CPU machine.

Reviewed By: jay-mahadeokar

Differential Revision: D17821391

fbshipit-source-id: 28e1bb39f7a4889b1dc6bd636b7c499e55bfc69a
---
 fairseq/distributed_utils.py |   5 +-
 tests/test_bmuf.py           | 145 +++++++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_bmuf.py

diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py
index 9cfe20d120..0fff7589de 100644
--- a/fairseq/distributed_utils.py
+++ b/fairseq/distributed_utils.py
@@ -87,7 +87,10 @@ def distributed_init(args):
             socket.gethostname(), args.distributed_rank), flush=True)
 
         # perform a dummy all-reduce to initialize the NCCL communicator
-        dist.all_reduce(torch.zeros(1).cuda())
+        if torch.cuda.is_available():
+            dist.all_reduce(torch.zeros(1).cuda())
+        else:
+            dist.all_reduce(torch.zeros(1))
 
         suppress_output(is_master(args))
 
diff --git a/tests/test_bmuf.py b/tests/test_bmuf.py
new file mode 100644
index 0000000000..0f54afb8bd
--- /dev/null
+++ b/tests/test_bmuf.py
@@ -0,0 +1,145 @@
+import argparse
+import random
+import unittest
+from multiprocessing import Manager
+
+import torch
+import torch.nn as nn
+from fairseq import distributed_utils, optim
+
+
+class Model(nn.Module):
+    def __init__(self, input_size, output_size):
+        super(Model, self).__init__()
+        self.fc = nn.Linear(input_size, output_size)
+
+    def forward(self, input):
+        output = self.fc(input)
+        return output
+
+
+def setup_model_loss_criterion(args, rank, is_cuda):
+    """
+    setup model, criterion and optimizer based on input args
+    """
+    args.distributed_rank = rank
+    distributed_utils.distributed_init(args)
+    torch.manual_seed(1)
+    model = Model(args.input_size, args.nb_classes)
+    loss_fn = nn.CrossEntropyLoss()
+    if is_cuda:
+        model = model.cuda()
+        loss_fn = loss_fn.cuda()
+
+    optimizer = optim.sgd.SGD(args, model.parameters())
+    optimizer = optim.FairseqBMUF(args, optimizer)
+
+    return model, loss_fn, optimizer
+
+
+def train_step(input, target, model, loss_fn, optimizer):
+    """Do forward, backward and parameter update."""
+    model.train()
+    output = model(input)
+    loss = loss_fn(output, target)
+    optimizer.backward(loss)
+    optimizer.step()
+
+
+def single_gpu_training(args, rank, iterations, shared_results):
+
+    is_cuda = torch.cuda.is_available()
+    if is_cuda:
+        torch.cuda.set_device(rank)
+
+    model, loss_fn, optimizer = setup_model_loss_criterion(args, rank, is_cuda)
+
+    for _ in range(iterations):
+        input = torch.randn(1, args.input_size)
+        target = torch.empty(args.batch_size, dtype=torch.long).random_(args.nb_classes)
+
+        if is_cuda:
+            input = input.cuda()
+            target = target.cuda()
+        train_step(input, target, model, loss_fn, optimizer)
+
+    results = []
+    for param in model.parameters():
+        if len(results) == 0:
+            results = param.flatten().cpu().data
+        else:
+            results = torch.cat((results, param.flatten().cpu().data), 0)
+
+    shared_results[rank] = results
+
+
+def setup_args():
+    args = argparse.Namespace()
+    args.global_sync_iter = 20
+    args.block_momentum = 0.875
+    args.block_lr = 0.5
+    args.input_size = 5
+    args.nb_classes = 2
+    args.batch_size = 1
+    args.lr = [1e-3]
+    args.momentum = 0
+    args.weight_decay = 0
+    args.warmup_iterations = 0
+    args.use_nbm = True
+    args.average_sync = True
+    args.global_sync_iter = 1
+    args.distributed_backend = "gloo"
+
+    args.distributed_world_size = 2
+    port = random.randint(10000, 20000)
+    args.distributed_init_method = "tcp://localhost:{port}".format(port=port)
+    args.distributed_init_host = "localhost"
+    args.distributed_port = port + 1
+    args.local_world_size = args.distributed_world_size
+    return args
+
+
+class TestBMUF(unittest.TestCase):
+    def bmuf_process(self, args, iterations):
+        processes = []
+        results = Manager().dict()
+        ctx = torch.multiprocessing.get_context("spawn")
+        for rank in range(args.distributed_world_size):
+            p = ctx.Process(
+                target=single_gpu_training, args=(args, rank, iterations, results)
+            )
+            p.start()
+            processes.append(p)
+
+        for p in processes:
+            p.join()
+
+        # Make sure params in both machines are same
+        assert len(results) == 2
+        self.assertAlmostEqual(results[0], results[1])
+
+    def test_bmuf_sync(self):
+        # Train model for 1 iteration and do bmuf sync without doing warmup
+        args = setup_args()
+        iterations = 1
+        self.bmuf_process(args, iterations)
+
+    def test_warmup_sync(self):
+        # Train model for 20 iteration and do warmup sync without doing bmuf sync
+        args = setup_args()
+        args.warmup_iterations = 20
+        iterations = 20
+        self.bmuf_process(args, iterations)
+
+    def test_warmup_sync_bmuf_sync(self):
+        # Train model for 25 iteration and do warmup sync after 20 iteration
+        # and bmuf sync after 25 iteration
+        args = setup_args()
+        args.warmup_iterations = 20
+        args.global_sync_iter = 5
+        iterations = 25
+        self.bmuf_process(args, iterations)
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess((t1 - t2).abs().max(), 1e-4)

From 3dcb5c77165c1a0c33a35a7831182f1aa2e8ad73 Mon Sep 17 00:00:00 2001
From: Changhan Wang <changhan@fb.com>
Date: Fri, 18 Oct 2019 12:43:13 -0700
Subject: [PATCH 181/213] fix levenshtein transfromer attn

Summary: When the `if` statements in the levenshtein transformer decoder forward are removed, `attn` may get inconsistent batch sizes with output tokens. This is a fix.

Reviewed By: cndn

Differential Revision: D17936411

fbshipit-source-id: a1583f3806dc9f41caeb783c043429e247035803
---
 fairseq/models/levenshtein_transformer.py |  8 +++++-
 fairseq/models/model_utils.py             | 32 ++++++++++++++++++-----
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index b153f99495..5a5c8172e7 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -420,10 +420,16 @@ def initialize_output_tokens(self, encoder_out, src_tokens):
         initial_output_scores = initial_output_tokens.new_zeros(
             *initial_output_tokens.size()
         ).type_as(encoder_out["encoder_out"])
+
+        initial_attn = None
+        if getattr(self.decoder.layers[-1], "need_attn", False):
+            initial_attn = initial_output_tokens.new_zeros(
+                src_tokens.size(0), 2, src_tokens.size(1)
+            )
         return {
             "output_tokens": initial_output_tokens,
             "output_scores": initial_output_scores,
-            "attn": None,
+            "attn": initial_attn,
         }
 
 
diff --git a/fairseq/models/model_utils.py b/fairseq/models/model_utils.py
index 8217731c9e..25b5de4c05 100644
--- a/fairseq/models/model_utils.py
+++ b/fairseq/models/model_utils.py
@@ -31,25 +31,45 @@ def skip_tensors(x, mask):
     raise NotImplementedError
 
 
+def expand_2d_or_3d_tensor(x, trg_dim, padding_idx):
+    """
+    Expand 2D/3D tensor on dim=1
+    """
+    if x is None:
+        return None
+
+    assert x.dim() == 2 or x.dim() == 3
+    assert trg_dim >= x.size(1), (trg_dim, x.size())
+    if trg_dim == x.size(1):
+        return x
+
+    dims = [x.size(0), trg_dim - x.size(1)]
+    if x.dim() == 3:
+        dims.append(x.size(2))
+    x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1)
+
+    return x
+
+
 def fill_tensors(x, mask, y, padding_idx):
     """
     Filling tensor x with y at masked positions (dim=0).
     """
     if x is None:
-        return y
+        return None
+
     assert x.dim() == y.dim() and mask.size(0) == x.size(0)
     assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
+
     n_selected = mask.sum()
+    if n_selected == 0:
+        return x
     assert n_selected == y.size(0)
-
     if n_selected == x.size(0):
         return y
 
     if x.size(1) < y.size(1):
-        dims = [x.size(0), y.size(1) - x.size(1)]
-        if x.dim() == 3:
-            dims.append(x.size(2))
-        x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1)
+        x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx)
         x[mask] = y
     elif x.size(1) > y.size(1):
         x[mask] = padding_idx

From c8a7b627527ba2d54d93a8d19ac15414a83e858e Mon Sep 17 00:00:00 2001
From: dikshameghwal <dikshameghwal@gmail.com>
Date: Fri, 18 Oct 2019 13:09:00 -0700
Subject: [PATCH 182/213] fixed a bug in preprocess glue dataset dev filename
 (#1270)

Summary:
removed redundant quotes in the filename assigned for dev dataset for GLUE tasks
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1270

Differential Revision: D18013071

fbshipit-source-id: 35f00162e117c6584dc859f760503ca32dcb706e
---
 examples/roberta/preprocess_GLUE_tasks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/roberta/preprocess_GLUE_tasks.sh b/examples/roberta/preprocess_GLUE_tasks.sh
index 7623566444..7f215a3b53 100755
--- a/examples/roberta/preprocess_GLUE_tasks.sh
+++ b/examples/roberta/preprocess_GLUE_tasks.sh
@@ -173,7 +173,7 @@ do
     fairseq-preprocess \
       --only-source \
       --trainpref "$TASK_DATA_FOLDER/processed/train.label" \
-      --validpref "${DEVPREF//LANG/'label'}" \
+      --validpref "${DEVPREF//LANG/label}" \
       --destdir "$TASK-bin/label" \
       --workers 60;
   else

From b8d024e9b8c20058dd7282f1418ebef00bfb8974 Mon Sep 17 00:00:00 2001
From: Spencer Poff <spoff@fb.com>
Date: Fri, 18 Oct 2019 14:29:49 -0700
Subject: [PATCH 183/213] add missing function to FairseqLanguageModel

Summary: In https://github.com/fairinternal/fairseq-py/pull/877, sequence_generator began calling `model.forward_decoder`, but not all decoder models were given an implementation of that function.

Reviewed By: okhonko

Differential Revision: D17863751

fbshipit-source-id: ea70b636c9dafcf87f5d5e49631d0c4b7cf14984
---
 fairseq/models/fairseq_model.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index 674de01310..bd73bd5c23 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -369,6 +369,9 @@ def forward(self, src_tokens, **kwargs):
         """
         return self.decoder(src_tokens, **kwargs)
 
+    def forward_decoder(self, prev_output_tokens, **kwargs):
+        return self.decoder(prev_output_tokens, **kwargs)
+
     def extract_features(self, src_tokens, **kwargs):
         """
         Similar to *forward* but only return features.

From a3c629b5cef188b064c543cbdd0fa9128d1d353e Mon Sep 17 00:00:00 2001
From: Jiatao Gu <jgu@fb.com>
Date: Sat, 19 Oct 2019 18:01:59 -0700
Subject: [PATCH 184/213] Fix typos on Examples for Nonautoregressive
 translation

Summary: Fix typos in the examples

Reviewed By: kahne

Differential Revision: D18030097

fbshipit-source-id: 84f0cbafd85e50ffd5033738835373935e3b83d4
---
 examples/nonautoregressive_translation/scripts.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/nonautoregressive_translation/scripts.md b/examples/nonautoregressive_translation/scripts.md
index 2fda7f6204..9120cdda02 100644
--- a/examples/nonautoregressive_translation/scripts.md
+++ b/examples/nonautoregressive_translation/scripts.md
@@ -38,7 +38,7 @@ fairseq-train \
     --ddp-backend=no_c10d \
     --task translation_lev \
     --criterion nat_loss \
-    --arch nonautoregressive_transformer \
+    --arch iterative_nonautoregressive_transformer \
     --noise full_mask \
     --share-all-embeddings \
     --optimizer adam --adam-betas '(0.9,0.98)' \
@@ -81,8 +81,6 @@ fairseq-train \
     --dropout 0.3 --weight-decay 0.01 \
     --decoder-learned-pos \
     --encoder-learned-pos \
-    --pred-length-offset \
-    --length-loss-factor 0.1 \
     --apply-bert-init \
     --log-format 'simple' --log-interval 100 \
     --fixed-validation-seed 7 \

From 66d24dc2ae5ed2bd631bfeccdb09983a34abc818 Mon Sep 17 00:00:00 2001
From: Jiatao Gu <jgu@fb.com>
Date: Sun, 20 Oct 2019 12:42:05 -0700
Subject: [PATCH 185/213] Enable separate models for insertion and deletion;

Summary:
The Diff conatins two fixes:
(1) enabling non-shared decoder layers for deletion/insertion
(2) adding options to perform sampling instead of argmax when learning the deletion

Reviewed By: kahne

Differential Revision: D18011220

fbshipit-source-id: c60815fb7bc3a0004c81249504f7a641536ae2d8
---
 fairseq/models/levenshtein_transformer.py | 89 ++++++++++++++---------
 1 file changed, 56 insertions(+), 33 deletions(-)

diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index 5a5c8172e7..9468c79bd8 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 from fairseq.utils import new_arange
 from fairseq.models import register_model, register_model_architecture
@@ -13,6 +14,7 @@
     TransformerDecoder,
     TransformerEncoder,
     TransformerModel,
+    TransformerDecoderLayer
 )
 from fairseq.modules.transformer_sentence_encoder import init_bert_params
 
@@ -24,7 +26,6 @@ def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
         import sys
         sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
         raise e
-
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -73,7 +74,6 @@ def _get_del_targets(in_tokens, out_tokens, padding_idx):
         import sys
         sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
         raise e
-
     out_seq_len = out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -106,7 +106,6 @@ def _get_del_ins_targets(in_tokens, out_tokens, padding_idx):
         import sys
         sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
         raise e
-
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
     with torch.cuda.device_of(in_tokens):
@@ -247,7 +246,22 @@ def add_args(parser):
             "--early-exit",
             default="6,6,6",
             type=str,
-            help="number of decoder layers before mask_ins, word_ins and word_del heads",
+            help="number of decoder layers for del_word, ins_mask, ins_word",
+        )
+        parser.add_argument(
+            "--no-share-discriminator",
+            action="store_true",
+            help="addtional decoder-layers to learn deletion",
+        )
+        parser.add_argument(
+            "--no-share-maskpredictor",
+            action="store_true",
+            help="addtional decoder-layers to learn predicting masks",
+        )
+        parser.add_argument(
+            "--sampling-for-deletion",
+            action='store_true',
+            help='instead of argmax, use sampling to predict the tokens'
         )
 
     @classmethod
@@ -288,7 +302,13 @@ def forward(
         )
 
         # make online prediction
-        word_predictions = F.log_softmax(word_ins_out, dim=-1).max(2)[1]
+        if self.decoder.sampling_for_deletion:
+            word_predictions = torch.multinomial(
+                F.softmax(word_ins_out, -1).view(-1, word_ins_out.size(-1)), 1).view(
+                    word_ins_out.size(0), -1)
+        else:
+            word_predictions = F.log_softmax(word_ins_out, dim=-1).max(2)[1]
+
         word_predictions.masked_scatter_(
             ~masked_tgt_masks, tgt_tokens[~masked_tgt_masks]
         )
@@ -363,7 +383,7 @@ def forward_decoder(
             )
             mask_ins_score = F.log_softmax(mask_ins_out, 2)
             if eos_penalty > 0.0:
-                mask_ins_score[:, :, 0] -= eos_penalty
+                mask_ins_score[:, :, 0] = mask_ins_score[:, :, 0] - eos_penalty
             mask_ins_pred = mask_ins_score.max(-1)[1]
             mask_ins_pred = torch.min(
                 mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
@@ -442,15 +462,30 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
         self.bos = dictionary.bos()
         self.unk = dictionary.unk()
         self.eos = dictionary.eos()
-
+        self.sampling_for_deletion = getattr(args, "sampling_for_deletion", False)
         self.embed_mask_ins = Embedding(256, self.output_embed_dim * 2, None)
         self.embed_word_del = Embedding(2, self.output_embed_dim, None)
+
         # del_word, ins_mask, ins_word
         self.early_exit = [int(i) for i in args.early_exit.split(',')]
         assert len(self.early_exit) == 3
 
+        # copy layers for mask-predict/deletion
+        self.layers_msk = None
+        if getattr(args, "no_share_maskpredictor", False):
+            self.layers_msk = nn.ModuleList([
+                                    TransformerDecoderLayer(args, no_encoder_attn)
+                                    for _ in range(self.early_exit[1])
+                                ])
+        self.layers_del = None
+        if getattr(args, "no_share_discriminator", False):
+            self.layers_del = nn.ModuleList([
+                                    TransformerDecoderLayer(args, no_encoder_attn)
+                                    for _ in range(self.early_exit[0])
+                                ])
+
     def extract_features(
-        self, prev_output_tokens, encoder_out=None, early_exit=None, **unused
+        self, prev_output_tokens, encoder_out=None, early_exit=None, layers=None, **unused
     ):
         """
         Similar to *forward* but only return features.
@@ -488,12 +523,9 @@ def extract_features(
 
         # decoder layers
         decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
-        for i, layer in enumerate(self.layers):
-
-            # early exit from the decoder.
-            if (early_exit is not None) and (i >= early_exit):
-                break
-
+        layers = self.layers if layers is None else layers
+        early_exit = len(layers) if early_exit is None else early_exit
+        for _, layer in enumerate(layers[: early_exit]):
             x, attn = layer(
                 x,
                 encoder_out["encoder_out"] if encoder_out is not None else None,
@@ -516,36 +548,25 @@ def extract_features(
 
         return x, {"attn": attn, "inner_states": inner_states}
 
-    def forward_mask_ins(self, prev_output_tokens, encoder_out=None):
+    def forward_mask_ins(self, prev_output_tokens, encoder_out=None, **unused):
         features, extra = self.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[1]
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[1], layers=self.layers_msk, **unused
         )
         features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
         return F.linear(features_cat, self.embed_mask_ins.weight), extra['attn']
 
-    def forward_word_ins(self, prev_output_tokens, encoder_out=None):
+    def forward_word_ins(self, prev_output_tokens, encoder_out=None, **unused):
         features, extra = self.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[2]
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[2], layers=self.layers, **unused
         )
         return self.output_layer(features), extra['attn']
 
-    def forward_word_del(self, prev_output_tokens, encoder_out=None):
+    def forward_word_del(self, prev_output_tokens, encoder_out=None, **unused):
         features, extra = self.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[0]
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[0], layers=self.layers_del, **unused
         )
         return F.linear(features, self.embed_word_del.weight), extra['attn']
 
-    def forward_word_del_mask_ins(self, prev_output_tokens, encoder_out=None):
-        # merge the word-deletion and mask insertion into one operation,
-        assert self.early_exit[0] == self.early_exit[1], "must the same depth."
-        features, extra = self.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[2]
-        )
-        features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
-        f_word_del = F.linear(features, self.embed_word_del.weight)
-        f_mask_ins = F.linear(features_cat, self.embed_mask_ins.weight)
-        return f_word_del, f_mask_ins, extra['attn']
-
 
 @register_model_architecture("levenshtein_transformer", "levenshtein_transformer")
 def base_architecture(args):
@@ -584,9 +605,11 @@ def base_architecture(args):
     args.decoder_output_dim = getattr(
         args, "decoder_output_dim", args.decoder_embed_dim
     )
+    args.sampling_for_deletion = getattr(args, "sampling_for_deletion", False)
     args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    args.early_exit = getattr(args, "early_exit", "(6, 6, 6)")
+    args.early_exit = getattr(args, "early_exit", "6,6,6")
+    args.no_share_discriminator = getattr(args, "no_share_discriminator", False)
+    args.no_share_maskpredictor = getattr(args, "no_share_maskpredictor", False)
 
 
 @register_model_architecture(

From 34e6a5e8edc7f297286161d9034e2bcba2c7b8c5 Mon Sep 17 00:00:00 2001
From: Louis MARTIN <louisrtm@gmail.com>
Date: Mon, 21 Oct 2019 19:57:22 -0700
Subject: [PATCH 186/213] Fix load_dataset signature (#1281)

Summary:
Fix for https://github.com/pytorch/fairseq/issues/1240
Tested with MaskedLMTask.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1281

Differential Revision: D18051472

fbshipit-source-id: 0aeff60c71489655f5e621349f780ba9cd8c027a
---
 fairseq/tasks/masked_lm.py              | 2 +-
 fairseq/tasks/multilingual_masked_lm.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py
index cd677dd0ac..60449d8069 100644
--- a/fairseq/tasks/masked_lm.py
+++ b/fairseq/tasks/masked_lm.py
@@ -71,7 +71,7 @@ def setup_task(cls, args, **kwargs):
         print('| dictionary: {} types'.format(len(dictionary)))
         return cls(args, dictionary)
 
-    def load_dataset(self, split, epoch=0, combine=False):
+    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
         """Load a given dataset split.
 
         Args:
diff --git a/fairseq/tasks/multilingual_masked_lm.py b/fairseq/tasks/multilingual_masked_lm.py
index 407f83e2eb..cc1e233132 100644
--- a/fairseq/tasks/multilingual_masked_lm.py
+++ b/fairseq/tasks/multilingual_masked_lm.py
@@ -110,7 +110,7 @@ def _get_sample_prob(self, dataset_lens):
         smoothed_prob = smoothed_prob / smoothed_prob.sum()
         return smoothed_prob
 
-    def load_dataset(self, split, epoch=0, combine=False):
+    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
         """Load a given dataset split.
 
         Args:

From 2d51e04d93295ab40fb787fabae9f3b53cc1266e Mon Sep 17 00:00:00 2001
From: Louis MARTIN <louisrtm@gmail.com>
Date: Mon, 21 Oct 2019 19:59:18 -0700
Subject: [PATCH 187/213] Rename "loaded {} batches" to "loaded {} blocks"
 (#1279)

Summary:
Very small change.
The previous message was misleading, the length of TokenBlocksDataset is a number of "blocks" or "streams" but not the number of batches strictly speaking if I am not mistaken. I use the notion of batch from roberta https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.pretraining.md.
It took me some time to understand what was going on, I hope it saves some time for others.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1279

Differential Revision: D18051476

fbshipit-source-id: 71fa35f21b9dbc8d6bde28cd3a487723690aadee
---
 fairseq/tasks/masked_lm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py
index 60449d8069..ad5d19fcae 100644
--- a/fairseq/tasks/masked_lm.py
+++ b/fairseq/tasks/masked_lm.py
@@ -100,7 +100,7 @@ def load_dataset(self, split, epoch=0, combine=False, **kwargs):
             eos=self.source_dictionary.eos(),
             break_mode=self.args.sample_break_mode,
         )
-        print('| loaded {} batches from: {}'.format(len(dataset), split_path))
+        print('| loaded {} blocks from: {}'.format(len(dataset), split_path))
 
         # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
         dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())

From e49b302a635f06301cd3eb0afe25fad2742bd4b1 Mon Sep 17 00:00:00 2001
From: Changhan Wang <changhan@fb.com>
Date: Tue, 22 Oct 2019 13:08:41 -0700
Subject: [PATCH 188/213] fix score

Summary: Bugfix for inconsistent scores on the same input sentences. This only affects the displayed scores in `generate.py` and does not affect the model outputs.

Reviewed By: MultiPath

Differential Revision: D17799343

fbshipit-source-id: 2b868ac03097a4db27db736e126a61d50958acc5
---
 fairseq/models/levenshtein_transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index 9468c79bd8..34d0899204 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -406,8 +406,8 @@ def forward_decoder(
             word_ins_out, word_ins_attn = self.decoder.forward_word_ins(
                 _skip(output_tokens, can_ins_word), _skip(encoder_out, can_ins_word)
             )
-            word_ins_score = F.log_softmax(word_ins_out, 2)
-            word_ins_pred = word_ins_score.max(-1)[1]
+
+            word_ins_score, word_ins_pred = F.log_softmax(word_ins_out, 2).max(-1)
 
             _tokens, _scores = _apply_ins_words(
                 output_tokens[can_ins_word],

From 8defa9d9a47bf99d07090efd35da78f99fb3e48d Mon Sep 17 00:00:00 2001
From: Yilei Li <yileil@fb.com>
Date: Tue, 22 Oct 2019 19:46:38 -0700
Subject: [PATCH 189/213] Add warmup support in reduce_on_plateau lr schedule

Summary:
Enables reduce_on_plateau schedule with optional warmup phase, where we linearly increase the learning rate from some initial learning rate (``--warmup-init-lr``) until the configured learning rate (``--lr``). Thereafter the lr is adjusted according to original reduce_on_plateau scheme
During warmup::

      lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates)
      lr = lrs[update_num]

Reviewed By: yqwangustc

Differential Revision: D17779925

fbshipit-source-id: c3bfb3321c76850824fc42df4fac4e5dcf73fbf8
---
 .../lr_scheduler/reduce_lr_on_plateau.py      | 48 +++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py b/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
index 715c714b65..c2f0671bca 100644
--- a/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
+++ b/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
@@ -10,7 +10,17 @@
 
 @register_lr_scheduler('reduce_lr_on_plateau')
 class ReduceLROnPlateau(FairseqLRScheduler):
-    """Decay the LR by a factor every time the validation loss plateaus."""
+    """
+    Decay the LR by a factor every time the validation loss plateaus.
+    Also comes with optional warmup phase, where we linearly increase the learning rate
+    from some initial learning rate (``--warmup-init-lr``) until the configured
+    learning rate (``--lr``). Thereafter the lr is adjusted according to original reduce_on_plateau scheme
+
+    During warmup::
+
+      lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates)
+      lr = lrs[update_num]
+    """
 
     def __init__(self, args, optimizer):
         super().__init__(args, optimizer)
@@ -22,6 +32,20 @@ def __init__(self, args, optimizer):
         self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
             self.optimizer.optimizer, patience=0, factor=args.lr_shrink,
             threshold=args.lr_threshold)
+        warmup_end_lr = args.lr[0]
+        """if no warm up, sets initial lr to be args.lr[0]"""
+        if args.warmup_init_lr < 0:
+            args.warmup_init_lr = 0 if args.warmup_updates > 0 else warmup_end_lr
+
+        """ linearly warmup for the first args.warmup_updates"""
+        if args.warmup_updates > 0:
+            self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates
+        """ this flag is either set from arg when no warm up, or set by step_update() when warmup finishes"""
+        self.warmup_end = True if args.warmup_updates <= 0 else False
+        """ initial learning rate"""
+        """this self.lr is used only during init and/or warm up period"""
+        self.lr = args.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
 
     @staticmethod
     def add_args(parser):
@@ -32,6 +56,10 @@ def add_args(parser):
         parser.add_argument('--lr-threshold', default=1e-4, type=float, metavar='LT',
                             help='Threshold for measuring the new optimum, \
                             to only focus on significant changes')
+        parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
+                            help='warmup the learning rate linearly for the first N updates')
+        parser.add_argument('--warmup-init-lr', default=-1, type=float, metavar='LR',
+                            help='initial learning rate during warmup phase; default is args.lr')
         # fmt: on
 
     def state_dict(self):
@@ -48,9 +76,23 @@ def load_state_dict(self, state_dict):
             self.lr_scheduler.last_epoch = state_dict['last_epoch']
 
     def step(self, epoch, val_loss=None):
-        """Update the learning rate at the end of the given epoch."""
-        if val_loss is not None:
+        """Update the learning rate at the end of the given epoch if warmup finishes"""
+        """ otherwise no update of lr on epoch boundaries"""
+        if val_loss is not None and self.warmup_end is True:
             self.lr_scheduler.step(val_loss, epoch)
         else:
             self.lr_scheduler.last_epoch = epoch
         return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        """ if there is warmup"""
+        if self.args.warmup_updates > 0:
+            if num_updates <= self.args.warmup_updates:
+                self.lr = self.args.warmup_init_lr + num_updates*self.lr_step
+                self.optimizer.set_lr(self.lr)
+            else:
+                if self.warmup_end is False:
+                    self.warmup_end = True
+        """else do nothing """
+        return self.optimizer.get_lr()

From 5a2f76ede044b4904af9461e18253f2929cfc5a4 Mon Sep 17 00:00:00 2001
From: Ning Dong <dnn@fb.com>
Date: Wed, 23 Oct 2019 17:44:55 -0700
Subject: [PATCH 190/213] NAT productionization

Summary:
NAT productionization diff

(1) Integrate NAT model training / Evaluation in LATTE base training workflow.
(2) Make NAT tracing compliant. Since it calls into Fairseq transformer, we need to refactor the code and I created a ~copy of it named fb_tracing_transformer.
(3) Decoder side C++ code is landed in the diff earlier.

Reviewed By: xianxl

Differential Revision: D17888324

fbshipit-source-id: ef4ef195fddd360da921502adcef82b087e46ce6
---
 fairseq/criterions/nat_loss.py                |  31 +-
 fairseq/data/language_pair_dataset.py         |  13 +
 fairseq/iterative_refinement_generator.py     | 140 ++--
 fairseq/models/levenshtein_transformer.py     | 602 ++++++++++--------
 fairseq/models/model_utils.py                 | 135 +++-
 fairseq/models/nonautoregressive_ensembles.py |   2 +-
 6 files changed, 587 insertions(+), 336 deletions(-)

diff --git a/fairseq/criterions/nat_loss.py b/fairseq/criterions/nat_loss.py
index ccb25298f4..174b1203cc 100644
--- a/fairseq/criterions/nat_loss.py
+++ b/fairseq/criterions/nat_loss.py
@@ -7,6 +7,7 @@
 
 import torch.nn.functional as F
 from fairseq import utils
+import torch
 from torch import Tensor
 
 from . import FairseqCriterion, register_criterion
@@ -44,23 +45,27 @@ def mean_ds(x: Tensor, dim=None) -> Tensor:
                 if dim is None
                 else x.float().mean(dim).type_as(x)
             )
-
         if masks is not None:
             outputs, targets = outputs[masks], targets[masks]
 
-        logits = F.log_softmax(outputs, dim=-1)
-        if targets.dim() == 1:
-            losses = F.nll_loss(logits, targets, reduction="none")
-
-        else:  # soft-labels
-            losses = F.kl_div(logits, targets, reduction="none")
-            losses = losses.float().sum(-1).type_as(losses)
-
-        nll_loss = mean_ds(losses)
-        if label_smoothing > 0:
-            loss = nll_loss * (1 - label_smoothing) - mean_ds(logits) * label_smoothing
-        else:
+        if not masks.any():
+            nll_loss = torch.tensor(0)
             loss = nll_loss
+        else:
+            logits = F.log_softmax(outputs, dim=-1)
+            if targets.dim() == 1:
+                losses = F.nll_loss(logits, targets.to(logits.device), reduction='none')
+
+            else:  # soft-labels
+                losses = F.kl_div(logits, targets.to(logits.device), reduction='none')
+                losses = losses.sum(-1)
+
+            nll_loss = mean_ds(losses)
+            if label_smoothing > 0:
+                loss = nll_loss * (
+                    1 - label_smoothing) - mean_ds(logits) * label_smoothing
+            else:
+                loss = nll_loss
 
         loss = loss * factor
         return {"name": name, "loss": loss, "nll_loss": nll_loss, "factor": factor}
diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py
index 09c7193ab4..48853ba726 100644
--- a/fairseq/data/language_pair_dataset.py
+++ b/fairseq/data/language_pair_dataset.py
@@ -142,6 +142,8 @@ class LanguagePairDataset(FairseqDataset):
             target if it's absent (default: False).
         align_dataset (torch.utils.data.Dataset, optional): dataset
             containing alignments.
+        append_bos (bool, optional): if set, appends bos to the beginning of
+            source/target sentence.
     """
 
     def __init__(
@@ -152,6 +154,7 @@ def __init__(
         shuffle=True, input_feeding=True,
         remove_eos_from_source=False, append_eos_to_target=False,
         align_dataset=None,
+        append_bos=False
     ):
         if tgt_dict is not None:
             assert src_dict.pad() == tgt_dict.pad()
@@ -174,6 +177,7 @@ def __init__(
         self.align_dataset = align_dataset
         if self.align_dataset is not None:
             assert self.tgt_sizes is not None, "Both source and target needed when alignments are provided"
+        self.append_bos = append_bos
 
     def __getitem__(self, index):
         tgt_item = self.tgt[index] if self.tgt is not None else None
@@ -187,6 +191,15 @@ def __getitem__(self, index):
             if self.tgt and self.tgt[index][-1] != eos:
                 tgt_item = torch.cat([self.tgt[index], torch.LongTensor([eos])])
 
+        if self.append_bos:
+            bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos()
+            if self.tgt and self.tgt[index][0] != bos:
+                tgt_item = torch.cat([torch.LongTensor([bos]), self.tgt[index]])
+
+            bos = self.src_dict.bos()
+            if self.src[index][-1] != bos:
+                src_item = torch.cat([torch.LongTensor([bos]), self.src[index]])
+
         if self.remove_eos_from_source:
             eos = self.src_dict.eos()
             if self.src[index][-1] == eos:
diff --git a/fairseq/iterative_refinement_generator.py b/fairseq/iterative_refinement_generator.py
index eeb6241039..551c49ffc2 100644
--- a/fairseq/iterative_refinement_generator.py
+++ b/fairseq/iterative_refinement_generator.py
@@ -4,21 +4,25 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-
-from fairseq.models.model_utils import skip_tensors as _skip
-from fairseq.models.nonautoregressive_ensembles import EnsembleLevT
-from fairseq.models.levenshtein_transformer import LevenshteinTransformerModel
+from fairseq import utils
+from fairseq.models.model_utils import (
+    script_skip_tensor_list,
+    skip_tensors as _skip,
+)
 
 
 class IterativeRefinementGenerator(object):
-    def __init__(self,
-                 tgt_dict,
-                 eos_penalty=0.,
-                 max_iter=10,
-                 max_ratio=2,
-                 decoding_format=None,
-                 retain_dropout=False,
-                 adaptive=True):
+    def __init__(
+        self,
+        models,
+        tgt_dict,
+        eos_penalty=0.0,
+        max_iter=10,
+        max_ratio=2,
+        decoding_format=None,
+        retain_dropout=False,
+        adaptive=True,
+    ):
         """
         Generates translations based on iterative refinement.
 
@@ -42,34 +46,67 @@ def __init__(self,
         self.decoding_format = decoding_format
         self.retain_dropout = retain_dropout
         self.adaptive = adaptive
+        self.models = models
+
+    def generate_batched_itr(
+        self,
+        data_itr,
+        maxlen_a=None,
+        maxlen_b=None,
+        cuda=False,
+        timer=None,
+        prefix_size=0,
+    ):
+        """Iterate over a batched dataset and yield individual translations.
 
-    @torch.no_grad()
-    def generate(self, models, sample, prefix_tokens=None):
+        Args:
+            maxlen_a/b: generate sequences of maximum length ax + b,
+                where x is the source sentence length.
+            cuda: use GPU for generation
+            timer: StopwatchMeter for timing generations.
+        """
 
-        if len(models) == 1:
-            # Keep this for other NAT models for which we have yet to implement ensemble wrappers. Later delete this.
-            model = models[0]
-        elif isinstance(models[0], LevenshteinTransformerModel):
-            model = EnsembleLevT(models)
-        else:
-            raise NotImplementedError
+        for sample in data_itr:
+            if "net_input" not in sample:
+                continue
+            if timer is not None:
+                timer.start()
+            with torch.no_grad():
+                hypos = self.generate(
+                    sample,
+                    prefix_tokens=sample["target"][:, :prefix_size]
+                    if prefix_size > 0
+                    else None,
+                )
+            if timer is not None:
+                timer.stop(sample["ntokens"])
+            for i, id in enumerate(sample["id"]):
+                # remove padding
+                src = utils.strip_pad(sample["net_input"]["src_tokens"][i, :], self.pad)
+                ref = utils.strip_pad(sample["target"][i, :], self.pad)
+                yield id, src, ref, hypos[i]
 
+    @torch.no_grad()
+    def generate(self, sample, prefix_tokens=None):
+
+        # TODO: model ensemble
+        assert len(self.models) == 1, "only support single model"
+        model = self.models[0]
         if not self.retain_dropout:
             model.eval()
 
         # TODO: better encoder inputs?
-        src_tokens = sample['net_input']['src_tokens']
-        src_lengths = sample['net_input']['src_lengths']
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lengths = sample["net_input"]["src_lengths"]
         bsz, src_len = src_tokens.size()
-        sent_idxs = torch.arange(bsz, device=src_tokens.device)
+        sent_idxs = torch.arange(bsz)
 
         # encoding
         encoder_out = model.forward_encoder([src_tokens, src_lengths])
 
         # initialize buffers (very model specific, with length prediction or not)
-        prev_decoder_out = model.initialize_output_tokens(
-            encoder_out, src_tokens)
-        prev_out_tokens = prev_decoder_out['output_tokens'].clone()
+        prev_decoder_out = model.initialize_output_tokens(encoder_out, src_tokens)
+        prev_output_tokens = prev_decoder_out[0].clone()
 
         finalized = [[] for _ in range(bsz)]
 
@@ -94,23 +131,23 @@ def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn):
                 hypo_attn = prev_out_attn[cutoff]
                 alignment = hypo_attn.max(dim=1)[1]
             return {
-                'steps': step,
-                'tokens': tokens,
-                'positional_scores': scores,
-                'score': scores.mean(),
-                'hypo_attn': hypo_attn,
-                'alignment': alignment,
+                "steps": step,
+                "tokens": tokens,
+                "positional_scores": scores,
+                "score": scores.mean(),
+                "hypo_attn": hypo_attn,
+                "alignment": alignment,
             }
 
         for step in range(self.max_iter + 1):
 
             decoder_options = {
-                'eos_penalty': self.eos_penalty,
-                'max_ratio': self.max_ratio,
-                'decoding_format': self.decoding_format
+                "eos_penalty": self.eos_penalty,
+                "max_ratio": self.max_ratio,
+                "decoding_format": self.decoding_format,
             }
-            prev_decoder_out['step'] = step
-            prev_decoder_out['max_step'] = self.max_iter + 1
+            prev_decoder_out[3] = step
+            prev_decoder_out[4] = self.max_iter + 1
 
             decoder_out = model.forward_decoder(
                 prev_decoder_out, encoder_out, **decoder_options
@@ -119,24 +156,25 @@ def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn):
             if self.adaptive:
                 # terminate if there is a loop
                 terminated, out_tokens, out_scores, out_attn = is_a_loop(
-                    prev_out_tokens, decoder_out['output_tokens'],
-                    decoder_out['output_scores'], decoder_out['attn'])
-                decoder_out['output_tokens'] = out_tokens
-                decoder_out['output_scores'] = out_scores
-                decoder_out['attn'] = out_attn
+                    prev_output_tokens, decoder_out[0], decoder_out[1], decoder_out[2]
+                )
+                decoder_out[0] = out_tokens
+                decoder_out[1] = out_scores
+                decoder_out[2] = out_attn
 
             else:
-                terminated = decoder_out['output_tokens'].new_zeros(
-                    decoder_out['output_tokens'].size(0)).bool()
+                terminated = decoder_out[0].new_zeros(decoder_out[0].size(0)).bool()
 
             if step == self.max_iter:  # reach last iteration, terminate
                 terminated.fill_(1)
 
             # collect finalized sentences
             finalized_idxs = sent_idxs[terminated]
-            finalized_tokens = decoder_out['output_tokens'][terminated]
-            finalized_scores = decoder_out['output_scores'][terminated]
-            finalized_attn = None if decoder_out['attn'] is None else decoder_out['attn'][terminated]
+            finalized_tokens = decoder_out[0][terminated]
+            finalized_scores = decoder_out[1][terminated]
+            finalized_attn = (
+                None if decoder_out[2] is None else decoder_out[2][terminated]
+            )
 
             for i in range(finalized_idxs.size(0)):
                 finalized[finalized_idxs[i]] = [
@@ -144,7 +182,7 @@ def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn):
                         step,
                         finalized_tokens[i],
                         finalized_scores[i],
-                        None if finalized_attn is None else finalized_attn[i]
+                        None if finalized_attn is None else finalized_attn[i],
                     )
                 ]
             # check if all terminated
@@ -153,9 +191,9 @@ def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn):
 
             # for next step
             prev_decoder_out = _skip(decoder_out, ~terminated)
-            encoder_out = _skip(encoder_out, ~terminated)
+            encoder_out = script_skip_tensor_list(encoder_out, ~terminated)
             sent_idxs = _skip(sent_idxs, ~terminated)
 
-            prev_out_tokens = prev_decoder_out['output_tokens'].clone()
+            prev_output_tokens = prev_decoder_out[0].clone()
 
         return finalized
diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index 34d0899204..cbf9244657 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -1,22 +1,35 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
+#!/usr/bin/env python3
+
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
 #
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from typing import Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from fairseq.utils import new_arange
 from fairseq.models import register_model, register_model_architecture
-from fairseq.models.model_utils import fill_tensors as _fill, skip_tensors as _skip
-from fairseq.models.transformer import (
-    Embedding,
-    TransformerDecoder,
-    TransformerEncoder,
-    TransformerModel,
-    TransformerDecoderLayer
+from fairseq.models.fb_tracing_transformer import (
+    TracingTransformerDecoder,
+    TracingTransformerEncoder,
+    TracingTransformerModel,
+    TransformerDecoderLayer,
+)
+from fairseq.models.model_utils import (
+    fill_tensors as _fill,
+    script_skip_tensor,
+    script_skip_tensor_list,
 )
+from fairseq.models.transformer import Embedding
 from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from torch import Tensor
 
 
 def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
@@ -24,18 +37,17 @@ def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
         from fairseq import libnat
     except ImportError as e:
         import sys
-        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+
+        sys.stderr.write("ERROR: missing libnat. run `pip install --editable .`\n")
         raise e
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
-    with torch.cuda.device_of(in_tokens):
-        in_tokens_list = [
-            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
-        ]
-        out_tokens_list = [
-            [t for t in s if t != padding_idx]
-            for i, s in enumerate(out_tokens.tolist())
-        ]
+    in_tokens_list = [
+        [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+    ]
+    out_tokens_list = [
+        [t for t in s if t != padding_idx] for i, s in enumerate(out_tokens.tolist())
+    ]
 
     full_labels = libnat.suggested_ed2_path(
         in_tokens_list, out_tokens_list, padding_idx
@@ -59,9 +71,7 @@ def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
     ]
 
     # transform to tensor
-    masked_tgt_masks = torch.tensor(
-        masked_tgt_masks, device=out_tokens.device
-    ).bool()
+    masked_tgt_masks = torch.tensor(masked_tgt_masks, device=out_tokens.device).bool()
     mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device)
     masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx)
     return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets
@@ -72,18 +82,17 @@ def _get_del_targets(in_tokens, out_tokens, padding_idx):
         from fairseq import libnat
     except ImportError as e:
         import sys
-        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+
+        sys.stderr.write("ERROR: missing libnat. run `pip install --editable .`\n")
         raise e
     out_seq_len = out_tokens.size(1)
 
-    with torch.cuda.device_of(in_tokens):
-        in_tokens_list = [
-            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
-        ]
-        out_tokens_list = [
-            [t for t in s if t != padding_idx]
-            for i, s in enumerate(out_tokens.tolist())
-        ]
+    in_tokens_list = [
+        [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+    ]
+    out_tokens_list = [
+        [t for t in s if t != padding_idx] for i, s in enumerate(out_tokens.tolist())
+    ]
 
     full_labels = libnat.suggested_ed2_path(
         in_tokens_list, out_tokens_list, padding_idx
@@ -95,7 +104,7 @@ def _get_del_targets(in_tokens, out_tokens, padding_idx):
     ]
 
     # transform to tensor
-    word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device)
+    word_del_targets = torch.tensor(word_del_targets)
     return word_del_targets
 
 
@@ -104,18 +113,17 @@ def _get_del_ins_targets(in_tokens, out_tokens, padding_idx):
         from fairseq import libnat
     except ImportError as e:
         import sys
-        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+
+        sys.stderr.write("ERROR: missing libnat. run `pip install --editable .`\n")
         raise e
     in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
 
-    with torch.cuda.device_of(in_tokens):
-        in_tokens_list = [
-            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
-        ]
-        out_tokens_list = [
-            [t for t in s if t != padding_idx]
-            for i, s in enumerate(out_tokens.tolist())
-        ]
+    in_tokens_list = [
+        [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+    ]
+    out_tokens_list = [
+        [t for t in s if t != padding_idx] for i, s in enumerate(out_tokens.tolist())
+    ]
 
     full_labels = libnat.suggested_ed2_path(
         in_tokens_list, out_tokens_list, padding_idx
@@ -136,96 +144,13 @@ def _get_del_ins_targets(in_tokens, out_tokens, padding_idx):
     ]
 
     # transform to tensor
-    mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device)
-    word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device)
+    mask_ins_targets = torch.tensor(mask_ins_targets)
+    word_del_targets = torch.tensor(word_del_targets)
     return word_del_targets, mask_ins_targets
 
 
-def _apply_ins_masks(
-    in_tokens, in_scores, mask_ins_pred, padding_idx, unk_idx, eos_idx
-):
-
-    in_masks = in_tokens.ne(padding_idx)
-    in_lengths = in_masks.sum(1)
-
-    # HACK: hacky way to shift all the paddings to eos first.
-    in_tokens.masked_fill_(~in_masks, eos_idx)
-    mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0)
-
-    out_lengths = in_lengths + mask_ins_pred.sum(1)
-    out_max_len = out_lengths.max()
-    out_masks = (
-        new_arange(out_lengths, out_max_len)[None, :]
-        < out_lengths[:, None]
-    )
-
-    reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1)
-    out_tokens = (
-        in_tokens.new_zeros(in_tokens.size(0), out_max_len)
-        .fill_(padding_idx)
-        .masked_fill_(out_masks, unk_idx)
-    )
-    out_tokens[:, 0] = in_tokens[:, 0]
-    out_tokens.scatter_(1, reordering, in_tokens[:, 1:])
-
-    out_scores = None
-    if in_scores is not None:
-        in_scores.masked_fill_(~in_masks, 0)
-        out_scores = in_scores.new_zeros(*out_tokens.size())
-        out_scores[:, 0] = in_scores[:, 0]
-        out_scores.scatter_(1, reordering, in_scores[:, 1:])
-
-    return out_tokens, out_scores
-
-
-def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, unk_idx):
-    word_ins_masks = in_tokens.eq(unk_idx)
-    out_tokens = in_tokens.masked_scatter(word_ins_masks, word_ins_pred[word_ins_masks])
-
-    if in_scores is not None:
-        out_scores = in_scores.masked_scatter(
-            word_ins_masks, word_ins_scores[word_ins_masks]
-        )
-    else:
-        out_scores = None
-
-    return out_tokens, out_scores
-
-
-def _apply_del_words(
-    in_tokens, in_scores, in_attn, word_del_pred, padding_idx, bos_idx, eos_idx
-):
-    # apply deletion to a tensor
-    in_masks = in_tokens.ne(padding_idx)
-    bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx)
-
-    max_len = in_tokens.size(1)
-    word_del_pred.masked_fill_(~in_masks, 1)
-    word_del_pred.masked_fill_(bos_eos_masks, 0)
-
-    reordering = (
-        new_arange(in_tokens)
-        .masked_fill_(word_del_pred, max_len)
-        .sort(1)[1]
-    )
-
-    out_tokens = in_tokens.masked_fill(word_del_pred, padding_idx).gather(1, reordering)
-
-    out_scores = None
-    if in_scores is not None:
-        out_scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering)
-
-    out_attn = None
-    if in_attn is not None:
-        _mask = word_del_pred[:, :, None].expand_as(in_attn)
-        _reordering = reordering[:, :, None].expand_as(in_attn)
-        out_attn = in_attn.masked_fill(_mask, 0.).gather(1, _reordering)
-
-    return out_tokens, out_scores, out_attn
-
-
 @register_model("levenshtein_transformer")
-class LevenshteinTransformerModel(TransformerModel):
+class LevenshteinTransformerModel(TracingTransformerModel):
     def __init__(self, encoder, decoder):
         super().__init__(encoder, decoder)
         self.tgt_dict = decoder.dictionary
@@ -236,7 +161,7 @@ def __init__(self, encoder, decoder):
 
     @staticmethod
     def add_args(parser):
-        TransformerModel.add_args(parser)
+        TracingTransformerModel.add_args(parser)
         parser.add_argument(
             "--apply-bert-init",
             action="store_true",
@@ -260,8 +185,17 @@ def add_args(parser):
         )
         parser.add_argument(
             "--sampling-for-deletion",
-            action='store_true',
-            help='instead of argmax, use sampling to predict the tokens'
+            action="store_true",
+            help="instead of argmax, use sampling to predict the tokens",
+        )
+        # Added for compatibility
+        parser.add_argument(
+            "--decoder-out-embed-dim",
+            default=None,
+            type=int,
+            metavar="N",
+            help="decoder output embedding dimension (bottleneck layer before"
+            "output layer if specified.)",
         )
 
     @classmethod
@@ -273,7 +207,7 @@ def build_decoder(cls, args, tgt_dict, embed_tokens):
 
     @classmethod
     def build_encoder(cls, args, src_dict, embed_tokens):
-        encoder = TransformerEncoder(args, src_dict, embed_tokens)
+        encoder = TracingTransformerEncoder(args, src_dict, embed_tokens)
         if getattr(args, "apply_bert_init", False):
             encoder.apply(init_bert_params)
         return encoder
@@ -304,8 +238,8 @@ def forward(
         # make online prediction
         if self.decoder.sampling_for_deletion:
             word_predictions = torch.multinomial(
-                F.softmax(word_ins_out, -1).view(-1, word_ins_out.size(-1)), 1).view(
-                    word_ins_out.size(0), -1)
+                F.softmax(word_ins_out, -1).view(-1, word_ins_out.size(-1)), 1
+            ).view(word_ins_out.size(0), -1)
         else:
             word_predictions = F.log_softmax(word_ins_out, dim=-1).max(2)[1]
 
@@ -315,9 +249,7 @@ def forward(
 
         # generate training labels for deletion
         word_del_targets = _get_del_targets(word_predictions, tgt_tokens, self.pad)
-        word_del_out, _ = self.decoder.forward_word_del(
-            word_predictions, encoder_out)
-
+        word_del_out, _ = self.decoder.forward_word_del(word_predictions, encoder_out)
         return {
             "mask_ins_out": mask_ins_out,
             "mask_ins_tgt": mask_ins_targets,
@@ -337,123 +269,246 @@ def forward_decoder(
         self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
     ):
 
-        output_tokens = decoder_out["output_tokens"]
-        output_scores = decoder_out["output_scores"]
-        attn = decoder_out["attn"]
+        output_tokens = decoder_out[0]
+        output_scores = decoder_out[1]
+        attn = decoder_out[2]
+
+        if max_ratio is not None and encoder_out[1] is not None:
+            max_lengths = ((~encoder_out[1]).sum(1) * max_ratio).clamp(min=10)
 
-        bsz = output_tokens.size(0)
-        if max_ratio is None:
-            max_lens = output_tokens.new().fill_(255)
         else:
-            if encoder_out["encoder_padding_mask"] is None:
-                max_src_len = encoder_out["encoder_out"].size(1)
-                src_lens = encoder_out["encoder_out"].new(bsz).fill_(max_src_len)
-            else:
-                src_lens = (~encoder_out["encoder_padding_mask"]).sum(1)
-            max_lens = (src_lens * max_ratio).clamp(min=10).long()
-
-        # delete words
-        # do not delete tokens if it is <s> </s>
+            max_lengths = torch.zeros(output_tokens.size(0)).fill_(255)
+
+        @torch.jit.script
+        def del_word(
+            output_tokens,
+            output_scores,
+            attn: Tensor,
+            word_del_attn: Optional[Tensor],
+            word_del_pred,
+            can_del_word,
+            pad_idx: int,
+            bos_idx: int,
+            eos_idx: int,
+        ):
+            # delete words
+            # do not delete tokens if it is <s> </s>
+            if can_del_word.sum() != 0:  # we cannot delete, skip
+                in_tokens = output_tokens[can_del_word]
+                in_scores = output_scores[can_del_word]
+                # apply deletion to a tensor
+                in_masks = in_tokens.ne(pad_idx)
+                bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx)
+
+                max_len = in_tokens.size(1)
+                word_del_pred.masked_fill_(~in_masks, 1)
+                word_del_pred.masked_fill_(bos_eos_masks, 0)
+
+                reordering = (
+                    torch.arange(max_len)[None, :]
+                    .expand_as(in_tokens)
+                    .contiguous()
+                    .masked_fill(word_del_pred, max_len)
+                    .sort(1)[1]
+                )
+
+                _tokens = in_tokens.masked_fill(word_del_pred, pad_idx).gather(
+                    1, reordering
+                )
+
+                _scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering)
+                if word_del_attn is not None:
+                    _mask = word_del_pred[:, :, None].expand_as(word_del_attn)
+                    _reordering = reordering[:, :, None].expand_as(word_del_attn)
+                    _attn = word_del_attn.masked_fill(_mask, 0.0).gather(1, _reordering)
+                    attn = _fill(attn, can_del_word, _attn, 0)
+
+                output_tokens = _fill(output_tokens, can_del_word, _tokens, pad_idx)
+                output_scores = _fill(output_scores, can_del_word, _scores, 0)
+            return output_tokens, output_scores, attn
+
+        @torch.jit.script
+        def ins_placeholders(
+            output_tokens,
+            output_scores,
+            mask_ins_pred,
+            can_ins_mask,
+            pad_idx: int,
+            unk_idx: int,
+            eos_idx: int,
+        ):
+            # insert placeholders
+            if can_ins_mask.sum() != 0:
+                in_tokens = output_tokens[can_ins_mask]
+                in_scores = output_scores[can_ins_mask]
+                in_masks = in_tokens.ne(pad_idx)
+                in_lengths = in_masks.sum(1)
+
+                # HACK: hacky way to shift all the paddings to eos first.
+                in_tokens.masked_fill_(~in_masks, eos_idx)
+                mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0)
+
+                out_lengths = in_lengths + mask_ins_pred.sum(1)
+                out_max_len = out_lengths.max()
+                out_masks = (
+                    torch.arange(out_max_len)[None, :].long() < out_lengths[:, None]
+                )
+
+                reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1)
+                out_tokens = (
+                    torch.zeros(in_tokens.size()[0], out_max_len)
+                    .fill_(pad_idx)
+                    .masked_fill_(out_masks, unk_idx)
+                )
+                out_tokens = torch.cat([in_tokens[:, :1], out_tokens[:, 1:]], 1)
+                out_tokens.scatter_(1, reordering, in_tokens[:, 1:].float())
+
+                if in_scores is not None:
+                    in_scores.masked_fill_(~in_masks, 0)
+                    out_scores = torch.zeros_like(out_tokens).to(in_scores)
+                    out_tokens = torch.cat([in_tokens[:, :1], out_tokens[:, 1:]], 1)
+                    out_scores.scatter_(1, reordering, in_scores[:, 1:])
+                else:
+                    out_scores = None
+                output_tokens = _fill(output_tokens, can_ins_mask, out_tokens, pad_idx)
+                output_scores = _fill(output_scores, can_ins_mask, out_scores, 0)
+            return output_tokens, output_scores
+
+        @torch.jit.script
+        def ins_words(
+            output_tokens,
+            output_scores,
+            attn: Tensor,
+            word_ins_attn,
+            word_ins_pred,
+            word_ins_scores,
+            can_ins_word,
+            pad_idx: int,
+            unk_idx: int,
+        ):
+            # insert words
+            if can_ins_word.sum() != 0:
+                in_tokens = output_tokens[can_ins_word]
+                in_scores = output_scores[can_ins_word]
+                word_ins_masks = in_tokens.eq(unk_idx)
+                out_tokens = in_tokens.masked_scatter(
+                    word_ins_masks, word_ins_pred[word_ins_masks].float()
+                )
+
+                if in_scores is not None:
+                    out_scores = in_scores.masked_scatter(
+                        word_ins_masks, word_ins_scores[word_ins_masks]
+                    )
+                else:
+                    out_scores = None
+                output_tokens = _fill(output_tokens, can_ins_word, out_tokens, pad_idx)
+                output_scores = _fill(output_scores, can_ins_word, out_scores, 0)
+                attn = _fill(attn, can_ins_word, word_ins_attn, 0)
+            return output_tokens, output_scores, attn
+
         can_del_word = output_tokens.ne(self.pad).sum(1) > 2
-        if can_del_word.sum() != 0:  # we cannot delete, skip
-            word_del_out, word_del_attn = self.decoder.forward_word_del(
-                _skip(output_tokens, can_del_word), _skip(encoder_out, can_del_word)
-            )
-            word_del_score = F.log_softmax(word_del_out, 2)
-            word_del_pred = word_del_score.max(-1)[1].bool()
-
-            _tokens, _scores, _attn = _apply_del_words(
-                output_tokens[can_del_word],
-                output_scores[can_del_word],
-                word_del_attn,
-                word_del_pred,
-                self.pad,
-                self.bos,
-                self.eos,
-            )
-            output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
-            output_scores = _fill(output_scores, can_del_word, _scores, 0)
-            attn = _fill(attn, can_del_word, _attn, 0.)
-
-        # insert placeholders
-        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
-        if can_ins_mask.sum() != 0:
-            mask_ins_out, _ = self.decoder.forward_mask_ins(
-                _skip(output_tokens, can_ins_mask), _skip(encoder_out, can_ins_mask)
-            )
-            mask_ins_score = F.log_softmax(mask_ins_out, 2)
-            if eos_penalty > 0.0:
-                mask_ins_score[:, :, 0] = mask_ins_score[:, :, 0] - eos_penalty
-            mask_ins_pred = mask_ins_score.max(-1)[1]
+        word_del_out, word_del_attn = self.decoder.forward_word_del(
+            script_skip_tensor(output_tokens, can_del_word),
+            script_skip_tensor_list(list(encoder_out), can_del_word),
+        )
+        word_del_score = F.log_softmax(word_del_out, 2)
+        word_del_pred = word_del_score.max(-1)[1].bool()
+
+        output_tokens, output_scores, attn = del_word(
+            output_tokens,
+            output_scores,
+            attn,
+            word_del_attn,
+            word_del_pred,
+            can_del_word,
+            self.pad,
+            self.bos,
+            self.eos,
+        )
+
+        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lengths
+        mask_ins_out, _ = self.decoder.forward_mask_ins(
+            script_skip_tensor(output_tokens, can_ins_mask),
+            script_skip_tensor_list(encoder_out, can_ins_mask),
+        )
+        mask_ins_score = F.log_softmax(mask_ins_out, 2)
+        if eos_penalty > 0.0:
+            mask_ins_score[:, :, 0] -= eos_penalty
+        mask_ins_pred = mask_ins_score.max(-1)[1]
+        if max_ratio is not None and encoder_out[1] is not None:
             mask_ins_pred = torch.min(
-                mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
+                mask_ins_pred, max_lengths[can_ins_mask, None].expand_as(mask_ins_pred)
             )
 
-            _tokens, _scores = _apply_ins_masks(
-                output_tokens[can_ins_mask],
-                output_scores[can_ins_mask],
-                mask_ins_pred,
-                self.pad,
-                self.unk,
-                self.eos,
-            )
-            output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
-            output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
+        output_tokens, output_scores = ins_placeholders(
+            output_tokens,
+            output_scores,
+            mask_ins_pred,
+            can_ins_mask,
+            self.pad,
+            self.unk,
+            self.eos,
+        )
 
-        # insert words
         can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
-        if can_ins_word.sum() != 0:
-            word_ins_out, word_ins_attn = self.decoder.forward_word_ins(
-                _skip(output_tokens, can_ins_word), _skip(encoder_out, can_ins_word)
-            )
+        word_ins_out, word_ins_attn = self.decoder.forward_word_ins(
+            script_skip_tensor(output_tokens, can_ins_word),
+            script_skip_tensor_list(encoder_out, can_ins_word),
+        )
+        word_ins_score = F.log_softmax(word_ins_out, 2)
+        word_ins_pred = word_ins_score.max(-1)[1]
+
+        output_tokens, output_scores, attn = ins_words(
+            output_tokens,
+            output_scores,
+            attn,
+            word_ins_attn,
+            word_ins_pred,
+            word_ins_score,
+            can_ins_word,
+            self.pad,
+            self.unk,
+        )
 
-            word_ins_score, word_ins_pred = F.log_softmax(word_ins_out, 2).max(-1)
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
 
-            _tokens, _scores = _apply_ins_words(
-                output_tokens[can_ins_word],
-                output_scores[can_ins_word],
-                word_ins_pred,
-                word_ins_score,
-                self.unk,
-            )
+        @torch.jit.script
+        def slice_wrap(x, l):
+            return x[:, :l]
 
-            output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
-            output_scores = _fill(output_scores, can_ins_word, _scores, 0)
-            attn = _fill(attn, can_ins_word, word_ins_attn, 0.)
+        @torch.jit.script
+        def slice_wrap_attn(x, l):
+            return x if x.size()[0] == 0 else x[:, :l, :]
 
-        # delete some unnecessary paddings
-        cut_off = output_tokens.ne(self.pad).sum(1).max()
-        output_tokens = output_tokens[:, :cut_off]
-        output_scores = output_scores[:, :cut_off]
-        attn = None if attn is None else attn[:, :cut_off, :]
-        return {
-            "output_tokens": output_tokens,
-            "output_scores": output_scores,
-            "attn": attn,
-        }
+        output_tokens = slice_wrap(output_tokens, cut_off)
+        output_scores = slice_wrap(output_scores, cut_off)
+        attn = slice_wrap(attn, cut_off)
+        return [output_tokens, output_scores, attn, 0, 0]
 
     def initialize_output_tokens(self, encoder_out, src_tokens):
-        initial_output_tokens = src_tokens.new_zeros(src_tokens.size(0), 2)
-        initial_output_tokens[:, 0] = self.bos
-        initial_output_tokens[:, 1] = self.eos
-
-        initial_output_scores = initial_output_tokens.new_zeros(
-            *initial_output_tokens.size()
-        ).type_as(encoder_out["encoder_out"])
-
-        initial_attn = None
-        if getattr(self.decoder.layers[-1], "need_attn", False):
-            initial_attn = initial_output_tokens.new_zeros(
-                src_tokens.size(0), 2, src_tokens.size(1)
+        initial_output_tokens = torch.cat(
+            [
+                torch.zeros(src_tokens.size(0), 1).fill_(self.bos),
+                torch.zeros(src_tokens.size(0), 1).fill_(self.eos),
+            ],
+            1,
+        )
+
+        initial_output_scores = torch.zeros_like(initial_output_tokens).to(
+            encoder_out[0]
+        )
+
+        initial_attn = torch.empty([0])
+        if getattr(self.decoder.layers[-1], "need_attn", True):
+            initial_attn = torch.zeros([src_tokens.size(0), 2, src_tokens.size(1)]).to(
+                initial_output_tokens
             )
-        return {
-            "output_tokens": initial_output_tokens,
-            "output_scores": initial_output_scores,
-            "attn": initial_attn,
-        }
+
+        return [initial_output_tokens, initial_output_scores, initial_attn, 0, 0]
 
 
-class LevenshteinTransformerDecoder(TransformerDecoder):
+class LevenshteinTransformerDecoder(TracingTransformerDecoder):
     def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
         super().__init__(
             args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
@@ -467,25 +522,34 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
         self.embed_word_del = Embedding(2, self.output_embed_dim, None)
 
         # del_word, ins_mask, ins_word
-        self.early_exit = [int(i) for i in args.early_exit.split(',')]
+        self.early_exit = [int(i) for i in args.early_exit.split(",")]
         assert len(self.early_exit) == 3
 
         # copy layers for mask-predict/deletion
         self.layers_msk = None
         if getattr(args, "no_share_maskpredictor", False):
-            self.layers_msk = nn.ModuleList([
-                                    TransformerDecoderLayer(args, no_encoder_attn)
-                                    for _ in range(self.early_exit[1])
-                                ])
+            self.layers_msk = nn.ModuleList(
+                [
+                    TransformerDecoderLayer(args, no_encoder_attn)
+                    for _ in range(self.early_exit[1])
+                ]
+            )
         self.layers_del = None
         if getattr(args, "no_share_discriminator", False):
-            self.layers_del = nn.ModuleList([
-                                    TransformerDecoderLayer(args, no_encoder_attn)
-                                    for _ in range(self.early_exit[0])
-                                ])
+            self.layers_del = nn.ModuleList(
+                [
+                    TransformerDecoderLayer(args, no_encoder_attn)
+                    for _ in range(self.early_exit[0])
+                ]
+            )
 
     def extract_features(
-        self, prev_output_tokens, encoder_out=None, early_exit=None, layers=None, **unused
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        early_exit=None,
+        layers=None,
+        **unused
     ):
         """
         Similar to *forward* but only return features.
@@ -508,7 +572,7 @@ def extract_features(
         )
 
         # embed tokens and positions
-        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens.long())
         if self.project_in_dim is not None:
             x = self.project_in_dim(x)
 
@@ -525,13 +589,11 @@ def extract_features(
         decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
         layers = self.layers if layers is None else layers
         early_exit = len(layers) if early_exit is None else early_exit
-        for _, layer in enumerate(layers[: early_exit]):
+        for _, layer in enumerate(layers[:early_exit]):
             x, attn = layer(
                 x,
-                encoder_out["encoder_out"] if encoder_out is not None else None,
-                encoder_out["encoder_padding_mask"]
-                if encoder_out is not None
-                else None,
+                encoder_out[0] if encoder_out is not None else None,
+                encoder_out[1] if encoder_out is not None else None,
                 self_attn_mask=None,
                 self_attn_padding_mask=decoder_padding_mask,
             )
@@ -546,26 +608,38 @@ def extract_features(
         if self.project_out_dim is not None:
             x = self.project_out_dim(x)
 
-        return x, {"attn": attn, "inner_states": inner_states}
+        return x, attn, inner_states
 
     def forward_mask_ins(self, prev_output_tokens, encoder_out=None, **unused):
-        features, extra = self.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[1], layers=self.layers_msk, **unused
+        features, attn, _ = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            early_exit=self.early_exit[1],
+            layers=self.layers_msk,
+            **unused
         )
         features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
-        return F.linear(features_cat, self.embed_mask_ins.weight), extra['attn']
+        return F.linear(features_cat, self.embed_mask_ins.weight), attn
 
     def forward_word_ins(self, prev_output_tokens, encoder_out=None, **unused):
-        features, extra = self.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[2], layers=self.layers, **unused
+        features, attn, _ = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            early_exit=self.early_exit[2],
+            layers=self.layers,
+            **unused
         )
-        return self.output_layer(features), extra['attn']
+        return self.output_layer(features), attn
 
     def forward_word_del(self, prev_output_tokens, encoder_out=None, **unused):
-        features, extra = self.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[0], layers=self.layers_del, **unused
+        features, attn, _ = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            early_exit=self.early_exit[0],
+            layers=self.layers_del,
+            **unused
         )
-        return F.linear(features, self.embed_word_del.weight), extra['attn']
+        return F.linear(features, self.embed_word_del.weight), attn
 
 
 @register_model_architecture("levenshtein_transformer", "levenshtein_transformer")
@@ -595,7 +669,7 @@ def base_architecture(args):
     args.share_decoder_input_output_embed = getattr(
         args, "share_decoder_input_output_embed", False
     )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", True)
     args.no_token_positional_embeddings = getattr(
         args, "no_token_positional_embeddings", False
     )
diff --git a/fairseq/models/model_utils.py b/fairseq/models/model_utils.py
index 25b5de4c05..9831efbd15 100644
--- a/fairseq/models/model_utils.py
+++ b/fairseq/models/model_utils.py
@@ -3,7 +3,42 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Dict, List
+
 import torch
+from torch import Tensor
+
+
+@torch.jit.script
+def script_skip_tensor_list(x: List[Tensor], mask):
+    res = [xi[mask] if xi.size(0) == mask.size(0) else xi[:, mask] for xi in x]
+    outputs = []
+    for i, t in enumerate(res):
+        if t.numel() != 0:
+            outputs.append(t)
+        else:
+            outputs.append(x[i])
+    return outputs
+
+
+@torch.jit.script
+def script_skip_tensor(x: Tensor, mask):
+    # None case
+    if x.size(0) == 0:
+        return x
+    res = x[mask] if x.size(0) == mask.size(0) else x[:, mask]
+    if res.numel() == 0:
+        return x
+    else:
+        return res
+
+
+@torch.jit.script
+def script_skip_tensor_dict(x: Dict[str, Tensor], mask):
+    outputs = {}
+    for s, t in x.items():
+        outputs[s] = t[mask] if t.size(0) == mask.size(0) else t[:, mask]
+    return outputs
 
 
 def skip_tensors(x, mask):
@@ -31,7 +66,8 @@ def skip_tensors(x, mask):
     raise NotImplementedError
 
 
-def expand_2d_or_3d_tensor(x, trg_dim, padding_idx):
+@torch.jit.script
+def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):
     """
     Expand 2D/3D tensor on dim=1
     """
@@ -46,18 +82,18 @@ def expand_2d_or_3d_tensor(x, trg_dim, padding_idx):
     dims = [x.size(0), trg_dim - x.size(1)]
     if x.dim() == 3:
         dims.append(x.size(2))
-    x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1)
+    x = torch.cat([x, torch.zeros(dims).to(x).fill_(padding_idx)], 1)
 
     return x
 
 
-def fill_tensors(x, mask, y, padding_idx):
+@torch.jit.script
+def fill_tensors(x, mask, y, padding_idx: int):
     """
     Filling tensor x with y at masked positions (dim=0).
     """
-    if x is None:
-        return None
-
+    if x is None or x.size()[0] == 0:
+        return torch.empty([0])
     assert x.dim() == y.dim() and mask.size(0) == x.size(0)
     assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
 
@@ -72,7 +108,7 @@ def fill_tensors(x, mask, y, padding_idx):
         x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx)
         x[mask] = y
     elif x.size(1) > y.size(1):
-        x[mask] = padding_idx
+        x[mask] = torch.tensor(padding_idx)
         if x.dim() == 2:
             x[mask, :y.size(1)] = y
         else:
@@ -80,3 +116,88 @@ def fill_tensors(x, mask, y, padding_idx):
     else:
         x[mask] = y
     return x
+
+
+def _apply_ins_masks(
+    in_tokens, in_scores, mask_ins_pred, padding_idx, unk_idx, eos_idx
+):
+
+    in_masks = in_tokens.ne(padding_idx)
+    in_lengths = in_masks.sum(1)
+
+    # HACK: hacky way to shift all the paddings to eos first.
+    in_tokens.masked_fill_(~in_masks, eos_idx)
+    mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0)
+
+    out_lengths = in_lengths + mask_ins_pred.sum(1)
+    out_max_len = out_lengths.max()
+    out_masks = (
+        torch.arange(out_max_len, device=out_lengths.device)[None, :]
+        < out_lengths[:, None]
+    )
+
+    reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1)
+    out_tokens = (
+        in_tokens.new_zeros(in_tokens.size(0), out_max_len)
+        .fill_(padding_idx)
+        .masked_fill_(out_masks, unk_idx)
+    )
+    out_tokens[:, 0] = in_tokens[:, 0]
+    out_tokens.scatter_(1, reordering, in_tokens[:, 1:])
+
+    out_scores = None
+    if in_scores is not None:
+        in_scores.masked_fill_(~in_masks, 0)
+        out_scores = in_scores.new_zeros(*out_tokens.size())
+        out_scores[:, 0] = in_scores[:, 0]
+        out_scores.scatter_(1, reordering, in_scores[:, 1:])
+
+    return out_tokens, out_scores
+
+
+def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, unk_idx):
+    word_ins_masks = in_tokens.eq(unk_idx)
+    out_tokens = in_tokens.masked_scatter(word_ins_masks, word_ins_pred[word_ins_masks])
+
+    if in_scores is not None:
+        out_scores = in_scores.masked_scatter(
+            word_ins_masks, word_ins_scores[word_ins_masks]
+        )
+    else:
+        out_scores = None
+
+    return out_tokens, out_scores
+
+
+def _apply_del_words(
+    in_tokens, in_scores, in_attn, word_del_pred, padding_idx, bos_idx, eos_idx
+):
+    # apply deletion to a tensor
+    in_masks = in_tokens.ne(padding_idx)
+    bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx)
+
+    max_len = in_tokens.size(1)
+    word_del_pred.masked_fill_(~in_masks, 1)
+    word_del_pred.masked_fill_(bos_eos_masks, 0)
+
+    reordering = (
+        torch.arange(max_len, device=in_tokens.device)[None, :]
+        .expand_as(in_tokens)
+        .contiguous()
+        .masked_fill_(word_del_pred, max_len)
+        .sort(1)[1]
+    )
+
+    out_tokens = in_tokens.masked_fill(word_del_pred, padding_idx).gather(1, reordering)
+
+    out_scores = None
+    if in_scores is not None:
+        out_scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering)
+
+    out_attn = None
+    if in_attn is not None:
+        _mask = word_del_pred[:, :, None].expand_as(in_attn)
+        _reordering = reordering[:, :, None].expand_as(in_attn)
+        out_attn = in_attn.masked_fill(_mask, 0.).gather(1, _reordering)
+
+    return out_tokens, out_scores, out_attn
diff --git a/fairseq/models/nonautoregressive_ensembles.py b/fairseq/models/nonautoregressive_ensembles.py
index 12b9856931..01680b86cd 100644
--- a/fairseq/models/nonautoregressive_ensembles.py
+++ b/fairseq/models/nonautoregressive_ensembles.py
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 import math
 from fairseq.models.model_utils import fill_tensors as _fill, skip_tensors as _skip
-from fairseq.models.levenshtein_transformer import _apply_del_words, _apply_ins_masks, _apply_ins_words
+from fairseq.models.model_utils import _apply_del_words, _apply_ins_masks, _apply_ins_words
 
 
 class BasicEnsembleModel(torch.nn.Module):

From 39faa0a419a8051837ee26c433c8ba863a2b51f3 Mon Sep 17 00:00:00 2001
From: Jerry Ma <noreplyspamblackhole@gmail.com>
Date: Wed, 23 Oct 2019 20:15:03 -0700
Subject: [PATCH 191/213] Reset both WPS and UPS on first minibatch (#891)

Summary:
Makes more sense to reset either both meters or neither of them.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/891

Differential Revision: D18109027

Pulled By: jma127

fbshipit-source-id: f63baed9a6b928a6f591a76e69ef6e9c524e4398
---
 train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 9396358aa9..d287e2513d 100644
--- a/train.py
+++ b/train.py
@@ -146,9 +146,10 @@ def train(args, trainer, task, epoch_itr):
             stats[k] = extra_meters[k].avg
         progress.log(stats, tag='train', step=stats['num_updates'])
 
-        # ignore the first mini-batch in words-per-second calculation
+        # ignore the first mini-batch in words-per-second and updates-per-second calculation
         if i == 0:
             trainer.get_meter('wps').reset()
+            trainer.get_meter('ups').reset()
 
         num_updates = trainer.get_num_updates()
         if (

From d0358bb38e4a40d8faaa155900ef7859c9b867b5 Mon Sep 17 00:00:00 2001
From: Jerry Ma <noreplyspamblackhole@gmail.com>
Date: Wed, 23 Oct 2019 20:50:42 -0700
Subject: [PATCH 192/213] fix inconsistency w/ recent pytorch cuda device logic

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/892

Differential Revision: D18109685

Pulled By: jma127

fbshipit-source-id: f96e1080a5577b8ee0748dfdd956bf72bed47474
---
 fairseq/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 03601f69d2..545357ebef 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -330,7 +330,7 @@ def maybe_no_sync():
                     print(msg, file=sys.stderr)
                     if torch.cuda.is_available() and hasattr(torch.cuda, "memory_summary"):
                         for device_idx in range(torch.cuda.device_count()):
-                            print(torch.cuda.memory_summary(device=torch.cuda.device(device_idx)),
+                            print(torch.cuda.memory_summary(device=device_idx),
                                   file=sys.stderr)
                     sys.stderr.flush()
 

From 5b086a0c17a16ed84285ce78017637d47fa50caa Mon Sep 17 00:00:00 2001
From: Ning Dong <dnn@fb.com>
Date: Thu, 24 Oct 2019 09:54:33 -0700
Subject: [PATCH 193/213] OSS tracing compliant transformer to unbreak master
 (#1299)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1299

 LevT calls into tracing compliant transformer we didn't plan to OSS earlier. This is a workaround to unbreak the master. Will revisit and simplify the code later.

Reviewed By: pipibjc

Differential Revision: D18110339

fbshipit-source-id: 3bb51c56c2c20f45db1d5786d030b374b412eab1
---
 fairseq/models/levenshtein_transformer.py     |   2 +-
 .../models/tracing_compliant_transformer.py   | 625 ++++++++++++++++++
 2 files changed, 626 insertions(+), 1 deletion(-)
 create mode 100644 fairseq/models/tracing_compliant_transformer.py

diff --git a/fairseq/models/levenshtein_transformer.py b/fairseq/models/levenshtein_transformer.py
index cbf9244657..7592bb99da 100644
--- a/fairseq/models/levenshtein_transformer.py
+++ b/fairseq/models/levenshtein_transformer.py
@@ -16,7 +16,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from fairseq.models import register_model, register_model_architecture
-from fairseq.models.fb_tracing_transformer import (
+from fairseq.models.tracing_compliant_transformer import (
     TracingTransformerDecoder,
     TracingTransformerEncoder,
     TracingTransformerModel,
diff --git a/fairseq/models/tracing_compliant_transformer.py b/fairseq/models/tracing_compliant_transformer.py
new file mode 100644
index 0000000000..ca3d807bed
--- /dev/null
+++ b/fairseq/models/tracing_compliant_transformer.py
@@ -0,0 +1,625 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import options, utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqIncrementalDecoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import Embedding, Linear, base_architecture
+from fairseq.modules import (
+    AdaptiveSoftmax,
+    LayerNorm,
+    PositionalEmbedding,
+    SinusoidalPositionalEmbedding,
+    TransformerDecoderLayer,
+    TransformerEncoderLayer,
+)
+
+DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model('tracing_transformer')
+class TracingTransformerModel(FairseqEncoderDecoderModel):
+    """
+    Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)
+    <https://arxiv.org/abs/1706.03762>`_.
+
+    Args:
+        encoder (TransformerEncoder): the encoder
+        decoder (TransformerDecoder): the decoder
+
+    The Transformer model provides the following named architectures and
+    command-line arguments:
+
+    .. argparse::
+        :ref: fairseq.models.transformer_parser
+        :prog:
+    """
+
+    @classmethod
+    def hub_models(cls):
+        # fmt: off
+        return {
+            'transformer.wmt14.en-fr': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2',
+            'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2',
+            'transformer.wmt18.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz',
+            'transformer.wmt19.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz',
+            'transformer.wmt19.en-ru': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz',
+            'transformer.wmt19.de-en': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz',
+            'transformer.wmt19.ru-en': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz',
+            'transformer.wmt19.en-de.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz',
+            'transformer.wmt19.en-ru.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz',
+            'transformer.wmt19.de-en.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz',
+            'transformer.wmt19.ru-en.single_model': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz',
+        }
+        # fmt: on
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.supports_align_args = True
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+                            help='dropout probability after activation in FFN.')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+                            help='num encoder attention heads')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+        parser.add_argument('--encoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the encoder')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads')
+        parser.add_argument('--decoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the decoder')
+        parser.add_argument('--decoder-normalize-before', action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+                            help='if set, disables positional embeddings (outside self attention)')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion'),
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+                            help='sets adaptive softmax dropout for the tail projections')
+        # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
+        parser.add_argument('--no-cross-attention', default=False, action='store_true',
+                            help='do not perform cross-attention')
+        parser.add_argument('--cross-self-attention', default=False, action='store_true',
+                            help='perform cross+self-attention')
+        parser.add_argument('--layer-wise-attention', default=False, action='store_true',
+                            help='perform layer-wise attention (cross-attention or cross+self-attention)')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if not hasattr(args, 'max_source_positions'):
+            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
+        if not hasattr(args, 'max_target_positions'):
+            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        def build_embedding(dictionary, embed_dim, path=None):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+
+        if args.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise ValueError('--share-all-embeddings requires a joined dictionary')
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    '--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim')
+            if args.decoder_embed_path and (
+                    args.decoder_embed_path != args.encoder_embed_path):
+                raise ValueError('--share-all-embeddings not compatible with --decoder-embed-path')
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = build_embedding(
+                tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
+            )
+
+        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
+        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
+        return cls(encoder, decoder)
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return TracingTransformerEncoder(args, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TracingTransformerDecoder(
+            args,
+            tgt_dict,
+            embed_tokens,
+            no_encoder_attn=getattr(args, 'no_cross_attention', False),
+        )
+
+
+class TracingTransformerEncoder(FairseqEncoder):
+    """
+    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_tokens (torch.nn.Embedding): input embedding
+    """
+
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(dictionary)
+        self.register_buffer('version', torch.Tensor([3]))
+
+        self.dropout = args.dropout
+
+        embed_dim = embed_tokens.embedding_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_source_positions = args.max_source_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)
+        self.embed_positions = PositionalEmbedding(
+            args.max_source_positions, embed_dim, self.padding_idx,
+            learned=args.encoder_learned_pos,
+        ) if not args.no_token_positional_embeddings else None
+
+        self.layer_wise_attention = getattr(args, 'layer_wise_attention', False)
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend([
+            TransformerEncoderLayer(args)
+            for i in range(args.encoder_layers)
+        ])
+
+        if args.encoder_normalize_before:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward_embedding(self, src_tokens):
+        # embed tokens and positions
+        embed = self.embed_scale * self.embed_tokens(src_tokens)
+        if self.embed_positions is not None:
+            x = embed + self.embed_positions(src_tokens)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        return x, embed
+
+    def forward(self, src_tokens, src_lengths, cls_input=None, return_all_hiddens=False):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        if self.layer_wise_attention:
+            return_all_hiddens = True
+
+        x, encoder_embedding = self.forward_embedding(src_tokens)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # compute padding mask
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+
+        encoder_states = [] if return_all_hiddens else None
+
+        # encoder layers
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask)
+            if return_all_hiddens:
+                encoder_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+            if return_all_hiddens:
+                encoder_states[-1] = x
+        if encoder_states is not None:
+            return x, encoder_padding_mask, encoder_embedding, encoder_states
+        else:
+            return x, encoder_padding_mask, encoder_embedding
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        # 0: encoder_out
+        # 1: encoder_padding_mask
+        # 2: encoder_states
+        if encoder_out[0] is not None:
+            encoder_out[0] = \
+                encoder_out[0].index_select(1, new_order)
+        if encoder_out[1] is not None:
+            encoder_out[1] = \
+                encoder_out[1].index_select(0, new_order)
+        if len(encoder_out) == 3 and encoder_out[2] is not None:
+            for idx, state in enumerate(encoder_out[2]):
+                encoder_out[2][idx] = state.index_select(1, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return min(self.max_source_positions, self.embed_positions.max_positions())
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device:
+            self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
+            if self._future_mask.size(0) < dim:
+                self._future_mask = torch.triu(utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1)
+        return self._future_mask[:dim, :dim]
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = '{}.embed_positions.weights'.format(name)
+            if weights_key in state_dict:
+                del state_dict[weights_key]
+            state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1)
+        for i in range(len(self.layers)):
+            # update layer norms
+            self.layers[i].upgrade_state_dict_named(state_dict, "{}.layers.{}".format(name, i))
+
+        version_key = '{}.version'.format(name)
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2:
+            # earlier checkpoints did not normalize after the stack of layers
+            self.layer_norm = None
+            self.normalize = False
+            state_dict[version_key] = torch.Tensor([1])
+        return state_dict
+
+
+class TracingTransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(dictionary)
+        self.register_buffer('version', torch.Tensor([3]))
+
+        self.dropout = args.dropout
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = args.decoder_embed_dim
+        self.output_embed_dim = args.decoder_output_dim
+
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = args.max_target_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
+
+        self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None
+
+        self.embed_positions = PositionalEmbedding(
+            args.max_target_positions, embed_dim, self.padding_idx,
+            learned=args.decoder_learned_pos,
+        ) if not args.no_token_positional_embeddings else None
+
+        self.cross_self_attention = getattr(args, 'cross_self_attention', False)
+        self.layer_wise_attention = getattr(args, 'layer_wise_attention', False)
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend([
+            TransformerDecoderLayer(args, no_encoder_attn)
+            for _ in range(args.decoder_layers)
+        ])
+
+        self.adaptive_softmax = None
+
+        self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \
+            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None
+
+        if args.adaptive_softmax_cutoff is not None:
+            self.adaptive_softmax = AdaptiveSoftmax(
+                len(dictionary),
+                self.output_embed_dim,
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
+                dropout=args.adaptive_softmax_dropout,
+                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
+                factor=args.adaptive_softmax_factor,
+                tie_proj=args.tie_adaptive_proj,
+            )
+        elif not self.share_input_output_embed:
+            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim))
+            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)
+
+        if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False):
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        features_only=False,
+        **extra_args,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        x, extra = self.extract_features(
+            prev_output_tokens, encoder_out, incremental_state, **extra_args,
+        )
+        if not features_only:
+            x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        full_context_alignment=False,
+        alignment_layer=None,
+        alignment_heads=None,
+        **unused,
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+
+        Args:
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+            alignment_layer (int, optional): return mean alignment over
+                heads at this layer (default: last layer).
+            alignment_heads (int, optional): only average alignment over
+                this many heads (default: all heads).
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        if alignment_layer is None:
+            alignment_layer = len(self.layers) - 1
+
+        # embed positions
+        positions = self.embed_positions(
+            prev_output_tokens,
+            incremental_state=incremental_state,
+        ) if self.embed_positions is not None else None
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        if not self_attn_padding_mask.any() and not self.cross_self_attention:
+            self_attn_padding_mask = None
+
+        # decoder layers
+        attn = None
+        inner_states = [x]
+        for idx, layer in enumerate(self.layers):
+            encoder_state = None
+            if encoder_out is not None:
+                if self.layer_wise_attention:
+                    encoder_state = encoder_out[3][idx]
+                else:
+                    encoder_state = encoder_out[0]
+
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+
+            x, layer_attn = layer(
+                x,
+                encoder_state
+                if encoder_state is not None else None,
+                encoder_out[1]
+                if encoder_out is not None else None,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=(idx == alignment_layer),
+                need_head_weights=(idx == alignment_layer),
+            )
+
+            inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float()
+
+        if attn is not None:
+            if alignment_heads is not None:
+                attn = attn[:alignment_heads]
+
+            # average probabilities over heads
+            attn = attn.mean(dim=0)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {'attn': attn, 'inner_states': inner_states}
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            if self.share_input_output_embed:
+                return F.linear(features, self.embed_tokens.weight)
+            else:
+                return F.linear(features, self.embed_out)
+        else:
+            return features
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions())
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, '_future_mask')
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
+        return self._future_mask[:dim, :dim]
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = '{}.embed_positions.weights'.format(name)
+            if weights_key in state_dict:
+                del state_dict[weights_key]
+            state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1)
+
+        for i in range(len(self.layers)):
+            # update layer norms
+            layer_norm_map = {
+                '0': 'self_attn_layer_norm',
+                '1': 'encoder_attn_layer_norm',
+                '2': 'final_layer_norm'
+            }
+            for old, new in layer_norm_map.items():
+                for m in ('weight', 'bias'):
+                    k = '{}.layers.{}.layer_norms.{}.{}'.format(name, i, old, m)
+                    if k in state_dict:
+                        state_dict['{}.layers.{}.{}.{}'.format(name, i, new, m)] = state_dict[k]
+                        del state_dict[k]
+
+        version_key = '{}.version'.format(name)
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
+            # earlier checkpoints did not normalize after the stack of layers
+            self.layer_norm = None
+            self.normalize = False
+            state_dict[version_key] = torch.Tensor([1])
+
+        return state_dict

From fdf4c3e9002ec1ee01a281779a095512ada90e40 Mon Sep 17 00:00:00 2001
From: Halil Akin <halilakin@fb.com>
Date: Fri, 25 Oct 2019 09:02:19 -0700
Subject: [PATCH 194/213] Simplify fairseq multihead attention (#888)

Summary:
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/888

We want to simplify multihead attention and get rid of the dynamic in_proj_weight logic. Sending the diff early for feedback, will have further changes as I try to fix breaking tests

Reviewed By: edunov

Differential Revision: D17912661

fbshipit-source-id: 0e6319fc694d8ec5187d1c2fefe5839d9d522186
---
 fairseq/modules/multihead_attention.py | 124 ++++++++++++-------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 0ff05d16db..cb3ae95d51 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import math
 import torch
 from torch import nn
 from torch.nn import Parameter
@@ -38,12 +39,9 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
                                                              'value to be of the same size'
 
-        if self.qkv_same_dim:
-            self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
-        else:
-            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
-            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
-            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+        self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+        self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+        self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
 
         if bias:
             self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
@@ -70,12 +68,19 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         else:
             self.enable_torch_version = False
 
+    @property
+    def in_proj_weight(self):
+        # TODO: Remove this backward compatibility code (in_proj_weight)
+        return torch.cat((self.q_proj_weight, self.k_proj_weight, self.v_proj_weight))
+
     def prepare_for_onnx_export_(self):
         self.onnx_trace = True
 
     def reset_parameters(self):
         if self.qkv_same_dim:
-            nn.init.xavier_uniform_(self.in_proj_weight)
+            nn.init.xavier_uniform_(self.k_proj_weight, gain=1/math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj_weight, gain=1/math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj_weight, gain=1/math.sqrt(2))
         else:
             nn.init.xavier_uniform_(self.k_proj_weight)
             nn.init.xavier_uniform_(self.v_proj_weight)
@@ -126,27 +131,17 @@ def forward(
         assert list(query.size()) == [tgt_len, bsz, embed_dim]
 
         if self.enable_torch_version and not self.onnx_trace and incremental_state is None and not static_kv:
-            if self.qkv_same_dim:
-                return F.multi_head_attention_forward(query, key, value,
-                                                      self.embed_dim, self.num_heads,
-                                                      self.in_proj_weight,
-                                                      self.in_proj_bias, self.bias_k, self.bias_v,
-                                                      self.add_zero_attn, self.dropout,
-                                                      self.out_proj.weight, self.out_proj.bias,
-                                                      self.training, key_padding_mask, need_weights,
-                                                      attn_mask)
-            else:
-                return F.multi_head_attention_forward(query, key, value,
-                                                      self.embed_dim, self.num_heads,
-                                                      torch.empty([0]),
-                                                      self.in_proj_bias, self.bias_k, self.bias_v,
-                                                      self.add_zero_attn, self.dropout,
-                                                      self.out_proj.weight, self.out_proj.bias,
-                                                      self.training, key_padding_mask, need_weights,
-                                                      attn_mask, use_separate_proj_weight=True,
-                                                      q_proj_weight=self.q_proj_weight,
-                                                      k_proj_weight=self.k_proj_weight,
-                                                      v_proj_weight=self.v_proj_weight)
+            return F.multi_head_attention_forward(query, key, value,
+                                                  self.embed_dim, self.num_heads,
+                                                  torch.empty([0]),
+                                                  self.in_proj_bias, self.bias_k, self.bias_v,
+                                                  self.add_zero_attn, self.dropout,
+                                                  self.out_proj.weight, self.out_proj.bias,
+                                                  self.training, key_padding_mask, need_weights,
+                                                  attn_mask, use_separate_proj_weight=True,
+                                                  q_proj_weight=self.q_proj_weight,
+                                                  k_proj_weight=self.k_proj_weight,
+                                                  v_proj_weight=self.v_proj_weight)
 
         if incremental_state is not None:
             saved_state = self._get_input_buffer(incremental_state)
@@ -160,8 +155,9 @@ def forward(
             saved_state = None
 
         if self.self_attention:
-            # self-attention
-            q, k, v = self.in_proj_qkv(query)
+            q = self.in_proj_q(query)
+            k = self.in_proj_k(query)
+            v = self.in_proj_v(query)
         elif self.encoder_decoder_attention:
             # encoder-decoder attention
             q = self.in_proj_q(query)
@@ -288,45 +284,25 @@ def forward(
 
         return attn, attn_weights
 
-    def in_proj_qkv(self, query):
-        return self._in_proj(query).chunk(3, dim=-1)
-
     def in_proj_q(self, query):
-        if self.qkv_same_dim:
-            return self._in_proj(query, end=self.embed_dim)
-        else:
-            bias = self.in_proj_bias
-            if bias is not None:
-                bias = bias[:self.embed_dim]
-            return F.linear(query, self.q_proj_weight, bias)
+        bias = self.in_proj_bias
+        if bias is not None:
+            bias = bias[:self.embed_dim]
+        return F.linear(query, self.q_proj_weight, bias)
 
     def in_proj_k(self, key):
-        if self.qkv_same_dim:
-            return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
-        else:
-            weight = self.k_proj_weight
-            bias = self.in_proj_bias
-            if bias is not None:
-                bias = bias[self.embed_dim:2 * self.embed_dim]
-            return F.linear(key, weight, bias)
+        weight = self.k_proj_weight
+        bias = self.in_proj_bias
+        if bias is not None:
+            bias = bias[self.embed_dim:2 * self.embed_dim]
+        return F.linear(key, weight, bias)
 
     def in_proj_v(self, value):
-        if self.qkv_same_dim:
-            return self._in_proj(value, start=2 * self.embed_dim)
-        else:
-            weight = self.v_proj_weight
-            bias = self.in_proj_bias
-            if bias is not None:
-                bias = bias[2 * self.embed_dim:]
-            return F.linear(value, weight, bias)
-
-    def _in_proj(self, input, start=0, end=None):
-        weight = self.in_proj_weight
+        weight = self.v_proj_weight
         bias = self.in_proj_bias
-        weight = weight[start:end, :]
         if bias is not None:
-            bias = bias[start:end]
-        return F.linear(input, weight, bias)
+            bias = bias[2 * self.embed_dim:]
+        return F.linear(value, weight, bias)
 
     def reorder_incremental_state(self, incremental_state, new_order):
         """Reorder buffered internal state (for incremental generation)."""
@@ -354,3 +330,27 @@ def _set_input_buffer(self, incremental_state, buffer):
 
     def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
         return attn_weights
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        # TODO: Remove this backward compatibility code (in_proj_weight)
+        # here, we convert in_proj_weight to individual q,k,v weights
+        prefix = name + '.' if name != '' else ''
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + 'in_proj_weight'):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + 'q_proj_weight'] = state_dict[k][:dim]
+                items_to_add[prefix + 'k_proj_weight'] = state_dict[k][dim:2*dim]
+                items_to_add[prefix + 'v_proj_weight'] = state_dict[k][2*dim:]
+
+                keys_to_remove.append(k)
+
+        for k in keys_to_remove:
+            del state_dict[k]
+
+        for key, value in items_to_add.items():
+            state_dict[key] = value
+
+        return state_dict

From c07362c675975ad7eb70afc941c6fee705c21642 Mon Sep 17 00:00:00 2001
From: Halil Akin <halilakin@fb.com>
Date: Fri, 25 Oct 2019 09:02:19 -0700
Subject: [PATCH 195/213] Convert matmuls to quantizable nn.Linear modules
 (#1304)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1304

Pull Request resolved: https://github.com/pytorch/translate/pull/657

Pull Request resolved: https://github.com/facebookresearch/pytext/pull/1065

Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/889

We are converting matmuls to quantizable nn.Linear modules in this diff. First let's test profile after the diff to see how low level operations are changing.

Reviewed By: jmp84, edunov, lly-zero-one, jhcross

Differential Revision: D17964796

fbshipit-source-id: 3ddd3ff81fa1ea5864dded98e993f4fe3b71fe5e
---
 fairseq/modules/multihead_attention.py | 95 ++++++++++++--------------
 1 file changed, 42 insertions(+), 53 deletions(-)

diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index cb3ae95d51..5b92662582 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -39,14 +39,9 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
                                                              'value to be of the same size'
 
-        self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
-        self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
-        self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
-
-        if bias:
-            self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
-        else:
-            self.register_parameter('in_proj_bias', None)
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
@@ -71,25 +66,30 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
     @property
     def in_proj_weight(self):
         # TODO: Remove this backward compatibility code (in_proj_weight)
-        return torch.cat((self.q_proj_weight, self.k_proj_weight, self.v_proj_weight))
+        return torch.cat((self.q_proj.weight, self.k_proj.weight, self.v_proj.weight))
+
+    @property
+    def in_proj_bias(self):
+        # TODO: Remove this backward compatibility code (in_proj_bias)
+        return torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias))
 
     def prepare_for_onnx_export_(self):
         self.onnx_trace = True
 
     def reset_parameters(self):
         if self.qkv_same_dim:
-            nn.init.xavier_uniform_(self.k_proj_weight, gain=1/math.sqrt(2))
-            nn.init.xavier_uniform_(self.v_proj_weight, gain=1/math.sqrt(2))
-            nn.init.xavier_uniform_(self.q_proj_weight, gain=1/math.sqrt(2))
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1/math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1/math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1/math.sqrt(2))
         else:
-            nn.init.xavier_uniform_(self.k_proj_weight)
-            nn.init.xavier_uniform_(self.v_proj_weight)
-            nn.init.xavier_uniform_(self.q_proj_weight)
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
 
         nn.init.xavier_uniform_(self.out_proj.weight)
-        if self.in_proj_bias is not None:
-            nn.init.constant_(self.in_proj_bias, 0.)
-            nn.init.constant_(self.out_proj.bias, 0.)
+        nn.init.constant_(self.out_proj.bias, 0.)
         if self.bias_k is not None:
             nn.init.xavier_normal_(self.bias_k)
         if self.bias_v is not None:
@@ -139,9 +139,9 @@ def forward(
                                                   self.out_proj.weight, self.out_proj.bias,
                                                   self.training, key_padding_mask, need_weights,
                                                   attn_mask, use_separate_proj_weight=True,
-                                                  q_proj_weight=self.q_proj_weight,
-                                                  k_proj_weight=self.k_proj_weight,
-                                                  v_proj_weight=self.v_proj_weight)
+                                                  q_proj_weight=self.q_proj.weight,
+                                                  k_proj_weight=self.k_proj.weight,
+                                                  v_proj_weight=self.v_proj.weight)
 
         if incremental_state is not None:
             saved_state = self._get_input_buffer(incremental_state)
@@ -155,23 +155,23 @@ def forward(
             saved_state = None
 
         if self.self_attention:
-            q = self.in_proj_q(query)
-            k = self.in_proj_k(query)
-            v = self.in_proj_v(query)
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
         elif self.encoder_decoder_attention:
             # encoder-decoder attention
-            q = self.in_proj_q(query)
+            q = self.q_proj(query)
             if key is None:
                 assert value is None
                 k = v = None
             else:
-                k = self.in_proj_k(key)
-                v = self.in_proj_v(key)
+                k = self.k_proj(key)
+                v = self.v_proj(key)
 
         else:
-            q = self.in_proj_q(query)
-            k = self.in_proj_k(key)
-            v = self.in_proj_v(value)
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
         q *= self.scaling
 
         if self.bias_k is not None:
@@ -284,26 +284,6 @@ def forward(
 
         return attn, attn_weights
 
-    def in_proj_q(self, query):
-        bias = self.in_proj_bias
-        if bias is not None:
-            bias = bias[:self.embed_dim]
-        return F.linear(query, self.q_proj_weight, bias)
-
-    def in_proj_k(self, key):
-        weight = self.k_proj_weight
-        bias = self.in_proj_bias
-        if bias is not None:
-            bias = bias[self.embed_dim:2 * self.embed_dim]
-        return F.linear(key, weight, bias)
-
-    def in_proj_v(self, value):
-        weight = self.v_proj_weight
-        bias = self.in_proj_bias
-        if bias is not None:
-            bias = bias[2 * self.embed_dim:]
-        return F.linear(value, weight, bias)
-
     def reorder_incremental_state(self, incremental_state, new_order):
         """Reorder buffered internal state (for incremental generation)."""
         input_buffer = self._get_input_buffer(incremental_state)
@@ -341,12 +321,21 @@ def upgrade_state_dict_named(self, state_dict, name):
             if k.endswith(prefix + 'in_proj_weight'):
                 # in_proj_weight used to be q + k + v with same dimensions
                 dim = int(state_dict[k].shape[0] / 3)
-                items_to_add[prefix + 'q_proj_weight'] = state_dict[k][:dim]
-                items_to_add[prefix + 'k_proj_weight'] = state_dict[k][dim:2*dim]
-                items_to_add[prefix + 'v_proj_weight'] = state_dict[k][2*dim:]
+                items_to_add[prefix + 'q_proj.weight'] = state_dict[k][:dim]
+                items_to_add[prefix + 'k_proj.weight'] = state_dict[k][dim:2*dim]
+                items_to_add[prefix + 'v_proj.weight'] = state_dict[k][2*dim:]
 
                 keys_to_remove.append(k)
 
+                k_bias = prefix + 'in_proj_bias'
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + 'q_proj.bias'] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + 'k_proj.bias'] = state_dict[k_bias][dim:2*dim]
+                    items_to_add[prefix + 'v_proj.bias'] = state_dict[k_bias][2*dim:]
+
+                    keys_to_remove.append(prefix + 'in_proj_bias')
+
         for k in keys_to_remove:
             del state_dict[k]
 

From eb68afca0208a040d4e91eceae86f5f22ca24b04 Mon Sep 17 00:00:00 2001
From: Xian Li <xianl@fb.com>
Date: Fri, 25 Oct 2019 17:19:52 -0700
Subject: [PATCH 196/213] fix a type mismatch in NAT quantization run

Summary:
Fix a type mismatch which was found after patching NAT on top of quantization.
Ning suggested this fix. Need to further understand: why this only appears after patching quantization diff?

Reviewed By: kahne, jhcross

Differential Revision: D18147726

fbshipit-source-id: a51becc9ad58a637a0180074eaa2b46990ab9f84
---
 fairseq/models/model_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/models/model_utils.py b/fairseq/models/model_utils.py
index 9831efbd15..432f81ea3d 100644
--- a/fairseq/models/model_utils.py
+++ b/fairseq/models/model_utils.py
@@ -108,7 +108,7 @@ def fill_tensors(x, mask, y, padding_idx: int):
         x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx)
         x[mask] = y
     elif x.size(1) > y.size(1):
-        x[mask] = torch.tensor(padding_idx)
+        x[mask] = torch.tensor(padding_idx).type_as(x)
         if x.dim() == 2:
             x[mask, :y.size(1)] = y
         else:

From dabbef467692ef4ffb7de8a01235876bd7320a93 Mon Sep 17 00:00:00 2001
From: Angela Fan <angela.h.fan@gmail.com>
Date: Sun, 27 Oct 2019 12:09:29 -0700
Subject: [PATCH 197/213] adding layerdrop code for training, pruning, and
 readme (#890)

Summary:
TEST 1: EVALUATION TIME WORKS
checked
achieves correct model perplexity: 18.68

TEST 2: TRAINING NEW MODEL WORKS
checked

without layerdrop:
--decoder-layerdrop 0 OR no flag at all
| epoch 001:     10 / 11201 loss=27.469, nll_loss=27.469, ppl=185799477.36, wps=1764, ups=0, wpb=9216.000, bsz=3.000, num_updates=7, lr=0.0004376, gnorm=25.471, clip=1.000, oom=0.000, loss_scale=8.000, wall=37, train_wall=30
| epoch 001:     20 / 11201 loss=27.443, nll_loss=27.443, ppl=182500427.22, wps=2449, ups=0, wpb=9216.000, bsz=3.000, num_updates=17, lr=0.0010626, gnorm=25.273, clip=1.000, oom=0.000, loss_scale=8.000, wall=64, train_wall=57
| epoch 001:     30 / 11201 loss=27.404, nll_loss=27.404, ppl=177612215.78, wps=2720, ups=0, wpb=9216.000, bsz=3.000, num_updates=27, lr=0.0016876, gnorm=25.136, clip=1.000, oom=0.000, loss_scale=8.000, wall=91, train_wall=84
| epoch 001:     40 / 11201 loss=27.009, nll_loss=27.009, ppl=135079983.00, wps=2865, ups=0, wpb=9216.000, bsz=3.000, num_updates=37, lr=0.0023126, gnorm=24.311, clip=1.000, oom=0.000, loss_scale=8.000, wall=119, train_wall=112
| epoch 001:     50 / 11201 loss=26.418, nll_loss=26.418, ppl=89680259.41, wps=2952, ups=0, wpb=9216.000, bsz=3.000, num_updates=47, lr=0.0029376, gnorm=22.775, clip=1.000, oom=0.000, loss_scale=8.000, wall=147, train_wall=140

with layerdrop (regularization effect should be seen in PPL):
--decoder-layerdrop 0.2

| epoch 001:     10 / 11201 loss=25.186, nll_loss=25.186, ppl=38182937.27, wps=2428, ups=0, wpb=9216.000, bsz=3.000, num_updates=8, lr=0.0005001, gnorm=17.082, clip=1.000, oom=0.000, loss_scale=16.000, wall=30, train_wall=24
| epoch 001:     20 / 11201 loss=25.270, nll_loss=25.270, ppl=40451933.50, wps=3173, ups=0, wpb=9216.000, bsz=3.000, num_updates=18, lr=0.0011251, gnorm=17.162, clip=1.000, oom=0.000, loss_scale=16.000, wall=52, train_wall=45
| epoch 001:     30 / 11201 loss=25.349, nll_loss=25.349, ppl=42752256.68, wps=3454, ups=0, wpb=9216.000, bsz=3.000, num_updates=28, lr=0.0017501, gnorm=17.370, clip=1.000, oom=0.000, loss_scale=16.000, wall=75, train_wall=68
| epoch 001:     40 / 11201 loss=25.115, nll_loss=25.115, ppl=36343806.30, wps=3619, ups=0, wpb=9216.000, bsz=3.000, num_updates=38, lr=0.0023751, gnorm=16.945, clip=1.000, oom=0.000, loss_scale=16.000, wall=97, train_wall=90
| epoch 001:     50 / 11201 loss=24.804, nll_loss=24.804, ppl=29284345.78, wps=3716, ups=0, wpb=9216.000, bsz=3.000, num_updates=48, lr=0.0030001, gnorm=16.406, clip=1.000, oom=0.000, loss_scale=16.000, wall=119, train_wall=112

TEST 3: PICKING UP TRAINING FROM EXISTING MODEL
checked

| loaded checkpoint /checkpoint/angelafan/structured_0.1_block_8_sd02/checkpoint_last.pt (epoch 272 @ 381066 updates)
| loading train data for epoch 272
| loaded 1801350 examples from: /private/home/angelafan/lm_work/fairseq-py/data-bin/wikitext-103/train

TEST 4: EVALUATING EXISTING BERT MODEL REPROS RESULTS
| [input] dictionary: 50265 types
| [label] dictionary: 9 types
| Accuracy:  0.9231651376146789
achieves correct accuracy on SST2 for this model

TEST 5: TRAINING NEW BERT MODEL WORKS
checked and works

TEST 6: NMT

without layerdrop
--encoder-layerdrop 0 --decoder-layerdrop 0 OR combinations of flag specified and not specified

| epoch 001:     10 / 92203 loss=15.820, nll_loss=15.830, ppl=58267.93, wps=4902, ups=0, wpb=1477.818, bsz=51.636, num_updates=11, lr=1.47473e-06, gnorm=7.207, clip=0.000, oom=0.000, loss_scale=128.000, wall=60, train_wall=3
| epoch 001:     20 / 92203 loss=15.523, nll_loss=15.501, ppl=46359.29, wps=5037, ups=0, wpb=1496.476, bsz=45.333, num_updates=21, lr=2.72448e-06, gnorm=6.869, clip=0.000, oom=0.000, loss_scale=128.000, wall=63, train_wall=6
| epoch 001:     30 / 92203 loss=15.185, nll_loss=15.123, ppl=35695.79, wps=5085, ups=0, wpb=1519.355, bsz=44.645, num_updates=31, lr=3.97423e-06, gnorm=6.186, clip=0.000, oom=0.000, loss_scale=128.000, wall=66, train_wall=9
| epoch 001:     40 / 92203 loss=14.940, nll_loss=14.849, ppl=29505.60, wps=5116, ups=1, wpb=1521.244, bsz=42.927, num_updates=41, lr=5.22398e-06, gnorm=5.610, clip=0.000, oom=0.000, loss_scale=128.000, wall=69, train_wall=12
| epoch 001:     50 / 92203 loss=14.745, nll_loss=14.630, ppl=25346.87, wps=5070, ups=1, wpb=1507.961, bsz=41.725, num_updates=51, lr=6.47373e-06, gnorm=5.104, clip=0.000, oom=0.000, loss_scale=128.000, wall=71, train_wall=15

with layerdrop (regularization effect should be seen in PPL)

A) works with --encoder-layerdrop 0.2 --decoder-layerdrop 0.2
B) works with different settings --encoder-layerdrop 0.3 --decoder-layerdrop 0.5
C) works with one on and one off --encoder-layerdrop 0.2 --decoder-layerdrop 0

| epoch 001:     10 / 92203 loss=15.817, nll_loss=15.828, ppl=58158.54, wps=5355, ups=0, wpb=1477.818, bsz=51.636, num_updates=11, lr=1.47473e-06, gnorm=6.959, clip=0.000, oom=0.000, loss_scale=128.000, wall=59, train_wall=3
| epoch 001:     20 / 92203 loss=15.650, nll_loss=15.641, ppl=51111.63, wps=5515, ups=0, wpb=1496.476, bsz=45.333, num_updates=21, lr=2.72448e-06, gnorm=6.825, clip=0.000, oom=0.000, loss_scale=128.000, wall=61, train_wall=6
| epoch 001:     30 / 92203 loss=15.440, nll_loss=15.408, ppl=43491.58, wps=5602, ups=0, wpb=1519.355, bsz=44.645, num_updates=31, lr=3.97423e-06, gnorm=6.576, clip=0.000, oom=0.000, loss_scale=128.000, wall=64, train_wall=8
| epoch 001:     40 / 92203 loss=15.247, nll_loss=15.193, ppl=37457.14, wps=5676, ups=1, wpb=1521.244, bsz=42.927, num_updates=41, lr=5.22398e-06, gnorm=6.124, clip=0.000, oom=0.000, loss_scale=128.000, wall=67, train_wall=11
| epoch 001:     50 / 92203 loss=15.055, nll_loss=14.977, ppl=32259.92, wps=5598, ups=1, wpb=1507.961, bsz=41.725, num_updates=51, lr=6.47373e-06, gnorm=5.661, clip=0.000, oom=0.000, loss_scale=128.000, wall=69, train_wall=14

TEST 7: PRUNING TESTCASES

A) after adding the pruning flags, model can evaluate as a full model
checked, reaches correct PPL
num. model params: 246933504
| Evaluated 217646 tokens in 196.3s (1108.99 tokens/s)
| Loss: 2.9275, Perplexity: 18.68

B) after adding pruning flags, model can be pruned. this works with multiple flag settings
checked three cases:
num. model params: 146163712
| Evaluated 217646 tokens in 106.0s (2054.07 tokens/s)
| Loss: 3.0932, Perplexity: 22.05

num. model params: 209144832
| Evaluated 217646 tokens in 162.8s (1336.99 tokens/s)
| Loss: 2.9526, Perplexity: 19.16

C) model can pick up training if you want to finetune the pruned model
checked:
| loading train data for epoch 272
| loaded 1801350 examples from: /private/home/angelafan/lm_work/fairseq-py/data-bin/wikitext-103/train
| WARNING: overflow detected, setting loss scale to: 64.0
| WARNING: overflow detected, setting loss scale to: 32.0
| epoch 272:   1500 / 5601 loss=5.015, nll_loss=5.015, ppl=32.33, wps=11598, ups=1, wpb=18432.000, bsz=6.000, num_updates=98, lr=0.0061251, gnorm=0.613, clip=1.000, oom=0.000, loss_scale=32.000, wall=156, train_wall=252396

D) works with BERT
checked:
without specifying any flags, reproduces the correct standard accuracy
with flags, produces the correct pruned accuracy

| [input] dictionary: 50265 types
| [label] dictionary: 9 types
| Accuracy:  0.9231651376146789

| [input] dictionary: 50265 types
| [label] dictionary: 9 types
| Pruning model to specified layer configuration - this works best if the model was trained with LayerDrop
| Accuracy:  0.9220183486238532
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/890

Reviewed By: edunov

Differential Revision: D18094657

Pulled By: huihuifan

fbshipit-source-id: 2bbaa2ff0039e906782694fc2038b8c17a8693e7
---
 examples/layerdrop/README.md                  | 66 +++++++++++++++++++
 fairseq/checkpoint_utils.py                   | 66 ++++++++++++++++++-
 fairseq/models/fairseq_model.py               |  6 +-
 fairseq/models/roberta/model.py               | 15 +++++
 fairseq/models/transformer.py                 | 57 +++++++++++-----
 fairseq/models/transformer_lm.py              |  8 +++
 .../modules/transformer_sentence_encoder.py   | 13 +++-
 fairseq/trainer.py                            |  2 +-
 8 files changed, 209 insertions(+), 24 deletions(-)
 create mode 100644 examples/layerdrop/README.md

diff --git a/examples/layerdrop/README.md b/examples/layerdrop/README.md
new file mode 100644
index 0000000000..82ec4b6d53
--- /dev/null
+++ b/examples/layerdrop/README.md
@@ -0,0 +1,66 @@
+# Reducing Transformer Depth on Demand with Structured Dropout (Fan et al., 2019)
+This page contains information for how to train models with LayerDrop.
+
+Looking for pretrained models? They will be added shortly.
+
+Looking for code for other forms of Structured Dropout? It will be added shortly.
+
+## Citation:
+```bibtex
+@article{fan2019reducing,
+  title={Reducing Transformer Depth on Demand with Structured Dropout},
+  author={Fan, Angela and Grave, Edouard and Joulin, Armand},
+  journal={arXiv preprint arXiv:1909.11556},
+  year={2019}
+}
+```
+
+## Example usage
+
+To train a model with LayerDrop, add the following flags. We recommend 0.2, a value that worked well in our experiments. For Language Models that are decoder-only, you need only the decoder flag. For RoBERTa, an encoder, you need only the encoder flag. The encoder and decoder LayerDrop values can be set differently.
+```
+--encoder-layerdrop 0.2 --decoder-layerdrop 0.2
+```
+
+To prune a model that has been trained with LayerDrop, add the following flags followed by a comma separated list of which layers you would like to keep.
+```
+--encoder-layers-to-keep 0,2,4,6,8,10,12,14 --decoder-layers-to-keep 0,2,4,6,8,10,12,14
+```
+Setting these flags should print a message such as:
+```
+| Pruning model to specified layer configuration
+```
+You should also see a smaller number of parameters in the model, for example the 16-Layer Transformer Language Model prints:
+```
+num. model params: 246933504
+```
+while a model pruned to 8 Layers prints:
+```
+num. model params: 146163712
+```
+
+If you would like to pick up training with a model that has been pruned, simply adding these flags is sufficient. If you would like to use a script that only does evaluation (no training), you may need to pass an override command. A specific example would be for language modeling:
+```
+python eval_lm.py /path/to/wikitext-103 --path '/path/to/model/checkpoint' --model-overrides "{'decoder_layers_to_keep':'0,2,4,6,8,10,12,14'}"
+```
+This model override command overrides the training parameters and updates the model arguments so that the pruned model is run instead of the full model.
+
+
+Looking to reproduce the results in the paper?
+
+1. For Translation on WMT en-de, we followed this setting [here](https://github.com/pytorch/fairseq/blob/master/examples/scaling_nmt/README.md)
+2. To train RoBERTa, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta)
+3. To train Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/language_model)
+
+
+## Tips
+
+1. If you would like to train large models with better performance, LayerDrop should be set to a smaller value such as 0.1 or 0.2. Too much LayerDrop will mean the model has too much regularization, so may not reach the best performance. Since LayerDrop adds regularization, you may achieve the best performance by slightly reducing the amount of standard dropout (for example, reduce by 0.1).
+
+2. If you would like to train large models to be pruned and made smaller, LayerDrop should be set to a larger value such as 0.5 if you want to prune very aggressively (such as removing half the network or more). If you would like to prune fewer layers away, LayerDrop can be set to a smaller value such as 0.2.
+
+3. When pruning layers at inference time, it is best to spread out the layers remaining so they are evenly spaced throughout the network. For example, if you want to remove 50% of the network, keeping every other layer is good.
+
+## Having an issue or have a question?
+
+Please open an issue in this repository with the details of your question. Thanks!
diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index ded8ce32f5..abf1bcc65f 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -183,7 +183,7 @@ def load_model_ensemble_and_task(filenames, arg_overrides=None, task=None):
 
         # build model for ensemble
         model = task.build_model(args)
-        model.load_state_dict(state['model'], strict=True)
+        model.load_state_dict(state['model'], strict=True, args=args)
         ensemble.append(model)
     return ensemble, args, task
 
@@ -334,6 +334,70 @@ def _upgrade_state_dict(state):
     return state
 
 
+def prune_state_dict(state_dict, args):
+    """Prune the given state_dict if desired for LayerDrop
+    (https://arxiv.org/abs/1909.11556).
+
+    Training with LayerDrop allows models to be robust to pruning at inference
+    time. This function prunes state_dict to allow smaller models to be loaded
+    from a larger model and re-maps the existing state_dict for this to occur.
+
+    It's called by functions that load models from checkpoints and does not
+    need to be called directly.
+    """
+    if not args:
+        # args should not be none, but don't crash if it is.
+        return state_dict
+
+    encoder_layers_to_keep = args.encoder_layers_to_keep if "encoder_layers_to_keep" in vars(args) else None
+    decoder_layers_to_keep = args.decoder_layers_to_keep if "decoder_layers_to_keep" in vars(args) else None
+
+    if not encoder_layers_to_keep and not decoder_layers_to_keep:
+        return state_dict
+
+    # apply pruning
+    print("| Pruning model to specified layer configuration - this works best if the model was trained with LayerDrop")
+
+    def create_pruning_pass(layers_to_keep, layer_name):
+        keep_layers = sorted([int(layer_string) for layer_string in layers_to_keep.split(",")])
+        mapping_dict = {}
+        for i in range(len(keep_layers)):
+            mapping_dict[str(keep_layers[i])] = str(i)
+
+        regex = re.compile("^{layer}.*\.layers\.(\d+)".format(layer=layer_name))
+        return {
+            "substitution_regex": regex,
+            "mapping_dict": mapping_dict
+        }
+
+    pruning_passes = []
+    if encoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder"))
+    if decoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder"))
+
+    new_state_dict = {}
+    for layer_name in state_dict.keys():
+        match = re.search("\.layers\.(\d+)\.", layer_name)
+        # if layer has no number in it, it is a supporting layer, such as an
+        # embedding
+        if not match:
+            new_state_dict[layer_name] = state_dict[layer_name]
+            continue
+
+        # otherwise, layer should be pruned.
+        original_layer_number = match.group(1)
+        # figure out which mapping dict to replace from
+        for pruning_pass in pruning_passes:
+            if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass["substitution_regex"].search(layer_name):
+                new_layer_number = pruning_pass["mapping_dict"][original_layer_number]
+                substitution_match = pruning_pass["substitution_regex"].search(layer_name)
+                new_state_key = layer_name[:substitution_match.start(1)] + new_layer_number + layer_name[substitution_match.end(1):]
+                new_state_dict[new_state_key] = state_dict[layer_name]
+
+    return new_state_dict
+
+
 def load_pretrained_component_from_model(
     component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str
 ):
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index bd73bd5c23..2d9e942d3b 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -13,6 +13,7 @@
 import torch.nn.functional as F
 
 from fairseq import utils
+from fairseq.checkpoint_utils import prune_state_dict
 from fairseq.data import Dictionary
 from fairseq.models import FairseqDecoder, FairseqEncoder
 
@@ -58,7 +59,7 @@ def max_positions(self):
         """Maximum length supported by the model."""
         return None
 
-    def load_state_dict(self, state_dict, strict=True):
+    def load_state_dict(self, state_dict, strict=True, args=None):
         """Copies parameters and buffers from *state_dict* into this module and
         its descendants.
 
@@ -66,7 +67,8 @@ def load_state_dict(self, state_dict, strict=True):
         this additionally "upgrades" *state_dicts* from old checkpoints.
         """
         self.upgrade_state_dict(state_dict)
-        return super().load_state_dict(state_dict, strict)
+        new_state_dict = prune_state_dict(state_dict, args)
+        return super().load_state_dict(new_state_dict, strict)
 
     def upgrade_state_dict(self, state_dict):
         """Upgrade old state dicts to work with newer code."""
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index ac94a04845..1c4c243e7e 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -78,6 +78,11 @@ def add_args(parser):
                             help='number of positional embeddings to learn')
         parser.add_argument('--load-checkpoint-heads', action='store_true',
                             help='(re-)register and load heads when loading checkpoints')
+        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for encoder')
+        parser.add_argument('--encoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
 
     @classmethod
     def build_model(cls, args, task):
@@ -245,6 +250,15 @@ class RobertaEncoder(FairseqDecoder):
     def __init__(self, args, dictionary):
         super().__init__(dictionary)
         self.args = args
+
+        # RoBERTa is a sentence encoder model, so users will intuitively trim
+        # encoder layers. However, the implementation uses the fairseq decoder,
+        # so we fix here.
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+            args.decoder_layers_to_keep = args.encoder_layers_to_keep
+            args.encoder_layers_to_keep = None
+
         self.sentence_encoder = TransformerSentenceEncoder(
             padding_idx=dictionary.pad(),
             vocab_size=len(dictionary),
@@ -255,6 +269,7 @@ def __init__(self, args, dictionary):
             dropout=args.dropout,
             attention_dropout=args.attention_dropout,
             activation_dropout=args.activation_dropout,
+            layerdrop=args.encoder_layerdrop,
             max_seq_len=args.max_positions,
             num_segments=0,
             encoder_normalize_before=True,
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index f5f23f1b95..573c41373b 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -25,6 +25,7 @@
     TransformerDecoderLayer,
     TransformerEncoderLayer,
 )
+import random
 
 DEFAULT_MAX_SOURCE_POSITIONS = 1024
 DEFAULT_MAX_TARGET_POSITIONS = 1024
@@ -130,6 +131,15 @@ def add_args(parser):
                             help='perform cross+self-attention')
         parser.add_argument('--layer-wise-attention', default=False, action='store_true',
                             help='perform layer-wise attention (cross-attention or cross+self-attention)')
+        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for encoder')
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for decoder')
+        parser.add_argument('--encoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
+        parser.add_argument('--decoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
         # fmt: on
 
     @classmethod
@@ -139,6 +149,11 @@ def build_model(cls, args, task):
         # make sure all arguments are present in older models
         base_architecture(args)
 
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
         if not hasattr(args, 'max_source_positions'):
             args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
         if not hasattr(args, 'max_target_positions'):
@@ -275,6 +290,7 @@ def __init__(self, args, dictionary, embed_tokens):
         self.register_buffer('version', torch.Tensor([3]))
 
         self.dropout = args.dropout
+        self.encoder_layerdrop = args.encoder_layerdrop
 
         embed_dim = embed_tokens.embedding_dim
         self.padding_idx = embed_tokens.padding_idx
@@ -300,6 +316,7 @@ def __init__(self, args, dictionary, embed_tokens):
         else:
             self.layer_norm = None
 
+
     def forward_embedding(self, src_tokens):
         # embed tokens and positions
         embed = self.embed_scale * self.embed_tokens(src_tokens)
@@ -345,9 +362,12 @@ def forward(self, src_tokens, src_lengths, cls_input=None, return_all_hiddens=Fa
 
         # encoder layers
         for layer in self.layers:
-            x = layer(x, encoder_padding_mask)
-            if return_all_hiddens:
-                encoder_states.append(x)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not self.training or (dropout_probability > self.encoder_layerdrop):
+                x = layer(x, encoder_padding_mask)
+                if return_all_hiddens:
+                    encoder_states.append(x)
 
         if self.layer_norm:
             x = self.layer_norm(x)
@@ -435,6 +455,7 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
         self.register_buffer('version', torch.Tensor([3]))
 
         self.dropout = args.dropout
+        self.decoder_layerdrop = args.decoder_layerdrop
         self.share_input_output_embed = args.share_decoder_input_output_embed
 
         input_embed_dim = embed_tokens.embedding_dim
@@ -594,20 +615,22 @@ def extract_features(
             else:
                 self_attn_mask = None
 
-            x, layer_attn = layer(
-                x,
-                encoder_state,
-                encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
-                incremental_state,
-                self_attn_mask=self_attn_mask,
-                self_attn_padding_mask=self_attn_padding_mask,
-                need_attn=(idx == alignment_layer),
-                need_head_weights=(idx == alignment_layer),
-            )
-
-            inner_states.append(x)
-            if layer_attn is not None and idx == alignment_layer:
-                attn = layer_attn.float()
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not self.training or (dropout_probability > self.decoder_layerdrop):
+                x, layer_attn = layer(
+                    x,
+                    encoder_state,
+                    encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
+                    incremental_state,
+                    self_attn_mask=self_attn_mask,
+                    self_attn_padding_mask=self_attn_padding_mask,
+                    need_attn=(idx == alignment_layer),
+                    need_head_weights=(idx == alignment_layer),
+                )
+                inner_states.append(x)
+                if layer_attn is not None and idx == alignment_layer:
+                    attn = layer_attn.float()
 
         if attn is not None:
             if alignment_heads is not None:
diff --git a/fairseq/models/transformer_lm.py b/fairseq/models/transformer_lm.py
index 87c7719209..f04dd36032 100644
--- a/fairseq/models/transformer_lm.py
+++ b/fairseq/models/transformer_lm.py
@@ -98,6 +98,11 @@ def add_args(parser):
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
+        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for decoder')
+        parser.add_argument('--decoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
         # fmt: on
 
     @classmethod
@@ -107,6 +112,9 @@ def build_model(cls, args, task):
         # make sure all arguments are present in older models
         base_lm_architecture(args)
 
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
         if getattr(args, 'max_target_positions', None) is None:
             args.max_target_positions = getattr(args, 'tokens_per_sample', DEFAULT_MAX_TARGET_POSITIONS)
 
diff --git a/fairseq/modules/transformer_sentence_encoder.py b/fairseq/modules/transformer_sentence_encoder.py
index 9be7ab3080..f7e3973080 100644
--- a/fairseq/modules/transformer_sentence_encoder.py
+++ b/fairseq/modules/transformer_sentence_encoder.py
@@ -14,6 +14,7 @@
     PositionalEmbedding,
     TransformerSentenceEncoderLayer,
 )
+import random
 
 
 def init_bert_params(module):
@@ -77,6 +78,7 @@ def __init__(
         dropout: float = 0.1,
         attention_dropout: float = 0.1,
         activation_dropout: float = 0.1,
+        layerdrop : float = 0.0,
         max_seq_len: int = 256,
         num_segments: int = 2,
         use_position_embeddings: bool = True,
@@ -97,6 +99,7 @@ def __init__(
         self.padding_idx = padding_idx
         self.vocab_size = vocab_size
         self.dropout = dropout
+        self.layerdrop = layerdrop
         self.max_seq_len = max_seq_len
         self.embedding_dim = embedding_dim
         self.num_segments = num_segments
@@ -208,9 +211,13 @@ def forward(
             inner_states.append(x)
 
         for layer in self.layers:
-            x, _ = layer(x, self_attn_padding_mask=padding_mask)
-            if not last_state_only:
-                inner_states.append(x)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, _ = layer(x, self_attn_padding_mask=padding_mask)
+                if not last_state_only:
+                    inner_states.append(x)
+
 
         # T x B x C -> B x T x C
         x = x.transpose(0, 1)
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 545357ebef..5de30e2246 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -181,7 +181,7 @@ def load_checkpoint(
 
             # load model parameters
             try:
-                self.get_model().load_state_dict(state['model'], strict=True)
+                self.get_model().load_state_dict(state['model'], strict=True, args=self.args)
                 if utils.has_parameters(self.get_criterion()):
                     self.get_criterion().load_state_dict(state['criterion'], strict=True)
             except Exception:

From 50cf3bb596abba70a0770ad2308fed0f4d32a002 Mon Sep 17 00:00:00 2001
From: Ning Dong <dnn@fb.com>
Date: Sun, 27 Oct 2019 22:30:43 -0700
Subject: [PATCH 198/213] Fix LevT generator interface

Summary: Revert the interface change for iterative_refinement_generator

Reviewed By: kahne

Differential Revision: D18165103

fbshipit-source-id: 075c276746eb90d7c359b6ad92e1ef25e8452bcc
---
 fairseq/iterative_refinement_generator.py | 24 +++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/fairseq/iterative_refinement_generator.py b/fairseq/iterative_refinement_generator.py
index 551c49ffc2..885e7c81b4 100644
--- a/fairseq/iterative_refinement_generator.py
+++ b/fairseq/iterative_refinement_generator.py
@@ -5,17 +5,16 @@
 
 import torch
 from fairseq import utils
-from fairseq.models.model_utils import (
-    script_skip_tensor_list,
-    skip_tensors as _skip,
-)
+from fairseq.models.levenshtein_transformer import LevenshteinTransformerModel
+from fairseq.models.model_utils import script_skip_tensor_list, skip_tensors as _skip
+from fairseq.models.nonautoregressive_ensembles import EnsembleLevT
 
 
 class IterativeRefinementGenerator(object):
     def __init__(
         self,
-        models,
         tgt_dict,
+        models=None,
         eos_penalty=0.0,
         max_iter=10,
         max_ratio=2,
@@ -73,6 +72,7 @@ def generate_batched_itr(
                 timer.start()
             with torch.no_grad():
                 hypos = self.generate(
+                    self.models,
                     sample,
                     prefix_tokens=sample["target"][:, :prefix_size]
                     if prefix_size > 0
@@ -87,11 +87,15 @@ def generate_batched_itr(
                 yield id, src, ref, hypos[i]
 
     @torch.no_grad()
-    def generate(self, sample, prefix_tokens=None):
-
-        # TODO: model ensemble
-        assert len(self.models) == 1, "only support single model"
-        model = self.models[0]
+    def generate(self, models, sample, prefix_tokens=None):
+
+        if len(models) == 1:
+            # Keep this for other NAT models for which we have yet to implement ensemble wrappers. Later delete this.
+            model = models[0]
+        elif isinstance(models[0], LevenshteinTransformerModel):
+            model = EnsembleLevT(models)
+        else:
+            raise NotImplementedError
         if not self.retain_dropout:
             model.eval()
 

From 856d8b8262e47a310b71e276c1e8e089d840f206 Mon Sep 17 00:00:00 2001
From: Xian Li <xianl@fb.com>
Date: Wed, 30 Oct 2019 12:54:21 -0700
Subject: [PATCH 199/213] layer drop

Summary: This diff enables layer drop in transformer decoder in production training pipeline (ptt_transformer). It builds on top of the fairseq implementation D18094657 added by Angela Fan, and added additional logic to handle corresponding dropping layers at test time in exported model.

Reviewed By: jhcross

Differential Revision: D18165586

fbshipit-source-id: 373ac00268a25fa9e412edcb483becdfe792d992
---
 fairseq/checkpoint_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index abf1bcc65f..10de955fad 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -345,7 +345,7 @@ def prune_state_dict(state_dict, args):
     It's called by functions that load models from checkpoints and does not
     need to be called directly.
     """
-    if not args:
+    if not args or args.arch == "ptt_transformer":
         # args should not be none, but don't crash if it is.
         return state_dict
 

From f30fc7d71c93ecc28115b9eca2c2d680bd061d09 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 31 Oct 2019 10:56:58 -0700
Subject: [PATCH 200/213] Fix MultiheadAttention and torch hub

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/895

Reviewed By: akinh

Differential Revision: D18246479

Pulled By: myleott

fbshipit-source-id: a610f1e4943619d32a523601a572fb09cdc5638d
---
 fairseq/models/roberta/hub_interface.py         | 2 +-
 fairseq/models/roberta/model.py                 | 2 ++
 fairseq/models/wav2vec.py                       | 2 +-
 fairseq/modules/multihead_attention.py          | 9 ---------
 fairseq/modules/transformer_sentence_encoder.py | 4 +++-
 5 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
index 216b6fd90f..279aba52dd 100644
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -101,7 +101,7 @@ def register_classification_head(
         )
 
     def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
-        features = self.extract_features(tokens)
+        features = self.extract_features(tokens.to(device=self.device))
         logits = self.model.classification_heads[head](features)
         if return_logits:
             return logits
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index 1c4c243e7e..a6ff42d3e4 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -146,6 +146,8 @@ def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_na
         return RobertaHubInterface(x['args'], x['task'], x['models'][0])
 
     def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+
         prefix = name + '.' if name != '' else ''
         current_head_names = [] if not hasattr(self, 'classification_heads') else \
             self.classification_heads.keys()
diff --git a/fairseq/models/wav2vec.py b/fairseq/models/wav2vec.py
index 62807764ef..eb5bd0d00a 100644
--- a/fairseq/models/wav2vec.py
+++ b/fairseq/models/wav2vec.py
@@ -187,7 +187,7 @@ def forward(self, source):
         return result
 
     def upgrade_state_dict_named(self, state_dict, name):
-        return state_dict
+        super().upgrade_state_dict_named(state_dict, name)
 
     def max_positions(self):
         """Maximum length supported by the model."""
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 5b92662582..0a1f449169 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -63,11 +63,6 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         else:
             self.enable_torch_version = False
 
-    @property
-    def in_proj_weight(self):
-        # TODO: Remove this backward compatibility code (in_proj_weight)
-        return torch.cat((self.q_proj.weight, self.k_proj.weight, self.v_proj.weight))
-
     @property
     def in_proj_bias(self):
         # TODO: Remove this backward compatibility code (in_proj_bias)
@@ -312,8 +307,6 @@ def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
         return attn_weights
 
     def upgrade_state_dict_named(self, state_dict, name):
-        # TODO: Remove this backward compatibility code (in_proj_weight)
-        # here, we convert in_proj_weight to individual q,k,v weights
         prefix = name + '.' if name != '' else ''
         items_to_add = {}
         keys_to_remove = []
@@ -341,5 +334,3 @@ def upgrade_state_dict_named(self, state_dict, name):
 
         for key, value in items_to_add.items():
             state_dict[key] = value
-
-        return state_dict
diff --git a/fairseq/modules/transformer_sentence_encoder.py b/fairseq/modules/transformer_sentence_encoder.py
index f7e3973080..862f27603a 100644
--- a/fairseq/modules/transformer_sentence_encoder.py
+++ b/fairseq/modules/transformer_sentence_encoder.py
@@ -40,7 +40,9 @@ def init_bert_params(module):
         if module.padding_idx is not None:
             module.weight.data[module.padding_idx].zero_()
     if isinstance(module, MultiheadAttention):
-        module.in_proj_weight.data.normal_(mean=0.0, std=0.02)
+        module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
+        module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
+        module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
 
 
 class TransformerSentenceEncoder(nn.Module):

From 99c524c5463b65c3e9c49a9ecbd53c39d0573c86 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 31 Oct 2019 12:53:48 -0700
Subject: [PATCH 201/213] Fix fairspeq unit test

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/897

Differential Revision: D18250587

Pulled By: myleott

fbshipit-source-id: b9cef376bc014b68766229aab7b6e454480757d3
---
 fairseq/modules/multihead_attention.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 0a1f449169..900060b0f0 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -63,9 +63,12 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         else:
             self.enable_torch_version = False
 
+    @property
+    def in_proj_weight(self):
+        return torch.cat((self.q_proj.weight, self.k_proj.weight, self.v_proj.weight))
+
     @property
     def in_proj_bias(self):
-        # TODO: Remove this backward compatibility code (in_proj_bias)
         return torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias))
 
     def prepare_for_onnx_export_(self):

From 4c6b689eebe66a53717dacf28cba7a11b6ffa64f Mon Sep 17 00:00:00 2001
From: Halil Akin <halilakin@fb.com>
Date: Fri, 1 Nov 2019 09:38:27 -0700
Subject: [PATCH 202/213] Remove in_proj_weight/in_proj_bias in multihead
 attention and fix the failing tests instead (#898)

Summary:
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/898

Pull Request resolved: https://github.com/pytorch/fairseq/pull/1333

Pull Request resolved: https://github.com/fairinternal/fairspeq/pull/11

This in_proj_weight and in_proj_bias properties are not the right way of providing backward compatibility, and it's causing other incompatibilities with the new Dynamic Quantization API. So, let's remove this, and properly fix the failing tests.

Reviewed By: myleott

Differential Revision: D18264129

fbshipit-source-id: fc1838657a60d914ca83c4e0f6add5ed8206ac54
---
 fairseq/modules/multihead_attention.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 900060b0f0..39ffe2ccc5 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -63,14 +63,6 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         else:
             self.enable_torch_version = False
 
-    @property
-    def in_proj_weight(self):
-        return torch.cat((self.q_proj.weight, self.k_proj.weight, self.v_proj.weight))
-
-    @property
-    def in_proj_bias(self):
-        return torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias))
-
     def prepare_for_onnx_export_(self):
         self.onnx_trace = True
 
@@ -132,7 +124,8 @@ def forward(
             return F.multi_head_attention_forward(query, key, value,
                                                   self.embed_dim, self.num_heads,
                                                   torch.empty([0]),
-                                                  self.in_proj_bias, self.bias_k, self.bias_v,
+                                                  torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+                                                  self.bias_k, self.bias_v,
                                                   self.add_zero_attn, self.dropout,
                                                   self.out_proj.weight, self.out_proj.bias,
                                                   self.training, key_padding_mask, need_weights,

From 828c1ca7522278aa6c1eaf91fe0425b8f40dd832 Mon Sep 17 00:00:00 2001
From: Chau Tran <chau@fb.com>
Date: Fri, 1 Nov 2019 12:31:58 -0700
Subject: [PATCH 203/213] Fix BPE for dual learning

Summary: Fix integration test

Reviewed By: xianxl

Differential Revision: D18040440

fbshipit-source-id: 98c8ab7970d081f17deb54c69aa35669de12d767
---
 fairseq/data/data_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 66b880fa35..b26972c4ad 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -234,6 +234,8 @@ def batch_by_size(
 def process_bpe_symbol(sentence: str, bpe_symbol: str):
     if bpe_symbol == 'sentencepiece':
         sentence = sentence.replace(' ', '').replace('\u2581', ' ').strip()
+    elif bpe_symbol == '_EOW':
+        sentence = sentence.replace(' ', '').replace('_EOW', ' ').strip()
     elif bpe_symbol is not None:
         sentence = (sentence + ' ').replace(bpe_symbol, '').rstrip()
     return sentence

From a0f75996b1e25c97149bcbc6aad7eed5601daab0 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Sat, 2 Nov 2019 16:51:32 -0700
Subject: [PATCH 204/213] Fix building of docs

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1340

Differential Revision: D18289455

Pulled By: myleott

fbshipit-source-id: a1c8163a35273b6c646d300142701e8a317d7378
---
 docs/conf.py                       |  2 +-
 examples/backtranslation/README.md |  9 ++++--
 examples/language_model/README.md  |  5 +++
 examples/translation/README.md     |  5 +++
 examples/wmt19/README.md           | 13 ++++++++
 setup.py                           | 49 +++++++++++++++++++++++-------
 6 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index d6ee5c4ebf..11358ca2ed 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -50,7 +50,7 @@
 
 # General information about the project.
 project = 'fairseq'
-copyright = '2018, Facebook AI Research (FAIR)'
+copyright = '2019, Facebook AI Research (FAIR)'
 author = 'Facebook AI Research (FAIR)'
 
 github_doc_root = 'https://github.com/pytorch/fairseq/tree/master/docs/'
diff --git a/examples/backtranslation/README.md b/examples/backtranslation/README.md
index a834214adf..bc32675de7 100644
--- a/examples/backtranslation/README.md
+++ b/examples/backtranslation/README.md
@@ -8,9 +8,14 @@ Model | Description | Dataset | Download
 ---|---|---|---
 `transformer.wmt18.en-de` | Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381)) <br> WMT'18 winner | [WMT'18 English-German](http://www.statmt.org/wmt18/translation-task.html) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz) <br> See NOTE in the archive
 
-## Example usage
+## Example usage (torch.hub)
 
-Interactive generation from the full ensemble via PyTorch Hub:
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install subword_nmt sacremoses
+```
+
+Then to generate translations from the full model ensemble:
 ```python
 import torch
 
diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index 8c7da50f38..f10eb4cb23 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -12,6 +12,11 @@ Model | Description | Dataset | Download
 
 ## Example usage
 
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install fastBPE sacremoses
+```
+
 To sample from a language model using PyTorch Hub:
 ```python
 import torch
diff --git a/examples/translation/README.md b/examples/translation/README.md
index 9807a13e9d..37c44690c1 100644
--- a/examples/translation/README.md
+++ b/examples/translation/README.md
@@ -20,6 +20,11 @@ Model | Description | Dataset | Download
 
 ## Example usage (torch.hub)
 
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install sacremoses subword_nmt
+```
+
 Interactive translation via PyTorch Hub:
 ```python
 import torch
diff --git a/examples/wmt19/README.md b/examples/wmt19/README.md
index 6eb7818925..34623575d3 100644
--- a/examples/wmt19/README.md
+++ b/examples/wmt19/README.md
@@ -16,6 +16,15 @@ Model | Description | Download
 
 ## Example usage (torch.hub)
 
+#### Requirements
+
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install fastBPE sacremoses
+```
+
+#### Translation
+
 ```python
 import torch
 
@@ -38,7 +47,11 @@ en2ru.translate("Machine learning is great!")  # 'Машинное обучен
 ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
                        tokenizer='moses', bpe='fastbpe')
 ru2en.translate("Машинное обучение - это здорово!")  # 'Machine learning is great!'
+```
+
+#### Language Modeling
 
+```python
 # Sample from the English LM
 en_lm = torch.hub.load('pytorch.fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
 en_lm.sample("Machine learning is")  # 'Machine learning is the future of computing, says Microsoft boss Satya Nadella ...'
diff --git a/setup.py b/setup.py
index 33849f8105..06d21e2658 100644
--- a/setup.py
+++ b/setup.py
@@ -4,13 +4,13 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
 from setuptools import setup, find_packages, Extension
-from torch.utils import cpp_extension
 import sys
 
 
-if sys.version_info < (3,):
-    sys.exit('Sorry, Python3 is required for fairseq.')
+if sys.version_info < (3, 5):
+    sys.exit('Sorry, Python >=3.5 is required for fairseq.')
 
 
 with open('README.md') as f:
@@ -61,15 +61,42 @@ def include_dirs(self, dirs):
         language='c++',
         extra_compile_args=extra_compile_args,
     ),
-    cpp_extension.CppExtension(
-        'fairseq.libnat',
-        sources=[
-            'fairseq/clib/libnat/edit_dist.cpp',
-        ],
-    )
 ]
 
 
+cmdclass = {}
+
+
+try:
+    # torch is not available when generating docs
+    from torch.utils import cpp_extension
+    extensions.extend([
+        cpp_extension.CppExtension(
+            'fairseq.libnat',
+            sources=[
+                'fairseq/clib/libnat/edit_dist.cpp',
+            ],
+        ),
+    ])
+    cmdclass['build_ext'] = cpp_extension.BuildExtension
+except ImportError:
+    pass
+
+
+if 'READTHEDOCS' in os.environ:
+    # don't build extensions when generating docs
+    extensions = []
+    if 'build_ext' in cmdclass:
+        del cmdclass['build_ext']
+
+    # use CPU build of PyTorch
+    dependency_links = [
+        'https://download.pytorch.org/whl/cpu/torch-1.3.0%2Bcpu-cp36-cp36m-linux_x86_64.whl'
+    ]
+else:
+    dependency_links = []
+
+
 setup(
     name='fairseq',
     version='0.8.0',
@@ -92,13 +119,13 @@ def include_dirs(self, dirs):
     install_requires=[
         'cffi',
         'cython',
-        'fastBPE',
         'numpy',
         'regex',
         'sacrebleu',
         'torch',
         'tqdm',
     ],
+    dependency_links=dependency_links,
     packages=find_packages(exclude=['scripts', 'tests']),
     ext_modules=extensions,
     test_suite='tests',
@@ -113,6 +140,6 @@ def include_dirs(self, dirs):
             'fairseq-validate = fairseq_cli.validate:cli_main',
         ],
     },
-    cmdclass={'build_ext': cpp_extension.BuildExtension},
+    cmdclass=cmdclass,
     zip_safe=False,
 )

From fd7dcacff7692e6b72166c6b5a2019bfdcda04e6 Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@google.com>
Date: Fri, 8 Nov 2019 17:21:41 +0000
Subject: [PATCH 205/213] option to suppress loss report

---
 train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/train.py b/train.py
index 68265d5f55..a5f07b7986 100644
--- a/train.py
+++ b/train.py
@@ -367,6 +367,7 @@ def train_loop_fn(device, trainer, loader, last_batch_index):
                     flush=True,
                 )
             log_output = trainer.train_step(samples)
+            log_output=None if args.suppress_loss_report else log_output
             xm.optimizer_step(trainer.optimizer)
             tracker.add(sum(sample['nsentences'] for sample in samples))
         return tracker
@@ -575,6 +576,7 @@ def get_args():
     parser.add_argument('--num_cores', type=int, default=8)
     parser.add_argument('--metrics_debug', action='store_true')
     parser.add_argument('--use_gpu', action='store_true')
+    parser.add_argument('--suppress_loss_report', action='store_true')
     parser.add_argument('--target_train_loss', type=float, default=None)
     parser.add_argument('--target_valid_loss', type=float, default=None)
     args = options.parse_args_and_arch(parser)

From 7a23b9341ce39ad8bf104724a06a38eefa7e45d2 Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@gmail.com>
Date: Thu, 13 Jun 2019 23:23:22 +0000
Subject: [PATCH 206/213] Making tpu training work

optimizer fix
progress bar comment out temporarily
some changes to train_tpu
int mask instead of float

pfpfpfpf

fix

printing device index per loop

bkpt to investigate resize_ call

attempting to init buffer size to 2*dim

bkpt

better print

do not drop records when computing loss

Changes that reduce graph compiles.

* Loss function replaced with an equivalent logic that doesn't resize
tensors.
* cli args changed to guarantee consistency
* collate_tokens function in fairseq/data/data_utils.py overwritten to
guarantee consistency

undoing some changes made while debugging

progress_bar implements len

some irrelevant changes to train_tpu.py

new xla changes

bug fix in enable_torch_version

removing the last batch that is of diferent size from the iterator

delete optimizer step in fairseq s trainer

Added `self.xla` flag that controls if Trainer includes optimizer step

+ Tried to include more explanation why skip optimizer step this time

deleted obsolete file

add norm clipping count back in (#4)

remove grad norm clip count (#5)

Change masked_fill_ input in loss in order to accomodate necessary pytorch changes (#6)

Adding tpu capabilities to train.py (#8)

* Adding tpu capabilities to train.py

* flush when printing for better user experience

* separated cli_main into parse_args, maingpu and maintpu
deleted unused line in datautils.py

Enumerate the loader in training and validation (#9)

* Adding tpu capabilities to train.py

* flush when printing for better user experience

* separated cli_main into parse_args, maingpu and maintpu
deleted unused line in datautils.py

* Enumerate the loader

* enumerate the loader

Add option to assert on training and/or validation loss (#10)

* Add option to assert on training and/or validation loss

* applied suggestion

None loss should be filled to inf (#11)

Enabling multiprocessing for fairseq training. (#12)

* initial commit for multiprocess api

* indentation fixes and import fix

* no need to softlink, fix save/load

* Remove the hacks to only save from master ordinal as xm.save takes care of that

* fix indentation; 3 -> 4 spaces

* Moved xu.eprints after spawn and dropping last batches better

trainers->trainer (#13)

fix bug in assert_on_losses

Replace usage of unsqueeze with transpose + broadcasting (#15)

remove attn mask + loss rewrite + save per host +

format
suppress loss report
allow usage of batch_by_size in translation.
attn_weights masked fill in place

Clean up the log output suppressing a bit

Revert multihead attn's in_proj code changes

non-rebased tpu branch is about 10% faster on TPUs
compared to the rebased branch. The regression is inside multihead
attn's in_proj mechanism. Reverting the relevant changes to preserve
performance.

Pass correct args to the new get_valid_stats function

Send meters to device in order not to fail training when resuming dfrom chkpt
---
 fairseq/checkpoint_utils.py                   |  24 +-
 .../label_smoothed_cross_entropy.py           |   8 +-
 fairseq/data/data_utils.py                    |  37 +-
 fairseq/data/language_pair_dataset.py         |  12 +-
 fairseq/models/transformer.py                 |   8 +-
 fairseq/modules/multihead_attention.py        |  96 +++--
 fairseq/tasks/fairseq_task.py                 |  15 +-
 fairseq/tasks/translation.py                  |   3 +
 fairseq/trainer.py                            |  23 +-
 fairseq/utils.py                              |   7 +-
 train.py                                      | 382 ++++++++++++++++--
 11 files changed, 530 insertions(+), 85 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 10de955fad..a582acdc2a 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -17,6 +17,8 @@
 
 from fairseq.models import FairseqEncoder, FairseqDecoder
 
+import torch_xla.core.xla_model as xm
+
 
 def save_checkpoint(args, trainer, epoch_itr, val_loss):
     from fairseq import distributed_utils, meters
@@ -62,15 +64,17 @@ def is_better(a, b):
         extra_state.update({'best': save_checkpoint.best})
 
     checkpoints = [os.path.join(args.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond]
+
     if len(checkpoints) > 0:
         trainer.save_checkpoint(checkpoints[0], extra_state)
         for cp in checkpoints[1:]:
             try:
                 from fairseq.fb_pathmgr import fb_pathmgr
-                fb_pathmgr.copy(checkpoints[0], cp, True)
+                if getattr(args, 'use_gpu', True) or xm.is_master_ordinal():
+                    fb_pathmgr.copy(checkpoints[0], cp, True)
             except (ModuleNotFoundError, ImportError):
-                shutil.copyfile(checkpoints[0], cp)
-
+                if getattr(args, 'use_gpu', True) or xm.is_master_ordinal():
+                    shutil.copyfile(checkpoints[0], cp)
         write_timer.stop()
         print('| saved checkpoint {} (epoch {} @ {} updates) (writing took {} seconds)'.format(
             checkpoints[0], epoch, updates, write_timer.sum))
@@ -97,7 +101,7 @@ def is_better(a, b):
 def load_checkpoint(args, trainer, data_selector=None):
     """Load a checkpoint and restore the training iterator."""
     # only one worker should attempt to create the required dir
-    if args.distributed_rank == 0:
+    if args.distributed_rank == 0 or xm.is_master_ordinal():
         os.makedirs(args.save_dir, exist_ok=True)
 
     if args.restore_file == 'checkpoint_last.pt':
@@ -210,7 +214,8 @@ def checkpoint_paths(path, pattern=r'checkpoint(\d+)\.pt'):
 def torch_persistent_save(*args, **kwargs):
     for i in range(3):
         try:
-            return torch.save(*args, **kwargs)
+            save_func = xm.save if kwargs.pop('xla', False) else torch.save
+            return save_func(*args, **kwargs)
         except Exception:
             if i == 2:
                 logging.error(traceback.format_exc())
@@ -256,14 +261,17 @@ def save_state(
         state_dict['criterion'] = criterion.state_dict()
     if not args.no_save_optimizer_state:
         state_dict['last_optimizer_state'] = convert_state_dict_type(optimizer.state_dict())
-
     try:
         from fairseq.fb_pathmgr import fb_pathmgr
         with fb_pathmgr.open(filename, "wb") as f:
-            torch_persistent_save(state_dict, f)
+            torch_persistent_save(
+                state_dict, f, xla=not getattr(args, 'use_gpu', True)
+            )
     except (ModuleNotFoundError, ImportError):
         # if path manager not found, continue with local file.
-        torch_persistent_save(state_dict, filename)
+        torch_persistent_save(
+            state_dict, filename, xla=not getattr(args, 'use_gpu', True)
+        )
 
 
 def _upgrade_state_dict(state):
diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
index 6687718725..acef05d1b6 100644
--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -17,8 +17,8 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T
     smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
     if ignore_index is not None:
         non_pad_mask = target.ne(ignore_index)
-        nll_loss = nll_loss[non_pad_mask]
-        smooth_loss = smooth_loss[non_pad_mask]
+        nll_loss.masked_fill_(~non_pad_mask, 0.0)
+        smooth_loss.masked_fill_(~non_pad_mask, 0.0)
     else:
         nll_loss = nll_loss.squeeze(-1)
         smooth_loss = smooth_loss.squeeze(-1)
@@ -57,8 +57,8 @@ def forward(self, model, sample, reduce=True):
         loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
         sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
         logging_output = {
-            'loss': utils.item(loss.data) if reduce else loss.data,
-            'nll_loss': utils.item(nll_loss.data) if reduce else nll_loss.data,
+            'loss': loss.data,
+            'nll_loss': nll_loss.data,
             'ntokens': sample['ntokens'],
             'nsentences': sample['target'].size(0),
             'sample_size': sample_size,
diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index b26972c4ad..790a1cf5c4 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -26,9 +26,25 @@ def infer_language_pair(path):
     return src, dst
 
 
-def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False):
+def get_pad_size(values, input_shapes):
+    if input_shapes is None:
+        return max(v.size(0) for v in values)
+    for batch_size, padlen in input_shapes:
+        if len(values) == batch_size:
+            return padlen
+    else:
+        raise IndexError(
+            'Encountered values with invalid length {}, input shapes were {}'
+            .format(len(values), input_shapes)
+        )
+
+
+def collate_tokens(
+    values, pad_idx, eos_idx=None, left_pad=False,
+    move_eos_to_beginning=False, input_shapes=None,
+):
     """Convert a list of 1d tensors into a padded 2d tensor."""
-    size = max(v.size(0) for v in values)
+    size = get_pad_size(values, input_shapes)
     res = values[0].new(len(values), size).fill_(pad_idx)
 
     def copy_tensor(src, dst):
@@ -227,10 +243,25 @@ def batch_by_size(
 
     if isinstance(indices, types.GeneratorType):
         indices = np.fromiter(indices, dtype=np.int64, count=-1)
-
     return batch_by_size_fast(indices, num_tokens_fn, max_tokens, max_sentences, bsz_mult)
 
 
+def batch_by_size_tpu(
+    indices, num_tokens_fn, input_shapes
+):
+    batches = [[] for _ in input_shapes]
+    for idx in indices:
+        sample_len = num_tokens_fn(idx)
+        for j, (batch_size, padlen) in enumerate(input_shapes):
+            if padlen < sample_len:
+                continue
+            batches[j].append(idx)
+            if len(batches[j]) == batch_size:
+                yield batches[j]
+                batches[j] = []
+            break
+
+
 def process_bpe_symbol(sentence: str, bpe_symbol: str):
     if bpe_symbol == 'sentencepiece':
         sentence = sentence.replace(' ', '').replace('\u2581', ' ').strip()
diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py
index 48853ba726..66b3524f42 100644
--- a/fairseq/data/language_pair_dataset.py
+++ b/fairseq/data/language_pair_dataset.py
@@ -11,15 +11,15 @@
 
 def collate(
     samples, pad_idx, eos_idx, left_pad_source=True, left_pad_target=False,
-    input_feeding=True,
+    input_feeding=True, input_shapes=None,
 ):
     if len(samples) == 0:
         return {}
 
     def merge(key, left_pad, move_eos_to_beginning=False):
         return data_utils.collate_tokens(
-            [s[key] for s in samples],
-            pad_idx, eos_idx, left_pad, move_eos_to_beginning,
+            [s[key] for s in samples], pad_idx,
+            eos_idx,left_pad, move_eos_to_beginning, input_shapes,
         )
 
     def check_alignment(alignment, src_len, tgt_len):
@@ -154,7 +154,8 @@ def __init__(
         shuffle=True, input_feeding=True,
         remove_eos_from_source=False, append_eos_to_target=False,
         align_dataset=None,
-        append_bos=False
+        append_bos=False,
+        input_shapes=None,
     ):
         if tgt_dict is not None:
             assert src_dict.pad() == tgt_dict.pad()
@@ -178,6 +179,7 @@ def __init__(
         if self.align_dataset is not None:
             assert self.tgt_sizes is not None, "Both source and target needed when alignments are provided"
         self.append_bos = append_bos
+        self.input_shapes = input_shapes
 
     def __getitem__(self, index):
         tgt_item = self.tgt[index] if self.tgt is not None else None
@@ -249,7 +251,7 @@ def collater(self, samples):
         return collate(
             samples, pad_idx=self.src_dict.pad(), eos_idx=self.src_dict.eos(),
             left_pad_source=self.left_pad_source, left_pad_target=self.left_pad_target,
-            input_feeding=self.input_feeding,
+            input_feeding=self.input_feeding, input_shapes=self.input_shapes,
         )
 
     def num_tokens(self, index):
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 573c41373b..bb30db524c 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -355,8 +355,8 @@ def forward(self, src_tokens, src_lengths, cls_input=None, return_all_hiddens=Fa
 
         # compute padding mask
         encoder_padding_mask = src_tokens.eq(self.padding_idx)
-        if not encoder_padding_mask.any():
-            encoder_padding_mask = None
+        #if not encoder_padding_mask.any():
+        #    encoder_padding_mask = None
 
         encoder_states = [] if return_all_hiddens else None
 
@@ -596,8 +596,8 @@ def extract_features(
         x = x.transpose(0, 1)
 
         self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
-        if not self_attn_padding_mask.any() and not self.cross_self_attention:
-            self_attn_padding_mask = None
+        # if not self_attn_padding_mask.any() and not self.cross_self_attention:
+        #     self_attn_padding_mask = None
 
         # decoder layers
         attn = None
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 39ffe2ccc5..95787b1fda 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -39,9 +39,17 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
                                                              'value to be of the same size'
 
-        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        if self.qkv_same_dim:
+            self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
+        else:
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+
+        if bias:
+            self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
 
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
@@ -57,11 +65,12 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
 
         self.onnx_trace = False
 
+        # XXX: (taylanbil) try F.multi...
         self.enable_torch_version = False
-        if hasattr(F, "multi_head_attention_forward"):
-            self.enable_torch_version = True
-        else:
-            self.enable_torch_version = False
+        # if hasattr(F, "multi_head_attention_forward"):
+        #     self.enable_torch_version = True
+        # else:
+        #     self.enable_torch_version = False
 
     def prepare_for_onnx_export_(self):
         self.onnx_trace = True
@@ -70,15 +79,15 @@ def reset_parameters(self):
         if self.qkv_same_dim:
             # Empirically observed the convergence to be much better with
             # the scaled initialization
-            nn.init.xavier_uniform_(self.k_proj.weight, gain=1/math.sqrt(2))
-            nn.init.xavier_uniform_(self.v_proj.weight, gain=1/math.sqrt(2))
-            nn.init.xavier_uniform_(self.q_proj.weight, gain=1/math.sqrt(2))
+            nn.init.xavier_uniform_(self.in_proj_weight, gain=1/math.sqrt(2))
         else:
-            nn.init.xavier_uniform_(self.k_proj.weight)
-            nn.init.xavier_uniform_(self.v_proj.weight)
-            nn.init.xavier_uniform_(self.q_proj.weight)
+            nn.init.xavier_uniform_(self.k_proj_weight)
+            nn.init.xavier_uniform_(self.v_proj_weight)
+            nn.init.xavier_uniform_(self.q_proj_weight)
 
         nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.in_proj_bias is not None:
+            nn.init.constant_(self.in_proj_bias, 0.)
         nn.init.constant_(self.out_proj.bias, 0.)
         if self.bias_k is not None:
             nn.init.xavier_normal_(self.bias_k)
@@ -146,23 +155,19 @@ def forward(
             saved_state = None
 
         if self.self_attention:
-            q = self.q_proj(query)
-            k = self.k_proj(query)
-            v = self.v_proj(query)
+            q, k, v = self.in_proj_qkv(query)
         elif self.encoder_decoder_attention:
             # encoder-decoder attention
-            q = self.q_proj(query)
+            q = self.in_proj_q(query)
             if key is None:
                 assert value is None
                 k = v = None
             else:
-                k = self.k_proj(key)
-                v = self.v_proj(key)
+                k = self.in_proj_k(key)
+                v = self.in_proj_v(key)
 
         else:
-            q = self.q_proj(query)
-            k = self.k_proj(key)
-            v = self.v_proj(value)
+            raise
         q *= self.scaling
 
         if self.bias_k is not None:
@@ -242,10 +247,9 @@ def forward(
         if key_padding_mask is not None:
             # don't attend to padding symbols
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.masked_fill(
-                key_padding_mask.unsqueeze(1).unsqueeze(2),
-                float('-inf'),
-            )
+            attn_weights = attn_weights.transpose(0, 2)
+            attn_weights.masked_fill_(key_padding_mask, float('-inf'))
+            attn_weights = attn_weights.transpose(0, 2)
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if before_softmax:
@@ -330,3 +334,43 @@ def upgrade_state_dict_named(self, state_dict, name):
 
         for key, value in items_to_add.items():
             state_dict[key] = value
+
+    def in_proj_qkv(self, query):
+        return self._in_proj(query).chunk(3, dim=-1)
+
+    def in_proj_q(self, query):
+        if self.qkv_same_dim:
+            return self._in_proj(query, end=self.embed_dim)
+        else:
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[:self.embed_dim]
+            return F.linear(query, self.q_proj_weight, bias)
+
+    def in_proj_k(self, key):
+        if self.qkv_same_dim:
+            return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
+        else:
+            weight = self.k_proj_weight
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[self.embed_dim:2 * self.embed_dim]
+            return F.linear(key, weight, bias)
+
+    def in_proj_v(self, value):
+        if self.qkv_same_dim:
+            return self._in_proj(value, start=2 * self.embed_dim)
+        else:
+            weight = self.v_proj_weight
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[2 * self.embed_dim:]
+            return F.linear(value, weight, bias)
+
+    def _in_proj(self, input, start=0, end=None):
+        weight = self.in_proj_weight
+        bias = self.in_proj_bias
+        weight = weight[start:end, :]
+        if bias is not None:
+            bias = bias[start:end]
+        return F.linear(input, weight, bias)
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index 538532b20e..2e524f875b 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -146,10 +146,17 @@ def get_batch_iterator(
             )
 
         # create mini-batches with given size constraints
-        batch_sampler = data_utils.batch_by_size(
-            indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
-            required_batch_size_multiple=required_batch_size_multiple,
-        )
+        if getattr(self.args, 'use_gpu', True):
+            batch_sampler = data_utils.batch_by_size(
+                indices, dataset.num_tokens, max_tokens=max_tokens,
+                max_sentences=max_sentences,
+                required_batch_size_multiple=required_batch_size_multiple,
+            )
+        else:
+            batch_sampler = data_utils.batch_by_size_tpu(
+                indices, dataset.num_tokens,
+                getattr(self.args, 'input_shapes', None)
+            )
 
         # return a reusable, sharded iterator
         epoch_iter = iterators.EpochBatchIterator(
diff --git a/fairseq/tasks/translation.py b/fairseq/tasks/translation.py
index 353e640bf6..aa29634f44 100644
--- a/fairseq/tasks/translation.py
+++ b/fairseq/tasks/translation.py
@@ -25,6 +25,7 @@ def load_langpair_dataset(
     combine, dataset_impl, upsample_primary,
     left_pad_source, left_pad_target, max_source_positions,
     max_target_positions, prepend_bos=False, load_alignments=False,
+    input_shapes=None,
 ):
     def split_exists(split, src, tgt, lang, data_path):
         filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
@@ -88,6 +89,7 @@ def split_exists(split, src, tgt, lang, data_path):
         max_source_positions=max_source_positions,
         max_target_positions=max_target_positions,
         align_dataset=align_dataset,
+        input_shapes=input_shapes,
     )
 
 
@@ -203,6 +205,7 @@ def load_dataset(self, split, epoch=0, combine=False, **kwargs):
             max_source_positions=self.args.max_source_positions,
             max_target_positions=self.args.max_target_positions,
             load_alignments=self.args.load_alignments,
+            input_shapes=getattr(self.args, 'input_shapes', None),
         )
 
     def build_dataset_for_inference(self, src_tokens, src_lengths):
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 5de30e2246..f2a8b10c4f 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -31,7 +31,7 @@ class Trainer(object):
     communication of the gradients across workers.
     """
 
-    def __init__(self, args, task, model, criterion, dummy_batch=None, oom_batch=None):
+    def __init__(self, args, task, model, criterion, dummy_batch=None, oom_batch=None, xla=False):
         self.args = args
         self.task = task
 
@@ -63,6 +63,7 @@ def __init__(self, args, task, model, criterion, dummy_batch=None, oom_batch=Non
         self.fast_stat_sync = args.fast_stat_sync
 
         self.init_meters(args)
+        self.xla = xla
 
     def init_meters(self, args):
         self.meters = OrderedDict()
@@ -419,7 +420,12 @@ def maybe_no_sync():
             self._prev_grad_norm = grad_norm
 
             # take an optimization step
-            self.optimizer.step()
+
+            # xla takes care of optimization step using
+            #   torch_xla.xla_model.optimizer_step
+            # so skip optimization step here in that case
+            if not self.xla:
+                self.optimizer.step()
             self.set_num_updates(self.get_num_updates() + 1)
 
             # task specific update per step
@@ -433,8 +439,11 @@ def maybe_no_sync():
             self.meters['wpb'].update(ntokens)
             self.meters['bsz'].update(nsentences)
             self.meters['gnorm'].update(grad_norm)
+            # the comparison below introduces too many .item() calls and slows
+            # down tpu
             self.meters['clip'].update(
-                1. if grad_norm > self.args.clip_norm and self.args.clip_norm > 0 else 0.
+                0.
+                #1. if grad_norm > self.args.clip_norm and self.args.clip_norm > 0 else 0.
             )
             self.meters['train_loss'].update(logging_output.get('loss', 0), sample_size)
             if 'train_acc' in self.meters:
@@ -576,6 +585,14 @@ def get_meter(self, name):
             return None
         return self.meters[name]
 
+    def meters_to_device(self, device):
+        """Send meters' values to given device. Useful for TPU's."""
+        for meter in self.meters.values():
+            for key, val in vars(meter).items():
+                if isinstance(val, torch.Tensor):
+                    newval = val.to(device=torch.device(device))
+                    setattr(meter, key, newval)
+
     def get_num_updates(self):
         """Get the number of parameters updates."""
         return self._num_updates
diff --git a/fairseq/utils.py b/fairseq/utils.py
index 79a89d41ec..8981ae5158 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -176,9 +176,10 @@ def make_positions(tensor, padding_idx, onnx_trace=False):
     # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
     # how to handle the dtype kwarg in cumsum.
     mask = tensor.ne(padding_idx).int()
-    return (
-        torch.cumsum(mask, dim=1).type_as(mask) * mask
-    ).long() + padding_idx
+    #return (
+    #    torch.cumsum(mask, dim=1).type_as(mask) * mask
+    #).long() + padding_idx
+    return (torch.cumsum(mask, dim=1) * mask).long() + padding_idx
 
 
 def strip_pad(tensor, pad):
diff --git a/train.py b/train.py
index d287e2513d..7e35ae8082 100644
--- a/train.py
+++ b/train.py
@@ -9,12 +9,23 @@
 
 import collections
 import math
+import sys
 import random
+from datetime import datetime
 
 import numpy as np
 import torch
-
-from fairseq import checkpoint_utils, distributed_utils, options, progress_bar, tasks, utils
+import torch_xla
+import torch_xla.debug.metrics as met
+import torch_xla.distributed.data_parallel as dp
+import torch_xla.distributed.parallel_loader as pl
+import torch_xla.utils.utils as xu
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.xla_multiprocessing as xmp
+
+from fairseq import (
+    checkpoint_utils, distributed_utils, options, progress_bar, tasks, utils
+)
 from fairseq.data import iterators
 from fairseq.trainer import Trainer
 from fairseq.meters import AverageMeter, StopwatchMeter
@@ -22,6 +33,32 @@
 fb_pathmgr_registerd = False
 
 
+def initialize_loader_for_epoch(args, epoch_itr, prefix='training'):
+    # Update parameters every N batches
+    if epoch_itr.epoch <= len(args.update_freq):
+        update_freq = args.update_freq[epoch_itr.epoch - 1]
+    else:
+        update_freq = args.update_freq[-1]
+
+    # Initialize data iterator
+    itr = epoch_itr.next_epoch_itr(
+          fix_batches_to_gpus=False, shuffle=(epoch_itr.epoch >= args.curriculum))
+    itr = iterators.GroupedIterator(itr, update_freq)
+    progress = progress_bar.build_progress_bar(
+          args, itr, epoch_itr.epoch, prefix=prefix, no_progress_bar='simple')
+    return progress
+
+
+def print_model_criterion(model, criterion, args):
+      print(model)
+      print('| model {}, criterion {}'.format(args.arch,
+                                              criterion.__class__.__name__))
+      print('| num. model params: {} (num. trained: {})'.format(
+          sum(p.numel() for p in model.parameters()),
+          sum(p.numel() for p in model.parameters() if p.requires_grad),
+      ))
+
+
 def main(args, init_distributed=False):
     utils.import_user_module(args)
 
@@ -61,12 +98,7 @@ def main(args, init_distributed=False):
     # Build model and criterion
     model = task.build_model(args)
     criterion = task.build_criterion(args)
-    print(model)
-    print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__))
-    print('| num. model params: {} (num. trained: {})'.format(
-        sum(p.numel() for p in model.parameters()),
-        sum(p.numel() for p in model.parameters() if p.requires_grad),
-    ))
+    print_model_criterion(model, criterion, args)
 
     # Build trainer
     trainer = Trainer(args, task, model, criterion)
@@ -112,20 +144,7 @@ def main(args, init_distributed=False):
 
 def train(args, trainer, task, epoch_itr):
     """Train the model for one epoch."""
-    # Update parameters every N batches
-    update_freq = args.update_freq[epoch_itr.epoch - 1] \
-        if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1]
-
-    # Initialize data iterator
-    itr = epoch_itr.next_epoch_itr(
-        fix_batches_to_gpus=args.fix_batches_to_gpus,
-        shuffle=(epoch_itr.epoch >= args.curriculum),
-    )
-    itr = iterators.GroupedIterator(itr, update_freq)
-    progress = progress_bar.build_progress_bar(
-        args, itr, epoch_itr.epoch, no_progress_bar='simple',
-    )
-
+    progress = initialize_loader_for_epoch(args, epoch_itr)
     extra_meters = collections.defaultdict(lambda: AverageMeter())
     valid_subsets = args.valid_subset.split(',')
     max_update = args.max_update or math.inf
@@ -301,11 +320,248 @@ def distributed_main(i, args, start_rank=0):
         args.distributed_rank = start_rank + i
     main(args, init_distributed=True)
 
+def parse_input_shapes(input_shapes_arg):
+    input_shapes = (
+        shape.replace('*', 'x').split('x') for shape in input_shapes_arg)
+    input_shapes = [list(map(int, shape)) for shape in input_shapes]
+    if len(input_shapes) == 1:
+        return input_shapes
+    input_shapes.sort(key=lambda shape: shape[1])
+    errmsg = (
+        'Invalid --input_shapes. Batch sizes (dimension 1) need to increase as '
+        'num_tokens (dimension 2) decrease. e.g. 16x128 32x64 64x32'
+    )
+    assert all(
+         shape1[0] > shape2[0]
+         for shape1, shape2 in zip(input_shapes, input_shapes[1:])), errmsg
+    return input_shapes
 
-def cli_main():
-    parser = options.get_training_parser()
-    args = options.parse_args_and_arch(parser)
 
+def main_tpu(args):
+
+    def now():
+        return datetime.now().strftime('%H:%M:%S')
+
+    def log_step(step_type, device, step, log_output=None, tracker=None):
+        msg = '{}/ {}, device {}, step {}'.format(
+            step_type, now(), device, step
+        )
+        if tracker:
+            rates = tracker.rate(), tracker.global_rate()
+            msg += ', Rate={:.2f}, GlobalRate={:.2f}'.format(*rates)
+        if log_output:
+            msg += ', loss={:.4f}, nll_loss={:.4f}'.format(
+                log_output['loss'].item(), log_output['nll_loss'].item()
+            )
+        return msg
+
+    def prepare_task(args, xla_device):
+        # Setup task, e.g., translation, language modeling, etc.
+        task = tasks.setup_task(args)
+
+        # Load valid dataset (we load training data below, based on the latest checkpoint)
+        for valid_sub_split in args.valid_subset.split(','):
+            task.load_dataset(valid_sub_split, combine=True, epoch=0)
+
+        # Build models and criteria to print some metadata
+        torch.manual_seed(args.seed)
+        model, criterion = task.build_model(args), task.build_criterion(args)
+        xm.master_print(model)
+        xm.master_print('| model {}, criterion {}'.format(
+            args.arch, criterion.__class__.__name__))
+        xm.master_print('| num. model params: {} (num. trained: {})'.format(
+            sum(p.numel() for p in model.parameters()),
+            sum(p.numel() for p in model.parameters() if p.requires_grad)))
+        model = model.to(xla_device)
+        trainer = Trainer(args, task, model, criterion, xla=True)
+        lr = trainer.get_lr()
+
+        # Load the latest checkpoint if one is available and restore the
+        # corresponding train iterator
+        # set distributed args here to shard data
+        trainer.args.distributed_rank = xm.get_ordinal()
+        trainer.args.distributed_world_size = xm.xrt_world_size()
+        extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)
+        trainer.args.distributed_rank = 0
+        trainer.args.distributed_world_size = 1
+        trainer.meters_to_device(xla_device)
+        valid_subsets = args.valid_subset.split(',')
+        ordinal = xm.get_ordinal(defval=-1)
+        device_str = (
+            str(xla_device) if ordinal < 0 else
+            '{}/{}'.format(xla_device, ordinal)
+        )
+        return task, trainer, model, epoch_itr, lr, valid_subsets, device_str
+
+    def train_loop_fn(device, trainer, loader, last_batch_index):
+        stats, log_output, tracker = None, None, xm.RateTracker()
+        for i, samples in enumerate(loader):
+            if i == last_batch_index:
+                # last batches are incomplete
+                break
+            if (i == last_batch_index - 1) or not (i % args.log_steps):
+                print(
+                    log_step(
+                        'training', device, i,
+                        log_output=log_output, tracker=tracker,
+                    ),
+                    flush=True,
+                )
+            log_output = trainer.train_step(samples)
+            log_output = None if args.suppress_loss_report else log_output
+            xm.optimizer_step(trainer.optimizer)
+            tracker.add(sum(sample['nsentences'] for sample in samples))
+        return tracker
+
+    def valid_loop_fn(args, device, trainer, loader, last_batch_index):
+        # reset validation loss meters
+        for k in ['valid_loss', 'valid_nll_loss']:
+            meter = trainer.get_meter(k)
+            if meter is not None:
+                meter.reset()
+        extra_meters = collections.defaultdict(lambda: AverageMeter())
+        for i, sample in enumerate(loader):
+            if i == last_batch_index:
+                # last batches are of different size, will cause recompilations
+                break
+            if not (i % args.log_steps):
+                print(log_step('validation', device, i, tracker=None))
+            log_output = trainer.valid_step(sample)
+            for k, v in log_output.items():
+                if k in ['loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size']:
+                    continue
+                extra_meters[k].update(v)
+        stats = get_valid_stats(trainer, args)
+        for k, meter in extra_meters.items():
+            stats[k] = meter.avg
+        return stats
+
+    def validate_subset(args, device, trainer, task, epoch_itr, subset):
+        xm.master_print('Validating the subset "{}"'.format(subset))
+        # Initialize data iterator
+        # XXX: we're not sharding the validation set
+        itr = task.get_batch_iterator(
+            dataset=task.dataset(subset),
+            max_tokens=args.max_tokens,
+            max_sentences=args.max_sentences_valid,
+            max_positions=utils.resolve_max_positions(
+                task.max_positions(),
+                list(trainer.get_model().max_positions()),
+            ),
+            ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
+            required_batch_size_multiple=args.required_batch_size_multiple,
+            seed=args.seed,
+            num_workers=args.num_workers
+        ).next_epoch_itr(shuffle=False)
+        progress = progress_bar.build_progress_bar(
+            args, itr, epoch_itr.epoch,
+            prefix='valid on {} \'{}\' subset'.format(device, subset),
+            no_progress_bar='simple'
+        )
+        para_loader = pl.ParallelLoader(progress, [device])
+        stats = valid_loop_fn(
+            args, device, trainer, para_loader.per_device_loader(device),
+            len(progress) - 1
+        )
+        progress.print(stats, tag=subset, step=trainer.get_num_updates())
+        return stats['loss'].avg
+
+    def validate_subsets(args, device, trainer, task, epoch_itr, subsets):
+        valid_losses = {
+            subset: validate_subset(
+                args, device, trainer, task, epoch_itr, subset
+            )
+            for subset in subsets
+        }
+        return valid_losses
+
+    def keep_training(lr, epoch_itr, trainer):
+        # Train until the learning rate gets too small
+        max_epoch = args.max_epoch or math.inf
+        max_update = args.max_update or math.inf
+        lr, n_updates = trainer.get_lr(), trainer.get_num_updates()
+        return ((lr > args.min_lr) and (epoch_itr.epoch < max_epoch) and
+            (n_updates < max_update))
+
+    if xu.getenv_as('XLA_USE_BF16', bool, False):
+        xm.master_print(
+            'WARNING: bfloat16 is enabled. Note that fairseq meters such as '
+            'loss will accumulate the numerator, and increment the denominator.'
+            ' Due to lack of precision in higher numbers in bfloat16, these '
+            'meters will report invalid values after a while.',
+            fd=sys.stderr
+        )
+
+    xm.master_print('Args', fd=sys.stderr)
+    for key, val in args.__dict__.items():
+        xm.master_print('\t{} {}'.format(key, val), fd=sys.stderr)
+    # `xla_device` is `torch.device` and `device` is `str`
+    xla_device = xm.xla_device()
+    task, trainer, model, epoch_itr, lr, valid_subsets, device = prepare_task(
+        args, xla_device)
+
+    train_meter = StopwatchMeter()
+    train_meter.start()
+    while keep_training(lr, epoch_itr, trainer):
+        # TRAINING
+        xm.master_print('Epoch {} begin {}'.format(epoch_itr.epoch + 1, now()))
+        progress = initialize_loader_for_epoch(
+            args, epoch_itr, prefix='training on {}'.format(device),
+        )
+        para_loader = pl.ParallelLoader(progress, [device])
+        tracker = train_loop_fn(
+            device, trainer, para_loader.per_device_loader(device),
+            len(progress) - 1
+        )
+        stats = get_training_stats(trainer)
+        progress.print(stats, tag=device)
+        print(
+            'Device {} Epoch {} Tracker Rate={:.2f}, GlobalRate={:.2f}'.format(
+                device, epoch_itr.epoch, tracker.rate(), tracker.global_rate()
+            )
+        )
+        xm.master_print('Epoch {} end {}'.format(epoch_itr.epoch, now()))
+        if args.metrics_debug:
+            xm.master_print(met.metrics_report())
+
+        # VALIDATION
+        if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0:
+            valid_losses = validate_subsets(
+                args, device, trainer, task, epoch_itr, valid_subsets
+            )
+
+            # only use average first validation loss from the first device
+            # to update the learning rate
+            vloss = valid_losses[valid_subsets[0]].item()
+            xm.master_print('old learning rate: {}'.format(lr))
+            lr = trainer.lr_step(epoch_itr.epoch, vloss)
+            xm.master_print('new learning rate: {}'.format(lr))
+        else:
+            vloss = None
+
+        # save checkpoint
+        if epoch_itr.epoch % args.save_interval == 0:
+            checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, vloss)
+
+        if args.metrics_debug:
+            xm.master_print(met.metrics_report())
+
+    train_meter.stop()
+    xm.master_print('| done training in {:.1f} seconds'.format(train_meter.sum))
+    assert_on_losses(args, trainer)
+
+
+def assert_on_losses(args, trainer):
+    if xu.getenv_as('XLA_USE_BF16', bool, False):
+        # XXX: loss values are meaningless in this case due to precision in bf16
+        return
+    valid_loss = args.target_valid_loss or math.inf
+    train_loss = args.target_train_loss or math.inf
+    assert valid_loss > trainer.meters['valid_loss'].avg.item()
+    assert train_loss > trainer.meters['train_loss'].avg.item()
+
+
+def cli_main_gpu(args):
     if args.distributed_init_method is None:
         distributed_utils.infer_init_method(args)
 
@@ -339,5 +595,81 @@ def cli_main():
         main(args)
 
 
+def get_args():
+    parser = options.get_training_parser()
+    # TPU: need to control certain flags here.
+    # e.g. parallelization needs to be suppressed and deferred to torch_xla flags
+    # e.g. input tensor shapes need to be controlled via --input_shapes
+    parser.add_argument(
+        '--input_shapes',
+        nargs='*',
+        default=None,
+        help=(
+            'This is used to specify batches and pad lengths. Ex: '
+            '`--input_shapes 256x32 512x16` will produce batches w/ 256 '
+            'sentences padded to length 32, or 512 sentences padded to length '
+            '16. Including too many input shapes will cause graph recompiles and'
+            ' degrade performance. On the other extreme, including 1 shape may '
+            'waste a ton of flops, since batches may contain a lot of pad '
+            'indices on average. Note that the max pad length in this arg will '
+            'be used as `--max-source-positions`'))
+    parser.add_argument('--log_steps', type=int, default=20)
+    parser.add_argument('--num_cores', type=int, default=8)
+    parser.add_argument('--metrics_debug', action='store_true')
+    parser.add_argument('--use_gpu', action='store_true')
+    parser.add_argument('--target_train_loss', type=float, default=None)
+    parser.add_argument('--target_valid_loss', type=float, default=None)
+    parser.add_argument('--suppress_loss_report', action='store_true')
+    args = options.parse_args_and_arch(parser)
+    return args
+
+
+def adjust_args_tpu(args):
+    if args.fp16:
+        raise RuntimeError(
+            '--fp16 was provided, this is controlled by env var XLA_USE_BF16')
+    print('suppressing distributed_init args for GPU', file=sys.stderr)
+    args.distributed_rank = 0
+    args.distributed_world_size = 1
+    args.distributed_init_method = None
+    if args.input_shapes is None:
+        raise RuntimeError(
+            'Please specify batches and pad lengths using '
+            '--input_shapes. Ex: `--input_shapes 256x32 512x16` .'
+            'Please refer to the description of the --input_shape'
+            ' arg in --help'
+        )
+    gpu_input_shape_args = ['max_sentences', 'max_sentences_valid', 'max_tokens']
+    nonnull_gpu_input_shape_args = [
+        arg for arg in gpu_input_shape_args if getattr(args, arg) is not None
+    ]
+    if nonnull_gpu_input_shape_args:
+      errmsg = (
+          'On TPUs, please control input shapes '
+          'using `--input_shapes`. Any non-null arg in {} will trigger'
+          ' this error.'
+      ).format(gpu_input_shape_args)
+      raise RuntimeError(errmsg)
+
+    args.input_shapes = parse_input_shapes(args.input_shapes)
+    # XXX (taylanbil): do we ever have more than 2 dimensions in fairseq?
+    args.max_source_positions = args.input_shapes[-1][1]
+    return args
+
+
+def cli_main():
+    args = get_args()
+    if args.use_gpu:
+        return cli_main_gpu(args)
+    # From here on out we are in TPU context
+    args = adjust_args_tpu(args)
+    xmp.spawn(_mp_fn, args=(args,), nprocs=args.num_cores)
+
+
+def _mp_fn(index, args):
+    torch.set_default_tensor_type('torch.FloatTensor')
+    main_tpu(args)
+
+
 if __name__ == '__main__':
     cli_main()

From f17ad03c547c95e5f0db0f2a4a9ecef7ed98faa8 Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@google.com>
Date: Thu, 14 Nov 2019 19:15:39 +0000
Subject: [PATCH 207/213] send meters to device

---
 train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train.py b/train.py
index a5f07b7986..b4e13453cb 100644
--- a/train.py
+++ b/train.py
@@ -345,6 +345,7 @@ def prepare_task(args, xla_device):
         extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)
         trainer.args.distributed_rank = 0
         trainer.args.distributed_world_size = 1
+        trainer.meters_to_device(xla_device)
         valid_subsets = args.valid_subset.split(',')
         ordinal = xm.get_ordinal(defval=-1)
         device_str = (

From 734b14f0b2cc58282e57d568365cbfa447950b64 Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@gmail.com>
Date: Sat, 16 Nov 2019 00:08:41 +0000
Subject: [PATCH 208/213] Revert inplace masked_fill_s so convergence occurs

---
 fairseq/criterions/label_smoothed_cross_entropy.py | 4 ++--
 fairseq/modules/multihead_attention.py             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
index acef05d1b6..a9d7828dcd 100644
--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -17,8 +17,8 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T
     smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
     if ignore_index is not None:
         non_pad_mask = target.ne(ignore_index)
-        nll_loss.masked_fill_(~non_pad_mask, 0.0)
-        smooth_loss.masked_fill_(~non_pad_mask, 0.0)
+        nll_loss = nll_loss.masked_fill_(~non_pad_mask, 0.0)
+        smooth_loss = smooth_loss.masked_fill_(~non_pad_mask, 0.0)
     else:
         nll_loss = nll_loss.squeeze(-1)
         smooth_loss = smooth_loss.squeeze(-1)
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 95787b1fda..656dcba244 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -248,7 +248,7 @@ def forward(
             # don't attend to padding symbols
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights.transpose(0, 2)
-            attn_weights.masked_fill_(key_padding_mask, float('-inf'))
+            attn_weights = attn_weights.masked_fill(key_padding_mask, float('-inf'))
             attn_weights = attn_weights.transpose(0, 2)
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 

From 043b6a968daa76aa5727d295361ee40ece44a19a Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@google.com>
Date: Sat, 16 Nov 2019 01:27:39 +0000
Subject: [PATCH 209/213] git wtf

---
 fairseq/checkpoint_utils.py |  8 --------
 fairseq/data/data_utils.py  | 16 ----------------
 train.py                    | 27 ---------------------------
 3 files changed, 51 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 31ed11ebd1..a582acdc2a 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -109,14 +109,6 @@ def load_checkpoint(args, trainer, data_selector=None):
     else:
         checkpoint_path = args.restore_file
 
-
-def load_checkpoint(args, trainer):
-    """Load a checkpoint and restore the training iterator."""
-    # only one worker should attempt to create the required dir
-    if args.distributed_rank == 0 or xm.is_master_ordinal():
-        os.makedirs(args.save_dir, exist_ok=True)
-
-    checkpoint_path = get_checkpoint_path(args)
     extra_state = trainer.load_checkpoint(
         checkpoint_path,
         args.reset_optimizer,
diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 26ec7b3fdb..790a1cf5c4 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -262,22 +262,6 @@ def batch_by_size_tpu(
             break
 
 
-def batch_by_size_tpu(
-    indices, num_tokens_fn, input_shapes
-):
-    batches = [[] for _ in input_shapes]
-    for idx in indices:
-        sample_len = num_tokens_fn(idx)
-        for j, (batch_size, padlen) in enumerate(input_shapes):
-            if padlen < sample_len:
-                continue
-            batches[j].append(idx)
-            if len(batches[j]) == batch_size:
-                yield batches[j]
-                batches[j] = []
-            break
-
-
 def process_bpe_symbol(sentence: str, bpe_symbol: str):
     if bpe_symbol == 'sentencepiece':
         sentence = sentence.replace(' ', '').replace('\u2581', ' ').strip()
diff --git a/train.py b/train.py
index 097baddf17..7e35ae8082 100644
--- a/train.py
+++ b/train.py
@@ -10,7 +10,6 @@
 import collections
 import math
 import sys
-import os
 import random
 from datetime import datetime
 
@@ -34,32 +33,6 @@
 fb_pathmgr_registerd = False
 
 
-def initialize_loader_for_epoch(args, epoch_itr, prefix='training'):
-    # Update parameters every N batches
-    if epoch_itr.epoch <= len(args.update_freq):
-        update_freq = args.update_freq[epoch_itr.epoch - 1]
-    else:
-        update_freq = args.update_freq[-1]
-
-    # Initialize data iterator
-    itr = epoch_itr.next_epoch_itr(
-          fix_batches_to_gpus=False, shuffle=(epoch_itr.epoch >= args.curriculum))
-    itr = iterators.GroupedIterator(itr, update_freq)
-    progress = progress_bar.build_progress_bar(
-          args, itr, epoch_itr.epoch, prefix=prefix, no_progress_bar='simple')
-    return progress
-
-
-def print_model_criterion(model, criterion, args):
-      print(model)
-      print('| model {}, criterion {}'.format(args.arch,
-                                              criterion.__class__.__name__))
-      print('| num. model params: {} (num. trained: {})'.format(
-          sum(p.numel() for p in model.parameters()),
-          sum(p.numel() for p in model.parameters() if p.requires_grad),
-      ))
-
-
 def initialize_loader_for_epoch(args, epoch_itr, prefix='training'):
     # Update parameters every N batches
     if epoch_itr.epoch <= len(args.update_freq):

From 12aaf5498be993f7713dc8b19818b1dad59b021f Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@google.com>
Date: Mon, 18 Nov 2019 17:56:31 +0000
Subject: [PATCH 210/213] Clean up comments, unused imports, and reuse var in
 checkpoint saving

---
 fairseq/checkpoint_utils.py            | 5 +++--
 fairseq/models/transformer.py          | 4 ----
 fairseq/modules/multihead_attention.py | 5 -----
 train.py                               | 5 +----
 4 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index a582acdc2a..aa8160f231 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -67,13 +67,14 @@ def is_better(a, b):
 
     if len(checkpoints) > 0:
         trainer.save_checkpoint(checkpoints[0], extra_state)
+        do_copy = getattr(args, 'use_gpu', True) or xm.is_master_ordinal()
         for cp in checkpoints[1:]:
             try:
                 from fairseq.fb_pathmgr import fb_pathmgr
-                if getattr(args, 'use_gpu', True) or xm.is_master_ordinal():
+                if do_copy:
                     fb_pathmgr.copy(checkpoints[0], cp, True)
             except (ModuleNotFoundError, ImportError):
-                if getattr(args, 'use_gpu', True) or xm.is_master_ordinal():
+                if do_copy:
                     shutil.copyfile(checkpoints[0], cp)
         write_timer.stop()
         print('| saved checkpoint {} (epoch {} @ {} updates) (writing took {} seconds)'.format(
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index bb30db524c..bc7c0432a0 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -355,8 +355,6 @@ def forward(self, src_tokens, src_lengths, cls_input=None, return_all_hiddens=Fa
 
         # compute padding mask
         encoder_padding_mask = src_tokens.eq(self.padding_idx)
-        #if not encoder_padding_mask.any():
-        #    encoder_padding_mask = None
 
         encoder_states = [] if return_all_hiddens else None
 
@@ -596,8 +594,6 @@ def extract_features(
         x = x.transpose(0, 1)
 
         self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
-        # if not self_attn_padding_mask.any() and not self.cross_self_attention:
-        #     self_attn_padding_mask = None
 
         # decoder layers
         attn = None
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 656dcba244..eefe12cee2 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -65,12 +65,7 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
 
         self.onnx_trace = False
 
-        # XXX: (taylanbil) try F.multi...
         self.enable_torch_version = False
-        # if hasattr(F, "multi_head_attention_forward"):
-        #     self.enable_torch_version = True
-        # else:
-        #     self.enable_torch_version = False
 
     def prepare_for_onnx_export_(self):
         self.onnx_trace = True
diff --git a/train.py b/train.py
index 7e35ae8082..482813db83 100644
--- a/train.py
+++ b/train.py
@@ -17,7 +17,6 @@
 import torch
 import torch_xla
 import torch_xla.debug.metrics as met
-import torch_xla.distributed.data_parallel as dp
 import torch_xla.distributed.parallel_loader as pl
 import torch_xla.utils.utils as xu
 import torch_xla.core.xla_model as xm
@@ -439,7 +438,7 @@ def valid_loop_fn(args, device, trainer, loader, last_batch_index):
     def validate_subset(args, device, trainer, task, epoch_itr, subset):
         xm.master_print('Validating the subset "{}"'.format(subset))
         # Initialize data iterator
-        # XXX: we're not sharding the validation set
+        # we're not sharding the validation set
         itr = task.get_batch_iterator(
             dataset=task.dataset(subset),
             max_tokens=args.max_tokens,
@@ -553,7 +552,6 @@ def keep_training(lr, epoch_itr, trainer):
 
 def assert_on_losses(args, trainer):
     if xu.getenv_as('XLA_USE_BF16', bool, False):
-        # XXX: loss values are meaningless in this case due to precision in bf16
         return
     valid_loss = args.target_valid_loss or math.inf
     train_loss = args.target_train_loss or math.inf
@@ -652,7 +650,6 @@ def adjust_args_tpu(args):
       raise RuntimeError(errmsg)
 
     args.input_shapes = parse_input_shapes(args.input_shapes)
-    # XXX (taylanbil): do we ever have more than 2 dimensions in fairseq?
     args.max_source_positions = args.input_shapes[-1][1]
     return args
 

From 8de1826ca046ac3eeff98b07fa2821fae7eff546 Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@google.com>
Date: Mon, 18 Nov 2019 21:56:10 +0000
Subject: [PATCH 211/213] Added comments to various places of tpu related code
 change, and fixed the multihead attention switch case

---
 fairseq/checkpoint_utils.py                        |  3 +++
 fairseq/criterions/label_smoothed_cross_entropy.py |  5 +++++
 fairseq/data/data_utils.py                         |  9 +++++++++
 fairseq/models/transformer.py                      |  4 ++++
 fairseq/modules/multihead_attention.py             | 10 +++++++++-
 5 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index aa8160f231..a6a61871f5 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -68,6 +68,7 @@ def is_better(a, b):
     if len(checkpoints) > 0:
         trainer.save_checkpoint(checkpoints[0], extra_state)
         do_copy = getattr(args, 'use_gpu', True) or xm.is_master_ordinal()
+        # tpu-comment: copy the saved checkpoint if master ordinal only
         for cp in checkpoints[1:]:
             try:
                 from fairseq.fb_pathmgr import fb_pathmgr
@@ -102,6 +103,8 @@ def is_better(a, b):
 def load_checkpoint(args, trainer, data_selector=None):
     """Load a checkpoint and restore the training iterator."""
     # only one worker should attempt to create the required dir
+    # tpu-comment: master ordinal check is required as distributed rank is
+    #   zero for 8 devices.
     if args.distributed_rank == 0 or xm.is_master_ordinal():
         os.makedirs(args.save_dir, exist_ok=True)
 
diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
index a9d7828dcd..ebba2e3bf3 100644
--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -17,6 +17,8 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T
     smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
     if ignore_index is not None:
         non_pad_mask = target.ne(ignore_index)
+        # tpu-comment: masked_selecting using non-pad-mask causes compilations
+        #     hence, we fill w/ 0s and sum
         nll_loss = nll_loss.masked_fill_(~non_pad_mask, 0.0)
         smooth_loss = smooth_loss.masked_fill_(~non_pad_mask, 0.0)
     else:
@@ -57,6 +59,9 @@ def forward(self, model, sample, reduce=True):
         loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
         sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
         logging_output = {
+            # tpu-comment: removing the item() calls here since it adds 2
+            #   aten::_local_scalar_dense's that slow the training down.
+            #   the returned loss values are scalar tensors if `reduce`
             'loss': loss.data,
             'nll_loss': nll_loss.data,
             'ntokens': sample['ntokens'],
diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 790a1cf5c4..2eaeedbcbf 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -249,6 +249,15 @@ def batch_by_size(
 def batch_by_size_tpu(
     indices, num_tokens_fn, input_shapes
 ):
+    """
+    tpu-comment: varying input shapes cause compilations and slow TPU training.
+     There is a trade-off between
+     * allow varying input shapes and lose time to compilations
+     * fix input shapes by padding and lose time by wasting flops
+
+    It is generally up to experimentation to determine the optimal input_shapes
+    parameter that results in the best performance.
+    """
     batches = [[] for _ in input_shapes]
     for idx in indices:
         sample_len = num_tokens_fn(idx)
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index bc7c0432a0..1a6354772b 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -355,6 +355,10 @@ def forward(self, src_tokens, src_lengths, cls_input=None, return_all_hiddens=Fa
 
         # compute padding mask
         encoder_padding_mask = src_tokens.eq(self.padding_idx)
+        # tpu-comment: the code snippet
+        #   `if not encoder_padding_mask.any(): encoder_padding_mask = None`
+        #  causes .item() calls for tpu, so it's not worth having it in here.
+
 
         encoder_states = [] if return_all_hiddens else None
 
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index eefe12cee2..8af1d6c93e 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -39,6 +39,10 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
                                                              'value to be of the same size'
 
+        # tpu-comment: The following `in_proj` code is from an older upstream
+        #    branch in fairseq. The newer version where we don't handle
+        #    `selg.qkv_same_dim` case separately causes a ~10% regression in
+        #    performance
         if self.qkv_same_dim:
             self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
         else:
@@ -65,6 +69,7 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
 
         self.onnx_trace = False
 
+        # tpu-comment: torch version of multihead attention is slower on TPUs.
         self.enable_torch_version = False
 
     def prepare_for_onnx_export_(self):
@@ -162,7 +167,10 @@ def forward(
                 v = self.in_proj_v(key)
 
         else:
-            raise
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+
         q *= self.scaling
 
         if self.bias_k is not None:

From 5120a2bb300373deda6c1252998efcd2704b3ff2 Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@google.com>
Date: Mon, 18 Nov 2019 23:30:28 +0000
Subject: [PATCH 212/213] Added comments to various places of tpu related code
 change, and fixed the multihead attention switch case

---
 fairseq/modules/multihead_attention.py |  4 ++++
 fairseq/tasks/fairseq_task.py          |  2 ++
 fairseq/trainer.py                     | 15 ++++++++++-----
 train.py                               |  7 ++++++-
 4 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
index 8af1d6c93e..fff4fdcded 100644
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -249,6 +249,10 @@ def forward(
 
         if key_padding_mask is not None:
             # don't attend to padding symbols
+            # tpu-comment: The following masked_fill replaces the upstream code 
+            #  with a mathematically equivalent operation where we rely on
+            #  transposing and pytorch's broadcasting. Result is a ~35% lift in 
+            #  performance
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights.transpose(0, 2)
             attn_weights = attn_weights.masked_fill(key_padding_mask, float('-inf'))
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
index 2e524f875b..c64bedd59d 100644
--- a/fairseq/tasks/fairseq_task.py
+++ b/fairseq/tasks/fairseq_task.py
@@ -153,6 +153,8 @@ def get_batch_iterator(
                 required_batch_size_multiple=required_batch_size_multiple,
             )
         else:
+            # tpu-comment: TPUs suffer from liberally varying input shapes.
+            #   here, we create batches by limiting the input shape variability
             batch_sampler = data_utils.batch_by_size_tpu(
                 indices, dataset.num_tokens,
                 getattr(self.args, 'input_shapes', None)
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index f2a8b10c4f..b4656c2ae8 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -421,8 +421,8 @@ def maybe_no_sync():
 
             # take an optimization step
 
-            # xla takes care of optimization step using
-            #   torch_xla.xla_model.optimizer_step
+            # tpu-comment: xla takes care of optimization step using
+            #   `torch_xla.core.xla_model.optimizer_step`
             # so skip optimization step here in that case
             if not self.xla:
                 self.optimizer.step()
@@ -439,8 +439,8 @@ def maybe_no_sync():
             self.meters['wpb'].update(ntokens)
             self.meters['bsz'].update(nsentences)
             self.meters['gnorm'].update(grad_norm)
-            # the comparison below introduces too many .item() calls and slows
-            # down tpu
+            # tpu-comment: the comparison below introduces too many .item()
+            # calls and slows down tpu
             self.meters['clip'].update(
                 0.
                 #1. if grad_norm > self.args.clip_norm and self.args.clip_norm > 0 else 0.
@@ -586,7 +586,12 @@ def get_meter(self, name):
         return self.meters[name]
 
     def meters_to_device(self, device):
-        """Send meters' values to given device. Useful for TPU's."""
+        """
+        tpu-comment: Send meters' values to given device.
+        Due to the need of reducing .item() calls, meters values sometimes
+        live on the device. When loading a checkpoint, this requires sending
+        those values to device.
+        """
         for meter in self.meters.values():
             for key, val in vars(meter).items():
                 if isinstance(val, torch.Tensor):
diff --git a/train.py b/train.py
index 482813db83..d08df1132c 100644
--- a/train.py
+++ b/train.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import torch
+
 import torch_xla
 import torch_xla.debug.metrics as met
 import torch_xla.distributed.parallel_loader as pl
@@ -377,7 +378,8 @@ def prepare_task(args, xla_device):
 
         # Load the latest checkpoint if one is available and restore the
         # corresponding train iterator
-        # set distributed args here to shard data
+        # we overwrite distributed args here to shard data using torch_xla's
+        # distributed training.
         trainer.args.distributed_rank = xm.get_ordinal()
         trainer.args.distributed_world_size = xm.xrt_world_size()
         extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)
@@ -393,6 +395,9 @@ def prepare_task(args, xla_device):
         return task, trainer, model, epoch_itr, lr, valid_subsets, device_str
 
     def train_loop_fn(device, trainer, loader, last_batch_index):
+        """
+        This is the main training loop. It trains for 1 epoch.
+        """
         stats, log_output, tracker = None, None, xm.RateTracker()
         for i, samples in enumerate(loader):
             if i == last_batch_index:

From bbfeec92c91d6252c6e46a806fbd496a610e77f7 Mon Sep 17 00:00:00 2001
From: Taylan Bilal <taylanbil@google.com>
Date: Mon, 18 Nov 2019 23:40:10 +0000
Subject: [PATCH 213/213] More documentation for sequence padding

---
 fairseq/data/data_utils.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 2eaeedbcbf..0058d490df 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -27,6 +27,19 @@ def infer_language_pair(path):
 
 
 def get_pad_size(values, input_shapes):
+    """
+    Returns the pad size.
+
+    On GPUs, pad to the max sequence length of a given input
+    On TPUs, that would cause a lot of compilations and slow training.
+      Thus, we pad to the "next sequence length as specified in the
+      `input_shapes` argument.
+    We assume `input_shapes` is an array of the form:
+      [[batchsize0, seqlen0], [batchsize1, seqlen1], ...]
+    sorted from shortest to longest sequence lengths, and unique in batch_sizes 
+
+    e.g. [[512, 32], [256, 64], [128, 128]]
+    """
     if input_shapes is None:
         return max(v.size(0) for v in values)
     for batch_size, padlen in input_shapes: