From c9fb0c91bdfda8190b421a30745b0f337ea31750 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 12 Jun 2020 01:58:51 +0000 Subject: [PATCH 01/30] remove transformer layer ID from the top module --- deepspeed/pt/deepspeed_cuda.py | 7 +++++-- docs/_tutorials/bert-pretraining.md | 4 ++-- docs/_tutorials/transformer_kernel.md | 4 ++-- tests/unit/test_cuda_backward.py | 5 ++--- tests/unit/test_cuda_forward.py | 5 ++--- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/deepspeed/pt/deepspeed_cuda.py b/deepspeed/pt/deepspeed_cuda.py index 3b86f06cc383..dcef515a4ba5 100755 --- a/deepspeed/pt/deepspeed_cuda.py +++ b/deepspeed/pt/deepspeed_cuda.py @@ -403,11 +403,14 @@ class DeepSpeedTransformerLayer(nn.Module): initial_biases: Optional: Only used for unit test """ - def __init__(self, layer_id, config, initial_weights=None, initial_biases=None): + layer_id = 0 + + def __init__(self, config, initial_weights=None, initial_biases=None): super(DeepSpeedTransformerLayer, self).__init__() self.config = config - self.config.layer_id = layer_id + self.config.layer_id = DeepSpeedTransformerLayer.layer_id + DeepSpeedTransformerLayer.layer_id = DeepSpeedTransformerLayer.layer_id + 1 print("DeepSpeed Transformer config is ", self.config.__dict__) diff --git a/docs/_tutorials/bert-pretraining.md b/docs/_tutorials/bert-pretraining.md index 9b15322a0f45..e23a80045429 100755 --- a/docs/_tutorials/bert-pretraining.md +++ b/docs/_tutorials/bert-pretraining.md @@ -284,10 +284,10 @@ transformer layers using DeepSpeed transformer kernel as below. gelu_checkpoint=args.gelu_checkpoint, stochastic_mode=True) - self.layer = nn.ModuleList([copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config)) for i in range(config.num_hidden_layers)]) + layer = DeepSpeedTransformerLayer(cuda_config) else: layer = BertLayer(config) - self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) ``` All configuration settings come from the DeepSpeed configuration file and command arguments and thus we must pass the `args` variable to here in this model. diff --git a/docs/_tutorials/transformer_kernel.md b/docs/_tutorials/transformer_kernel.md index ce5955e0fe6f..4fdce21095c3 100755 --- a/docs/_tutorials/transformer_kernel.md +++ b/docs/_tutorials/transformer_kernel.md @@ -40,8 +40,8 @@ config = DeepSpeedTransformerConfig(batch_size = 64, normalize_invertible=False, gelu_checkpoint=False) self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config)) - for i in range(config.num_hidden_layers) + copy.deepcopy(DeepSpeedTransformerLayer(cuda_config)) + for _ in range(config.num_hidden_layers) ]) ``` ### Transformer kernel Parameters diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py index bf0e5955d62c..95c220e71650 100755 --- a/tests/unit/test_cuda_backward.py +++ b/tests/unit/test_cuda_backward.py @@ -79,11 +79,10 @@ def __init__(self, config, weights, biases): super(DSEncoder, self).__init__() self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(i, - config, + copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) - for i in range(config.num_hidden_layers) + for _ in range(config.num_hidden_layers) ]) self.grads = [] self.pre_or_post = config.pre_layer_norm diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index 4e995a34448f..c359b7f6e36a 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -44,11 +44,10 @@ def __init__(self, config, weights, biases): super(DSEncoder, self).__init__() self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(i, - config, + copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) - for i in range(config.num_hidden_layers) + for _ in range(config.num_hidden_layers) ]) self.grads = [] self.pre_or_post = config.pre_layer_norm From 6254e461d68b0bc2ddabd663cdbb29349a89e4f5 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 12 Jun 2020 04:44:43 +0000 Subject: [PATCH 02/30] updating docstring --- deepspeed/pt/deepspeed_cuda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepspeed/pt/deepspeed_cuda.py b/deepspeed/pt/deepspeed_cuda.py index dcef515a4ba5..1fe08d0a24e8 100755 --- a/deepspeed/pt/deepspeed_cuda.py +++ b/deepspeed/pt/deepspeed_cuda.py @@ -393,10 +393,10 @@ def backward(ctx, grad_output): class DeepSpeedTransformerLayer(nn.Module): """Initialize the DeepSpeed Transformer Layer. + Static variable: + layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated, + e.g. if a model has 24 transformer layers, layer_id goes from 0 to 23. Arguments: - layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers, - layer_id will be 0,1,2...23 when each layer object is instantiated - config: An object of DeepSpeedTransformerConfig initial_weights: Optional: Only used for unit test From 3eab150b909fdfa02607ec716cf3049554d5c22d Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 12 Jun 2020 07:41:43 +0000 Subject: [PATCH 03/30] add inject --- deepspeed/pt/inject.py | 79 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 deepspeed/pt/inject.py diff --git a/deepspeed/pt/inject.py b/deepspeed/pt/inject.py new file mode 100644 index 000000000000..5e5d433573c0 --- /dev/null +++ b/deepspeed/pt/inject.py @@ -0,0 +1,79 @@ +import copy +import torch +from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig, DeepSpeedConfig + + +def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed): + for name, child in model.named_children(): + if isinstance(child, layer_obj): + print('REPLACING BertLayer') + + cuda_config = DeepSpeedTransformerConfig( + batch_size=micro_batch_size, + max_seq_length=max_seq_length, + hidden_size=config.hidden_size, + heads=config.num_attention_heads, + attn_dropout_ratio=config.attention_probs_dropout_prob, + hidden_dropout_ratio=config.hidden_dropout_prob, + num_hidden_layers=config.num_hidden_layers, + initializer_range=config.initializer_range, + seed=seed, + fp16=True, + pre_layer_norm=True) + + new_module = DeepSpeedTransformerLayer(cuda_config) + + #TODO: copy relevant state from child -> new module + + setattr(model, name, copy.deepcopy(new_module)) + + else: + module_inject(layer_obj, + child, + config, + micro_batch_size, + max_seq_length, + seed) + + return model + + +def test_hi(): + from turing.nvidia_modelingpreln import BertConfig as BertConfigPreLN + from turing.nvidia_modelingpreln import BertForQuestionAnswering as BertForQuestionAnsweringPreLN + from turing.nvidia_modelingpreln import BertLayer + bert_model_config = { + "vocab_size_or_config_json_file": 119547, + "hidden_size": 1024, + "num_hidden_layers": 1, + "num_attention_heads": 16, + "intermediate_size": 4096, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 2, + "initializer_range": 0.02 + } + bert_config = BertConfigPreLN(**bert_model_config) + base_model = BertForQuestionAnsweringPreLN(bert_config, args=None) + + #base_model = LinearStack() + + test_model = copy.deepcopy(base_model) + test_model = module_inject(BertLayer, test_model, bert_config, 4, 384, 1234) + + print('BASE', base_model) + print('TEST', test_model) + + #base_model.eval() + #test_model.eval() + + #test_input = torch.rand(1, base_model.input_dim) + + #base_output = base_model(test_input) + #test_output = test_model(test_input) + # + #assert torch.allclose(base_output, test_output, atol=3e-8) From f6baecbf60146fc6247e71f6a7b5f1285da49210 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Wed, 17 Jun 2020 06:24:12 +0000 Subject: [PATCH 04/30] update inject PoC --- deepspeed/__init__.py | 1 + deepspeed/pt/deepspeed_cuda.py | 19 +++++++++++++++---- deepspeed/pt/inject.py | 24 +++++++++++++++++++++++- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index b1970ac4ebbe..98fbb15bd9ad 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -8,6 +8,7 @@ from deepspeed.pt.log_utils import logger from deepspeed.pt.deepspeed_cuda import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig from deepspeed.pt.deepspeed_config import DeepSpeedConfig +from deepspeed.pt.inject import module_inject import deepspeed.pt.deepspeed_checkpointing as checkpointing diff --git a/deepspeed/pt/deepspeed_cuda.py b/deepspeed/pt/deepspeed_cuda.py index 1fe08d0a24e8..1d318f462325 100755 --- a/deepspeed/pt/deepspeed_cuda.py +++ b/deepspeed/pt/deepspeed_cuda.py @@ -394,7 +394,7 @@ class DeepSpeedTransformerLayer(nn.Module): """Initialize the DeepSpeed Transformer Layer. Static variable: - layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated, + layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated, e.g. if a model has 24 transformer layers, layer_id goes from 0 to 23. Arguments: config: An object of DeepSpeedTransformerConfig @@ -500,11 +500,22 @@ def init_transformer_weights(self, adjust_init_range=False): self.norm_w.data.fill_(1.0) self.norm_b.data.zero_() - def forward(self, input, input_mask, grads=None): + #def forward(self, input, input_mask, grads=None): + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() - return DeepSpeedTransformerFunction.apply(input, - input_mask, + # disable grad testing for now + grads = None + return DeepSpeedTransformerFunction.apply(hidden_states, + attention_mask, self, grads, self.config.layer_id, diff --git a/deepspeed/pt/inject.py b/deepspeed/pt/inject.py index 5e5d433573c0..09dc3e0b65fc 100644 --- a/deepspeed/pt/inject.py +++ b/deepspeed/pt/inject.py @@ -23,7 +23,29 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se new_module = DeepSpeedTransformerLayer(cuda_config) - #TODO: copy relevant state from child -> new module + # copy relevant state from child -> new module + qw = child.attention.self.query.weight + qb = child.attention.self.query.bias + kw = child.attention.self.key.weight + kb = child.attention.self.key.bias + vw = child.attention.self.value.weight + vb = child.attention.self.value.bias + + qkvw = torch.cat((qw, kw, vw), 0) + qkvb = torch.cat((qb, kb, vb), 0) + + new_module.attn_qkvw.data = qkvw + new_module.attn_qkvb.data = qkvb + new_module.attn_ow.data = child.attention.output.dense.weight + new_module.attn_ob.data = child.attention.output.dense.bias + new_module.attn_nw.data = child.attention.output.LayerNorm.weight + new_module.attn_nb.data = child.attention.output.LayerNorm.bias + new_module.inter_w.data = child.intermediate.dense.weight + new_module.inter_b.data = child.intermediate.dense.bias + new_module.output_w.data = child.output.dense.weight + new_module.output_b.data = child.output.dense.bias + new_module.norm_w.data = child.output.LayerNorm.weight + new_module.norm_b.data = child.output.LayerNorm.bias setattr(model, name, copy.deepcopy(new_module)) From b5acaca478819826f1efb19a174fefbdd50b0f40 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 18 Jun 2020 16:16:08 +0000 Subject: [PATCH 05/30] fix the preln injection --- deepspeed/pt/inject.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) mode change 100644 => 100755 deepspeed/pt/inject.py diff --git a/deepspeed/pt/inject.py b/deepspeed/pt/inject.py old mode 100644 new mode 100755 index 09dc3e0b65fc..c726492f3a24 --- a/deepspeed/pt/inject.py +++ b/deepspeed/pt/inject.py @@ -3,7 +3,7 @@ from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig, DeepSpeedConfig -def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed): +def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed, preln, fp16 = True): for name, child in model.named_children(): if isinstance(child, layer_obj): print('REPLACING BertLayer') @@ -18,8 +18,8 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se num_hidden_layers=config.num_hidden_layers, initializer_range=config.initializer_range, seed=seed, - fp16=True, - pre_layer_norm=True) + fp16=fp16, + pre_layer_norm=preln) new_module = DeepSpeedTransformerLayer(cuda_config) @@ -38,14 +38,26 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se new_module.attn_qkvb.data = qkvb new_module.attn_ow.data = child.attention.output.dense.weight new_module.attn_ob.data = child.attention.output.dense.bias - new_module.attn_nw.data = child.attention.output.LayerNorm.weight - new_module.attn_nb.data = child.attention.output.LayerNorm.bias - new_module.inter_w.data = child.intermediate.dense.weight - new_module.inter_b.data = child.intermediate.dense.bias + if preln: + attention_layerNorm = child.PostAttentionLayerNorm + else: + attention_layerNorm = child.attention.output.LayerNorm + new_module.attn_nw.data = attention_layerNorm.weight + new_module.attn_nb.data = attention_layerNorm.bias + if preln: + intermediate_FF = child.intermediate.dense_act + else: + intermediate_FF = child.intermediate.dense + new_module.inter_w.data = intermediate_FF.weight + new_module.inter_b.data = intermediate_FF.bias new_module.output_w.data = child.output.dense.weight new_module.output_b.data = child.output.dense.bias - new_module.norm_w.data = child.output.LayerNorm.weight - new_module.norm_b.data = child.output.LayerNorm.bias + if preln: + transformer_LayerNorm = child.PreAttentionLayerNorm + else: + transformer_LayerNorm = child.output.LayerNorm + new_module.norm_w.data = transformer_LayerNorm.weight + new_module.norm_b.data = transformer_LayerNorm.bias setattr(model, name, copy.deepcopy(new_module)) @@ -55,7 +67,7 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se config, micro_batch_size, max_seq_length, - seed) + seed, preln, fp16) return model From b3f99b73645f47ef5dfb616d9448bb1badd8e672 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 18 Jun 2020 16:17:03 +0000 Subject: [PATCH 06/30] fix the preln injection --- deepspeed/pt/inject.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/deepspeed/pt/inject.py b/deepspeed/pt/inject.py index c726492f3a24..ab8469113ce1 100755 --- a/deepspeed/pt/inject.py +++ b/deepspeed/pt/inject.py @@ -3,7 +3,14 @@ from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig, DeepSpeedConfig -def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed, preln, fp16 = True): +def module_inject(layer_obj, + model, + config, + micro_batch_size, + max_seq_length, + seed, + preln, + fp16=True): for name, child in model.named_children(): if isinstance(child, layer_obj): print('REPLACING BertLayer') @@ -53,7 +60,7 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se new_module.output_w.data = child.output.dense.weight new_module.output_b.data = child.output.dense.bias if preln: - transformer_LayerNorm = child.PreAttentionLayerNorm + transformer_LayerNorm = child.PreAttentionLayerNorm else: transformer_LayerNorm = child.output.LayerNorm new_module.norm_w.data = transformer_LayerNorm.weight @@ -67,7 +74,9 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se config, micro_batch_size, max_seq_length, - seed, preln, fp16) + seed, + preln, + fp16) return model From 04a4d35858a757ef15ee8a18b6b7a56902bc700a Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 19 Jun 2020 15:42:16 +0000 Subject: [PATCH 07/30] backward-test fixed --- deepspeed/pt/deepspeed_cuda.py | 3 +-- tests/unit/test_cuda_backward.py | 2 +- tests/unit/test_cuda_forward.py | 10 +--------- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/deepspeed/pt/deepspeed_cuda.py b/deepspeed/pt/deepspeed_cuda.py index 1d318f462325..67870ea22b60 100755 --- a/deepspeed/pt/deepspeed_cuda.py +++ b/deepspeed/pt/deepspeed_cuda.py @@ -509,11 +509,10 @@ def forward( encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=False, + grads=None ): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() - # disable grad testing for now - grads = None return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py index 95c220e71650..07b0ff52ffa4 100755 --- a/tests/unit/test_cuda_backward.py +++ b/tests/unit/test_cuda_backward.py @@ -117,7 +117,7 @@ def custom_forward(*inputs): # decoder layers else: for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, attention_mask, self.grads) + hidden_states = layer_module(hidden_states, attention_mask, grads=self.grads) hidden_states.register_hook( lambda x, self=self: self.grads.append([x, diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index c359b7f6e36a..584a2c1b8471 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -49,7 +49,6 @@ def __init__(self, config, weights, biases): biases)) for _ in range(config.num_hidden_layers) ]) - self.grads = [] self.pre_or_post = config.pre_layer_norm def forward(self, @@ -83,11 +82,6 @@ def custom_forward(*inputs): else: for i, layer_module in enumerate(self.layer): hidden_states = layer_module(hidden_states, attention_mask) - hidden_states.register_hook( - lambda x, - i=i, - self=self: self.grads.append([x, - "hidden_state"])) if output_all_encoded_layers: all_encoder_layers.append(hidden_states) @@ -98,8 +92,6 @@ def custom_forward(*inputs): all_encoder_layers.append(hidden_states) return all_encoder_layers - def get_grads(self): - return self.grads def create_models(ds_config): @@ -199,7 +191,7 @@ def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): output_all_encoded_layers=False, checkpoint_activations=False) - # check grads + # check forward evaluation check_equal(base_results, ds_results, atol=atol, verbose=verbose) From fca500f529b3f6cd62f5f9b40d6da0173004eec3 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 19 Jun 2020 15:43:34 +0000 Subject: [PATCH 08/30] backward-test fixed --- deepspeed/pt/deepspeed_cuda.py | 18 ++++++++---------- tests/unit/test_cuda_backward.py | 4 +++- tests/unit/test_cuda_forward.py | 1 - 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/deepspeed/pt/deepspeed_cuda.py b/deepspeed/pt/deepspeed_cuda.py index 67870ea22b60..5156e88d01e1 100755 --- a/deepspeed/pt/deepspeed_cuda.py +++ b/deepspeed/pt/deepspeed_cuda.py @@ -501,16 +501,14 @@ def init_transformer_weights(self, adjust_init_range=False): self.norm_b.data.zero_() #def forward(self, input, input_mask, grads=None): - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - grads=None - ): + def forward(self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + grads=None): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() return DeepSpeedTransformerFunction.apply(hidden_states, diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py index 07b0ff52ffa4..7b9928477bca 100755 --- a/tests/unit/test_cuda_backward.py +++ b/tests/unit/test_cuda_backward.py @@ -117,7 +117,9 @@ def custom_forward(*inputs): # decoder layers else: for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, attention_mask, grads=self.grads) + hidden_states = layer_module(hidden_states, + attention_mask, + grads=self.grads) hidden_states.register_hook( lambda x, self=self: self.grads.append([x, diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index 584a2c1b8471..4ce89304ed94 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -93,7 +93,6 @@ def custom_forward(*inputs): return all_encoder_layers - def create_models(ds_config): bert_config = BertConfig(vocab_size_or_config_json_file=119547, hidden_size=ds_config.hidden_size, From f3ff21e7ca3263da064fd66a28477e23f920196d Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Thu, 25 Jun 2020 17:58:56 +0000 Subject: [PATCH 09/30] update with replace module style --- deepspeed/__init__.py | 2 +- deepspeed/pt/deepspeed_cuda.py | 31 ++++++--- deepspeed/pt/inject.py | 122 --------------------------------- deepspeed/pt/replace_module.py | 120 ++++++++++++++++++++++++++++++++ 4 files changed, 142 insertions(+), 133 deletions(-) delete mode 100755 deepspeed/pt/inject.py create mode 100755 deepspeed/pt/replace_module.py diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 98fbb15bd9ad..979afdf0f64f 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -8,7 +8,7 @@ from deepspeed.pt.log_utils import logger from deepspeed.pt.deepspeed_cuda import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig from deepspeed.pt.deepspeed_config import DeepSpeedConfig -from deepspeed.pt.inject import module_inject +from deepspeed.pt.replace_module import replace_transformer_layer import deepspeed.pt.deepspeed_checkpointing as checkpointing diff --git a/deepspeed/pt/deepspeed_cuda.py b/deepspeed/pt/deepspeed_cuda.py index 5156e88d01e1..d0db30903f76 100755 --- a/deepspeed/pt/deepspeed_cuda.py +++ b/deepspeed/pt/deepspeed_cuda.py @@ -77,6 +77,9 @@ class DeepSpeedTransformerConfig(TransformerConfig): that by enabling it, the pretraining tasks such as BERT are not affected and can obtain a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend to turn it off in order to be able to reproduce the same result through the regular kernel execution. + + huggingface: Enbale if using the HuggingFace interface style for sending out the forward results. + """ def __init__(self, batch_size=-1, @@ -95,7 +98,8 @@ def __init__(self, gelu_checkpoint=False, adjust_init_range=True, attn_dropout_checkpoint=False, - stochastic_mode=False): + stochastic_mode=False, + huggingface=False): super(DeepSpeedTransformerConfig, self).__init__(batch_size, max_seq_length, @@ -117,6 +121,7 @@ def __init__(self, self.is_grad_enabled = True self.attn_dropout_checkpoint = attn_dropout_checkpoint self.stochastic_mode = stochastic_mode + self.huggingface = huggingface @classmethod def from_dict(cls, json_object): @@ -279,7 +284,10 @@ def forward(ctx, ctx.attn_output_dropout_mask = attn_output_dropout_mask ctx.layer_output_dropout_mask = layer_output_dropout_mask - return output + if config.huggingface: + return (output, ) # outputs -> (output) : outputs[0] = output + else: + return output @staticmethod def backward(ctx, grad_output): @@ -501,16 +509,19 @@ def init_transformer_weights(self, adjust_init_range=False): self.norm_b.data.zero_() #def forward(self, input, input_mask, grads=None): - def forward(self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - grads=None): + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() + # disable grad testing for now + grads = None return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, diff --git a/deepspeed/pt/inject.py b/deepspeed/pt/inject.py deleted file mode 100755 index ab8469113ce1..000000000000 --- a/deepspeed/pt/inject.py +++ /dev/null @@ -1,122 +0,0 @@ -import copy -import torch -from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig, DeepSpeedConfig - - -def module_inject(layer_obj, - model, - config, - micro_batch_size, - max_seq_length, - seed, - preln, - fp16=True): - for name, child in model.named_children(): - if isinstance(child, layer_obj): - print('REPLACING BertLayer') - - cuda_config = DeepSpeedTransformerConfig( - batch_size=micro_batch_size, - max_seq_length=max_seq_length, - hidden_size=config.hidden_size, - heads=config.num_attention_heads, - attn_dropout_ratio=config.attention_probs_dropout_prob, - hidden_dropout_ratio=config.hidden_dropout_prob, - num_hidden_layers=config.num_hidden_layers, - initializer_range=config.initializer_range, - seed=seed, - fp16=fp16, - pre_layer_norm=preln) - - new_module = DeepSpeedTransformerLayer(cuda_config) - - # copy relevant state from child -> new module - qw = child.attention.self.query.weight - qb = child.attention.self.query.bias - kw = child.attention.self.key.weight - kb = child.attention.self.key.bias - vw = child.attention.self.value.weight - vb = child.attention.self.value.bias - - qkvw = torch.cat((qw, kw, vw), 0) - qkvb = torch.cat((qb, kb, vb), 0) - - new_module.attn_qkvw.data = qkvw - new_module.attn_qkvb.data = qkvb - new_module.attn_ow.data = child.attention.output.dense.weight - new_module.attn_ob.data = child.attention.output.dense.bias - if preln: - attention_layerNorm = child.PostAttentionLayerNorm - else: - attention_layerNorm = child.attention.output.LayerNorm - new_module.attn_nw.data = attention_layerNorm.weight - new_module.attn_nb.data = attention_layerNorm.bias - if preln: - intermediate_FF = child.intermediate.dense_act - else: - intermediate_FF = child.intermediate.dense - new_module.inter_w.data = intermediate_FF.weight - new_module.inter_b.data = intermediate_FF.bias - new_module.output_w.data = child.output.dense.weight - new_module.output_b.data = child.output.dense.bias - if preln: - transformer_LayerNorm = child.PreAttentionLayerNorm - else: - transformer_LayerNorm = child.output.LayerNorm - new_module.norm_w.data = transformer_LayerNorm.weight - new_module.norm_b.data = transformer_LayerNorm.bias - - setattr(model, name, copy.deepcopy(new_module)) - - else: - module_inject(layer_obj, - child, - config, - micro_batch_size, - max_seq_length, - seed, - preln, - fp16) - - return model - - -def test_hi(): - from turing.nvidia_modelingpreln import BertConfig as BertConfigPreLN - from turing.nvidia_modelingpreln import BertForQuestionAnswering as BertForQuestionAnsweringPreLN - from turing.nvidia_modelingpreln import BertLayer - bert_model_config = { - "vocab_size_or_config_json_file": 119547, - "hidden_size": 1024, - "num_hidden_layers": 1, - "num_attention_heads": 16, - "intermediate_size": 4096, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02 - } - bert_config = BertConfigPreLN(**bert_model_config) - base_model = BertForQuestionAnsweringPreLN(bert_config, args=None) - - #base_model = LinearStack() - - test_model = copy.deepcopy(base_model) - test_model = module_inject(BertLayer, test_model, bert_config, 4, 384, 1234) - - print('BASE', base_model) - print('TEST', test_model) - - #base_model.eval() - #test_model.eval() - - #test_input = torch.rand(1, base_model.input_dim) - - #base_output = base_model(test_input) - #test_output = test_model(test_input) - # - #assert torch.allclose(base_output, test_output, atol=3e-8) diff --git a/deepspeed/pt/replace_module.py b/deepspeed/pt/replace_module.py new file mode 100755 index 000000000000..9471e5bedfbb --- /dev/null +++ b/deepspeed/pt/replace_module.py @@ -0,0 +1,120 @@ +import copy +import torch +import deepspeed + + +def replace_transformer_layer(orig_layer_impl, + model, + micro_batch_size, + bert_config, + seed, + max_seq_length, + preln=False, + fp16=True, + huggingface=False): + """ Replace bert-style transformer layers with DeepSpeed's transformer layer + Arguments: + orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for, + e.g., transformers.modeling_bert.BertLayer. + model (torch.nn.Module): user's nn.module representing their model + micro_batch_size (int): micro batch size per gpu used during training/eval + bert_config (dict): model config containing hidden size, attention heads, etc. + seed (int): random seed value + max_seq_length (int): max sequence length for training + preln (bool): does the original layer implementation do pre or post layer norm? + fp16 (bool): fp16 or fp32 + huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) + + Returns: + Updated nn.module with replaced transformer layers + """ + def replace_fn(child): + transformer_config = deepspeed.DeepSpeedTransformerConfig( + batch_size=micro_batch_size, + max_seq_length=max_seq_length, + hidden_size=bert_config.hidden_size, + heads=bert_config.num_attention_heads, + attn_dropout_ratio=bert_config.attention_probs_dropout_prob, + hidden_dropout_ratio=bert_config.hidden_dropout_prob, + num_hidden_layers=bert_config.num_hidden_layers, + initializer_range=bert_config.initializer_range, + seed=seed, + fp16=fp16, + pre_layer_norm=preln, + huggingface=huggingface) + new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config) + + # copy relevant state from child -> new module + qw = child.attention.self.query.weight + qb = child.attention.self.query.bias + kw = child.attention.self.key.weight + kb = child.attention.self.key.bias + vw = child.attention.self.value.weight + vb = child.attention.self.value.bias + + qkvw = torch.cat((qw, kw, vw), 0) + qkvb = torch.cat((qb, kb, vb), 0) + + new_module.attn_qkvw.data = qkvw + new_module.attn_qkvb.data = qkvb + new_module.attn_ow.data = child.attention.output.dense.weight + new_module.attn_ob.data = child.attention.output.dense.bias + if preln: + attention_layernorm = child.PostAttentionLayerNorm + else: + attention_layernorm = child.attention.output.LayerNorm + new_module.attn_nw.data = attention_layernorm.weight + new_module.attn_nb.data = attention_layernorm.bias + if preln: + intermediate_ff = child.intermediate.dense_act + else: + intermediate_ff = child.intermediate.dense + new_module.inter_w.data = intermediate_ff.weight + new_module.inter_b.data = intermediate_ff.bias + new_module.output_w.data = child.output.dense.weight + new_module.output_b.data = child.output.dense.bias + if preln: + transformer_layernorm = child.PreAttentionLayerNorm + else: + transformer_layernorm = child.output.LayerNorm + new_module.norm_w.data = transformer_layernorm.weight + new_module.norm_b.data = transformer_layernorm.bias + return new_module + + return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn) + + +def replace_module(model, orig_class, replace_fn): + """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``. + Arguments: + model (torch.nn.Module): the model to augment + orig_class (torch.nn.Module): the module to search for + replace_fn (method): a method to convert instances of ``orig_class`` to the + desired type and return a new instance. + + Returns: + A modified ``model``. + """ + policy = {orig_class: replace_fn} + return _replace_module(model, policy) + + +def _replace_module(model, policies): + """ Traverse model's children recursively and apply any transformations in ``policies``. + Arguments: + model (torch.nn.Module): model to augment + policies (dict): Mapping of source class to replacement function. + + Returns: + Modified ``model``. + """ + for name, child in model.named_children(): + if child.__class__ in policies: + orig = repr(child) + setattr(model, name, policies[child.__class__](child)) + new = getattr(model, name) + print(f'{orig} -> {new}') + else: + _replace_module(child, policies) + + return model From c208bdfa67163760b1b926f5fb22355ebd0475e0 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Thu, 16 Jul 2020 22:10:55 +0000 Subject: [PATCH 10/30] add function to revert from ds kernel -> orig layer --- deepspeed/pt/replace_module.py | 68 ++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/deepspeed/pt/replace_module.py b/deepspeed/pt/replace_module.py index 9471e5bedfbb..3714c4cad559 100755 --- a/deepspeed/pt/replace_module.py +++ b/deepspeed/pt/replace_module.py @@ -84,6 +84,74 @@ def replace_fn(child): return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn) +def revert_transformer_layer(orig_layer_impl, + model, + micro_batch_size, + bert_config, + seed, + max_seq_length, + preln=False, + fp16=True, + huggingface=False): + """ Revert DeepSpeed's transformer layer back to original bert-style transformer layer + Arguments: + orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced, + e.g., transformers.modeling_bert.BertLayer. + model (torch.nn.Module): user's nn.module representing their model + micro_batch_size (int): micro batch size per gpu used during training/eval + bert_config (dict): model config containing hidden size, attention heads, etc. + seed (int): random seed value + max_seq_length (int): max sequence length for training + preln (bool): does the original layer implementation do pre or post layer norm? + fp16 (bool): fp16 or fp32 + huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) + + Returns: + Updated nn.module with original bert-style transformer layers + """ + def replace_fn(child): + #from turing.nvidia_modelingpreln import BertLayer + orig_module = orig_layer_impl(bert_config) + + # copy relevant state from child -> new module + qw = child.attention.self.query.weight + qb = child.attention.self.query.bias + kw = child.attention.self.key.weight + kb = child.attention.self.key.bias + vw = child.attention.self.value.weight + vb = child.attention.self.value.bias + + qkvw = torch.cat((qw, kw, vw), 0) + qkvb = torch.cat((qb, kb, vb), 0) + + new_module.attn_qkvw.data = qkvw + new_module.attn_qkvb.data = qkvb + new_module.attn_ow.data = child.attention.output.dense.weight + new_module.attn_ob.data = child.attention.output.dense.bias + if preln: + attention_layernorm = child.PostAttentionLayerNorm + else: + attention_layernorm = child.attention.output.LayerNorm + new_module.attn_nw.data = attention_layernorm.weight + new_module.attn_nb.data = attention_layernorm.bias + if preln: + intermediate_ff = child.intermediate.dense_act + else: + intermediate_ff = child.intermediate.dense + new_module.inter_w.data = intermediate_ff.weight + new_module.inter_b.data = intermediate_ff.bias + new_module.output_w.data = child.output.dense.weight + new_module.output_b.data = child.output.dense.bias + if preln: + transformer_layernorm = child.PreAttentionLayerNorm + else: + transformer_layernorm = child.output.LayerNorm + new_module.norm_w.data = transformer_layernorm.weight + new_module.norm_b.data = transformer_layernorm.bias + return new_module + + return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn) + def replace_module(model, orig_class, replace_fn): """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``. Arguments: From c2785623eb2237150dd05af88ce5d3649bd21eb2 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 17 Jul 2020 15:42:33 +0000 Subject: [PATCH 11/30] add code from Elton to do ds kernel -> orig layer conversion --- deepspeed/pt/replace_module.py | 69 ++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/deepspeed/pt/replace_module.py b/deepspeed/pt/replace_module.py index 3714c4cad559..ea6f73db0487 100755 --- a/deepspeed/pt/replace_module.py +++ b/deepspeed/pt/replace_module.py @@ -113,44 +113,55 @@ def replace_fn(child): #from turing.nvidia_modelingpreln import BertLayer orig_module = orig_layer_impl(bert_config) - # copy relevant state from child -> new module - qw = child.attention.self.query.weight - qb = child.attention.self.query.bias - kw = child.attention.self.key.weight - kb = child.attention.self.key.bias - vw = child.attention.self.value.weight - vb = child.attention.self.value.bias + # copy relevant state from child -> original module + qkvw = child.attn_qkvw.data + qkvb = child.attn_qkvb.data - qkvw = torch.cat((qw, kw, vw), 0) - qkvb = torch.cat((qb, kb, vb), 0) + qw, kw, vw = torch.chunk(qkvw, 3, axis=0) + qb, kb, vb = torch.chunk(qkvb, 3, axis=0) - new_module.attn_qkvw.data = qkvw - new_module.attn_qkvb.data = qkvb - new_module.attn_ow.data = child.attention.output.dense.weight - new_module.attn_ob.data = child.attention.output.dense.bias + orig_module.attention.self.query.weight = qw + orig_module.attention.self.query.bias = qb + orig_module.attention.self.key.weight = kw + orig_module.attention.self.key.bias = kb + orig_module.attention.self.value.weight = vw + orig_module.attention.self.value.bias = vb + + orig_module.attention.output.dense.weight = child.attn_ow.data + orig_module.attention.output.dense.bias = child.attn_ob.data + + attn_ln_w = child.attn_nw.data + attn_ln_b = child.attn_nb.data if preln: - attention_layernorm = child.PostAttentionLayerNorm + orig_module.PostAttentionLayerNorm.weight = attn_ln_w + orig_module.PostAttentionLayerNorm.bias = attn_ln_b else: - attention_layernorm = child.attention.output.LayerNorm - new_module.attn_nw.data = attention_layernorm.weight - new_module.attn_nb.data = attention_layernorm.bias + orig_module.attention.output.LayerNorm.weight = attn_ln_w + orig_module.attention.output.LayerNorm.bias = attn_ln_b + + inter_ff_w = child.inter_w.data + inter_ff_b = child.inter_b.data if preln: - intermediate_ff = child.intermediate.dense_act + orig_module.intermediate.dense_act.weight = inter_ff_w + orig_module.intermediate.dense_act.bias = inter_ff_b else: - intermediate_ff = child.intermediate.dense - new_module.inter_w.data = intermediate_ff.weight - new_module.inter_b.data = intermediate_ff.bias - new_module.output_w.data = child.output.dense.weight - new_module.output_b.data = child.output.dense.bias + orig_module.intermediate.dense.weight = inter_ff_w + orig_module.intermediate.dense.bias = inter_ff_b + + orig_module.output.dense.weight = child.output_w.data + orig_module.output.dense.bias = child.output_b.data + + transformer_ln_w = child.norm_w.data + transformer_ln_b = child.norm_b.data if preln: - transformer_layernorm = child.PreAttentionLayerNorm + orig_module.PreAttentionLayerNorm.weight = transformer_ln_w + orig_module.PreAttentionLayerNorm.bias = transformer_ln_b else: - transformer_layernorm = child.output.LayerNorm - new_module.norm_w.data = transformer_layernorm.weight - new_module.norm_b.data = transformer_layernorm.bias - return new_module + orig_module.output.LayerNorm.weight = transformer_ln_w + orig_module.output.LayerNorm.bias = transformer_ln_b + return orig_module - return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn) + return replace_module(model=model, orig_class=deepspeed.DeepSpeedTransformerLayer, replace_fn=replace_fn) def replace_module(model, orig_class, replace_fn): """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``. From 68d8c13c0a7af9ffc6779f9fb2afbb81bd04ed5d Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 17 Jul 2020 15:54:22 +0000 Subject: [PATCH 12/30] formatting --- deepspeed/pt/replace_module.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepspeed/pt/replace_module.py b/deepspeed/pt/replace_module.py index ea6f73db0487..90523600bb7e 100755 --- a/deepspeed/pt/replace_module.py +++ b/deepspeed/pt/replace_module.py @@ -161,7 +161,10 @@ def replace_fn(child): orig_module.output.LayerNorm.bias = transformer_ln_b return orig_module - return replace_module(model=model, orig_class=deepspeed.DeepSpeedTransformerLayer, replace_fn=replace_fn) + return replace_module(model=model, + orig_class=deepspeed.DeepSpeedTransformerLayer, + replace_fn=replace_fn) + def replace_module(model, orig_class, replace_fn): """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``. From 31615652f4c0fc8cf5aba1ffc93260f21ee6b60d Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Wed, 22 Jul 2020 22:43:33 +0000 Subject: [PATCH 13/30] update replace to fix runtime errors --- deepspeed/__init__.py | 2 +- deepspeed/pt/replace_module.py | 49 +++++++++++++--------------------- 2 files changed, 19 insertions(+), 32 deletions(-) diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 979afdf0f64f..bd8f1d0fb13a 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -8,7 +8,7 @@ from deepspeed.pt.log_utils import logger from deepspeed.pt.deepspeed_cuda import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig from deepspeed.pt.deepspeed_config import DeepSpeedConfig -from deepspeed.pt.replace_module import replace_transformer_layer +from deepspeed.pt.replace_module import replace_transformer_layer, revert_transformer_layer import deepspeed.pt.deepspeed_checkpointing as checkpointing diff --git a/deepspeed/pt/replace_module.py b/deepspeed/pt/replace_module.py index 90523600bb7e..023c60aa9eea 100755 --- a/deepspeed/pt/replace_module.py +++ b/deepspeed/pt/replace_module.py @@ -84,27 +84,13 @@ def replace_fn(child): return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn) -def revert_transformer_layer(orig_layer_impl, - model, - micro_batch_size, - bert_config, - seed, - max_seq_length, - preln=False, - fp16=True, - huggingface=False): +def revert_transformer_layer(orig_layer_impl, model, bert_config, preln=False): """ Revert DeepSpeed's transformer layer back to original bert-style transformer layer Arguments: orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced, e.g., transformers.modeling_bert.BertLayer. model (torch.nn.Module): user's nn.module representing their model - micro_batch_size (int): micro batch size per gpu used during training/eval bert_config (dict): model config containing hidden size, attention heads, etc. - seed (int): random seed value - max_seq_length (int): max sequence length for training - preln (bool): does the original layer implementation do pre or post layer norm? - fp16 (bool): fp16 or fp32 - huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) Returns: Updated nn.module with original bert-style transformer layers @@ -120,18 +106,19 @@ def replace_fn(child): qw, kw, vw = torch.chunk(qkvw, 3, axis=0) qb, kb, vb = torch.chunk(qkvb, 3, axis=0) - orig_module.attention.self.query.weight = qw - orig_module.attention.self.query.bias = qb - orig_module.attention.self.key.weight = kw - orig_module.attention.self.key.bias = kb - orig_module.attention.self.value.weight = vw - orig_module.attention.self.value.bias = vb + orig_module.attention.self.query.weight = torch.nn.Parameter(qw) + orig_module.attention.self.query.bias = torch.nn.Parameter(qb) + orig_module.attention.self.key.weight = torch.nn.Parameter(kw) + orig_module.attention.self.key.bias = torch.nn.Parameter(kb) + orig_module.attention.self.value.weight = torch.nn.Parameter(vw) + orig_module.attention.self.value.bias = torch.nn.Parameter(vb) - orig_module.attention.output.dense.weight = child.attn_ow.data - orig_module.attention.output.dense.bias = child.attn_ob.data + orig_module.attention.output.dense.weight = torch.nn.Parameter( + child.attn_ow.data) + orig_module.attention.output.dense.bias = torch.nn.Parameter(child.attn_ob.data) - attn_ln_w = child.attn_nw.data - attn_ln_b = child.attn_nb.data + attn_ln_w = torch.nn.Parameter(child.attn_nw.data) + attn_ln_b = torch.nn.Parameter(child.attn_nb.data) if preln: orig_module.PostAttentionLayerNorm.weight = attn_ln_w orig_module.PostAttentionLayerNorm.bias = attn_ln_b @@ -139,8 +126,8 @@ def replace_fn(child): orig_module.attention.output.LayerNorm.weight = attn_ln_w orig_module.attention.output.LayerNorm.bias = attn_ln_b - inter_ff_w = child.inter_w.data - inter_ff_b = child.inter_b.data + inter_ff_w = torch.nn.Parameter(child.inter_w.data) + inter_ff_b = torch.nn.Parameter(child.inter_b.data) if preln: orig_module.intermediate.dense_act.weight = inter_ff_w orig_module.intermediate.dense_act.bias = inter_ff_b @@ -148,11 +135,11 @@ def replace_fn(child): orig_module.intermediate.dense.weight = inter_ff_w orig_module.intermediate.dense.bias = inter_ff_b - orig_module.output.dense.weight = child.output_w.data - orig_module.output.dense.bias = child.output_b.data + orig_module.output.dense.weight = torch.nn.Parameter(child.output_w.data) + orig_module.output.dense.bias = torch.nn.Parameter(child.output_b.data) - transformer_ln_w = child.norm_w.data - transformer_ln_b = child.norm_b.data + transformer_ln_w = torch.nn.Parameter(child.norm_w.data) + transformer_ln_b = torch.nn.Parameter(child.norm_b.data) if preln: orig_module.PreAttentionLayerNorm.weight = transformer_ln_w orig_module.PreAttentionLayerNorm.bias = transformer_ln_b From 798e6d334db49f4eb03d10e7c0808865b7ddb230 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Wed, 29 Jul 2020 00:30:24 +0000 Subject: [PATCH 14/30] remove pillow --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ac94d73c25a8..738cfe422440 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ torch>=1.2 torchvision>=0.4.0 -pillow==6.2.2 tqdm psutil tensorboardX==1.8 From 66f590df76e93a87114b3fb6e68b32bb6ec59798 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 12 Jun 2020 01:58:51 +0000 Subject: [PATCH 15/30] remove transformer layer ID from the top module --- deepspeed/ops/transformer/transformer.py | 7 +++++-- docs/_tutorials/bert-pretraining.md | 4 ++-- docs/_tutorials/transformer_kernel.md | 4 ++-- tests/unit/test_cuda_backward.py | 5 ++--- tests/unit/test_cuda_forward.py | 15 +++------------ 5 files changed, 14 insertions(+), 21 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index a91e5ce6f08b..c29e07bbb31e 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -431,11 +431,14 @@ class DeepSpeedTransformerLayer(nn.Module): initial_biases: Optional: Only used for unit test """ - def __init__(self, layer_id, config, initial_weights=None, initial_biases=None): + layer_id = 0 + + def __init__(self, config, initial_weights=None, initial_biases=None): super(DeepSpeedTransformerLayer, self).__init__() self.config = config - self.config.layer_id = layer_id + self.config.layer_id = DeepSpeedTransformerLayer.layer_id + DeepSpeedTransformerLayer.layer_id = DeepSpeedTransformerLayer.layer_id + 1 print("DeepSpeed Transformer config is ", self.config.__dict__) diff --git a/docs/_tutorials/bert-pretraining.md b/docs/_tutorials/bert-pretraining.md index 03462e893b07..0791fb3308fe 100755 --- a/docs/_tutorials/bert-pretraining.md +++ b/docs/_tutorials/bert-pretraining.md @@ -284,10 +284,10 @@ transformer layers using DeepSpeed transformer kernel as below. gelu_checkpoint=args.gelu_checkpoint, stochastic_mode=True) - self.layer = nn.ModuleList([copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config)) for i in range(config.num_hidden_layers)]) + layer = DeepSpeedTransformerLayer(cuda_config) else: layer = BertLayer(config) - self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) ``` All configuration settings come from the DeepSpeed configuration file and command arguments and thus we must pass the `args` variable to here in this model. diff --git a/docs/_tutorials/transformer_kernel.md b/docs/_tutorials/transformer_kernel.md index 26e88406920e..9dbcf26e2a12 100755 --- a/docs/_tutorials/transformer_kernel.md +++ b/docs/_tutorials/transformer_kernel.md @@ -43,8 +43,8 @@ config = DeepSpeedTransformerConfig(batch_size = 64, normalize_invertible=False, gelu_checkpoint=False) self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config)) - for i in range(config.num_hidden_layers) + copy.deepcopy(DeepSpeedTransformerLayer(cuda_config)) + for _ in range(config.num_hidden_layers) ]) ``` ### Transformer kernel Parameters diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py index 317cd7aa33c0..b4c40ddb532e 100755 --- a/tests/unit/test_cuda_backward.py +++ b/tests/unit/test_cuda_backward.py @@ -83,11 +83,10 @@ def __init__(self, config, weights, biases): super(DSEncoder, self).__init__() self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(i, - config, + copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) - for i in range(config.num_hidden_layers) + for _ in range(config.num_hidden_layers) ]) self.grads = [] self.pre_or_post = config.pre_layer_norm diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index 893b66c904bb..393e84f21b0a 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -48,11 +48,10 @@ def __init__(self, config, weights, biases): super(DSEncoder, self).__init__() self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(i, - config, + copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) - for i in range(config.num_hidden_layers) + for _ in range(config.num_hidden_layers) ]) self.grads = [] self.pre_or_post = config.pre_layer_norm @@ -88,11 +87,6 @@ def custom_forward(*inputs): else: for i, layer_module in enumerate(self.layer): hidden_states = layer_module(hidden_states, attention_mask) - hidden_states.register_hook( - lambda x, - i=i, - self=self: self.grads.append([x, - "hidden_state"])) if output_all_encoded_layers: all_encoder_layers.append(hidden_states) @@ -103,9 +97,6 @@ def custom_forward(*inputs): all_encoder_layers.append(hidden_states) return all_encoder_layers - def get_grads(self): - return self.grads - def create_models(ds_config): bert_config = BertConfig(vocab_size_or_config_json_file=119547, @@ -206,7 +197,7 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): output_all_encoded_layers=False, checkpoint_activations=False) - # check grads + # check forward evaluation check_equal(base_results, ds_results, atol=atol, verbose=verbose) From e4b46fbb709c7d968d02fe73ec2fd42295759eba Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 12 Jun 2020 04:44:43 +0000 Subject: [PATCH 16/30] updating docstring --- deepspeed/ops/transformer/transformer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index c29e07bbb31e..1bb78088aafd 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -421,10 +421,10 @@ def backward(ctx, grad_output): class DeepSpeedTransformerLayer(nn.Module): """Initialize the DeepSpeed Transformer Layer. + Static variable: + layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated, + e.g. if a model has 24 transformer layers, layer_id goes from 0 to 23. Arguments: - layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers, - layer_id will be 0,1,2...23 when each layer object is instantiated - config: An object of DeepSpeedTransformerConfig initial_weights: Optional: Only used for unit test From 25ee5e72a9309d684dc89e588449deaef61f0266 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 12 Jun 2020 07:41:43 +0000 Subject: [PATCH 17/30] add inject --- deepspeed/module_inject/__init__.py | 0 deepspeed/module_inject/inject.py | 79 +++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100755 deepspeed/module_inject/__init__.py create mode 100755 deepspeed/module_inject/inject.py diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py new file mode 100755 index 000000000000..e69de29bb2d1 diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py new file mode 100755 index 000000000000..f0129d13c66d --- /dev/null +++ b/deepspeed/module_inject/inject.py @@ -0,0 +1,79 @@ +import copy +import torch +from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig + + +def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed): + for name, child in model.named_children(): + if isinstance(child, layer_obj): + print('REPLACING BertLayer') + + cuda_config = DeepSpeedTransformerConfig( + batch_size=micro_batch_size, + max_seq_length=max_seq_length, + hidden_size=config.hidden_size, + heads=config.num_attention_heads, + attn_dropout_ratio=config.attention_probs_dropout_prob, + hidden_dropout_ratio=config.hidden_dropout_prob, + num_hidden_layers=config.num_hidden_layers, + initializer_range=config.initializer_range, + seed=seed, + fp16=True, + pre_layer_norm=True) + + new_module = DeepSpeedTransformerLayer(cuda_config) + + #TODO: copy relevant state from child -> new module + + setattr(model, name, copy.deepcopy(new_module)) + + else: + module_inject(layer_obj, + child, + config, + micro_batch_size, + max_seq_length, + seed) + + return model + + +def test_hi(): + from turing.nvidia_modelingpreln import BertConfig as BertConfigPreLN + from turing.nvidia_modelingpreln import BertForQuestionAnswering as BertForQuestionAnsweringPreLN + from turing.nvidia_modelingpreln import BertLayer + bert_model_config = { + "vocab_size_or_config_json_file": 119547, + "hidden_size": 1024, + "num_hidden_layers": 1, + "num_attention_heads": 16, + "intermediate_size": 4096, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 2, + "initializer_range": 0.02 + } + bert_config = BertConfigPreLN(**bert_model_config) + base_model = BertForQuestionAnsweringPreLN(bert_config, args=None) + + #base_model = LinearStack() + + test_model = copy.deepcopy(base_model) + test_model = module_inject(BertLayer, test_model, bert_config, 4, 384, 1234) + + print('BASE', base_model) + print('TEST', test_model) + + #base_model.eval() + #test_model.eval() + + #test_input = torch.rand(1, base_model.input_dim) + + #base_output = base_model(test_input) + #test_output = test_model(test_input) + # + #assert torch.allclose(base_output, test_output, atol=3e-8) From d5d10e9c9a59dac5a48585f4fa3f80043692fe21 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Wed, 17 Jun 2020 06:24:12 +0000 Subject: [PATCH 18/30] update inject PoC --- deepspeed/module_inject/inject.py | 24 +++++++++++++++++++++++- deepspeed/ops/transformer/transformer.py | 19 +++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py index f0129d13c66d..de217dd9aafa 100755 --- a/deepspeed/module_inject/inject.py +++ b/deepspeed/module_inject/inject.py @@ -23,7 +23,29 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se new_module = DeepSpeedTransformerLayer(cuda_config) - #TODO: copy relevant state from child -> new module + # copy relevant state from child -> new module + qw = child.attention.self.query.weight + qb = child.attention.self.query.bias + kw = child.attention.self.key.weight + kb = child.attention.self.key.bias + vw = child.attention.self.value.weight + vb = child.attention.self.value.bias + + qkvw = torch.cat((qw, kw, vw), 0) + qkvb = torch.cat((qb, kb, vb), 0) + + new_module.attn_qkvw.data = qkvw + new_module.attn_qkvb.data = qkvb + new_module.attn_ow.data = child.attention.output.dense.weight + new_module.attn_ob.data = child.attention.output.dense.bias + new_module.attn_nw.data = child.attention.output.LayerNorm.weight + new_module.attn_nb.data = child.attention.output.LayerNorm.bias + new_module.inter_w.data = child.intermediate.dense.weight + new_module.inter_b.data = child.intermediate.dense.bias + new_module.output_w.data = child.output.dense.weight + new_module.output_b.data = child.output.dense.bias + new_module.norm_w.data = child.output.LayerNorm.weight + new_module.norm_b.data = child.output.LayerNorm.bias setattr(model, name, copy.deepcopy(new_module)) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 1bb78088aafd..4f3db925c968 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -422,7 +422,7 @@ class DeepSpeedTransformerLayer(nn.Module): """Initialize the DeepSpeed Transformer Layer. Static variable: - layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated, + layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated, e.g. if a model has 24 transformer layers, layer_id goes from 0 to 23. Arguments: config: An object of DeepSpeedTransformerConfig @@ -535,11 +535,22 @@ def init_transformer_weights(self, adjust_init_range=False): self.norm_w.data.fill_(1.0) self.norm_b.data.zero_() - def forward(self, input, input_mask, grads=None): + #def forward(self, input, input_mask, grads=None): + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() - return DeepSpeedTransformerFunction.apply(input, - input_mask, + # disable grad testing for now + grads = None + return DeepSpeedTransformerFunction.apply(hidden_states, + attention_mask, self, grads, self.config.layer_id, From e09004993ae03c821828b6870a79c19bae7c60fc Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 18 Jun 2020 16:16:08 +0000 Subject: [PATCH 19/30] fix the preln injection --- deepspeed/module_inject/inject.py | 32 +++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py index de217dd9aafa..7828808ad816 100755 --- a/deepspeed/module_inject/inject.py +++ b/deepspeed/module_inject/inject.py @@ -3,7 +3,7 @@ from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig -def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed): +def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed, preln, fp16 = True): for name, child in model.named_children(): if isinstance(child, layer_obj): print('REPLACING BertLayer') @@ -18,8 +18,8 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se num_hidden_layers=config.num_hidden_layers, initializer_range=config.initializer_range, seed=seed, - fp16=True, - pre_layer_norm=True) + fp16=fp16, + pre_layer_norm=preln) new_module = DeepSpeedTransformerLayer(cuda_config) @@ -38,14 +38,26 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se new_module.attn_qkvb.data = qkvb new_module.attn_ow.data = child.attention.output.dense.weight new_module.attn_ob.data = child.attention.output.dense.bias - new_module.attn_nw.data = child.attention.output.LayerNorm.weight - new_module.attn_nb.data = child.attention.output.LayerNorm.bias - new_module.inter_w.data = child.intermediate.dense.weight - new_module.inter_b.data = child.intermediate.dense.bias + if preln: + attention_layerNorm = child.PostAttentionLayerNorm + else: + attention_layerNorm = child.attention.output.LayerNorm + new_module.attn_nw.data = attention_layerNorm.weight + new_module.attn_nb.data = attention_layerNorm.bias + if preln: + intermediate_FF = child.intermediate.dense_act + else: + intermediate_FF = child.intermediate.dense + new_module.inter_w.data = intermediate_FF.weight + new_module.inter_b.data = intermediate_FF.bias new_module.output_w.data = child.output.dense.weight new_module.output_b.data = child.output.dense.bias - new_module.norm_w.data = child.output.LayerNorm.weight - new_module.norm_b.data = child.output.LayerNorm.bias + if preln: + transformer_LayerNorm = child.PreAttentionLayerNorm + else: + transformer_LayerNorm = child.output.LayerNorm + new_module.norm_w.data = transformer_LayerNorm.weight + new_module.norm_b.data = transformer_LayerNorm.bias setattr(model, name, copy.deepcopy(new_module)) @@ -55,7 +67,7 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se config, micro_batch_size, max_seq_length, - seed) + seed, preln, fp16) return model From 3df72f86d7acc6ef1e2d72c6cece0da06264f861 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 18 Jun 2020 16:17:03 +0000 Subject: [PATCH 20/30] fix the preln injection --- deepspeed/module_inject/inject.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py index 7828808ad816..a601ef10e1d2 100755 --- a/deepspeed/module_inject/inject.py +++ b/deepspeed/module_inject/inject.py @@ -3,7 +3,14 @@ from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig -def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed, preln, fp16 = True): +def module_inject(layer_obj, + model, + config, + micro_batch_size, + max_seq_length, + seed, + preln, + fp16=True): for name, child in model.named_children(): if isinstance(child, layer_obj): print('REPLACING BertLayer') @@ -53,7 +60,7 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se new_module.output_w.data = child.output.dense.weight new_module.output_b.data = child.output.dense.bias if preln: - transformer_LayerNorm = child.PreAttentionLayerNorm + transformer_LayerNorm = child.PreAttentionLayerNorm else: transformer_LayerNorm = child.output.LayerNorm new_module.norm_w.data = transformer_LayerNorm.weight @@ -67,7 +74,9 @@ def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, se config, micro_batch_size, max_seq_length, - seed, preln, fp16) + seed, + preln, + fp16) return model From 41cc4e6483db5a13936913fee1d15710f85eec22 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 19 Jun 2020 15:42:16 +0000 Subject: [PATCH 21/30] backward-test fixed --- deepspeed/ops/transformer/transformer.py | 3 +- tests/unit/test_cuda_backward.py | 2 +- tests/unit/test_cuda_forward.py | 48 +++++++----------------- 3 files changed, 16 insertions(+), 37 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 4f3db925c968..e8ba79a5f260 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -544,11 +544,10 @@ def forward( encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=False, + grads=None ): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() - # disable grad testing for now - grads = None return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py index b4c40ddb532e..c1d7e4dcec06 100755 --- a/tests/unit/test_cuda_backward.py +++ b/tests/unit/test_cuda_backward.py @@ -121,7 +121,7 @@ def custom_forward(*inputs): # decoder layers else: for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, attention_mask, self.grads) + hidden_states = layer_module(hidden_states, attention_mask, grads=self.grads) hidden_states.register_hook( lambda x, self=self: self.grads.append([x, diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index 393e84f21b0a..a5c035f342d2 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -12,13 +12,9 @@ from modeling import BertEncoder as BertEncoderPostln from modeling import BertLayerNorm, BertConfig from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig -import deepspeed import sys -#if not deepspeed.ops.__installed_ops__['transformer']: -# pytest.skip("transformer kernels are not installed", allow_module_level=True) - def check_equal(first, second, atol=1e-2, verbose=False): if verbose: @@ -53,7 +49,6 @@ def __init__(self, config, weights, biases): biases)) for _ in range(config.num_hidden_layers) ]) - self.grads = [] self.pre_or_post = config.pre_layer_norm def forward(self, @@ -98,13 +93,14 @@ def custom_forward(*inputs): return all_encoder_layers + def create_models(ds_config): bert_config = BertConfig(vocab_size_or_config_json_file=119547, hidden_size=ds_config.hidden_size, num_hidden_layers=ds_config.num_hidden_layers, num_attention_heads=ds_config.heads, batch_size=ds_config.batch_size, - intermediate_size=ds_config.intermediate_size, + intermediate_size=4 * ds_config.hidden_size, hidden_act="gelu", hidden_dropout_prob=ds_config.hidden_dropout_ratio, attention_probs_dropout_prob=ds_config.attn_dropout_ratio, @@ -125,12 +121,12 @@ def create_models(ds_config): weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[4].data.fill_(1.0) weights.append( - nn.Parameter(torch.Tensor(ds_config.intermediate_size, + nn.Parameter(torch.Tensor(4 * ds_config.hidden_size, ds_config.hidden_size))) weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append( nn.Parameter(torch.Tensor(ds_config.hidden_size, - ds_config.intermediate_size))) + 4 * ds_config.hidden_size))) weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[7].data.fill_(1.0) @@ -140,7 +136,7 @@ def create_models(ds_config): for i in range(4): biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases[i + 1].data.zero_() - biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size))) + biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size))) biases[5].data.zero_() biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases[6].data.zero_() @@ -169,7 +165,7 @@ def set_seed(seed): torch.manual_seed(seed) -def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): +def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): set_seed(123) bert_encoder, ds_encoder = create_models(ds_config) @@ -178,12 +174,10 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): # prepare test data kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32 hidden_states = torch.randn(bsz, - seq_len, #ds_config.max_seq_length, + ds_config.max_seq_length, ds_config.hidden_size, **kwargs) - input_mask = torch.randn(bsz, 1, 1, - seq_len, #ds_config.max_seq_length, - **kwargs) + input_mask = torch.randn(bsz, 1, 1, ds_config.max_seq_length, **kwargs) # run baseline base_results = bert_encoder(hidden_states, @@ -204,21 +198,14 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): # FP16 test cases can only run on the devices support FP16. @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', [ - (8,256,128,4,3,True,False), - (8,256,128,4,3,True,True), (64,1024,128,16,3,True,False), (64,1024,128,16,3,True,True), (8,1024,384,16,3,True,False), (8,1024,384,16,3,True,True), - (8,1024,384,16,3,True,True), - (8,1024,120,16,3,True,False), - (8,1024,120,16,3,True,True), (8,1024,512,16,3,True,False), (8,1024,512,16,3,True,True), - (64,1024,56,16,3,False,False), - (64,1024,56,16,3,False,True), - (64,1024,24,16,3,False,False), - (64,1024,24,16,3,False,True), + (64,1024,128,16,3,False,False), + (64,1024,128,16,3,False,True), (8,1024,384,16,3,False,False), (8,1024,384,16,3,False,True), (8,1024,512,16,3,False,False), @@ -229,10 +216,6 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): (8,2048,128,32,3,False,True), (8,2560,128,40,3,False,False), (8,2560,128,40,3,False,True), - (8,128,128,2,3,True,False), - (8,128,128,2,3,True,True), - (8,4096,128,64,3,True,True), - (8,8192,128,64,3,False,True), ]) # yapf: disable def test_forward(batch_size, hidden_size, @@ -250,8 +233,7 @@ def test_forward(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.max_seq_length = 128 #seq_len - ds_config.intermediate_size = 4 * hidden_size + ds_config.max_seq_length = seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 ds_config.hidden_dropout_ratio = 0.0 @@ -260,7 +242,7 @@ def test_forward(batch_size, ds_config.initializer_range = 0.02 ds_config.fp16 = use_fp16 - run_forward(ds_config, seq_len, atol=2e-2) + run_forward(ds_config, atol=2e-2) @pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', @@ -287,7 +269,6 @@ def test_forward_with_small_bsz(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.intermediate_size = 4 * hidden_size ds_config.max_seq_length = seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 @@ -297,7 +278,7 @@ def test_forward_with_small_bsz(batch_size, ds_config.initializer_range = 0.02 ds_config.fp16 = use_fp16 - run_forward(ds_config, seq_len, atol=2e-2, test_bsz=small_bsz) + run_forward(ds_config, atol=2e-2, test_bsz=small_bsz) @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', [ @@ -322,7 +303,6 @@ def test_forward_stochastic(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.intermediate_size = 4 * hidden_size ds_config.max_seq_length = seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 @@ -333,4 +313,4 @@ def test_forward_stochastic(batch_size, ds_config.fp16 = use_fp16 ds_config.stochastic_mode = True - run_forward(ds_config, seq_len, atol=7e-2) + run_forward(ds_config, atol=7e-2) From 24a3d24982d908cb717ae9750b0140749dc16f9f Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 19 Jun 2020 15:43:34 +0000 Subject: [PATCH 22/30] backward-test fixed --- deepspeed/ops/transformer/transformer.py | 18 +- tests/unit/test_cuda_backward.py | 4 +- tests/unit/test_cuda_forward.py | 318 +++++++++++++++++++++++ 3 files changed, 329 insertions(+), 11 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index e8ba79a5f260..3d17201d8811 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -536,16 +536,14 @@ def init_transformer_weights(self, adjust_init_range=False): self.norm_b.data.zero_() #def forward(self, input, input_mask, grads=None): - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - grads=None - ): + def forward(self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + grads=None): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() return DeepSpeedTransformerFunction.apply(hidden_states, diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py index c1d7e4dcec06..c0783669e6ed 100755 --- a/tests/unit/test_cuda_backward.py +++ b/tests/unit/test_cuda_backward.py @@ -121,7 +121,9 @@ def custom_forward(*inputs): # decoder layers else: for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, attention_mask, grads=self.grads) + hidden_states = layer_module(hidden_states, + attention_mask, + grads=self.grads) hidden_states.register_hook( lambda x, self=self: self.grads.append([x, diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index a5c035f342d2..76830c95a8da 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -1,3 +1,4 @@ +<<<<<<< HEAD import argparse import numpy as np import torch @@ -314,3 +315,320 @@ def test_forward_stochastic(batch_size, ds_config.stochastic_mode = True run_forward(ds_config, atol=7e-2) +======= +import argparse +import numpy as np +import torch +import torch.nn.functional as F +import pytest +import json +import random +import time +import copy +from torch import nn +from modelingpreln import BertEncoder as BertEncoderPreln +from modeling import BertEncoder as BertEncoderPostln +from modeling import BertLayerNorm, BertConfig +from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig + +import sys + + +def check_equal(first, second, atol=1e-2, verbose=False): + if verbose: + print() + for i, (x, y) in enumerate(zip(first, second)): + x = x[0].cpu().detach().numpy() + y = y[0].cpu().detach().numpy() + if verbose: + print("x = {}".format(x.flatten())) + print("y = {}".format(y.flatten())) + print('-' * 80) + np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=atol) + + +def zero_grad(variables): + for variable in variables: + variable.grad.zero_() + + +device = torch.device("cuda") +kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True} +kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True} + + +class DSEncoder(nn.Module): + def __init__(self, config, weights, biases): + super(DSEncoder, self).__init__() + self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.layer = nn.ModuleList([ + copy.deepcopy(DeepSpeedTransformerLayer(config, + weights, + biases)) + for _ in range(config.num_hidden_layers) + ]) + self.pre_or_post = config.pre_layer_norm + + def forward(self, + hidden_states, + attention_mask, + output_all_encoded_layers=True, + checkpoint_activations=False): + all_encoder_layers = [] + + def custom(start, end): + def custom_forward(*inputs): + layers = self.layer[start:end] + x_ = inputs[0] + for layer in layers: + x_ = layer(x_, inputs[1]) + return x_ + + return custom_forward + + if checkpoint_activations: + l = 0 + num_layers = len(self.layer) + chunk_length = math.ceil(math.sqrt(num_layers)) + while l < num_layers: + hidden_states = checkpoint.checkpoint(custom(l, + l + chunk_length), + hidden_states, + attention_mask * 1) + l += chunk_length + # decoder layers + else: + for i, layer_module in enumerate(self.layer): + hidden_states = layer_module(hidden_states, attention_mask) + + if output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + + if not output_all_encoded_layers or checkpoint_activations: + if (self.pre_or_post): + hidden_states = self.FinalLayerNorm(hidden_states) + all_encoder_layers.append(hidden_states) + return all_encoder_layers + + +def create_models(ds_config): + bert_config = BertConfig(vocab_size_or_config_json_file=119547, + hidden_size=ds_config.hidden_size, + num_hidden_layers=ds_config.num_hidden_layers, + num_attention_heads=ds_config.heads, + batch_size=ds_config.batch_size, + intermediate_size=4 * ds_config.hidden_size, + hidden_act="gelu", + hidden_dropout_prob=ds_config.hidden_dropout_ratio, + attention_probs_dropout_prob=ds_config.attn_dropout_ratio, + max_position_embeddings=ds_config.max_seq_length, + type_vocab_size=2, + initializer_range=ds_config.initializer_range, + fp16=ds_config.fp16) + + weights = [] + biases = [] + + for i in range(4): + weights.append( + nn.Parameter(torch.Tensor(ds_config.hidden_size, + ds_config.hidden_size))) + weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range) + + weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) + weights[4].data.fill_(1.0) + weights.append( + nn.Parameter(torch.Tensor(4 * ds_config.hidden_size, + ds_config.hidden_size))) + weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) + weights.append( + nn.Parameter(torch.Tensor(ds_config.hidden_size, + 4 * ds_config.hidden_size))) + weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) + weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) + weights[7].data.fill_(1.0) + + biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) + biases[0].data.zero_() + for i in range(4): + biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) + biases[i + 1].data.zero_() + biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size))) + biases[5].data.zero_() + biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) + biases[6].data.zero_() + biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) + biases[7].data.zero_() + + if (ds_config.pre_layer_norm): + bert_encoder = BertEncoderPreln(bert_config, weights, biases) + else: + bert_encoder = BertEncoderPostln(bert_config, weights, biases) + ds_encoder = DSEncoder(ds_config, weights, biases) + + if ds_config.fp16: + bert_encoder.half() + ds_encoder.half() + + bert_encoder.cuda() + ds_encoder.cuda() + + return bert_encoder, ds_encoder + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + +def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): + set_seed(123) + bert_encoder, ds_encoder = create_models(ds_config) + + bsz = ds_config.batch_size if test_bsz is None else test_bsz + + # prepare test data + kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32 + hidden_states = torch.randn(bsz, + ds_config.max_seq_length, + ds_config.hidden_size, + **kwargs) + input_mask = torch.randn(bsz, 1, 1, ds_config.max_seq_length, **kwargs) + + # run baseline + base_results = bert_encoder(hidden_states, + input_mask, + output_all_encoded_layers=False, + checkpoint_activations=False) + + # run ds + ds_results = ds_encoder(hidden_states, + input_mask, + output_all_encoded_layers=False, + checkpoint_activations=False) + + # check forward evaluation + check_equal(base_results, ds_results, atol=atol, verbose=verbose) + + +# FP16 test cases can only run on the devices support FP16. +@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', + [ + (64,1024,128,16,3,True,False), + (64,1024,128,16,3,True,True), + (8,1024,384,16,3,True,False), + (8,1024,384,16,3,True,True), + (8,1024,512,16,3,True,False), + (8,1024,512,16,3,True,True), + (64,1024,128,16,3,False,False), + (64,1024,128,16,3,False,True), + (8,1024,384,16,3,False,False), + (8,1024,384,16,3,False,True), + (8,1024,512,16,3,False,False), + (8,1024,512,16,3,False,True), + (8,1536,128,24,3,False,False), + (8,1536,128,24,3,False,True), + (8,2048,128,32,3,False,False), + (8,2048,128,32,3,False,True), + (8,2560,128,40,3,False,False), + (8,2560,128,40,3,False,True), + ]) # yapf: disable +def test_forward(batch_size, + hidden_size, + seq_len, + heads, + num_layers, + is_preln, + use_fp16): + # Only run fp16 test cases on devices with 7+ capability. + major, _ = torch.cuda.get_device_capability() + if major < 7 and use_fp16 is True: + return + + ds_config = DeepSpeedTransformerConfig() + ds_config.layer_id = None + ds_config.batch_size = batch_size + ds_config.hidden_size = hidden_size + ds_config.max_seq_length = seq_len + ds_config.heads = heads + ds_config.attn_dropout_ratio = 0.0 + ds_config.hidden_dropout_ratio = 0.0 + ds_config.num_hidden_layers = num_layers + ds_config.pre_layer_norm = is_preln + ds_config.initializer_range = 0.02 + ds_config.fp16 = use_fp16 + + run_forward(ds_config, atol=2e-2) + + +@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', + [ + (8,3,1024,512,16,3,True,False), + (8,7,1024,512,16,3,True,True), + (8,3,1024,512,16,3,False,False), + (8,7,1024,512,16,3,False,True), + ]) # yapf: disable +def test_forward_with_small_bsz(batch_size, + small_bsz, + hidden_size, + seq_len, + heads, + num_layers, + is_preln, + use_fp16): + # Only run fp16 test cases on devices with 7+ capability. + major, _ = torch.cuda.get_device_capability() + if major < 7 and use_fp16 is True: + return + + ds_config = DeepSpeedTransformerConfig() + ds_config.layer_id = None + ds_config.batch_size = batch_size + ds_config.hidden_size = hidden_size + ds_config.max_seq_length = seq_len + ds_config.heads = heads + ds_config.attn_dropout_ratio = 0.0 + ds_config.hidden_dropout_ratio = 0.0 + ds_config.num_hidden_layers = num_layers + ds_config.pre_layer_norm = is_preln + ds_config.initializer_range = 0.02 + ds_config.fp16 = use_fp16 + + run_forward(ds_config, atol=2e-2, test_bsz=small_bsz) + +@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', + [ + (64,1024,128,16,3,True,False), + (64,1024,128,16,3,True,True), + (64,1024,128,16,3,False,False), + (64,1024,128,16,3,False,True), + ]) # yapf: disable +def test_forward_stochastic(batch_size, + hidden_size, + seq_len, + heads, + num_layers, + is_preln, + use_fp16): + # Only run fp16 test cases on devices with 7+ capability. + major, _ = torch.cuda.get_device_capability() + if major < 7 and use_fp16 is True: + return + + ds_config = DeepSpeedTransformerConfig() + ds_config.layer_id = None + ds_config.batch_size = batch_size + ds_config.hidden_size = hidden_size + ds_config.max_seq_length = seq_len + ds_config.heads = heads + ds_config.attn_dropout_ratio = 0.0 + ds_config.hidden_dropout_ratio = 0.0 + ds_config.num_hidden_layers = num_layers + ds_config.pre_layer_norm = is_preln + ds_config.initializer_range = 0.02 + ds_config.fp16 = use_fp16 + ds_config.stochastic_mode = True + + run_forward(ds_config, atol=7e-2) +>>>>>>> fca500f... backward-test fixed From 66b4e634bac58c32be11e3102d2165afc6946350 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Thu, 25 Jun 2020 17:58:56 +0000 Subject: [PATCH 23/30] update with replace module style --- deepspeed/__init__.py | 1 + deepspeed/ops/transformer/transformer.py | 31 ++++++++++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 8ac0aad05562..04eedb203bb8 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -13,6 +13,7 @@ from .runtime.config import DeepSpeedConfig from .runtime.activation_checkpointing import checkpointing from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig +from .module_inject.ds_kernel_inject import replace_transformer_layer, revert_transformer_layer from .utils import log_dist from .pipe import PipelineModule diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 3d17201d8811..0e7104a9abba 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -89,6 +89,9 @@ class DeepSpeedTransformerConfig(TransformerConfig): that by enabling it, the pretraining tasks such as BERT are not affected and can obtain a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend to turn it off in order to be able to reproduce the same result through the regular kernel execution. + + huggingface: Enbale if using the HuggingFace interface style for sending out the forward results. + """ def __init__(self, batch_size=-1, @@ -108,7 +111,8 @@ def __init__(self, gelu_checkpoint=False, adjust_init_range=True, attn_dropout_checkpoint=False, - stochastic_mode=False): + stochastic_mode=False, + huggingface=False): super(DeepSpeedTransformerConfig, self).__init__( batch_size, @@ -132,6 +136,7 @@ def __init__(self, self.is_grad_enabled = True self.attn_dropout_checkpoint = attn_dropout_checkpoint self.stochastic_mode = stochastic_mode + self.huggingface = huggingface @classmethod def from_dict(cls, json_object): @@ -303,7 +308,10 @@ def forward(ctx, ctx.attn_layer_norm_var = attn_layer_norm_var ctx.layer_norm_var = layer_norm_var - return output + if config.huggingface: + return (output, ) # outputs -> (output) : outputs[0] = output + else: + return output @staticmethod def backward(ctx, grad_output): @@ -536,16 +544,19 @@ def init_transformer_weights(self, adjust_init_range=False): self.norm_b.data.zero_() #def forward(self, input, input_mask, grads=None): - def forward(self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - grads=None): + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() + # disable grad testing for now + grads = None return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, From ee4003446ec951aab6086e0398b084035beac5db Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Thu, 16 Jul 2020 22:10:55 +0000 Subject: [PATCH 24/30] add function to revert from ds kernel -> orig layer --- deepspeed/module_inject/replace_module.py | 120 ++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100755 deepspeed/module_inject/replace_module.py diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py new file mode 100755 index 000000000000..9471e5bedfbb --- /dev/null +++ b/deepspeed/module_inject/replace_module.py @@ -0,0 +1,120 @@ +import copy +import torch +import deepspeed + + +def replace_transformer_layer(orig_layer_impl, + model, + micro_batch_size, + bert_config, + seed, + max_seq_length, + preln=False, + fp16=True, + huggingface=False): + """ Replace bert-style transformer layers with DeepSpeed's transformer layer + Arguments: + orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for, + e.g., transformers.modeling_bert.BertLayer. + model (torch.nn.Module): user's nn.module representing their model + micro_batch_size (int): micro batch size per gpu used during training/eval + bert_config (dict): model config containing hidden size, attention heads, etc. + seed (int): random seed value + max_seq_length (int): max sequence length for training + preln (bool): does the original layer implementation do pre or post layer norm? + fp16 (bool): fp16 or fp32 + huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) + + Returns: + Updated nn.module with replaced transformer layers + """ + def replace_fn(child): + transformer_config = deepspeed.DeepSpeedTransformerConfig( + batch_size=micro_batch_size, + max_seq_length=max_seq_length, + hidden_size=bert_config.hidden_size, + heads=bert_config.num_attention_heads, + attn_dropout_ratio=bert_config.attention_probs_dropout_prob, + hidden_dropout_ratio=bert_config.hidden_dropout_prob, + num_hidden_layers=bert_config.num_hidden_layers, + initializer_range=bert_config.initializer_range, + seed=seed, + fp16=fp16, + pre_layer_norm=preln, + huggingface=huggingface) + new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config) + + # copy relevant state from child -> new module + qw = child.attention.self.query.weight + qb = child.attention.self.query.bias + kw = child.attention.self.key.weight + kb = child.attention.self.key.bias + vw = child.attention.self.value.weight + vb = child.attention.self.value.bias + + qkvw = torch.cat((qw, kw, vw), 0) + qkvb = torch.cat((qb, kb, vb), 0) + + new_module.attn_qkvw.data = qkvw + new_module.attn_qkvb.data = qkvb + new_module.attn_ow.data = child.attention.output.dense.weight + new_module.attn_ob.data = child.attention.output.dense.bias + if preln: + attention_layernorm = child.PostAttentionLayerNorm + else: + attention_layernorm = child.attention.output.LayerNorm + new_module.attn_nw.data = attention_layernorm.weight + new_module.attn_nb.data = attention_layernorm.bias + if preln: + intermediate_ff = child.intermediate.dense_act + else: + intermediate_ff = child.intermediate.dense + new_module.inter_w.data = intermediate_ff.weight + new_module.inter_b.data = intermediate_ff.bias + new_module.output_w.data = child.output.dense.weight + new_module.output_b.data = child.output.dense.bias + if preln: + transformer_layernorm = child.PreAttentionLayerNorm + else: + transformer_layernorm = child.output.LayerNorm + new_module.norm_w.data = transformer_layernorm.weight + new_module.norm_b.data = transformer_layernorm.bias + return new_module + + return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn) + + +def replace_module(model, orig_class, replace_fn): + """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``. + Arguments: + model (torch.nn.Module): the model to augment + orig_class (torch.nn.Module): the module to search for + replace_fn (method): a method to convert instances of ``orig_class`` to the + desired type and return a new instance. + + Returns: + A modified ``model``. + """ + policy = {orig_class: replace_fn} + return _replace_module(model, policy) + + +def _replace_module(model, policies): + """ Traverse model's children recursively and apply any transformations in ``policies``. + Arguments: + model (torch.nn.Module): model to augment + policies (dict): Mapping of source class to replacement function. + + Returns: + Modified ``model``. + """ + for name, child in model.named_children(): + if child.__class__ in policies: + orig = repr(child) + setattr(model, name, policies[child.__class__](child)) + new = getattr(model, name) + print(f'{orig} -> {new}') + else: + _replace_module(child, policies) + + return model From e982c65d8104cd34d878b5d9dcee147da3356683 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 17 Jul 2020 15:42:33 +0000 Subject: [PATCH 25/30] add code from Elton to do ds kernel -> orig layer conversion --- deepspeed/module_inject/replace_module.py | 79 +++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 9471e5bedfbb..ea6f73db0487 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -84,6 +84,85 @@ def replace_fn(child): return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn) +def revert_transformer_layer(orig_layer_impl, + model, + micro_batch_size, + bert_config, + seed, + max_seq_length, + preln=False, + fp16=True, + huggingface=False): + """ Revert DeepSpeed's transformer layer back to original bert-style transformer layer + Arguments: + orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced, + e.g., transformers.modeling_bert.BertLayer. + model (torch.nn.Module): user's nn.module representing their model + micro_batch_size (int): micro batch size per gpu used during training/eval + bert_config (dict): model config containing hidden size, attention heads, etc. + seed (int): random seed value + max_seq_length (int): max sequence length for training + preln (bool): does the original layer implementation do pre or post layer norm? + fp16 (bool): fp16 or fp32 + huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) + + Returns: + Updated nn.module with original bert-style transformer layers + """ + def replace_fn(child): + #from turing.nvidia_modelingpreln import BertLayer + orig_module = orig_layer_impl(bert_config) + + # copy relevant state from child -> original module + qkvw = child.attn_qkvw.data + qkvb = child.attn_qkvb.data + + qw, kw, vw = torch.chunk(qkvw, 3, axis=0) + qb, kb, vb = torch.chunk(qkvb, 3, axis=0) + + orig_module.attention.self.query.weight = qw + orig_module.attention.self.query.bias = qb + orig_module.attention.self.key.weight = kw + orig_module.attention.self.key.bias = kb + orig_module.attention.self.value.weight = vw + orig_module.attention.self.value.bias = vb + + orig_module.attention.output.dense.weight = child.attn_ow.data + orig_module.attention.output.dense.bias = child.attn_ob.data + + attn_ln_w = child.attn_nw.data + attn_ln_b = child.attn_nb.data + if preln: + orig_module.PostAttentionLayerNorm.weight = attn_ln_w + orig_module.PostAttentionLayerNorm.bias = attn_ln_b + else: + orig_module.attention.output.LayerNorm.weight = attn_ln_w + orig_module.attention.output.LayerNorm.bias = attn_ln_b + + inter_ff_w = child.inter_w.data + inter_ff_b = child.inter_b.data + if preln: + orig_module.intermediate.dense_act.weight = inter_ff_w + orig_module.intermediate.dense_act.bias = inter_ff_b + else: + orig_module.intermediate.dense.weight = inter_ff_w + orig_module.intermediate.dense.bias = inter_ff_b + + orig_module.output.dense.weight = child.output_w.data + orig_module.output.dense.bias = child.output_b.data + + transformer_ln_w = child.norm_w.data + transformer_ln_b = child.norm_b.data + if preln: + orig_module.PreAttentionLayerNorm.weight = transformer_ln_w + orig_module.PreAttentionLayerNorm.bias = transformer_ln_b + else: + orig_module.output.LayerNorm.weight = transformer_ln_w + orig_module.output.LayerNorm.bias = transformer_ln_b + return orig_module + + return replace_module(model=model, orig_class=deepspeed.DeepSpeedTransformerLayer, replace_fn=replace_fn) + def replace_module(model, orig_class, replace_fn): """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``. Arguments: From fd4d0bc1c444e57bfc27a55e1291fdc2c900993e Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 17 Jul 2020 15:54:22 +0000 Subject: [PATCH 26/30] formatting --- deepspeed/module_inject/replace_module.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index ea6f73db0487..90523600bb7e 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -161,7 +161,10 @@ def replace_fn(child): orig_module.output.LayerNorm.bias = transformer_ln_b return orig_module - return replace_module(model=model, orig_class=deepspeed.DeepSpeedTransformerLayer, replace_fn=replace_fn) + return replace_module(model=model, + orig_class=deepspeed.DeepSpeedTransformerLayer, + replace_fn=replace_fn) + def replace_module(model, orig_class, replace_fn): """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``. From e332d618daf08a4f22cb58e635d0bf447573b2ae Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Wed, 22 Jul 2020 22:43:33 +0000 Subject: [PATCH 27/30] update replace to fix runtime errors --- deepspeed/__init__.py | 10 +++++ deepspeed/module_inject/replace_module.py | 49 +++++++++-------------- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 04eedb203bb8..eb92190de2d7 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -4,7 +4,17 @@ import sys import types +<<<<<<< HEAD from . import ops +======= +from deepspeed.pt.deepspeed_light import DeepSpeedLight +from deepspeed.pt.deepspeed_light import ADAM_OPTIMIZER, LAMB_OPTIMIZER +from deepspeed.pt.deepspeed_lr_schedules import add_tuning_arguments +from deepspeed.pt.log_utils import logger +from deepspeed.pt.deepspeed_cuda import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig +from deepspeed.pt.deepspeed_config import DeepSpeedConfig +from deepspeed.pt.replace_module import replace_transformer_layer, revert_transformer_layer +>>>>>>> 3161565... update replace to fix runtime errors from .runtime.engine import DeepSpeedEngine from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 90523600bb7e..023c60aa9eea 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -84,27 +84,13 @@ def replace_fn(child): return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn) -def revert_transformer_layer(orig_layer_impl, - model, - micro_batch_size, - bert_config, - seed, - max_seq_length, - preln=False, - fp16=True, - huggingface=False): +def revert_transformer_layer(orig_layer_impl, model, bert_config, preln=False): """ Revert DeepSpeed's transformer layer back to original bert-style transformer layer Arguments: orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced, e.g., transformers.modeling_bert.BertLayer. model (torch.nn.Module): user's nn.module representing their model - micro_batch_size (int): micro batch size per gpu used during training/eval bert_config (dict): model config containing hidden size, attention heads, etc. - seed (int): random seed value - max_seq_length (int): max sequence length for training - preln (bool): does the original layer implementation do pre or post layer norm? - fp16 (bool): fp16 or fp32 - huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) Returns: Updated nn.module with original bert-style transformer layers @@ -120,18 +106,19 @@ def replace_fn(child): qw, kw, vw = torch.chunk(qkvw, 3, axis=0) qb, kb, vb = torch.chunk(qkvb, 3, axis=0) - orig_module.attention.self.query.weight = qw - orig_module.attention.self.query.bias = qb - orig_module.attention.self.key.weight = kw - orig_module.attention.self.key.bias = kb - orig_module.attention.self.value.weight = vw - orig_module.attention.self.value.bias = vb + orig_module.attention.self.query.weight = torch.nn.Parameter(qw) + orig_module.attention.self.query.bias = torch.nn.Parameter(qb) + orig_module.attention.self.key.weight = torch.nn.Parameter(kw) + orig_module.attention.self.key.bias = torch.nn.Parameter(kb) + orig_module.attention.self.value.weight = torch.nn.Parameter(vw) + orig_module.attention.self.value.bias = torch.nn.Parameter(vb) - orig_module.attention.output.dense.weight = child.attn_ow.data - orig_module.attention.output.dense.bias = child.attn_ob.data + orig_module.attention.output.dense.weight = torch.nn.Parameter( + child.attn_ow.data) + orig_module.attention.output.dense.bias = torch.nn.Parameter(child.attn_ob.data) - attn_ln_w = child.attn_nw.data - attn_ln_b = child.attn_nb.data + attn_ln_w = torch.nn.Parameter(child.attn_nw.data) + attn_ln_b = torch.nn.Parameter(child.attn_nb.data) if preln: orig_module.PostAttentionLayerNorm.weight = attn_ln_w orig_module.PostAttentionLayerNorm.bias = attn_ln_b @@ -139,8 +126,8 @@ def replace_fn(child): orig_module.attention.output.LayerNorm.weight = attn_ln_w orig_module.attention.output.LayerNorm.bias = attn_ln_b - inter_ff_w = child.inter_w.data - inter_ff_b = child.inter_b.data + inter_ff_w = torch.nn.Parameter(child.inter_w.data) + inter_ff_b = torch.nn.Parameter(child.inter_b.data) if preln: orig_module.intermediate.dense_act.weight = inter_ff_w orig_module.intermediate.dense_act.bias = inter_ff_b @@ -148,11 +135,11 @@ def replace_fn(child): orig_module.intermediate.dense.weight = inter_ff_w orig_module.intermediate.dense.bias = inter_ff_b - orig_module.output.dense.weight = child.output_w.data - orig_module.output.dense.bias = child.output_b.data + orig_module.output.dense.weight = torch.nn.Parameter(child.output_w.data) + orig_module.output.dense.bias = torch.nn.Parameter(child.output_b.data) - transformer_ln_w = child.norm_w.data - transformer_ln_b = child.norm_b.data + transformer_ln_w = torch.nn.Parameter(child.norm_w.data) + transformer_ln_b = torch.nn.Parameter(child.norm_b.data) if preln: orig_module.PreAttentionLayerNorm.weight = transformer_ln_w orig_module.PreAttentionLayerNorm.bias = transformer_ln_b From 581425418034ad804c88b6e6a2d6800695039eb6 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 4 Dec 2020 03:29:19 +0000 Subject: [PATCH 28/30] rebase-complete --- deepspeed/__init__.py | 12 +- deepspeed/module_inject/replace_module.py | 60 ++-- deepspeed/ops/transformer/transformer.py | 28 +- tests/unit/test_cuda_forward.py | 366 ++-------------------- 4 files changed, 78 insertions(+), 388 deletions(-) diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index eb92190de2d7..85acb0bf2e89 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -4,17 +4,7 @@ import sys import types -<<<<<<< HEAD from . import ops -======= -from deepspeed.pt.deepspeed_light import DeepSpeedLight -from deepspeed.pt.deepspeed_light import ADAM_OPTIMIZER, LAMB_OPTIMIZER -from deepspeed.pt.deepspeed_lr_schedules import add_tuning_arguments -from deepspeed.pt.log_utils import logger -from deepspeed.pt.deepspeed_cuda import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig -from deepspeed.pt.deepspeed_config import DeepSpeedConfig -from deepspeed.pt.replace_module import replace_transformer_layer, revert_transformer_layer ->>>>>>> 3161565... update replace to fix runtime errors from .runtime.engine import DeepSpeedEngine from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER @@ -23,7 +13,7 @@ from .runtime.config import DeepSpeedConfig from .runtime.activation_checkpointing import checkpointing from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig -from .module_inject.ds_kernel_inject import replace_transformer_layer, revert_transformer_layer +from .module_inject.replace_module import replace_transformer_layer, revert_transformer_layer from .utils import log_dist from .pipe import PipelineModule diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 023c60aa9eea..8b4a1dbaeb84 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -55,6 +55,9 @@ def replace_fn(child): qkvw = torch.cat((qw, kw, vw), 0) qkvb = torch.cat((qb, kb, vb), 0) + #qw.data,kw.data,vw.data = torch.chunk(qkvw, 3, axis=0) + #qb.data,kb.data,vb.data = torch.chunk(qkvb, 3, axis=0) + new_module.attn_qkvw.data = qkvw new_module.attn_qkvb.data = qkvb new_module.attn_ow.data = child.attention.output.dense.weight @@ -106,46 +109,45 @@ def replace_fn(child): qw, kw, vw = torch.chunk(qkvw, 3, axis=0) qb, kb, vb = torch.chunk(qkvb, 3, axis=0) - orig_module.attention.self.query.weight = torch.nn.Parameter(qw) - orig_module.attention.self.query.bias = torch.nn.Parameter(qb) - orig_module.attention.self.key.weight = torch.nn.Parameter(kw) - orig_module.attention.self.key.bias = torch.nn.Parameter(kb) - orig_module.attention.self.value.weight = torch.nn.Parameter(vw) - orig_module.attention.self.value.bias = torch.nn.Parameter(vb) + orig_module.attention.self.query.weight.data = qw + orig_module.attention.self.query.bias.data = qb + orig_module.attention.self.key.weight.data = kw + orig_module.attention.self.key.bias.data = kb + orig_module.attention.self.value.weight.data = vw + orig_module.attention.self.value.bias.data = vb - orig_module.attention.output.dense.weight = torch.nn.Parameter( - child.attn_ow.data) - orig_module.attention.output.dense.bias = torch.nn.Parameter(child.attn_ob.data) + orig_module.attention.output.dense.weight.data = child.attn_ow.data + orig_module.attention.output.dense.bias.data = child.attn_ob.data - attn_ln_w = torch.nn.Parameter(child.attn_nw.data) - attn_ln_b = torch.nn.Parameter(child.attn_nb.data) + attn_ln_w = child.attn_nw.data + attn_ln_b = child.attn_nb.data if preln: - orig_module.PostAttentionLayerNorm.weight = attn_ln_w - orig_module.PostAttentionLayerNorm.bias = attn_ln_b + orig_module.PostAttentionLayerNorm.weight.data = attn_ln_w + orig_module.PostAttentionLayerNorm.bias.data = attn_ln_b else: - orig_module.attention.output.LayerNorm.weight = attn_ln_w - orig_module.attention.output.LayerNorm.bias = attn_ln_b + orig_module.attention.output.LayerNorm.weight.data = attn_ln_w + orig_module.attention.output.LayerNorm.bias.data = attn_ln_b - inter_ff_w = torch.nn.Parameter(child.inter_w.data) - inter_ff_b = torch.nn.Parameter(child.inter_b.data) + inter_ff_w = child.inter_w.data + inter_ff_b = child.inter_b.data if preln: - orig_module.intermediate.dense_act.weight = inter_ff_w - orig_module.intermediate.dense_act.bias = inter_ff_b + orig_module.intermediate.dense_act.weight.data = inter_ff_w + orig_module.intermediate.dense_act.bias.data = inter_ff_b else: - orig_module.intermediate.dense.weight = inter_ff_w - orig_module.intermediate.dense.bias = inter_ff_b + orig_module.intermediate.dense.weight.data = inter_ff_w + orig_module.intermediate.dense.bias.data = inter_ff_b - orig_module.output.dense.weight = torch.nn.Parameter(child.output_w.data) - orig_module.output.dense.bias = torch.nn.Parameter(child.output_b.data) + orig_module.output.dense.weight.data = child.output_w.data + orig_module.output.dense.bias.data = child.output_b.data - transformer_ln_w = torch.nn.Parameter(child.norm_w.data) - transformer_ln_b = torch.nn.Parameter(child.norm_b.data) + transformer_ln_w = child.norm_w.data + transformer_ln_b = child.norm_b.data if preln: - orig_module.PreAttentionLayerNorm.weight = transformer_ln_w - orig_module.PreAttentionLayerNorm.bias = transformer_ln_b + orig_module.PreAttentionLayerNorm.weight.data = transformer_ln_w + orig_module.PreAttentionLayerNorm.bias.data = transformer_ln_b else: - orig_module.output.LayerNorm.weight = transformer_ln_w - orig_module.output.LayerNorm.bias = transformer_ln_b + orig_module.output.LayerNorm.weight.data = transformer_ln_w + orig_module.output.LayerNorm.bias.data = transformer_ln_b return orig_module return replace_module(model=model, diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 0e7104a9abba..6937ec91ee6d 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -91,7 +91,6 @@ class DeepSpeedTransformerConfig(TransformerConfig): to turn it off in order to be able to reproduce the same result through the regular kernel execution. huggingface: Enbale if using the HuggingFace interface style for sending out the forward results. - """ def __init__(self, batch_size=-1, @@ -429,10 +428,10 @@ def backward(ctx, grad_output): class DeepSpeedTransformerLayer(nn.Module): """Initialize the DeepSpeed Transformer Layer. - Static variable: - layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated, - e.g. if a model has 24 transformer layers, layer_id goes from 0 to 23. Arguments: + layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers, + layer_id will be 0,1,2...23 when each layer object is instantiated + config: An object of DeepSpeedTransformerConfig initial_weights: Optional: Only used for unit test @@ -446,6 +445,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None): self.config = config self.config.layer_id = DeepSpeedTransformerLayer.layer_id + self.config.layer_id = DeepSpeedTransformerLayer.layer_id DeepSpeedTransformerLayer.layer_id = DeepSpeedTransformerLayer.layer_id + 1 print("DeepSpeed Transformer config is ", self.config.__dict__) @@ -543,20 +543,16 @@ def init_transformer_weights(self, adjust_init_range=False): self.norm_w.data.fill_(1.0) self.norm_b.data.zero_() - #def forward(self, input, input_mask, grads=None): - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - ): + def forward(self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + grads=None): self.config.training = self.training self.config.is_grad_enabled = torch.is_grad_enabled() - # disable grad testing for now - grads = None return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py index 76830c95a8da..393e84f21b0a 100755 --- a/tests/unit/test_cuda_forward.py +++ b/tests/unit/test_cuda_forward.py @@ -1,4 +1,3 @@ -<<<<<<< HEAD import argparse import numpy as np import torch @@ -13,9 +12,13 @@ from modeling import BertEncoder as BertEncoderPostln from modeling import BertLayerNorm, BertConfig from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig +import deepspeed import sys +#if not deepspeed.ops.__installed_ops__['transformer']: +# pytest.skip("transformer kernels are not installed", allow_module_level=True) + def check_equal(first, second, atol=1e-2, verbose=False): if verbose: @@ -50,6 +53,7 @@ def __init__(self, config, weights, biases): biases)) for _ in range(config.num_hidden_layers) ]) + self.grads = [] self.pre_or_post = config.pre_layer_norm def forward(self, @@ -94,14 +98,13 @@ def custom_forward(*inputs): return all_encoder_layers - def create_models(ds_config): bert_config = BertConfig(vocab_size_or_config_json_file=119547, hidden_size=ds_config.hidden_size, num_hidden_layers=ds_config.num_hidden_layers, num_attention_heads=ds_config.heads, batch_size=ds_config.batch_size, - intermediate_size=4 * ds_config.hidden_size, + intermediate_size=ds_config.intermediate_size, hidden_act="gelu", hidden_dropout_prob=ds_config.hidden_dropout_ratio, attention_probs_dropout_prob=ds_config.attn_dropout_ratio, @@ -122,12 +125,12 @@ def create_models(ds_config): weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[4].data.fill_(1.0) weights.append( - nn.Parameter(torch.Tensor(4 * ds_config.hidden_size, + nn.Parameter(torch.Tensor(ds_config.intermediate_size, ds_config.hidden_size))) weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append( nn.Parameter(torch.Tensor(ds_config.hidden_size, - 4 * ds_config.hidden_size))) + ds_config.intermediate_size))) weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[7].data.fill_(1.0) @@ -137,7 +140,7 @@ def create_models(ds_config): for i in range(4): biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases[i + 1].data.zero_() - biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size))) + biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size))) biases[5].data.zero_() biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases[6].data.zero_() @@ -166,7 +169,7 @@ def set_seed(seed): torch.manual_seed(seed) -def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): +def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): set_seed(123) bert_encoder, ds_encoder = create_models(ds_config) @@ -175,10 +178,12 @@ def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): # prepare test data kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32 hidden_states = torch.randn(bsz, - ds_config.max_seq_length, + seq_len, #ds_config.max_seq_length, ds_config.hidden_size, **kwargs) - input_mask = torch.randn(bsz, 1, 1, ds_config.max_seq_length, **kwargs) + input_mask = torch.randn(bsz, 1, 1, + seq_len, #ds_config.max_seq_length, + **kwargs) # run baseline base_results = bert_encoder(hidden_states, @@ -199,14 +204,21 @@ def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): # FP16 test cases can only run on the devices support FP16. @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', [ + (8,256,128,4,3,True,False), + (8,256,128,4,3,True,True), (64,1024,128,16,3,True,False), (64,1024,128,16,3,True,True), (8,1024,384,16,3,True,False), (8,1024,384,16,3,True,True), + (8,1024,384,16,3,True,True), + (8,1024,120,16,3,True,False), + (8,1024,120,16,3,True,True), (8,1024,512,16,3,True,False), (8,1024,512,16,3,True,True), - (64,1024,128,16,3,False,False), - (64,1024,128,16,3,False,True), + (64,1024,56,16,3,False,False), + (64,1024,56,16,3,False,True), + (64,1024,24,16,3,False,False), + (64,1024,24,16,3,False,True), (8,1024,384,16,3,False,False), (8,1024,384,16,3,False,True), (8,1024,512,16,3,False,False), @@ -217,6 +229,10 @@ def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): (8,2048,128,32,3,False,True), (8,2560,128,40,3,False,False), (8,2560,128,40,3,False,True), + (8,128,128,2,3,True,False), + (8,128,128,2,3,True,True), + (8,4096,128,64,3,True,True), + (8,8192,128,64,3,False,True), ]) # yapf: disable def test_forward(batch_size, hidden_size, @@ -234,7 +250,8 @@ def test_forward(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size - ds_config.max_seq_length = seq_len + ds_config.max_seq_length = 128 #seq_len + ds_config.intermediate_size = 4 * hidden_size ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 ds_config.hidden_dropout_ratio = 0.0 @@ -243,7 +260,7 @@ def test_forward(batch_size, ds_config.initializer_range = 0.02 ds_config.fp16 = use_fp16 - run_forward(ds_config, atol=2e-2) + run_forward(ds_config, seq_len, atol=2e-2) @pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', @@ -270,6 +287,7 @@ def test_forward_with_small_bsz(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size + ds_config.intermediate_size = 4 * hidden_size ds_config.max_seq_length = seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 @@ -279,7 +297,7 @@ def test_forward_with_small_bsz(batch_size, ds_config.initializer_range = 0.02 ds_config.fp16 = use_fp16 - run_forward(ds_config, atol=2e-2, test_bsz=small_bsz) + run_forward(ds_config, seq_len, atol=2e-2, test_bsz=small_bsz) @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', [ @@ -304,6 +322,7 @@ def test_forward_stochastic(batch_size, ds_config.layer_id = None ds_config.batch_size = batch_size ds_config.hidden_size = hidden_size + ds_config.intermediate_size = 4 * hidden_size ds_config.max_seq_length = seq_len ds_config.heads = heads ds_config.attn_dropout_ratio = 0.0 @@ -314,321 +333,4 @@ def test_forward_stochastic(batch_size, ds_config.fp16 = use_fp16 ds_config.stochastic_mode = True - run_forward(ds_config, atol=7e-2) -======= -import argparse -import numpy as np -import torch -import torch.nn.functional as F -import pytest -import json -import random -import time -import copy -from torch import nn -from modelingpreln import BertEncoder as BertEncoderPreln -from modeling import BertEncoder as BertEncoderPostln -from modeling import BertLayerNorm, BertConfig -from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig - -import sys - - -def check_equal(first, second, atol=1e-2, verbose=False): - if verbose: - print() - for i, (x, y) in enumerate(zip(first, second)): - x = x[0].cpu().detach().numpy() - y = y[0].cpu().detach().numpy() - if verbose: - print("x = {}".format(x.flatten())) - print("y = {}".format(y.flatten())) - print('-' * 80) - np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=atol) - - -def zero_grad(variables): - for variable in variables: - variable.grad.zero_() - - -device = torch.device("cuda") -kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True} -kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True} - - -class DSEncoder(nn.Module): - def __init__(self, config, weights, biases): - super(DSEncoder, self).__init__() - self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(config, - weights, - biases)) - for _ in range(config.num_hidden_layers) - ]) - self.pre_or_post = config.pre_layer_norm - - def forward(self, - hidden_states, - attention_mask, - output_all_encoded_layers=True, - checkpoint_activations=False): - all_encoder_layers = [] - - def custom(start, end): - def custom_forward(*inputs): - layers = self.layer[start:end] - x_ = inputs[0] - for layer in layers: - x_ = layer(x_, inputs[1]) - return x_ - - return custom_forward - - if checkpoint_activations: - l = 0 - num_layers = len(self.layer) - chunk_length = math.ceil(math.sqrt(num_layers)) - while l < num_layers: - hidden_states = checkpoint.checkpoint(custom(l, - l + chunk_length), - hidden_states, - attention_mask * 1) - l += chunk_length - # decoder layers - else: - for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, attention_mask) - - if output_all_encoded_layers: - all_encoder_layers.append(hidden_states) - - if not output_all_encoded_layers or checkpoint_activations: - if (self.pre_or_post): - hidden_states = self.FinalLayerNorm(hidden_states) - all_encoder_layers.append(hidden_states) - return all_encoder_layers - - -def create_models(ds_config): - bert_config = BertConfig(vocab_size_or_config_json_file=119547, - hidden_size=ds_config.hidden_size, - num_hidden_layers=ds_config.num_hidden_layers, - num_attention_heads=ds_config.heads, - batch_size=ds_config.batch_size, - intermediate_size=4 * ds_config.hidden_size, - hidden_act="gelu", - hidden_dropout_prob=ds_config.hidden_dropout_ratio, - attention_probs_dropout_prob=ds_config.attn_dropout_ratio, - max_position_embeddings=ds_config.max_seq_length, - type_vocab_size=2, - initializer_range=ds_config.initializer_range, - fp16=ds_config.fp16) - - weights = [] - biases = [] - - for i in range(4): - weights.append( - nn.Parameter(torch.Tensor(ds_config.hidden_size, - ds_config.hidden_size))) - weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range) - - weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) - weights[4].data.fill_(1.0) - weights.append( - nn.Parameter(torch.Tensor(4 * ds_config.hidden_size, - ds_config.hidden_size))) - weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) - weights.append( - nn.Parameter(torch.Tensor(ds_config.hidden_size, - 4 * ds_config.hidden_size))) - weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) - weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) - weights[7].data.fill_(1.0) - - biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) - biases[0].data.zero_() - for i in range(4): - biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) - biases[i + 1].data.zero_() - biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size))) - biases[5].data.zero_() - biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) - biases[6].data.zero_() - biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) - biases[7].data.zero_() - - if (ds_config.pre_layer_norm): - bert_encoder = BertEncoderPreln(bert_config, weights, biases) - else: - bert_encoder = BertEncoderPostln(bert_config, weights, biases) - ds_encoder = DSEncoder(ds_config, weights, biases) - - if ds_config.fp16: - bert_encoder.half() - ds_encoder.half() - - bert_encoder.cuda() - ds_encoder.cuda() - - return bert_encoder, ds_encoder - - -def set_seed(seed): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - - -def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None): - set_seed(123) - bert_encoder, ds_encoder = create_models(ds_config) - - bsz = ds_config.batch_size if test_bsz is None else test_bsz - - # prepare test data - kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32 - hidden_states = torch.randn(bsz, - ds_config.max_seq_length, - ds_config.hidden_size, - **kwargs) - input_mask = torch.randn(bsz, 1, 1, ds_config.max_seq_length, **kwargs) - - # run baseline - base_results = bert_encoder(hidden_states, - input_mask, - output_all_encoded_layers=False, - checkpoint_activations=False) - - # run ds - ds_results = ds_encoder(hidden_states, - input_mask, - output_all_encoded_layers=False, - checkpoint_activations=False) - - # check forward evaluation - check_equal(base_results, ds_results, atol=atol, verbose=verbose) - - -# FP16 test cases can only run on the devices support FP16. -@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', - [ - (64,1024,128,16,3,True,False), - (64,1024,128,16,3,True,True), - (8,1024,384,16,3,True,False), - (8,1024,384,16,3,True,True), - (8,1024,512,16,3,True,False), - (8,1024,512,16,3,True,True), - (64,1024,128,16,3,False,False), - (64,1024,128,16,3,False,True), - (8,1024,384,16,3,False,False), - (8,1024,384,16,3,False,True), - (8,1024,512,16,3,False,False), - (8,1024,512,16,3,False,True), - (8,1536,128,24,3,False,False), - (8,1536,128,24,3,False,True), - (8,2048,128,32,3,False,False), - (8,2048,128,32,3,False,True), - (8,2560,128,40,3,False,False), - (8,2560,128,40,3,False,True), - ]) # yapf: disable -def test_forward(batch_size, - hidden_size, - seq_len, - heads, - num_layers, - is_preln, - use_fp16): - # Only run fp16 test cases on devices with 7+ capability. - major, _ = torch.cuda.get_device_capability() - if major < 7 and use_fp16 is True: - return - - ds_config = DeepSpeedTransformerConfig() - ds_config.layer_id = None - ds_config.batch_size = batch_size - ds_config.hidden_size = hidden_size - ds_config.max_seq_length = seq_len - ds_config.heads = heads - ds_config.attn_dropout_ratio = 0.0 - ds_config.hidden_dropout_ratio = 0.0 - ds_config.num_hidden_layers = num_layers - ds_config.pre_layer_norm = is_preln - ds_config.initializer_range = 0.02 - ds_config.fp16 = use_fp16 - - run_forward(ds_config, atol=2e-2) - - -@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', - [ - (8,3,1024,512,16,3,True,False), - (8,7,1024,512,16,3,True,True), - (8,3,1024,512,16,3,False,False), - (8,7,1024,512,16,3,False,True), - ]) # yapf: disable -def test_forward_with_small_bsz(batch_size, - small_bsz, - hidden_size, - seq_len, - heads, - num_layers, - is_preln, - use_fp16): - # Only run fp16 test cases on devices with 7+ capability. - major, _ = torch.cuda.get_device_capability() - if major < 7 and use_fp16 is True: - return - - ds_config = DeepSpeedTransformerConfig() - ds_config.layer_id = None - ds_config.batch_size = batch_size - ds_config.hidden_size = hidden_size - ds_config.max_seq_length = seq_len - ds_config.heads = heads - ds_config.attn_dropout_ratio = 0.0 - ds_config.hidden_dropout_ratio = 0.0 - ds_config.num_hidden_layers = num_layers - ds_config.pre_layer_norm = is_preln - ds_config.initializer_range = 0.02 - ds_config.fp16 = use_fp16 - - run_forward(ds_config, atol=2e-2, test_bsz=small_bsz) - -@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16', - [ - (64,1024,128,16,3,True,False), - (64,1024,128,16,3,True,True), - (64,1024,128,16,3,False,False), - (64,1024,128,16,3,False,True), - ]) # yapf: disable -def test_forward_stochastic(batch_size, - hidden_size, - seq_len, - heads, - num_layers, - is_preln, - use_fp16): - # Only run fp16 test cases on devices with 7+ capability. - major, _ = torch.cuda.get_device_capability() - if major < 7 and use_fp16 is True: - return - - ds_config = DeepSpeedTransformerConfig() - ds_config.layer_id = None - ds_config.batch_size = batch_size - ds_config.hidden_size = hidden_size - ds_config.max_seq_length = seq_len - ds_config.heads = heads - ds_config.attn_dropout_ratio = 0.0 - ds_config.hidden_dropout_ratio = 0.0 - ds_config.num_hidden_layers = num_layers - ds_config.pre_layer_norm = is_preln - ds_config.initializer_range = 0.02 - ds_config.fp16 = use_fp16 - ds_config.stochastic_mode = True - - run_forward(ds_config, atol=7e-2) ->>>>>>> fca500f... backward-test fixed + run_forward(ds_config, seq_len, atol=7e-2) From f48c52a9f81c9d7ffb696367be6d5d77c16b0388 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 4 Dec 2020 03:36:01 +0000 Subject: [PATCH 29/30] resolve conflict --- deepspeed/ops/transformer/transformer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index e5db66f72745..f6baedbbbce8 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -449,10 +449,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None): self.config = config self.config.layer_id = DeepSpeedTransformerLayer.layer_id -<<<<<<< HEAD:deepspeed/ops/transformer/transformer.py self.config.layer_id = DeepSpeedTransformerLayer.layer_id -======= ->>>>>>> 798e6d334db49f4eb03d10e7c0808865b7ddb230:deepspeed/pt/deepspeed_cuda.py DeepSpeedTransformerLayer.layer_id = DeepSpeedTransformerLayer.layer_id + 1 print("DeepSpeed Transformer config is ", self.config.__dict__) From 0df72a01cd492e5ae6e808effa987730cecc1d57 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Fri, 4 Dec 2020 20:21:55 +0000 Subject: [PATCH 30/30] remove dup line and add local-rank parameter to replace function --- deepspeed/module_inject/replace_module.py | 6 ++++-- deepspeed/ops/transformer/transformer.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 8b4a1dbaeb84..38b6dc5491df 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -11,7 +11,8 @@ def replace_transformer_layer(orig_layer_impl, max_seq_length, preln=False, fp16=True, - huggingface=False): + huggingface=False, + local_rank=-1): """ Replace bert-style transformer layers with DeepSpeed's transformer layer Arguments: orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for, @@ -41,7 +42,8 @@ def replace_fn(child): seed=seed, fp16=fp16, pre_layer_norm=preln, - huggingface=huggingface) + huggingface=huggingface, + local_rank=local_rank) new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config) # copy relevant state from child -> new module diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index f6baedbbbce8..0775f2efa955 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -449,7 +449,6 @@ def __init__(self, config, initial_weights=None, initial_biases=None): self.config = config self.config.layer_id = DeepSpeedTransformerLayer.layer_id - self.config.layer_id = DeepSpeedTransformerLayer.layer_id DeepSpeedTransformerLayer.layer_id = DeepSpeedTransformerLayer.layer_id + 1 print("DeepSpeed Transformer config is ", self.config.__dict__)