diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 6fd055f0d..0b4cd99dd 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -16,7 +16,12 @@ import torch import torch.nn as nn + +from megatron import logging from megatron.enums import AttnMaskType +from megatron.model import utils + +logger = logging.get_logger(__name__) class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): """ @@ -134,6 +139,13 @@ def forward(self, input, mask): def is_kernel_available(self, mask, b, np, sq, sk): attn_batches = b * np + print("is_kernel_available") + print(f"self.self.scaled_masked_softmax_fusion = {self.scaled_masked_softmax_fusion}") + print(f"self.input_in_float16 = {self.input_in_float16}") + print(f"mask is not None = {mask is not None}") + print(f"sq = {sq}") + print(f"sk = {sk}") + print(f"attn_batches = {attn_batches}") if ( self.scaled_masked_softmax_fusion # user want to fuse and self.input_in_float16 # input must be fp16 @@ -149,10 +161,12 @@ def is_kernel_available(self, mask, b, np, sq, sk): if attn_batches % batch_per_block == 0: return True else: + print(sq, batch_per_block) if sq % batch_per_block == 0: return True return False + @utils.log_debug_usage(logger, "Using fused softmax") def forward_fused_softmax(self, input, mask): b, np, sq, sk = input.size() scale = self.scale if self.scale is not None else 1.0 @@ -168,6 +182,7 @@ def forward_fused_softmax(self, input, mask): # input is 4D tensor (b, np, sq, sk) return ScaledMaskedSoftmax.apply(input, mask, scale) + @utils.log_debug_usage(logger, "Using torch softmax") def forward_torch_softmax(self, input, mask): if self.input_in_float16 and self.softmax_in_fp32: input = input.float() diff --git a/tests/test_model.py b/tests/test_model.py index 189fcce01..dfdebfe7c 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -6,7 +6,7 @@ import torch from megatron import initialize_megatron, get_args, get_tokenizer, global_vars -from megatron.testing_utils import TestCasePlus, mockenv_context +from megatron.testing_utils import TestCasePlus, mockenv_context, CaptureStdout from megatron.training import setup_model_and_optimizer from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe @@ -49,6 +49,10 @@ def get_default_args(): "--checkpoint-activations": "", # DATA_ARGS + + # LOGGING_ARGS + "--log-level": "debug", + "--log-level-replica": "info", } @@ -257,10 +261,16 @@ def test_prefix_lm_wo_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - model(*input_batch) + with CaptureStdout() as cs: + model(*input_batch) + + self.assertIn("Using fused softmax", cs.out) + self.assertIn("Using torch softmax", cs.out) #TODO: Check all invariants + + def test_gpt_rotary_embeddings(self): """Test rotary embeddings""" command_args = get_default_args()