diff --git a/README.md b/README.md index e7f61bf20..e63a59f28 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,6 @@ To install the remaining basic dependencies, run: pip install -r requirements/requirements.txt pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard -python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels ``` from the repository root. @@ -106,6 +105,16 @@ from the repository root. +### Fused Kernels +We now support AMD GPUs (MI100, MI250X) through JIT fused-kernel compilation. Fused kernels will be built and loaded as needed. To avoid waiting during job launching, you can also do the following for manual pre-build: + +```python +python +from megatron.fused_kernels import load +load() +``` +This will automatically adapts building process over different GPU vendors (AMD, NVIDIA) without platform specific code changes. To further test fused kernels using `pytest`, use `pytest tests/model/test_fused_kernels.py` + ### Flash Attention To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details. @@ -640,7 +649,7 @@ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, # Profiling -We support profiling with Nsight Systems and PyTorch Memory Profiling. +We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling. ## Nsight Systems Profiling @@ -656,6 +665,15 @@ The generated output file can then by viewed with the Nsight Systems GUI: ![Alt text](images/nsight_profiling.png) +## PyTorch Profiling + +To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`. + +The PyTorch profiler will save traces to your `tensorboard` log directory. You can view these traces within +TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). + +![Alt text](images/pytorch_profiling.png) + ## PyTorch Memory Profiling To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`. diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index e03265bca..1dbb4dd8a 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = fdc395f + Default = b68ba6d current git hash of repository diff --git a/images/pytorch_profiling.png b/images/pytorch_profiling.png new file mode 100644 index 000000000..e85324dc6 Binary files /dev/null and b/images/pytorch_profiling.png differ diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index aca290854..9b062b050 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t& docs_, } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { @@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t& docs_, num_sent = 0; } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py index 1e4c9efac..3694e964b 100644 --- a/megatron/fused_kernels/__init__.py +++ b/megatron/fused_kernels/__init__.py @@ -135,8 +135,8 @@ def _cpp_extention_load_helper( srcpath / "fused_rotary_positional_embedding.cpp", srcpath / "fused_rotary_positional_embedding_cuda.cu", ] - fused_rotary_positional_embedding_cuda = _cpp_extention_load_helper( - "fused_rotary_positional_embedding_cuda", + fused_rotary_positional_embedding = _cpp_extention_load_helper( + "fused_rotary_positional_embedding", sources, extra_cuda_flags, extra_include_paths, @@ -174,7 +174,7 @@ def load_fused_kernels(): print(e) print("=" * 100) print( - f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them" + f"ERROR: Fused kernels configured but not properly installed. Please run `from megatron.fused_kernels import load()` then `load()` to load them correctly" ) print("=" * 100) exit() diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index ff4f4bc21..98a444ea4 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1070,8 +1070,8 @@ def calculate_derived(self): ), "Mamba does not yet have dropout implemented" if "rwkv" in self.attention_config: assert ( - not self.is_pipe_parallel and self.model_parallel_size == 1 - ), "RWKV not currently compatible with parallelism" + self.model_parallel_size == 1 + ), "RWKV not currently compatible with model parallelism" if isinstance(self.zero_stage, int): assert self.zero_stage <= 2, "Zero stage 3 not compatible with RWKV" assert ( diff --git a/megatron/training.py b/megatron/training.py index 6a4e843ab..3265680c5 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -970,7 +970,28 @@ def train( # to monitor if we've skipped many iterations in a row and trigger an early exit overflow_monitor = OverflowMonitor(optimizer) + + if neox_args.profile: + schedule = torch.profiler.schedule( + wait=neox_args.profile_step_start, + warmup=1, + active=neox_args.profile_step_stop - neox_args.profile_step_start, + ) + prof = torch.profiler.profile( + schedule=schedule, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + neox_args.tensorboard_dir + ), + record_shapes=True, + profile_memory=True, + with_flops=True, + with_modules=True, + with_stack=True, + ) + prof.start() while iteration < neox_args.train_iters: + if neox_args.profile: + prof.step() if neox_args.profile and iteration == neox_args.profile_step_start: torch.cuda.cudart().cudaProfilerStart() loss_dict, skipped_iter = train_step( @@ -983,6 +1004,7 @@ def train( ) if neox_args.profile and iteration == neox_args.profile_step_stop: torch.cuda.cudart().cudaProfilerStop() + prof.stop() iteration += 1 neox_args.iteration = iteration if neox_args.precision == "fp16": diff --git a/tests/model/test_fused_kernels.py b/tests/model/test_fused_kernels.py index cc458bf4a..125eb6c52 100644 --- a/tests/model/test_fused_kernels.py +++ b/tests/model/test_fused_kernels.py @@ -30,9 +30,7 @@ ) -@pytest.mark.xfail( - reason="ModuleNotFoundError: No module named 'scaled_masked_softmax_cuda'" -) +@pytest.mark.xfail(reason="SystemExit: None") def test_load_fused_kernels(): load() try: