From e95086f3bd33441f3f8bd409ad4ec115b0ee3b78 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Mon, 28 Oct 2024 11:44:20 -0700 Subject: [PATCH] Formatting fixes --- csrc/aio/common/deepspeed_aio_common.cpp | 5 ++--- csrc/aio/py_lib/deepspeed_py_aio.cpp | 10 ++++------ csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 10 ++++------ csrc/aio/py_lib/deepspeed_py_copy.cpp | 2 +- .../evoformer_attn/gemm_kernel_utils.h | 9 ++++----- csrc/includes/simd.h | 2 +- csrc/xpu/includes/simd.h | 2 +- csrc/xpu/includes/type_shim.h | 10 +++++----- .../v2/model_implementations/phi3small/model.py | 16 ++++++++++++---- 9 files changed, 34 insertions(+), 32 deletions(-) mode change 100755 => 100644 csrc/xpu/includes/simd.h diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp index 0f2895dfa328..a65cc500cc82 100644 --- a/csrc/aio/common/deepspeed_aio_common.cpp +++ b/csrc/aio/common/deepspeed_aio_common.cpp @@ -301,9 +301,8 @@ int regular_read(const char* filename, std::vector& buffer) } while (r > 0); if (read_bytes != num_bytes) { - std::cerr << "read error " - << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes - << std::endl; + std::cerr << "read error " << " read_bytes (read) = " << read_bytes + << " num_bytes (fstat) = " << num_bytes << std::endl; } assert(read_bytes == num_bytes); close(fd); diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp index 387b713f2bfc..0556f5aa8168 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp @@ -72,9 +72,8 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " - << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 - << std::endl; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; return 0; } @@ -118,8 +117,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " - << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 - << std::endl; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; return 0; } diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index c21e92de9449..23ddabe260d4 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -93,9 +93,8 @@ int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, co if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); } const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " - << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 - << std::endl; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; return 0; } @@ -128,9 +127,8 @@ int deepspeed_aio_handle_t::write(const torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " - << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 - << std::endl; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; return 0; } diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp index 8a59107dd347..c597b91d05c9 100644 --- a/csrc/aio/py_lib/deepspeed_py_copy.cpp +++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp @@ -10,7 +10,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. #include "deepspeed_py_copy.h" #include -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) or defined(__AVX256__) union AVX_Data { diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h index 2a4300c5cac1..c102234a4dfb 100644 --- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h +++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h @@ -125,11 +125,10 @@ struct CheckArch { std::cerr << #PTR " is not correctly aligned\n"; \ return false; \ } -#define EVOFORMER_CHECK(COND, ERR) \ - if (!(COND)) { \ - std::cerr << "[Evoformer Attention]" \ - << "'" #COND "' failed: " << ERR << "\n"; \ - return false; \ +#define EVOFORMER_CHECK(COND, ERR) \ + if (!(COND)) { \ + std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \ + return false; \ } #endif diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h index f5bfb45dd2e2..a205026ec7c1 100644 --- a/csrc/includes/simd.h +++ b/csrc/includes/simd.h @@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val) std::memcpy(dst, &val, sizeof(T)); } -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h old mode 100755 new mode 100644 index f77568be7835..097e2d8585cc --- a/csrc/xpu/includes/simd.h +++ b/csrc/xpu/includes/simd.h @@ -13,7 +13,7 @@ #define TILE (128 * 1024 * 1024) #if defined(__AVX512__) or defined(__AVX256__) -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h index fa41757c895b..1897afd1fea2 100644 --- a/csrc/xpu/includes/type_shim.h +++ b/csrc/xpu/includes/type_shim.h @@ -82,11 +82,11 @@ } template -__inline__ __attribute__((always_inline)) T reduce_block_into_lanes( - T* x, - T val, - int lanes = 1, - bool share_result = false) // lanes is intended to be <= 32. +__inline__ __attribute__((always_inline)) T +reduce_block_into_lanes(T* x, + T val, + int lanes = 1, + bool share_result = false) // lanes is intended to be <= 32. { auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2); diff --git a/deepspeed/inference/v2/model_implementations/phi3small/model.py b/deepspeed/inference/v2/model_implementations/phi3small/model.py index 1f1e853fc167..532cb49c80c4 100644 --- a/deepspeed/inference/v2/model_implementations/phi3small/model.py +++ b/deepspeed/inference/v2/model_implementations/phi3small/model.py @@ -112,7 +112,6 @@ def positional_embedding_config(self) -> Optional[RotateHalfConfig]: def mup_embedding_multiplier(self) -> float: return 10.0 - """ Forward implementations """ @@ -160,7 +159,10 @@ def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hid if self.tp_size > 1: dist.all_reduce(hidden_states, group=self._base_mp_group) - residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=cur_params.mlp_norm_beta) + residual, hidden_states = self.norm(residual, + hidden_states, + cur_params.mlp_norm_gamma, + beta=cur_params.mlp_norm_beta) hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None) hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None) @@ -170,7 +172,10 @@ def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hid if layer_idx != self.num_layers - 1: next_params = self._transformer[layer_idx + 1] - residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=next_params.attn_norm_beta) + residual, hidden_states = self.norm(residual, + hidden_states, + next_params.attn_norm_gamma, + beta=next_params.attn_norm_beta) else: # On last layer, we just need to perform the residual add. Adding into the residual # here is safe. @@ -205,7 +210,10 @@ def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: Ragge def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor: residual = self._forward_embed(wrapped_batch) - residual, hidden_states = self.norm(residual, None, gamma=self._transformer[0].attn_norm_gamma, beta=self._transformer[0].attn_norm_beta) + residual, hidden_states = self.norm(residual, + None, + gamma=self._transformer[0].attn_norm_gamma, + beta=self._transformer[0].attn_norm_beta) for layer_idx in range(self.num_layers): residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,