From e95086f3bd33441f3f8bd409ad4ec115b0ee3b78 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Mon, 28 Oct 2024 11:44:20 -0700
Subject: [PATCH] Formatting fixes

---
 csrc/aio/common/deepspeed_aio_common.cpp         |  5 ++---
 csrc/aio/py_lib/deepspeed_py_aio.cpp             | 10 ++++------
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp      | 10 ++++------
 csrc/aio/py_lib/deepspeed_py_copy.cpp            |  2 +-
 .../evoformer_attn/gemm_kernel_utils.h           |  9 ++++-----
 csrc/includes/simd.h                             |  2 +-
 csrc/xpu/includes/simd.h                         |  2 +-
 csrc/xpu/includes/type_shim.h                    | 10 +++++-----
 .../v2/model_implementations/phi3small/model.py  | 16 ++++++++++++----
 9 files changed, 34 insertions(+), 32 deletions(-)
 mode change 100755 => 100644 csrc/xpu/includes/simd.h
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 0f2895dfa328..a65cc500cc82 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -301,9 +301,8 @@ int regular_read(const char* filename, std::vector<char>& buffer)
     } while (r > 0);
 
     if (read_bytes != num_bytes) {
-        std::cerr << "read error "
-                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
-                  << std::endl;
+        std::cerr << "read error " << " read_bytes (read) = " << read_bytes
+                  << " num_bytes (fstat) = " << num_bytes << std::endl;
     }
     assert(read_bytes == num_bytes);
     close(fd);
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 387b713f2bfc..0556f5aa8168 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -72,9 +72,8 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
 
@@ -118,8 +117,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index c21e92de9449..23ddabe260d4 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -93,9 +93,8 @@ int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, co
     if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
 
@@ -128,9 +127,8 @@ int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
 
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index 8a59107dd347..c597b91d05c9 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -10,7 +10,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include "deepspeed_py_copy.h"
 #include <omp.h>
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__) or defined(__AVX256__)
 union AVX_Data {
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
index 2a4300c5cac1..c102234a4dfb 100644
--- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
+++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -125,11 +125,10 @@ struct CheckArch {
         std::cerr << #PTR " is not correctly aligned\n"; \
         return false;                                    \
     }
-#define EVOFORMER_CHECK(COND, ERR)                          \
-    if (!(COND)) {                                          \
-        std::cerr << "[Evoformer Attention]"                \
-                  << "'" #COND "' failed: " << ERR << "\n"; \
-        return false;                                       \
+#define EVOFORMER_CHECK(COND, ERR)                                                     \
+    if (!(COND)) {                                                                     \
+        std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \
+        return false;                                                                  \
     }
 #endif
 
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index f5bfb45dd2e2..a205026ec7c1 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val)
     std::memcpy(dst, &val, sizeof(T));
 }
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h
old mode 100755
new mode 100644
index f77568be7835..097e2d8585cc
--- a/csrc/xpu/includes/simd.h
+++ b/csrc/xpu/includes/simd.h
@@ -13,7 +13,7 @@
 #define TILE (128 * 1024 * 1024)
 #if defined(__AVX512__) or defined(__AVX256__)
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h
index fa41757c895b..1897afd1fea2 100644
--- a/csrc/xpu/includes/type_shim.h
+++ b/csrc/xpu/includes/type_shim.h
@@ -82,11 +82,11 @@
     }
 
 template <typename T>
-__inline__ __attribute__((always_inline)) T reduce_block_into_lanes(
-    T* x,
-    T val,
-    int lanes = 1,
-    bool share_result = false)  // lanes is intended to be <= 32.
+__inline__ __attribute__((always_inline)) T
+reduce_block_into_lanes(T* x,
+                        T val,
+                        int lanes = 1,
+                        bool share_result = false)  // lanes is intended to be <= 32.
 {
     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
     int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2);
diff --git a/deepspeed/inference/v2/model_implementations/phi3small/model.py b/deepspeed/inference/v2/model_implementations/phi3small/model.py
index 1f1e853fc167..532cb49c80c4 100644
--- a/deepspeed/inference/v2/model_implementations/phi3small/model.py
+++ b/deepspeed/inference/v2/model_implementations/phi3small/model.py
@@ -112,7 +112,6 @@ def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
     def mup_embedding_multiplier(self) -> float:
         return 10.0
 
-
     """
     Forward implementations
     """
@@ -160,7 +159,10 @@ def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hid
         if self.tp_size > 1:
             dist.all_reduce(hidden_states, group=self._base_mp_group)
 
-        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=cur_params.mlp_norm_beta)
+        residual, hidden_states = self.norm(residual,
+                                            hidden_states,
+                                            cur_params.mlp_norm_gamma,
+                                            beta=cur_params.mlp_norm_beta)
 
         hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None)
         hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None)
@@ -170,7 +172,10 @@ def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hid
 
         if layer_idx != self.num_layers - 1:
             next_params = self._transformer[layer_idx + 1]
-            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=next_params.attn_norm_beta)
+            residual, hidden_states = self.norm(residual,
+                                                hidden_states,
+                                                next_params.attn_norm_gamma,
+                                                beta=next_params.attn_norm_beta)
         else:
             # On last layer, we just need to perform the residual add. Adding into the residual
             # here is safe.
@@ -205,7 +210,10 @@ def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: Ragge
     def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
         residual = self._forward_embed(wrapped_batch)
 
-        residual, hidden_states = self.norm(residual, None, gamma=self._transformer[0].attn_norm_gamma, beta=self._transformer[0].attn_norm_beta)
+        residual, hidden_states = self.norm(residual,
+                                            None,
+                                            gamma=self._transformer[0].attn_norm_gamma,
+                                            beta=self._transformer[0].attn_norm_beta)
 
         for layer_idx in range(self.num_layers):
             residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,