diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index dd793c02036ed4..e6f26d0df51969 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -459,6 +459,11 @@ def __init__(self, skip_memory_metrics=False):
         elif is_torch_xpu_available():
             import torch
 
+            self.torch = torch
+            self.gpu = {}
+        elif is_torch_npu_available():
+            import torch
+
             self.torch = torch
             self.gpu = {}
         else:
@@ -517,6 +522,9 @@ def start(self):
             elif is_torch_xpu_available():
                 self.torch.xpu.reset_peak_memory_stats()
                 self.torch.xpu.empty_cache()
+            elif is_torch_npu_available():
+                self.torch.npu.reset_peak_memory_stats()
+                self.torch.npu.empty_cache()
 
         # gpu
         if self.torch is not None:
@@ -524,6 +532,8 @@ def start(self):
                 self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
             elif is_torch_xpu_available():
                 self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
+            elif is_torch_npu_available():
+                self.gpu_mem_used_at_start = self.torch.npu.memory_allocated()
 
         # cpu
         self.cpu_mem_used_at_start = self.cpu_mem_used()
@@ -551,6 +561,8 @@ def stop(self, stage):
                 self.torch.cuda.empty_cache()
             elif is_torch_xpu_available():
                 self.torch.xpu.empty_cache()
+            elif is_torch_npu_available():
+                self.torch.npu.empty_cache()
 
         # concepts:
         # - alloc_delta:  the difference of allocated memory between the end and the start
@@ -565,6 +577,9 @@ def stop(self, stage):
             elif is_torch_xpu_available():
                 self.gpu_mem_used_now = self.torch.xpu.memory_allocated()
                 self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated()
+            elif is_torch_npu_available():
+                self.gpu_mem_used_now = self.torch.npu.memory_allocated()
+                self.gpu_mem_used_peak = self.torch.npu.max_memory_allocated()
             else:
                 raise ValueError("No available GPU device found!")
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index ae6d8f7ae388f5..9df5ac84d74761 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1944,18 +1944,18 @@ def check_mem_metrics(self, trainer, check_func):
         metrics = trainer.train().metrics
         check_func("init_mem_cpu_alloc_delta", metrics)
         check_func("train_mem_cpu_alloc_delta", metrics)
-        if torch.cuda.device_count() > 0:
+        if backend_device_count(torch_device) > 0:
             check_func("init_mem_gpu_alloc_delta", metrics)
             check_func("train_mem_gpu_alloc_delta", metrics)
 
         metrics = trainer.evaluate()
         check_func("eval_mem_cpu_alloc_delta", metrics)
-        if torch.cuda.device_count() > 0:
+        if backend_device_count(torch_device) > 0:
             check_func("eval_mem_gpu_alloc_delta", metrics)
 
         metrics = trainer.predict(RegressionDataset()).metrics
         check_func("test_mem_cpu_alloc_delta", metrics)
-        if torch.cuda.device_count() > 0:
+        if backend_device_count(torch_device) > 0:
             check_func("test_mem_gpu_alloc_delta", metrics)
 
     def test_mem_metrics(self):