[llama] fix neftune & pbar with start_step (hpcaitech#5364)

Camille7777 · flybird11111 · commit 007430e4e554 · 2024-03-18T12:30:27.000+08:00
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/neftune_patch.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/neftune_patch.py
@@ -17,7 +17,7 @@
 
 def unwrap(model):
     if hasattr(model, "module"):
-        return unwrap_model(model.module)
+        return model.unwrap()
     else:
         return model
 
diff --git a/applications/Colossal-LLaMA-2/train.py b/applications/Colossal-LLaMA-2/train.py
@@ -329,9 +329,9 @@ def main() -> None:
 
     for epoch in range(start_epoch, args.num_epochs):
         dataloader.sampler.set_epoch(epoch=epoch)
-        pbar = tqdm(desc=f"Epoch {epoch}", disable=not coordinator.is_master(), total=num_steps_per_epoch)
+        pbar = tqdm(desc=f"Epoch {epoch}", disable=not coordinator.is_master(), total=num_steps_per_epoch, initial=start_step // args.accumulation_steps)
         total_loss = torch.tensor(0.0, device=get_current_device())
-        for step, batch in enumerate(dataloader):
+        for step, batch in enumerate(dataloader, start=start_step):
             batch = {k: v.to(get_current_device()) for k, v in batch.items() if isinstance(v, torch.Tensor)}
 
             batch_output = model(**batch)