From ffb8d315996537cfa34da8cb7e361316a64d0f81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Tue, 6 Aug 2024 00:00:57 +0800 Subject: [PATCH] deepspeed --- funasr/bin/train_ds.py | 7 ++++++- funasr/train_utils/trainer_ds.py | 12 ++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/funasr/bin/train_ds.py b/funasr/bin/train_ds.py index 415904ed3..90450953a 100644 --- a/funasr/bin/train_ds.py +++ b/funasr/bin/train_ds.py @@ -34,6 +34,7 @@ from funasr.train_utils.load_pretrained_model import load_pretrained_model from funasr.utils.misc import prepare_model_dir from funasr.train_utils.model_summary import model_summary from funasr import AutoModel +from datetime import timedelta try: import deepspeed @@ -81,7 +82,11 @@ def main(**kwargs): deepspeed.init_distributed(dist_backend=kwargs.get("backend", "nccl")) elif use_ddp or use_fsdp: logging.info(f"use_ddp: {use_ddp}, use_fsdp: {use_fsdp}") - dist.init_process_group(backend=kwargs.get("backend", "nccl"), init_method="env://") + dist.init_process_group( + backend=kwargs.get("backend", "nccl"), + init_method="env://", + timeout=timedelta(seconds=60 * 60), + ) torch.cuda.set_device(local_rank) # rank = dist.get_rank() diff --git a/funasr/train_utils/trainer_ds.py b/funasr/train_utils/trainer_ds.py index c369a9e57..a850d5c52 100644 --- a/funasr/train_utils/trainer_ds.py +++ b/funasr/train_utils/trainer_ds.py @@ -217,9 +217,9 @@ class Trainer: # Create output directory if it does not exist os.makedirs(self.output_dir, exist_ok=True) if step is None: - ckpt_name = f"model.pt.ep{epoch}" + ckpt_name = f"ds-model.pt.ep{epoch}" else: - ckpt_name = f"model.pt.ep{epoch}.{step}" + ckpt_name = f"ds-model.pt.ep{epoch}.{step}" filename = os.path.join(self.output_dir, ckpt_name) # torch.save(state, filename) @@ -239,11 +239,11 @@ class Trainer: >= self.val_acc_step_or_eoch[self.best_step_or_epoch] ): self.best_step_or_epoch = ckpt_name - best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best")) + best_ckpt = Path(os.path.join(self.output_dir, f"ds-model.pt.best")) # torch.save(state, best_ckpt) with torch.no_grad(): model.save_checkpoint( - save_dir=self.output_dir, tag=f"model.pt.best", client_state=state + save_dir=self.output_dir, tag=f"ds-model.pt.best", client_state=state ) logging.info( f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}" @@ -258,11 +258,11 @@ class Trainer: <= self.val_loss_step_or_eoch[self.best_step_or_epoch] ): self.best_step_or_epoch = ckpt_name - best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best")) + best_ckpt = Path(os.path.join(self.output_dir, f"ds-model.pt.best")) # torch.save(state, best_ckpt) with torch.no_grad(): model.save_checkpoint( - save_dir=self.output_dir, tag=f"model.pt.best", client_state=state + save_dir=self.output_dir, tag=f"ds-model.pt.best", client_state=state ) logging.info( f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"