deepspeed

2025-09-15 14:48:36 +08:00 · 2024-08-06 00:00:57 +08:00 · 2024-08-06 00:00:57 +08:00 · ffb8d31599
commit ffb8d31599
parent 11586f7ebd
2 changed files with 12 additions and 7 deletions
--- a/funasr/bin/train_ds.py
+++ b/funasr/bin/train_ds.py
@ -34,6 +34,7 @@ from funasr.train_utils.load_pretrained_model import load_pretrained_model
 from funasr.utils.misc import prepare_model_dir
 from funasr.train_utils.model_summary import model_summary
 from funasr import AutoModel
+from datetime import timedelta

 try:
    import deepspeed
@ -81,7 +82,11 @@ def main(**kwargs):
        deepspeed.init_distributed(dist_backend=kwargs.get("backend", "nccl"))
    elif use_ddp or use_fsdp:
        logging.info(f"use_ddp: {use_ddp}, use_fsdp: {use_fsdp}")
-        dist.init_process_group(backend=kwargs.get("backend", "nccl"), init_method="env://")
+        dist.init_process_group(
+            backend=kwargs.get("backend", "nccl"),
+            init_method="env://",
+            timeout=timedelta(seconds=60 * 60),
+        )
        torch.cuda.set_device(local_rank)

    # rank = dist.get_rank()
--- a/funasr/train_utils/trainer_ds.py
+++ b/funasr/train_utils/trainer_ds.py
@ -217,9 +217,9 @@ class Trainer:
            # Create output directory if it does not exist
            os.makedirs(self.output_dir, exist_ok=True)
            if step is None:
-                ckpt_name = f"model.pt.ep{epoch}"
+                ckpt_name = f"ds-model.pt.ep{epoch}"
            else:
-                ckpt_name = f"model.pt.ep{epoch}.{step}"
+                ckpt_name = f"ds-model.pt.ep{epoch}.{step}"
            filename = os.path.join(self.output_dir, ckpt_name)

            # torch.save(state, filename)
@ -239,11 +239,11 @@ class Trainer:
                    >= self.val_acc_step_or_eoch[self.best_step_or_epoch]
                ):
                    self.best_step_or_epoch = ckpt_name
-                    best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
+                    best_ckpt = Path(os.path.join(self.output_dir, f"ds-model.pt.best"))
                    # torch.save(state, best_ckpt)
                    with torch.no_grad():
                        model.save_checkpoint(
-                            save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
+                            save_dir=self.output_dir, tag=f"ds-model.pt.best", client_state=state
                        )
                    logging.info(
                        f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
@ -258,11 +258,11 @@ class Trainer:
                    <= self.val_loss_step_or_eoch[self.best_step_or_epoch]
                ):
                    self.best_step_or_epoch = ckpt_name
-                    best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
+                    best_ckpt = Path(os.path.join(self.output_dir, f"ds-model.pt.best"))
                    # torch.save(state, best_ckpt)
                    with torch.no_grad():
                        model.save_checkpoint(
-                            save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
+                            save_dir=self.output_dir, tag=f"ds-model.pt.best", client_state=state
                        )
                    logging.info(
                        f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"