deepspeed

This commit is contained in:
游雁 2024-08-06 00:00:57 +08:00
parent 11586f7ebd
commit ffb8d31599
2 changed files with 12 additions and 7 deletions

View File

@ -34,6 +34,7 @@ from funasr.train_utils.load_pretrained_model import load_pretrained_model
from funasr.utils.misc import prepare_model_dir
from funasr.train_utils.model_summary import model_summary
from funasr import AutoModel
from datetime import timedelta
try:
import deepspeed
@ -81,7 +82,11 @@ def main(**kwargs):
deepspeed.init_distributed(dist_backend=kwargs.get("backend", "nccl"))
elif use_ddp or use_fsdp:
logging.info(f"use_ddp: {use_ddp}, use_fsdp: {use_fsdp}")
dist.init_process_group(backend=kwargs.get("backend", "nccl"), init_method="env://")
dist.init_process_group(
backend=kwargs.get("backend", "nccl"),
init_method="env://",
timeout=timedelta(seconds=60 * 60),
)
torch.cuda.set_device(local_rank)
# rank = dist.get_rank()

View File

@ -217,9 +217,9 @@ class Trainer:
# Create output directory if it does not exist
os.makedirs(self.output_dir, exist_ok=True)
if step is None:
ckpt_name = f"model.pt.ep{epoch}"
ckpt_name = f"ds-model.pt.ep{epoch}"
else:
ckpt_name = f"model.pt.ep{epoch}.{step}"
ckpt_name = f"ds-model.pt.ep{epoch}.{step}"
filename = os.path.join(self.output_dir, ckpt_name)
# torch.save(state, filename)
@ -239,11 +239,11 @@ class Trainer:
>= self.val_acc_step_or_eoch[self.best_step_or_epoch]
):
self.best_step_or_epoch = ckpt_name
best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
best_ckpt = Path(os.path.join(self.output_dir, f"ds-model.pt.best"))
# torch.save(state, best_ckpt)
with torch.no_grad():
model.save_checkpoint(
save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
save_dir=self.output_dir, tag=f"ds-model.pt.best", client_state=state
)
logging.info(
f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
@ -258,11 +258,11 @@ class Trainer:
<= self.val_loss_step_or_eoch[self.best_step_or_epoch]
):
self.best_step_or_epoch = ckpt_name
best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
best_ckpt = Path(os.path.join(self.output_dir, f"ds-model.pt.best"))
# torch.save(state, best_ckpt)
with torch.no_grad():
model.save_checkpoint(
save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
save_dir=self.output_dir, tag=f"ds-model.pt.best", client_state=state
)
logging.info(
f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"