mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
deepspeed
This commit is contained in:
parent
11586f7ebd
commit
ffb8d31599
@ -34,6 +34,7 @@ from funasr.train_utils.load_pretrained_model import load_pretrained_model
|
||||
from funasr.utils.misc import prepare_model_dir
|
||||
from funasr.train_utils.model_summary import model_summary
|
||||
from funasr import AutoModel
|
||||
from datetime import timedelta
|
||||
|
||||
try:
|
||||
import deepspeed
|
||||
@ -81,7 +82,11 @@ def main(**kwargs):
|
||||
deepspeed.init_distributed(dist_backend=kwargs.get("backend", "nccl"))
|
||||
elif use_ddp or use_fsdp:
|
||||
logging.info(f"use_ddp: {use_ddp}, use_fsdp: {use_fsdp}")
|
||||
dist.init_process_group(backend=kwargs.get("backend", "nccl"), init_method="env://")
|
||||
dist.init_process_group(
|
||||
backend=kwargs.get("backend", "nccl"),
|
||||
init_method="env://",
|
||||
timeout=timedelta(seconds=60 * 60),
|
||||
)
|
||||
torch.cuda.set_device(local_rank)
|
||||
|
||||
# rank = dist.get_rank()
|
||||
|
||||
@ -217,9 +217,9 @@ class Trainer:
|
||||
# Create output directory if it does not exist
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
if step is None:
|
||||
ckpt_name = f"model.pt.ep{epoch}"
|
||||
ckpt_name = f"ds-model.pt.ep{epoch}"
|
||||
else:
|
||||
ckpt_name = f"model.pt.ep{epoch}.{step}"
|
||||
ckpt_name = f"ds-model.pt.ep{epoch}.{step}"
|
||||
filename = os.path.join(self.output_dir, ckpt_name)
|
||||
|
||||
# torch.save(state, filename)
|
||||
@ -239,11 +239,11 @@ class Trainer:
|
||||
>= self.val_acc_step_or_eoch[self.best_step_or_epoch]
|
||||
):
|
||||
self.best_step_or_epoch = ckpt_name
|
||||
best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
|
||||
best_ckpt = Path(os.path.join(self.output_dir, f"ds-model.pt.best"))
|
||||
# torch.save(state, best_ckpt)
|
||||
with torch.no_grad():
|
||||
model.save_checkpoint(
|
||||
save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
|
||||
save_dir=self.output_dir, tag=f"ds-model.pt.best", client_state=state
|
||||
)
|
||||
logging.info(
|
||||
f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
|
||||
@ -258,11 +258,11 @@ class Trainer:
|
||||
<= self.val_loss_step_or_eoch[self.best_step_or_epoch]
|
||||
):
|
||||
self.best_step_or_epoch = ckpt_name
|
||||
best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
|
||||
best_ckpt = Path(os.path.join(self.output_dir, f"ds-model.pt.best"))
|
||||
# torch.save(state, best_ckpt)
|
||||
with torch.no_grad():
|
||||
model.save_checkpoint(
|
||||
save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
|
||||
save_dir=self.output_dir, tag=f"ds-model.pt.best", client_state=state
|
||||
)
|
||||
logging.info(
|
||||
f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user