Bugfix: Only allow rank==0 to clean up old checkpoints (#2558)

Fixes bug: https://github.com/modelscope/FunASR/issues/2557
This commit is contained in:
kmn1024 2025-06-25 16:34:30 +08:00 committed by GitHub
parent a3d6e48fe1
commit 443bc09c11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -272,22 +272,23 @@ class Trainer:
)
else:
print("Undo")
self.saved_ckpts[ckpt_name] = getattr(
self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch"
)[ckpt_name]
if self.keep_nbest_models > 0:
if len(self.saved_ckpts) > self.keep_nbest_models:
if self.avg_keep_nbest_models_type == "acc":
key = min(self.saved_ckpts, key=self.saved_ckpts.get)
else:
key = max(self.saved_ckpts, key=self.saved_ckpts.get)
if key in self.saved_ckpts:
del self.saved_ckpts[key]
filename = os.path.join(self.output_dir, key)
logging.info(f"Delete: {filename}")
if os.path.exists(filename):
# os.remove(filename)
misc_utils.smart_remove(filename)
if self.rank == 0:
self.saved_ckpts[ckpt_name] = getattr(
self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch"
)[ckpt_name]
if self.keep_nbest_models > 0:
if len(self.saved_ckpts) > self.keep_nbest_models:
if self.avg_keep_nbest_models_type == "acc":
key = min(self.saved_ckpts, key=self.saved_ckpts.get)
else:
key = max(self.saved_ckpts, key=self.saved_ckpts.get)
if key in self.saved_ckpts:
del self.saved_ckpts[key]
filename = os.path.join(self.output_dir, key)
logging.info(f"Delete: {filename}")
if os.path.exists(filename):
# os.remove(filename)
misc_utils.smart_remove(filename)
elif self.use_fsdp:
pass