Bugfix: Only allow rank==0 to clean up old checkpoints (#2558)

Fixes bug: https://github.com/modelscope/FunASR/issues/2557
This commit is contained in:
kmn1024 2025-06-25 16:34:30 +08:00 committed by GitHub
parent a3d6e48fe1
commit 443bc09c11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -272,22 +272,23 @@ class Trainer:
) )
else: else:
print("Undo") print("Undo")
self.saved_ckpts[ckpt_name] = getattr( if self.rank == 0:
self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch" self.saved_ckpts[ckpt_name] = getattr(
)[ckpt_name] self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch"
if self.keep_nbest_models > 0: )[ckpt_name]
if len(self.saved_ckpts) > self.keep_nbest_models: if self.keep_nbest_models > 0:
if self.avg_keep_nbest_models_type == "acc": if len(self.saved_ckpts) > self.keep_nbest_models:
key = min(self.saved_ckpts, key=self.saved_ckpts.get) if self.avg_keep_nbest_models_type == "acc":
else: key = min(self.saved_ckpts, key=self.saved_ckpts.get)
key = max(self.saved_ckpts, key=self.saved_ckpts.get) else:
if key in self.saved_ckpts: key = max(self.saved_ckpts, key=self.saved_ckpts.get)
del self.saved_ckpts[key] if key in self.saved_ckpts:
filename = os.path.join(self.output_dir, key) del self.saved_ckpts[key]
logging.info(f"Delete: {filename}") filename = os.path.join(self.output_dir, key)
if os.path.exists(filename): logging.info(f"Delete: {filename}")
# os.remove(filename) if os.path.exists(filename):
misc_utils.smart_remove(filename) # os.remove(filename)
misc_utils.smart_remove(filename)
elif self.use_fsdp: elif self.use_fsdp:
pass pass