mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
Fix a few issues found during fine-tuning (#2582)
* Fix wandb log * fix validation loss is not logged batch_idx got reset for each epoch. use the global step counter instead * LR should only be updated per step, not per step+ per epoch * add early stopping * Fix bf16 handling scaler is only needed for fp16 * more logs --------- Co-authored-by: Tony Mak <tony@Tonys-MacBook-Air-1800.local>
This commit is contained in:
parent
05c8eba11c
commit
a750595594
@ -149,7 +149,7 @@ def main(**kwargs):
|
||||
dataloader = dataloader_class(**kwargs)
|
||||
# dataloader_tr, dataloader_val = dataloader_class(**kwargs)
|
||||
|
||||
scaler = GradScaler(enabled=True) if trainer.use_fp16 or trainer.use_bf16 else None
|
||||
scaler = GradScaler(enabled=True) if trainer.use_fp16 else None
|
||||
scaler = ShardedGradScaler(enabled=trainer.use_fp16) if trainer.use_fsdp else scaler
|
||||
|
||||
trainer.resume_checkpoint(
|
||||
@ -159,6 +159,10 @@ def main(**kwargs):
|
||||
scaler=scaler,
|
||||
)
|
||||
|
||||
early_stopping_patience = kwargs.get("train_conf", {}).get("early_stopping_patience", 0)
|
||||
best_val_loss = float("inf")
|
||||
epochs_no_improve = 0
|
||||
|
||||
dataloader_tr, dataloader_val = None, None
|
||||
for epoch in range(trainer.start_epoch, trainer.max_epoch):
|
||||
time1 = time.perf_counter()
|
||||
@ -199,7 +203,19 @@ def main(**kwargs):
|
||||
|
||||
trainer.start_data_split_i = 0
|
||||
trainer.validate_epoch(model=model, dataloader_val=dataloader_val, epoch=epoch + 1)
|
||||
scheduler.step()
|
||||
current_val = trainer.val_loss_avg
|
||||
|
||||
if current_val < best_val_loss:
|
||||
logging.info(f"current_val: {current_val}, best_val_loss: {best_val_loss}")
|
||||
best_val_loss = current_val
|
||||
epochs_no_improve = 0
|
||||
else:
|
||||
epochs_no_improve += 1
|
||||
logging.info(f"No val_loss improvement for {epochs_no_improve}/{early_stopping_patience} epochs")
|
||||
if early_stopping_patience > 0 and epochs_no_improve >= early_stopping_patience:
|
||||
logging.info(f"Early stopping triggered at epoch {epoch+1}")
|
||||
break
|
||||
|
||||
trainer.step_in_epoch = 0
|
||||
trainer.save_checkpoint(
|
||||
epoch + 1, model=model, optim=optim, scheduler=scheduler, scaler=scaler
|
||||
|
||||
@ -715,7 +715,7 @@ class Trainer:
|
||||
if self.use_wandb and wandb is not None:
|
||||
wandb.log(
|
||||
description_dict,
|
||||
setp=self.batch_total,
|
||||
step=self.batch_total,
|
||||
)
|
||||
|
||||
def close(self, writer=None):
|
||||
|
||||
@ -30,9 +30,8 @@ def maybe_autocast(dtype=None, use_deepspeed=False):
|
||||
yield
|
||||
else:
|
||||
if dtype == torch.float16 or dtype == torch.bfloat16:
|
||||
yield
|
||||
# with autocast(enabled=True, dtype=dtype):
|
||||
# yield
|
||||
with autocast(enabled=True, dtype=dtype):
|
||||
yield
|
||||
else:
|
||||
yield
|
||||
|
||||
@ -684,7 +683,7 @@ class Trainer:
|
||||
scaled_loss = model.backward(loss)
|
||||
else:
|
||||
loss = loss / self.accum_grad
|
||||
if self.use_fp16 or self.use_bf16:
|
||||
if scaler:
|
||||
scaler.scale(loss).backward()
|
||||
else:
|
||||
loss.backward()
|
||||
@ -712,7 +711,7 @@ class Trainer:
|
||||
# Execute an optimization step (update model parameters)
|
||||
if self.use_ddp or self.use_fsdp:
|
||||
dist.barrier()
|
||||
if self.use_fp16 or self.use_bf16:
|
||||
if scaler:
|
||||
scaler.step(optim)
|
||||
scaler.update()
|
||||
else:
|
||||
@ -736,6 +735,9 @@ class Trainer:
|
||||
Args:
|
||||
epoch (int): The current epoch number.
|
||||
"""
|
||||
self.val_loss_avg = 0.0
|
||||
self.val_acc_avg = 0.0
|
||||
|
||||
if self.use_ddp or self.use_fsdp or self.use_deepspeed:
|
||||
dist.barrier()
|
||||
logging.info(f"Validate epoch: {epoch}, rank: {self.rank}\n")
|
||||
@ -757,7 +759,7 @@ class Trainer:
|
||||
"data_split_i": kwargs.get("data_split_i", 0),
|
||||
"data_split_num": kwargs.get("data_split_num", 1),
|
||||
"log_step": batch_idx + kwargs.get("start_step", 0),
|
||||
"batch_total": batch_idx + 1,
|
||||
"batch_total": self.batch_total,
|
||||
"step_in_epoch": batch_idx + 1,
|
||||
"lr": 0.0,
|
||||
}
|
||||
@ -883,7 +885,7 @@ class Trainer:
|
||||
if self.use_wandb and wandb is not None:
|
||||
wandb.log(
|
||||
description_dict,
|
||||
setp=batch_total,
|
||||
step=batch_total,
|
||||
)
|
||||
|
||||
def close(self, writer=None):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user