From 9a6c6ab5ea25bf2d6e874010ae101e29bde2a217 Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Wed, 21 Feb 2024 17:35:14 +0800 Subject: [PATCH 1/2] update rwkv_bat --- funasr/models/rwkv_bat/model.py | 0 funasr/models/rwkv_bat/rwkv.py | 17 ++++++++--------- funasr/models/rwkv_bat/rwkv_attention.py | 15 ++++++--------- funasr/models/rwkv_bat/rwkv_encoder.py | 18 +++++++++++------- funasr/models/rwkv_bat/rwkv_feed_forward.py | 14 +++++--------- funasr/models/rwkv_bat/rwkv_subsampling.py | 20 +++++++------------- 6 files changed, 37 insertions(+), 47 deletions(-) delete mode 100644 funasr/models/rwkv_bat/model.py diff --git a/funasr/models/rwkv_bat/model.py b/funasr/models/rwkv_bat/model.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/funasr/models/rwkv_bat/rwkv.py b/funasr/models/rwkv_bat/rwkv.py index 422e1c8fe..bd218a282 100644 --- a/funasr/models/rwkv_bat/rwkv.py +++ b/funasr/models/rwkv_bat/rwkv.py @@ -1,16 +1,15 @@ -"""Receptance Weighted Key Value (RWKV) block definition. - -Based/modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py - -""" - -from typing import Dict, Optional, Tuple +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) import torch +from typing import Dict, Optional, Tuple -from funasr.models.rwkv_bat.rwkv_attention import EncoderSelfAttention, DecoderSelfAttention -from funasr.models.rwkv_bat.rwkv_feed_forward import FeedForward from funasr.models.transformer.layer_norm import LayerNorm +from funasr.models.rwkv_bat.rwkv_feed_forward import FeedForward +from funasr.models.rwkv_bat.rwkv_attention import EncoderSelfAttention, DecoderSelfAttention + class RWKV(torch.nn.Module): """RWKV module. diff --git a/funasr/models/rwkv_bat/rwkv_attention.py b/funasr/models/rwkv_bat/rwkv_attention.py index 5384fb9ca..c085874e4 100644 --- a/funasr/models/rwkv_bat/rwkv_attention.py +++ b/funasr/models/rwkv_bat/rwkv_attention.py @@ -1,17 +1,14 @@ -"""Attention (time mixing) modules for RWKV block. - -Based/Modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py. - -Some variables are renamed according to https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py. - -""" # noqa +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) import math -from importlib.util import find_spec +import torch from pathlib import Path +from importlib.util import find_spec from typing import List, Optional, Tuple, Union -import torch wkv_kernel_encoder = None wkv_kernel_decoder = None diff --git a/funasr/models/rwkv_bat/rwkv_encoder.py b/funasr/models/rwkv_bat/rwkv_encoder.py index af702e91b..c0e5f4255 100644 --- a/funasr/models/rwkv_bat/rwkv_encoder.py +++ b/funasr/models/rwkv_bat/rwkv_encoder.py @@ -1,17 +1,20 @@ -"""RWKV encoder definition for Transducer models.""" - -import math -from typing import Dict, List, Optional, Tuple +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) import torch +from typing import Dict, List, Optional, Tuple -from funasr.models.encoder.abs_encoder import AbsEncoder +from funasr.register import tables from funasr.models.rwkv_bat.rwkv import RWKV from funasr.models.transformer.layer_norm import LayerNorm -from funasr.models.rwkv_bat.rwkv_subsampling import RWKVConvInput from funasr.models.transformer.utils.nets_utils import make_source_mask +from funasr.models.rwkv_bat.rwkv_subsampling import RWKVConvInput -class RWKVEncoder(AbsEncoder): + +@tables.register("encoder_classes", "RWKVEncoder") +class RWKVEncoder(torch.nn.Module): """RWKV encoder module. Based on https://arxiv.org/pdf/2305.13048.pdf. @@ -44,6 +47,7 @@ class RWKVEncoder(AbsEncoder): subsampling_factor: int =4, time_reduction_factor: int = 1, kernel: int = 3, + **kwargs, ) -> None: """Construct a RWKVEncoder object.""" super().__init__() diff --git a/funasr/models/rwkv_bat/rwkv_feed_forward.py b/funasr/models/rwkv_bat/rwkv_feed_forward.py index ddb42859e..32949ab02 100644 --- a/funasr/models/rwkv_bat/rwkv_feed_forward.py +++ b/funasr/models/rwkv_bat/rwkv_feed_forward.py @@ -1,14 +1,10 @@ -"""Feed-forward (channel mixing) module for RWKV block. - -Based/Modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py - -Some variables are renamed according to https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py. - -""" # noqa - -from typing import List, Optional, Tuple +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) import torch +from typing import List, Optional, Tuple class FeedForward(torch.nn.Module): diff --git a/funasr/models/rwkv_bat/rwkv_subsampling.py b/funasr/models/rwkv_bat/rwkv_subsampling.py index 54ad1f5ad..a688acaff 100644 --- a/funasr/models/rwkv_bat/rwkv_subsampling.py +++ b/funasr/models/rwkv_bat/rwkv_subsampling.py @@ -1,19 +1,13 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +# -*- encoding: utf-8 -*- +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) -# Copyright 2019 Shigeki Karita -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -"""Subsampling layer definition.""" -import numpy as np -import torch -import torch.nn.functional as F -from funasr.models.transformer.embedding import PositionalEncoding -import logging -from funasr.models.scama.utils import sequence_mask -from funasr.models.transformer.utils.nets_utils import sub_factor_to_params, pad_to_len -from typing import Optional, Tuple, Union import math +import torch +from typing import Optional, Tuple, Union +from funasr.models.transformer.utils.nets_utils import pad_to_len + class TooShortUttError(Exception): """Raised when the utt is too short for subsampling. From cdca62d933c4e0766a05044c6cba7cfa0596e615 Mon Sep 17 00:00:00 2001 From: zhifu gao Date: Wed, 21 Feb 2024 19:22:59 +0800 Subject: [PATCH 2/2] Dev gzf (#1377) * update train recipe * v1.0.8 * llm * update trainer --- .../paraformer/{infer_demo.sh => demo.sh} | 0 .../paraformer/finetune.sh | 2 +- funasr/datasets/audio_datasets/scp2jsonl.py | 16 +++++++-------- .../models/mossformer/mossformer_encoder.py | 2 +- funasr/models/paraformer/model.py | 4 +++- funasr/train_utils/trainer.py | 20 +++++++++---------- funasr/version.txt | 2 +- 7 files changed, 24 insertions(+), 22 deletions(-) rename examples/industrial_data_pretraining/paraformer/{infer_demo.sh => demo.sh} (100%) diff --git a/examples/industrial_data_pretraining/paraformer/infer_demo.sh b/examples/industrial_data_pretraining/paraformer/demo.sh similarity index 100% rename from examples/industrial_data_pretraining/paraformer/infer_demo.sh rename to examples/industrial_data_pretraining/paraformer/demo.sh diff --git a/examples/industrial_data_pretraining/paraformer/finetune.sh b/examples/industrial_data_pretraining/paraformer/finetune.sh index 8bdd8daaf..266346cca 100644 --- a/examples/industrial_data_pretraining/paraformer/finetune.sh +++ b/examples/industrial_data_pretraining/paraformer/finetune.sh @@ -6,7 +6,7 @@ #git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} ## generate jsonl from wav.scp and text.txt -#python funasr/datasets/audio_datasets/scp2jsonl.py \ +#python -m funasr.datasets.audio_datasets.scp2jsonl \ #++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ #++data_type_list='["source", "target"]' \ #++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl diff --git a/funasr/datasets/audio_datasets/scp2jsonl.py b/funasr/datasets/audio_datasets/scp2jsonl.py index b6df34ae3..e09a84a61 100644 --- a/funasr/datasets/audio_datasets/scp2jsonl.py +++ b/funasr/datasets/audio_datasets/scp2jsonl.py @@ -72,14 +72,7 @@ def parse_context_length(data_list: list, data_type: str): @hydra.main(config_name=None, version_base=None) def main_hydra(cfg: DictConfig): - """ - python funasr/datasets/audio_datasets/scp2jsonl.py \ - ++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ - ++data_type_list='["source", "target"]' \ - ++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl - - """ - + kwargs = OmegaConf.to_container(cfg, resolve=True) scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt")) @@ -90,6 +83,13 @@ def main_hydra(cfg: DictConfig): gen_jsonl_from_wav_text_list(scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out) +""" +python -m funasr.datasets.audio_datasets.scp2jsonl \ +++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ +++data_type_list='["source", "target"]' \ +++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl +""" + if __name__ == "__main__": main_hydra() diff --git a/funasr/models/mossformer/mossformer_encoder.py b/funasr/models/mossformer/mossformer_encoder.py index d06af999d..a28c960e8 100644 --- a/funasr/models/mossformer/mossformer_encoder.py +++ b/funasr/models/mossformer/mossformer_encoder.py @@ -4,7 +4,7 @@ import torch.nn.functional as F try: from rotary_embedding_torch import RotaryEmbedding except: - print("Please install rotary_embedding_torch by: \n pip install -U rotary_embedding_torch") + print("If you want use mossformer, lease install rotary_embedding_torch by: \n pip install -U rotary_embedding_torch") from funasr.models.transformer.layer_norm import GlobalLayerNorm, CumulativeLayerNorm, ScaleNorm from funasr.models.transformer.embedding import ScaledSinuEmbedding from funasr.models.transformer.mossformer import FLASH_ShareA_FFConvM diff --git a/funasr/models/paraformer/model.py b/funasr/models/paraformer/model.py index 729b8f500..90ce162d4 100644 --- a/funasr/models/paraformer/model.py +++ b/funasr/models/paraformer/model.py @@ -455,7 +455,9 @@ class Paraformer(torch.nn.Module): speech, speech_lengths = data_in, data_lengths if len(speech.shape) < 3: speech = speech[None, :, :] - if speech_lengths is None: + if speech_lengths is not None: + speech_lengths = speech_lengths.squeeze(-1) + else: speech_lengths = speech.shape[1] else: # extract fbank feats diff --git a/funasr/train_utils/trainer.py b/funasr/train_utils/trainer.py index c2326424f..6a59f91a1 100644 --- a/funasr/train_utils/trainer.py +++ b/funasr/train_utils/trainer.py @@ -181,7 +181,7 @@ class Trainer: time2 = time.perf_counter() time_escaped = (time2 - time1)/3600.0 - print(f"\ntime_escaped_epoch: {time_escaped:.3f} hours, estimated to finish {self.max_epoch} epoch: {(self.max_epoch-epoch)*time_escaped:.3f}\n") + print(f"\nrank: {self.local_rank}, time_escaped_epoch: {time_escaped:.3f} hours, estimated to finish {self.max_epoch} epoch: {(self.max_epoch-epoch)*time_escaped:.3f}\n") if self.rank == 0: average_checkpoints(self.output_dir, self.avg_nbest_model) @@ -302,17 +302,14 @@ class Trainer: ) pbar.set_description(description) if self.writer: - self.writer.add_scalar(f'rank{self.local_rank}_Loss/train', loss.item(), - epoch*len(self.dataloader_train) + batch_idx) + self.writer.add_scalar(f'rank{self.local_rank}_Loss/train', loss.item(), self.batch_total) + self.writer.add_scalar(f'rank{self.local_rank}_lr/train', lr, self.batch_total) for key, var in stats.items(): - self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', var.item(), - epoch * len(self.dataloader_train) + batch_idx) + self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', var.item(), self.batch_total) for key, var in speed_stats.items(): - self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', eval(var), - epoch * len(self.dataloader_train) + batch_idx) - - # if batch_idx == 2: - # break + self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', eval(var), self.batch_total) + + pbar.close() def _validate_epoch(self, epoch): @@ -356,7 +353,10 @@ class Trainer: if (batch_idx+1) % self.log_interval == 0 or (batch_idx+1) == len(self.dataloader_val): pbar.update(self.log_interval) + time_now = datetime.now() + time_now = time_now.strftime("%Y-%m-%d %H:%M:%S") description = ( + f"{time_now}, " f"rank: {self.local_rank}, " f"validation epoch: {epoch}/{self.max_epoch}, " f"step: {batch_idx+1}/{len(self.dataloader_val)}, " diff --git a/funasr/version.txt b/funasr/version.txt index 238d6e882..b0f3d96f8 100644 --- a/funasr/version.txt +++ b/funasr/version.txt @@ -1 +1 @@ -1.0.7 +1.0.8