From 9a6c6ab5ea25bf2d6e874010ae101e29bde2a217 Mon Sep 17 00:00:00 2001
From: "shixian.shi" <shixian.shi@alibaba-inc.com>
Date: Wed, 21 Feb 2024 17:35:14 +0800
Subject: [PATCH 1/2] update rwkv_bat

---
 funasr/models/rwkv_bat/model.py             |  0
 funasr/models/rwkv_bat/rwkv.py              | 17 ++++++++---------
 funasr/models/rwkv_bat/rwkv_attention.py    | 15 ++++++---------
 funasr/models/rwkv_bat/rwkv_encoder.py      | 18 +++++++++++-------
 funasr/models/rwkv_bat/rwkv_feed_forward.py | 14 +++++---------
 funasr/models/rwkv_bat/rwkv_subsampling.py  | 20 +++++++-------------
 6 files changed, 37 insertions(+), 47 deletions(-)
 delete mode 100644 funasr/models/rwkv_bat/model.py

diff --git a/funasr/models/rwkv_bat/model.py b/funasr/models/rwkv_bat/model.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/funasr/models/rwkv_bat/rwkv.py b/funasr/models/rwkv_bat/rwkv.py
index 422e1c8fe..bd218a282 100644
--- a/funasr/models/rwkv_bat/rwkv.py
+++ b/funasr/models/rwkv_bat/rwkv.py
@@ -1,16 +1,15 @@
-"""Receptance Weighted Key Value (RWKV) block definition.
-
-Based/modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py
-
-"""
-
-from typing import Dict, Optional, Tuple
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
 
 import torch
+from typing import Dict, Optional, Tuple
 
-from funasr.models.rwkv_bat.rwkv_attention import EncoderSelfAttention, DecoderSelfAttention
-from funasr.models.rwkv_bat.rwkv_feed_forward import FeedForward
 from funasr.models.transformer.layer_norm import LayerNorm
+from funasr.models.rwkv_bat.rwkv_feed_forward import FeedForward
+from funasr.models.rwkv_bat.rwkv_attention import EncoderSelfAttention, DecoderSelfAttention
+
 
 class RWKV(torch.nn.Module):
     """RWKV module.
diff --git a/funasr/models/rwkv_bat/rwkv_attention.py b/funasr/models/rwkv_bat/rwkv_attention.py
index 5384fb9ca..c085874e4 100644
--- a/funasr/models/rwkv_bat/rwkv_attention.py
+++ b/funasr/models/rwkv_bat/rwkv_attention.py
@@ -1,17 +1,14 @@
-"""Attention (time mixing) modules for RWKV block.
-
-Based/Modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py.
-
-Some variables are renamed according to https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py.
-
-"""  # noqa
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
 
 import math
-from importlib.util import find_spec
+import torch
 from pathlib import Path
+from importlib.util import find_spec
 from typing import List, Optional, Tuple, Union
 
-import torch
 
 wkv_kernel_encoder = None
 wkv_kernel_decoder = None
diff --git a/funasr/models/rwkv_bat/rwkv_encoder.py b/funasr/models/rwkv_bat/rwkv_encoder.py
index af702e91b..c0e5f4255 100644
--- a/funasr/models/rwkv_bat/rwkv_encoder.py
+++ b/funasr/models/rwkv_bat/rwkv_encoder.py
@@ -1,17 +1,20 @@
-"""RWKV encoder definition for Transducer models."""
-
-import math
-from typing import Dict, List, Optional, Tuple
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
 
 import torch
+from typing import Dict, List, Optional, Tuple
 
-from funasr.models.encoder.abs_encoder import AbsEncoder
+from funasr.register import tables
 from funasr.models.rwkv_bat.rwkv import RWKV
 from funasr.models.transformer.layer_norm import LayerNorm
-from funasr.models.rwkv_bat.rwkv_subsampling import RWKVConvInput
 from funasr.models.transformer.utils.nets_utils import make_source_mask
+from funasr.models.rwkv_bat.rwkv_subsampling import RWKVConvInput
 
-class RWKVEncoder(AbsEncoder):
+
+@tables.register("encoder_classes", "RWKVEncoder")
+class RWKVEncoder(torch.nn.Module):
     """RWKV encoder module.
 
     Based on https://arxiv.org/pdf/2305.13048.pdf.
@@ -44,6 +47,7 @@ class RWKVEncoder(AbsEncoder):
         subsampling_factor: int =4,
         time_reduction_factor: int = 1,
         kernel: int = 3,
+        **kwargs,
     ) -> None:
         """Construct a RWKVEncoder object."""
         super().__init__()
diff --git a/funasr/models/rwkv_bat/rwkv_feed_forward.py b/funasr/models/rwkv_bat/rwkv_feed_forward.py
index ddb42859e..32949ab02 100644
--- a/funasr/models/rwkv_bat/rwkv_feed_forward.py
+++ b/funasr/models/rwkv_bat/rwkv_feed_forward.py
@@ -1,14 +1,10 @@
-"""Feed-forward (channel mixing) module for RWKV block.
-
-Based/Modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py
-
-Some variables are renamed according to https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py.
-
-"""  # noqa
-
-from typing import List, Optional, Tuple
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
 
 import torch
+from typing import List, Optional, Tuple
 
 
 class FeedForward(torch.nn.Module):
diff --git a/funasr/models/rwkv_bat/rwkv_subsampling.py b/funasr/models/rwkv_bat/rwkv_subsampling.py
index 54ad1f5ad..a688acaff 100644
--- a/funasr/models/rwkv_bat/rwkv_subsampling.py
+++ b/funasr/models/rwkv_bat/rwkv_subsampling.py
@@ -1,19 +1,13 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
 
-# Copyright 2019 Shigeki Karita
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-"""Subsampling layer definition."""
-import numpy as np
-import torch
-import torch.nn.functional as F
-from funasr.models.transformer.embedding import PositionalEncoding
-import logging
-from funasr.models.scama.utils import sequence_mask
-from funasr.models.transformer.utils.nets_utils import sub_factor_to_params, pad_to_len
-from typing import Optional, Tuple, Union
 import math
+import torch
+from typing import Optional, Tuple, Union
+from funasr.models.transformer.utils.nets_utils import pad_to_len
+
 
 class TooShortUttError(Exception):
     """Raised when the utt is too short for subsampling.

From cdca62d933c4e0766a05044c6cba7cfa0596e615 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: Wed, 21 Feb 2024 19:22:59 +0800
Subject: [PATCH 2/2] Dev gzf (#1377)

* update train recipe

* v1.0.8

* llm

* update trainer
---
 .../paraformer/{infer_demo.sh => demo.sh}     |  0
 .../paraformer/finetune.sh                    |  2 +-
 funasr/datasets/audio_datasets/scp2jsonl.py   | 16 +++++++--------
 .../models/mossformer/mossformer_encoder.py   |  2 +-
 funasr/models/paraformer/model.py             |  4 +++-
 funasr/train_utils/trainer.py                 | 20 +++++++++----------
 funasr/version.txt                            |  2 +-
 7 files changed, 24 insertions(+), 22 deletions(-)
 rename examples/industrial_data_pretraining/paraformer/{infer_demo.sh => demo.sh} (100%)

diff --git a/examples/industrial_data_pretraining/paraformer/infer_demo.sh b/examples/industrial_data_pretraining/paraformer/demo.sh
similarity index 100%
rename from examples/industrial_data_pretraining/paraformer/infer_demo.sh
rename to examples/industrial_data_pretraining/paraformer/demo.sh
diff --git a/examples/industrial_data_pretraining/paraformer/finetune.sh b/examples/industrial_data_pretraining/paraformer/finetune.sh
index 8bdd8daaf..266346cca 100644
--- a/examples/industrial_data_pretraining/paraformer/finetune.sh
+++ b/examples/industrial_data_pretraining/paraformer/finetune.sh
@@ -6,7 +6,7 @@
 #git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path}
 
 ## generate jsonl from wav.scp and text.txt
-#python funasr/datasets/audio_datasets/scp2jsonl.py \
+#python -m funasr.datasets.audio_datasets.scp2jsonl \
 #++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
 #++data_type_list='["source", "target"]' \
 #++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
diff --git a/funasr/datasets/audio_datasets/scp2jsonl.py b/funasr/datasets/audio_datasets/scp2jsonl.py
index b6df34ae3..e09a84a61 100644
--- a/funasr/datasets/audio_datasets/scp2jsonl.py
+++ b/funasr/datasets/audio_datasets/scp2jsonl.py
@@ -72,14 +72,7 @@ def parse_context_length(data_list: list, data_type: str):
 
 @hydra.main(config_name=None, version_base=None)
 def main_hydra(cfg: DictConfig):
-    """
-    python funasr/datasets/audio_datasets/scp2jsonl.py \
-    ++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
-    ++data_type_list='["source", "target"]' \
-    ++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
-
-    """
-    
+ 
     kwargs = OmegaConf.to_container(cfg, resolve=True)
 
     scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"))
@@ -90,6 +83,13 @@ def main_hydra(cfg: DictConfig):
     gen_jsonl_from_wav_text_list(scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out)
     
 
+"""
+python -m funasr.datasets.audio_datasets.scp2jsonl \
+++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+++data_type_list='["source", "target"]' \
+++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+"""
+
 if __name__ == "__main__":
     main_hydra()
 
diff --git a/funasr/models/mossformer/mossformer_encoder.py b/funasr/models/mossformer/mossformer_encoder.py
index d06af999d..a28c960e8 100644
--- a/funasr/models/mossformer/mossformer_encoder.py
+++ b/funasr/models/mossformer/mossformer_encoder.py
@@ -4,7 +4,7 @@ import torch.nn.functional as F
 try:
     from rotary_embedding_torch import RotaryEmbedding
 except:
-    print("Please install rotary_embedding_torch by: \n pip install -U rotary_embedding_torch")
+    print("If you want use mossformer, lease install rotary_embedding_torch by: \n pip install -U rotary_embedding_torch")
 from funasr.models.transformer.layer_norm import GlobalLayerNorm, CumulativeLayerNorm, ScaleNorm
 from funasr.models.transformer.embedding import ScaledSinuEmbedding
 from funasr.models.transformer.mossformer import FLASH_ShareA_FFConvM
diff --git a/funasr/models/paraformer/model.py b/funasr/models/paraformer/model.py
index 729b8f500..90ce162d4 100644
--- a/funasr/models/paraformer/model.py
+++ b/funasr/models/paraformer/model.py
@@ -455,7 +455,9 @@ class Paraformer(torch.nn.Module):
             speech, speech_lengths = data_in, data_lengths
             if len(speech.shape) < 3:
                 speech = speech[None, :, :]
-            if speech_lengths is None:
+            if speech_lengths is not None:
+                speech_lengths = speech_lengths.squeeze(-1)
+            else:
                 speech_lengths = speech.shape[1]
         else:
             # extract fbank feats
diff --git a/funasr/train_utils/trainer.py b/funasr/train_utils/trainer.py
index c2326424f..6a59f91a1 100644
--- a/funasr/train_utils/trainer.py
+++ b/funasr/train_utils/trainer.py
@@ -181,7 +181,7 @@ class Trainer:
 
             time2 = time.perf_counter()
             time_escaped = (time2 - time1)/3600.0
-            print(f"\ntime_escaped_epoch: {time_escaped:.3f} hours, estimated to finish {self.max_epoch} epoch: {(self.max_epoch-epoch)*time_escaped:.3f}\n")
+            print(f"\nrank: {self.local_rank}, time_escaped_epoch: {time_escaped:.3f} hours, estimated to finish {self.max_epoch} epoch: {(self.max_epoch-epoch)*time_escaped:.3f}\n")
 
         if self.rank == 0:
             average_checkpoints(self.output_dir, self.avg_nbest_model)
@@ -302,17 +302,14 @@ class Trainer:
                 )
                 pbar.set_description(description)
                 if self.writer:
-                    self.writer.add_scalar(f'rank{self.local_rank}_Loss/train', loss.item(),
-                                           epoch*len(self.dataloader_train) + batch_idx)
+                    self.writer.add_scalar(f'rank{self.local_rank}_Loss/train', loss.item(), self.batch_total)
+                    self.writer.add_scalar(f'rank{self.local_rank}_lr/train', lr, self.batch_total)
                     for key, var in stats.items():
-                        self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', var.item(),
-                                               epoch * len(self.dataloader_train) + batch_idx)
+                        self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', var.item(), self.batch_total)
                     for key, var in speed_stats.items():
-                        self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', eval(var),
-                                               epoch * len(self.dataloader_train) + batch_idx)
-                    
-            # if batch_idx == 2:
-            #     break
+                        self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', eval(var), self.batch_total)
+
+
         pbar.close()
 
     def _validate_epoch(self, epoch):
@@ -356,7 +353,10 @@ class Trainer:
                 
                 if (batch_idx+1) % self.log_interval == 0 or (batch_idx+1) == len(self.dataloader_val):
                     pbar.update(self.log_interval)
+                    time_now = datetime.now()
+                    time_now = time_now.strftime("%Y-%m-%d %H:%M:%S")
                     description = (
+                        f"{time_now}, "
                         f"rank: {self.local_rank}, "
                         f"validation epoch: {epoch}/{self.max_epoch}, "
                         f"step: {batch_idx+1}/{len(self.dataloader_val)}, "
diff --git a/funasr/version.txt b/funasr/version.txt
index 238d6e882..b0f3d96f8 100644
--- a/funasr/version.txt
+++ b/funasr/version.txt
@@ -1 +1 @@
-1.0.7
+1.0.8