diff --git a/docs/tutorial/Tables.md b/docs/tutorial/Tables.md index 2cbfc6200..831a3ac8e 100644 --- a/docs/tutorial/Tables.md +++ b/docs/tutorial/Tables.md @@ -5,13 +5,13 @@ The original intention of the funasr-1.x.x version is to make model integration easier. The core feature is the registry and AutoModel: * The introduction of the registry enables the development of building blocks to access the model, compatible with a variety of tasks; - + * The newly designed AutoModel interface unifies modelscope, huggingface, and funasr inference and training interfaces, and supports free download of repositories; - + * Support model export, demo-level service deployment, and industrial-level multi-concurrent service deployment; - + * Unify academic and industrial model inference training scripts; - + # Quick to get started @@ -51,19 +51,19 @@ Model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch ``` * `model`(str): [Model Warehouse](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)The model name in, or the model path in the local disk - + * `device`(str): `cuda:0`(Default gpu0), using GPU for inference, specified. If`cpu`Then the CPU is used for inference - + * `ncpu`(int): `4`(Default), set the number of threads used for CPU internal operation parallelism - + * `output_dir`(str): `None`(Default) If set, the output path of the output result - + * `batch_size`(int): `1`(Default), batch processing during decoding, number of samples - + * `hub`(str):`ms`(Default) to download the model from modelscope. If`hf`To download the model from huggingface. - + * `**kwargs`(dict): All in`config.yaml`Parameters, which can be specified directly here, for example, the maximum cut length in the vad model.`max_single_segment_time=6000`(Milliseconds). - + #### AutoModel reasoning @@ -72,13 +72,13 @@ Res = model.generate(input=[str], output_dir=[str]) ``` * * wav file path, for example: asr\_example.wav - + * pcm file path, for example: asr\_example.pcm, you need to specify the audio sampling rate fs (default is 16000) - + * Audio byte stream, for example: microphone byte data - + * wav.scp,kaldi-style wav list (`wav_id \t wav_path`), for example: - + ```plaintext Asr_example1./audios/asr_example1.wav @@ -89,13 +89,13 @@ Asr_example2./audios/asr_example2.wav In this input * Audio sampling points, for example:`audio, rate = soundfile.read("asr_example_zh.wav")`Is numpy.ndarray. batch input is supported. The type is list:`[audio_sample1, audio_sample2, ..., audio_sampleN]` - + * fbank input, support group batch. shape is \[batch, frames, dim\], type is torch.Tensor, for example - + * `output_dir`: None (default), if set, the output path of the output result - + * `**kwargs`(dict): Model-related inference parameters, e.g,`beam_size=10`,`decoding_ctc_weight=0.1`. - + Detailed documentation link:[https://github.com/modelscope/FunASR/blob/main/examples/README\_zh.md](https://github.com/modelscope/FunASR/blob/main/examples/README_zh.md) @@ -128,7 +128,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm @@ -208,10 +208,10 @@ Path resolution: configuration.json (not required) "model": {"type" : "funasr"}, "pipeline": {"type":"funasr-pipeline"}, "model_name_in_hub": { - "ms":"", + "ms":"", "hf":""}, "file_path_metas": { - "init_param":"model.pt", + "init_param":"model.pt", "config":"config.yaml", "tokenizer_conf": {"bpemodel": "chn_jpn_yue_eng_ko_spectok.bpe.model"}, "frontend_conf":{"cmvn_file": "am.mvn"}} @@ -274,7 +274,7 @@ class SenseVoiceSmall(nn.Module): def forward( self, **kwargs, - ): + ): def inference( self, @@ -320,9 +320,9 @@ from funasr.models.sense_voice.model import * ## Principles of Registration * Model: models are independent of each other. Each Model needs to create a new Model directory under funasr/models/. Do not use class inheritance method!!! Do not import from other model directories, and put everything you need into your own model directory!!! Do not modify the existing model code!!! - + * dataset,frontend,tokenizer, if you can reuse the existing one, reuse it directly, if you cannot reuse it, please register a new one, modify it again, and do not modify the original one!!! - + # Independent warehouse @@ -337,7 +337,7 @@ from funasr import AutoModel model = AutoModel ( model="iic/SenseVoiceSmall ", trust_remote_code=True -remote_code = "./model.py", +remote_code = "./model.py", ) ``` @@ -360,4 +360,4 @@ res = m.inference( print(text) ``` -Trim reference:[https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh](https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh) \ No newline at end of file +Trim reference:[https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh](https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh) diff --git a/docs/tutorial/Tables_zh.md b/docs/tutorial/Tables_zh.md index 72af82a0b..e9360e0d4 100644 --- a/docs/tutorial/Tables_zh.md +++ b/docs/tutorial/Tables_zh.md @@ -5,13 +5,13 @@ funasr-1.x.x 版本的设计初衷是【**让模型集成更简单**】,核心feature为注册表与AutoModel: * 注册表的引入,使得开发中可以用搭积木的方式接入模型,兼容多种task; - + * 新设计的AutoModel接口,统一modelscope、huggingface与funasr推理与训练接口,支持自由选择下载仓库; - + * 支持模型导出,demo级别服务部署,以及工业级别多并发服务部署; - + * 统一学术与工业模型推理训练脚本; - + # 快速上手 @@ -51,19 +51,19 @@ model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch ``` * `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称,或本地磁盘中的模型路径 - + * `device`(str): `cuda:0`(默认gpu0),使用 GPU 进行推理,指定。如果为`cpu`,则使用 CPU 进行推理 - + * `ncpu`(int): `4` (默认),设置用于 CPU 内部操作并行性的线程数 - + * `output_dir`(str): `None` (默认),如果设置,输出结果的输出路径 - + * `batch_size`(int): `1` (默认),解码时的批处理,样本个数 - + * `hub`(str):`ms`(默认),从modelscope下载模型。如果为`hf`,从huggingface下载模型。 - + * `**kwargs`(dict): 所有在`config.yaml`中参数,均可以直接在此处指定,例如,vad模型中最大切割长度 `max_single_segment_time=6000` (毫秒)。 - + #### AutoModel 推理 @@ -72,13 +72,13 @@ res = model.generate(input=[str], output_dir=[str]) ``` * * wav文件路径, 例如: asr\_example.wav - + * pcm文件路径, 例如: asr\_example.pcm,此时需要指定音频采样率fs(默认为16000) - + * 音频字节数流,例如:麦克风的字节数数据 - + * wav.scp,kaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如: - + ```plaintext asr_example1 ./audios/asr_example1.wav @@ -89,13 +89,13 @@ asr_example2 ./audios/asr_example2.wav 在这种输入  * 音频采样点,例如:`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入,类型为list: `[audio_sample1, audio_sample2, ..., audio_sampleN]` - + * fbank输入,支持组batch。shape为\[batch, frames, dim\],类型为torch.Tensor,例如 - + * `output_dir`: None (默认),如果设置,输出结果的输出路径 - + * `**kwargs`(dict): 与模型相关的推理参数,例如,`beam_size=10`,`decoding_ctc_weight=0.1`。 - + 详细文档链接:[https://github.com/modelscope/FunASR/blob/main/examples/README\_zh.md](https://github.com/modelscope/FunASR/blob/main/examples/README_zh.md) @@ -128,7 +128,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm @@ -208,10 +208,10 @@ scheduler_conf: "model": {"type" : "funasr"}, "pipeline": {"type":"funasr-pipeline"}, "model_name_in_hub": { - "ms":"", + "ms":"", "hf":""}, "file_path_metas": { - "init_param":"model.pt", + "init_param":"model.pt", "config":"config.yaml", "tokenizer_conf": {"bpemodel": "chn_jpn_yue_eng_ko_spectok.bpe.model"}, "frontend_conf":{"cmvn_file": "am.mvn"}} @@ -274,7 +274,7 @@ class SenseVoiceSmall(nn.Module): def forward( self, **kwargs, - ): + ): def inference( self, @@ -320,9 +320,9 @@ from funasr.models.sense_voice.model import * ## 注册原则 * Model:模型之间互相独立,每一个模型,都需要在funasr/models/下面新建一个模型目录,不要采用类的继承方法!!!不要从其他模型目录中import,所有需要用到的都单独放到自己的模型目录中!!!不要修改现有的模型代码!!! - + * dataset,frontend,tokenizer,如果能复用现有的,直接复用,如果不能复用,请注册一个新的,再修改,不要修改原来的!!! - + # 独立仓库 @@ -336,8 +336,8 @@ from funasr import AutoModel # trust_remote_code:`True` 表示 model 代码实现从 `remote_code` 处加载,`remote_code` 指定 `model` 具体代码的位置(例如,当前目录下的 `model.py`),支持绝对路径与相对路径,以及网络 url。 model = AutoModel( model="iic/SenseVoiceSmall", - trust_remote_code=True, - remote_code="./model.py", + trust_remote_code=True, + remote_code="./model.py", ) ``` @@ -360,4 +360,4 @@ res = m.inference( print(text) ``` -微调参考:[https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh](https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh) \ No newline at end of file +微调参考:[https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh](https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh) diff --git a/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml b/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml index c4d8c1826..4fa8f35ed 100644 --- a/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml +++ b/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml @@ -18,7 +18,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm # frontend related diff --git a/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml b/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml index 664997c10..28b3ccbf0 100644 --- a/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml +++ b/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml @@ -18,7 +18,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm chunk_size: - 16 diff --git a/funasr/models/bicif_paraformer/template.yaml b/funasr/models/bicif_paraformer/template.yaml index db7ce5555..710938c3d 100644 --- a/funasr/models/bicif_paraformer/template.yaml +++ b/funasr/models/bicif_paraformer/template.yaml @@ -30,7 +30,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm # decoder @@ -45,7 +45,7 @@ decoder_conf: src_attention_dropout_rate: 0.1 att_layer_num: 16 kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 predictor: CifPredictorV3 predictor_conf: diff --git a/funasr/models/contextual_paraformer/decoder.py b/funasr/models/contextual_paraformer/decoder.py index ba2ce9ade..958c46b54 100644 --- a/funasr/models/contextual_paraformer/decoder.py +++ b/funasr/models/contextual_paraformer/decoder.py @@ -137,7 +137,7 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder): concat_after: bool = False, att_layer_num: int = 6, kernel_size: int = 21, - sanm_shfit: int = 0, + sanm_shift: int = 0, ): super().__init__( vocab_size=vocab_size, @@ -179,14 +179,14 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder): self.att_layer_num = att_layer_num self.num_blocks = num_blocks - if sanm_shfit is None: - sanm_shfit = (kernel_size - 1) // 2 + if sanm_shift is None: + sanm_shift = (kernel_size - 1) // 2 self.decoders = repeat( att_layer_num - 1, lambda lnum: DecoderLayerSANM( attention_dim, MultiHeadedAttentionSANMDecoder( - attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit + attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift ), MultiHeadedAttentionCrossAtt( attention_heads, attention_dim, src_attention_dropout_rate @@ -210,7 +210,7 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder): self.last_decoder = ContextualDecoderLayer( attention_dim, MultiHeadedAttentionSANMDecoder( - attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit + attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift ), MultiHeadedAttentionCrossAtt( attention_heads, attention_dim, src_attention_dropout_rate @@ -228,7 +228,7 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder): lambda lnum: DecoderLayerSANM( attention_dim, MultiHeadedAttentionSANMDecoder( - attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=0 + attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=0 ), None, PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate), diff --git a/funasr/models/contextual_paraformer/template.yaml b/funasr/models/contextual_paraformer/template.yaml index 22052500c..17e542a65 100644 --- a/funasr/models/contextual_paraformer/template.yaml +++ b/funasr/models/contextual_paraformer/template.yaml @@ -30,7 +30,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm @@ -46,7 +46,7 @@ decoder_conf: src_attention_dropout_rate: 0.1 att_layer_num: 16 kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 predictor: CifPredictorV2 predictor_conf: @@ -126,4 +126,4 @@ ctc_conf: ctc_type: builtin reduce: true ignore_nan_grad: true -normalize: null \ No newline at end of file +normalize: null diff --git a/funasr/models/ct_transformer/template.yaml b/funasr/models/ct_transformer/template.yaml index 2538e6b1d..7ad6e697d 100644 --- a/funasr/models/ct_transformer/template.yaml +++ b/funasr/models/ct_transformer/template.yaml @@ -41,7 +41,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm padding_idx: 0 diff --git a/funasr/models/ct_transformer_streaming/attention.py b/funasr/models/ct_transformer_streaming/attention.py index 97e770bbb..be7113f89 100644 --- a/funasr/models/ct_transformer_streaming/attention.py +++ b/funasr/models/ct_transformer_streaming/attention.py @@ -11,9 +11,9 @@ class MultiHeadedAttentionSANMwithMask(MultiHeadedAttentionSANM): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + def forward(self, x, mask, mask_shift_chunk=None, mask_att_chunk_encoder=None): q_h, k_h, v_h, v = self.forward_qkv(x) - fsmn_memory = self.forward_fsmn(v, mask[0], mask_shfit_chunk) + fsmn_memory = self.forward_fsmn(v, mask[0], mask_shift_chunk) q_h = q_h * self.d_k ** (-0.5) scores = torch.matmul(q_h, k_h.transpose(-2, -1)) att_outs = self.forward_attention(v_h, scores, mask[1], mask_att_chunk_encoder) diff --git a/funasr/models/ct_transformer_streaming/encoder.py b/funasr/models/ct_transformer_streaming/encoder.py index a61319aa4..7d09875fb 100644 --- a/funasr/models/ct_transformer_streaming/encoder.py +++ b/funasr/models/ct_transformer_streaming/encoder.py @@ -56,7 +56,7 @@ class EncoderLayerSANM(torch.nn.Module): self.stochastic_depth_rate = stochastic_depth_rate self.dropout_rate = dropout_rate - def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None): """Compute encoded features. Args: @@ -93,7 +93,7 @@ class EncoderLayerSANM(torch.nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ), ), @@ -109,7 +109,7 @@ class EncoderLayerSANM(torch.nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) @@ -118,7 +118,7 @@ class EncoderLayerSANM(torch.nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) @@ -132,7 +132,7 @@ class EncoderLayerSANM(torch.nn.Module): if not self.normalize_before: x = self.norm2(x) - return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): """Compute encoded features. @@ -198,7 +198,7 @@ class SANMVadEncoder(torch.nn.Module): interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, kernel_size: int = 11, - sanm_shfit: int = 0, + sanm_shift: int = 0, selfattention_layer_type: str = "sanm", ): super().__init__() @@ -277,7 +277,7 @@ class SANMVadEncoder(torch.nn.Module): output_size, attention_dropout_rate, kernel_size, - sanm_shfit, + sanm_shift, ) encoder_selfattn_layer_args = ( @@ -286,7 +286,7 @@ class SANMVadEncoder(torch.nn.Module): output_size, attention_dropout_rate, kernel_size, - sanm_shfit, + sanm_shift, ) self.encoders0 = repeat( diff --git a/funasr/models/ct_transformer_streaming/template.yaml b/funasr/models/ct_transformer_streaming/template.yaml index 2477ac2be..ae59b4446 100644 --- a/funasr/models/ct_transformer_streaming/template.yaml +++ b/funasr/models/ct_transformer_streaming/template.yaml @@ -41,10 +41,10 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 5 + sanm_shift: 5 selfattention_layer_type: sanm padding_idx: 0 tokenizer: CharTokenizer tokenizer_conf: - unk_symbol: \ No newline at end of file + unk_symbol: diff --git a/funasr/models/monotonic_aligner/template.yaml b/funasr/models/monotonic_aligner/template.yaml index f8d5ded7d..82abf35c6 100644 --- a/funasr/models/monotonic_aligner/template.yaml +++ b/funasr/models/monotonic_aligner/template.yaml @@ -25,7 +25,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm predictor: CifPredictorV3 @@ -111,5 +111,5 @@ ctc_conf: ctc_type: builtin reduce: true ignore_nan_grad: true - + normalize: null diff --git a/funasr/models/paraformer/decoder.py b/funasr/models/paraformer/decoder.py index 7edd91a2a..fafb8d41a 100644 --- a/funasr/models/paraformer/decoder.py +++ b/funasr/models/paraformer/decoder.py @@ -248,7 +248,7 @@ class ParaformerSANMDecoder(BaseTransformerDecoder): concat_after: bool = False, att_layer_num: int = 6, kernel_size: int = 21, - sanm_shfit: int = 0, + sanm_shift: int = 0, lora_list: List[str] = None, lora_rank: int = 8, lora_alpha: int = 16, @@ -298,14 +298,14 @@ class ParaformerSANMDecoder(BaseTransformerDecoder): self.att_layer_num = att_layer_num self.num_blocks = num_blocks - if sanm_shfit is None: - sanm_shfit = (kernel_size - 1) // 2 + if sanm_shift is None: + sanm_shift = (kernel_size - 1) // 2 self.decoders = repeat( att_layer_num, lambda lnum: DecoderLayerSANM( attention_dim, MultiHeadedAttentionSANMDecoder( - attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit + attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift ), MultiHeadedAttentionCrossAtt( attention_heads, @@ -330,7 +330,7 @@ class ParaformerSANMDecoder(BaseTransformerDecoder): lambda lnum: DecoderLayerSANM( attention_dim, MultiHeadedAttentionSANMDecoder( - attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=0 + attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=0 ), None, PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate), @@ -785,20 +785,20 @@ class ParaformerSANMDecoderExport(torch.nn.Module): for _ in range(cache_num) ] return (tgt, memory, pre_acoustic_embeds, cache) - + def is_optimizable(self): return True - + def get_input_names(self): cache_num = len(self.model.decoders) + len(self.model.decoders2) return ['tgt', 'memory', 'pre_acoustic_embeds'] \ + ['cache_%d' % i for i in range(cache_num)] - + def get_output_names(self): cache_num = len(self.model.decoders) + len(self.model.decoders2) return ['y'] \ + ['out_cache_%d' % i for i in range(cache_num)] - + def get_dynamic_axes(self): ret = { 'tgt': { diff --git a/funasr/models/paraformer/template.yaml b/funasr/models/paraformer/template.yaml index 249e88ca6..170c10be4 100644 --- a/funasr/models/paraformer/template.yaml +++ b/funasr/models/paraformer/template.yaml @@ -29,7 +29,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm # decoder @@ -44,7 +44,7 @@ decoder_conf: src_attention_dropout_rate: 0.1 att_layer_num: 16 kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 predictor: CifPredictorV2 predictor_conf: diff --git a/funasr/models/paraformer_streaming/model.py b/funasr/models/paraformer_streaming/model.py index 16021ceb6..bbc9668d0 100644 --- a/funasr/models/paraformer_streaming/model.py +++ b/funasr/models/paraformer_streaming/model.py @@ -198,10 +198,10 @@ class ParaformerStreaming(Paraformer): mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - encoder_out = encoder_out * mask_shfit_chunk + encoder_out = encoder_out * mask_shift_chunk pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor( encoder_out, ys_pad, @@ -357,10 +357,10 @@ class ParaformerStreaming(Paraformer): mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - encoder_out = encoder_out * mask_shfit_chunk + encoder_out = encoder_out * mask_shift_chunk pre_acoustic_embeds, pre_token_length, pre_alphas, pre_peak_index = self.predictor( encoder_out, None, diff --git a/funasr/models/paraformer_streaming/template.yaml b/funasr/models/paraformer_streaming/template.yaml index 889971ad1..44cbbc7bb 100644 --- a/funasr/models/paraformer_streaming/template.yaml +++ b/funasr/models/paraformer_streaming/template.yaml @@ -29,7 +29,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm chunk_size: - 12 @@ -59,7 +59,7 @@ decoder_conf: src_attention_dropout_rate: 0.1 att_layer_num: 16 kernel_size: 11 - sanm_shfit: 5 + sanm_shift: 5 predictor: CifPredictorV2 predictor_conf: diff --git a/funasr/models/sanm/attention.py b/funasr/models/sanm/attention.py index 47d60cb67..a9bb70fe8 100644 --- a/funasr/models/sanm/attention.py +++ b/funasr/models/sanm/attention.py @@ -154,7 +154,7 @@ class MultiHeadedAttentionSANM(nn.Module): n_feat, dropout_rate, kernel_size, - sanm_shfit=0, + sanm_shift=0, lora_list=None, lora_rank=8, lora_alpha=16, @@ -199,17 +199,17 @@ class MultiHeadedAttentionSANM(nn.Module): ) # padding left_padding = (kernel_size - 1) // 2 - if sanm_shfit > 0: - left_padding = left_padding + sanm_shfit + if sanm_shift > 0: + left_padding = left_padding + sanm_shift right_padding = kernel_size - 1 - left_padding self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) - def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None): + def forward_fsmn(self, inputs, mask, mask_shift_chunk=None): b, t, d = inputs.size() if mask is not None: mask = torch.reshape(mask, (b, -1, 1)) - if mask_shfit_chunk is not None: - mask = mask * mask_shfit_chunk + if mask_shift_chunk is not None: + mask = mask * mask_shift_chunk inputs = inputs * mask x = inputs.transpose(1, 2) @@ -289,7 +289,7 @@ class MultiHeadedAttentionSANM(nn.Module): return self.linear_out(x) # (batch, time1, d_model) - def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + def forward(self, x, mask, mask_shift_chunk=None, mask_att_chunk_encoder=None): """Compute scaled dot product attention. Args: @@ -304,7 +304,7 @@ class MultiHeadedAttentionSANM(nn.Module): """ q_h, k_h, v_h, v = self.forward_qkv(x) - fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) + fsmn_memory = self.forward_fsmn(v, mask, mask_shift_chunk) q_h = q_h * self.d_k ** (-0.5) scores = torch.matmul(q_h, k_h.transpose(-2, -1)) att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) @@ -478,7 +478,7 @@ class MultiHeadedAttentionSANMDecoder(nn.Module): """ - def __init__(self, n_feat, dropout_rate, kernel_size, sanm_shfit=0): + def __init__(self, n_feat, dropout_rate, kernel_size, sanm_shift=0): """Construct an MultiHeadedAttention object.""" super().__init__() @@ -490,13 +490,13 @@ class MultiHeadedAttentionSANMDecoder(nn.Module): # padding # padding left_padding = (kernel_size - 1) // 2 - if sanm_shfit > 0: - left_padding = left_padding + sanm_shfit + if sanm_shift > 0: + left_padding = left_padding + sanm_shift right_padding = kernel_size - 1 - left_padding self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) self.kernel_size = kernel_size - def forward(self, inputs, mask, cache=None, mask_shfit_chunk=None): + def forward(self, inputs, mask, cache=None, mask_shift_chunk=None): """ :param x: (#batch, time1, size). :param mask: Mask tensor (#batch, 1, time) @@ -509,9 +509,9 @@ class MultiHeadedAttentionSANMDecoder(nn.Module): if mask is not None: mask = torch.reshape(mask, (b, -1, 1)) # logging.info("in fsmn, mask: {}, {}".format(mask.size(), mask[0:100:50, :, :])) - if mask_shfit_chunk is not None: - # logging.info("in fsmn, mask_fsmn: {}, {}".format(mask_shfit_chunk.size(), mask_shfit_chunk[0:100:50, :, :])) - mask = mask * mask_shfit_chunk + if mask_shift_chunk is not None: + # logging.info("in fsmn, mask_fsmn: {}, {}".format(mask_shift_chunk.size(), mask_shift_chunk[0:100:50, :, :])) + mask = mask * mask_shift_chunk # logging.info("in fsmn, mask_after_fsmn: {}, {}".format(mask.size(), mask[0:100:50, :, :])) # print("in fsmn, mask", mask.size()) # print("in fsmn, inputs", inputs.size()) diff --git a/funasr/models/sanm/decoder.py b/funasr/models/sanm/decoder.py index 1a4fb26e2..01a5f0ece 100644 --- a/funasr/models/sanm/decoder.py +++ b/funasr/models/sanm/decoder.py @@ -226,7 +226,7 @@ class FsmnDecoder(BaseTransformerDecoder): concat_after: bool = False, att_layer_num: int = 6, kernel_size: int = 21, - sanm_shfit: int = None, + sanm_shift: int = None, concat_embeds: bool = False, attention_dim: int = None, tf2torch_tensor_name_prefix_torch: str = "decoder", @@ -271,14 +271,14 @@ class FsmnDecoder(BaseTransformerDecoder): self.att_layer_num = att_layer_num self.num_blocks = num_blocks - if sanm_shfit is None: - sanm_shfit = (kernel_size - 1) // 2 + if sanm_shift is None: + sanm_shift = (kernel_size - 1) // 2 self.decoders = repeat( att_layer_num, lambda lnum: DecoderLayerSANM( attention_dim, MultiHeadedAttentionSANMDecoder( - attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit + attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift ), MultiHeadedAttentionCrossAtt( attention_heads, @@ -303,7 +303,7 @@ class FsmnDecoder(BaseTransformerDecoder): attention_dim, self_attention_dropout_rate, kernel_size, - sanm_shfit=sanm_shfit, + sanm_shift=sanm_shift, ), None, PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate), diff --git a/funasr/models/sanm/encoder.py b/funasr/models/sanm/encoder.py index 0d39ca742..b590e2489 100644 --- a/funasr/models/sanm/encoder.py +++ b/funasr/models/sanm/encoder.py @@ -69,7 +69,7 @@ class EncoderLayerSANM(nn.Module): self.stochastic_depth_rate = stochastic_depth_rate self.dropout_rate = dropout_rate - def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None): """Compute encoded features. Args: @@ -106,7 +106,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ), ), @@ -122,7 +122,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) @@ -131,7 +131,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) @@ -145,7 +145,7 @@ class EncoderLayerSANM(nn.Module): if not self.normalize_before: x = self.norm2(x) - return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): """Compute encoded features. @@ -212,7 +212,7 @@ class SANMEncoder(nn.Module): interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, kernel_size: int = 11, - sanm_shfit: int = 0, + sanm_shift: int = 0, lora_list: List[str] = None, lora_rank: int = 8, lora_alpha: int = 16, @@ -299,7 +299,7 @@ class SANMEncoder(nn.Module): output_size, attention_dropout_rate, kernel_size, - sanm_shfit, + sanm_shift, lora_list, lora_rank, lora_alpha, @@ -312,7 +312,7 @@ class SANMEncoder(nn.Module): output_size, attention_dropout_rate, kernel_size, - sanm_shfit, + sanm_shift, lora_list, lora_rank, lora_alpha, diff --git a/funasr/models/sanm/template.yaml b/funasr/models/sanm/template.yaml index 316fe75cb..987fec2ae 100644 --- a/funasr/models/sanm/template.yaml +++ b/funasr/models/sanm/template.yaml @@ -26,7 +26,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm # decoder @@ -41,7 +41,7 @@ decoder_conf: src_attention_dropout_rate: 0.1 att_layer_num: 16 kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 diff --git a/funasr/models/scama/chunk_utilis.py b/funasr/models/scama/chunk_utilis.py index 2fe3fa4e3..d9b4aa996 100644 --- a/funasr/models/scama/chunk_utilis.py +++ b/funasr/models/scama/chunk_utilis.py @@ -21,7 +21,7 @@ class overlap_chunk: stride: tuple = (10,), pad_left: tuple = (0,), encoder_att_look_back_factor: tuple = (1,), - shfit_fsmn: int = 0, + shift_fsmn: int = 0, decoder_att_look_back_factor: tuple = (1,), ): @@ -45,11 +45,11 @@ class overlap_chunk: encoder_att_look_back_factor, decoder_att_look_back_factor, ) - self.shfit_fsmn = shfit_fsmn + self.shift_fsmn = shift_fsmn self.x_add_mask = None self.x_rm_mask = None self.x_len = None - self.mask_shfit_chunk = None + self.mask_shift_chunk = None self.mask_chunk_predictor = None self.mask_att_chunk_encoder = None self.mask_shift_att_chunk_decoder = None @@ -88,7 +88,7 @@ class overlap_chunk: stride, pad_left, encoder_att_look_back_factor, - chunk_size + self.shfit_fsmn, + chunk_size + self.shift_fsmn, decoder_att_look_back_factor, ) return ( @@ -118,13 +118,13 @@ class overlap_chunk: chunk_size, stride, pad_left, encoder_att_look_back_factor, chunk_size_pad_shift = ( self.get_chunk_size(ind) ) - shfit_fsmn = self.shfit_fsmn + shift_fsmn = self.shift_fsmn pad_right = chunk_size - stride - pad_left chunk_num_batch = np.ceil(x_len / stride).astype(np.int32) x_len_chunk = ( (chunk_num_batch - 1) * chunk_size_pad_shift - + shfit_fsmn + + shift_fsmn + pad_left + 0 + x_len @@ -138,13 +138,13 @@ class overlap_chunk: max_len_for_x_mask_tmp = max(chunk_size, x_len_max + pad_left) x_add_mask = np.zeros([0, max_len_for_x_mask_tmp], dtype=dtype) x_rm_mask = np.zeros([max_len_for_x_mask_tmp, 0], dtype=dtype) - mask_shfit_chunk = np.zeros([0, num_units], dtype=dtype) + mask_shift_chunk = np.zeros([0, num_units], dtype=dtype) mask_chunk_predictor = np.zeros([0, num_units_predictor], dtype=dtype) mask_shift_att_chunk_decoder = np.zeros([0, 1], dtype=dtype) mask_att_chunk_encoder = np.zeros([0, chunk_num * chunk_size_pad_shift], dtype=dtype) for chunk_ids in range(chunk_num): # x_mask add - fsmn_padding = np.zeros((shfit_fsmn, max_len_for_x_mask_tmp), dtype=dtype) + fsmn_padding = np.zeros((shift_fsmn, max_len_for_x_mask_tmp), dtype=dtype) x_mask_cur = np.diag(np.ones(chunk_size, dtype=np.float32)) x_mask_pad_left = np.zeros((chunk_size, chunk_ids * stride), dtype=dtype) x_mask_pad_right = np.zeros((chunk_size, max_len_for_x_mask_tmp), dtype=dtype) @@ -154,7 +154,7 @@ class overlap_chunk: x_add_mask = np.concatenate([x_add_mask, x_add_mask_fsmn], axis=0) # x_mask rm - fsmn_padding = np.zeros((max_len_for_x_mask_tmp, shfit_fsmn), dtype=dtype) + fsmn_padding = np.zeros((max_len_for_x_mask_tmp, shift_fsmn), dtype=dtype) padding_mask_left = np.zeros((max_len_for_x_mask_tmp, pad_left), dtype=dtype) padding_mask_right = np.zeros((max_len_for_x_mask_tmp, pad_right), dtype=dtype) x_mask_cur = np.diag(np.ones(stride, dtype=dtype)) @@ -170,13 +170,13 @@ class overlap_chunk: x_rm_mask = np.concatenate([x_rm_mask, x_rm_mask_cur_fsmn], axis=1) # fsmn_padding_mask - pad_shfit_mask = np.zeros([shfit_fsmn, num_units], dtype=dtype) + pad_shift_mask = np.zeros([shift_fsmn, num_units], dtype=dtype) ones_1 = np.ones([chunk_size, num_units], dtype=dtype) - mask_shfit_chunk_cur = np.concatenate([pad_shfit_mask, ones_1], axis=0) - mask_shfit_chunk = np.concatenate([mask_shfit_chunk, mask_shfit_chunk_cur], axis=0) + mask_shift_chunk_cur = np.concatenate([pad_shift_mask, ones_1], axis=0) + mask_shift_chunk = np.concatenate([mask_shift_chunk, mask_shift_chunk_cur], axis=0) # predictor mask - zeros_1 = np.zeros([shfit_fsmn + pad_left, num_units_predictor], dtype=dtype) + zeros_1 = np.zeros([shift_fsmn + pad_left, num_units_predictor], dtype=dtype) ones_2 = np.ones([stride, num_units_predictor], dtype=dtype) zeros_3 = np.zeros( [chunk_size - stride - pad_left, num_units_predictor], dtype=dtype @@ -188,13 +188,13 @@ class overlap_chunk: ) # encoder att mask - zeros_1_top = np.zeros([shfit_fsmn, chunk_num * chunk_size_pad_shift], dtype=dtype) + zeros_1_top = np.zeros([shift_fsmn, chunk_num * chunk_size_pad_shift], dtype=dtype) zeros_2_num = max(chunk_ids - encoder_att_look_back_factor, 0) zeros_2 = np.zeros([chunk_size, zeros_2_num * chunk_size_pad_shift], dtype=dtype) encoder_att_look_back_num = max(chunk_ids - zeros_2_num, 0) - zeros_2_left = np.zeros([chunk_size, shfit_fsmn], dtype=dtype) + zeros_2_left = np.zeros([chunk_size, shift_fsmn], dtype=dtype) ones_2_mid = np.ones([stride, stride], dtype=dtype) zeros_2_bottom = np.zeros([chunk_size - stride, stride], dtype=dtype) zeros_2_right = np.zeros([chunk_size, chunk_size - stride], dtype=dtype) @@ -202,7 +202,7 @@ class overlap_chunk: ones_2 = np.concatenate([zeros_2_left, ones_2, zeros_2_right], axis=1) ones_2 = np.tile(ones_2, [1, encoder_att_look_back_num]) - zeros_3_left = np.zeros([chunk_size, shfit_fsmn], dtype=dtype) + zeros_3_left = np.zeros([chunk_size, shift_fsmn], dtype=dtype) ones_3_right = np.ones([chunk_size, chunk_size], dtype=dtype) ones_3 = np.concatenate([zeros_3_left, ones_3_right], axis=1) @@ -218,7 +218,7 @@ class overlap_chunk: ) # decoder fsmn_shift_att_mask - zeros_1 = np.zeros([shfit_fsmn, 1]) + zeros_1 = np.zeros([shift_fsmn, 1]) ones_1 = np.ones([chunk_size, 1]) mask_shift_att_chunk_decoder_cur = np.concatenate([zeros_1, ones_1], axis=0) mask_shift_att_chunk_decoder = np.concatenate( @@ -229,7 +229,7 @@ class overlap_chunk: self.x_len_chunk = x_len_chunk self.x_rm_mask = x_rm_mask[:x_len_max, :x_len_chunk_max] self.x_len = x_len - self.mask_shfit_chunk = mask_shfit_chunk[:x_len_chunk_max, :] + self.mask_shift_chunk = mask_shift_chunk[:x_len_chunk_max, :] self.mask_chunk_predictor = mask_chunk_predictor[:x_len_chunk_max, :] self.mask_att_chunk_encoder = mask_att_chunk_encoder[:x_len_chunk_max, :x_len_chunk_max] self.mask_shift_att_chunk_decoder = mask_shift_att_chunk_decoder[:x_len_chunk_max, :] @@ -238,7 +238,7 @@ class overlap_chunk: self.x_len_chunk, self.x_rm_mask, self.x_len, - self.mask_shfit_chunk, + self.mask_shift_chunk, self.mask_chunk_predictor, self.mask_att_chunk_encoder, self.mask_shift_att_chunk_decoder, @@ -309,7 +309,7 @@ class overlap_chunk: x = torch.from_numpy(x).type(dtype).to(device) return x - def get_mask_shfit_chunk( + def get_mask_shift_chunk( self, chunk_outs=None, device="cpu", batch_size=1, num_units=1, idx=4, dtype=torch.float32 ): with torch.no_grad(): diff --git a/funasr/models/scama/decoder.py b/funasr/models/scama/decoder.py index 31b235778..f457b75bf 100644 --- a/funasr/models/scama/decoder.py +++ b/funasr/models/scama/decoder.py @@ -226,7 +226,7 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder): concat_after: bool = False, att_layer_num: int = 6, kernel_size: int = 21, - sanm_shfit: int = None, + sanm_shift: int = None, concat_embeds: bool = False, attention_dim: int = None, tf2torch_tensor_name_prefix_torch: str = "decoder", @@ -271,14 +271,14 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder): self.att_layer_num = att_layer_num self.num_blocks = num_blocks - if sanm_shfit is None: - sanm_shfit = (kernel_size - 1) // 2 + if sanm_shift is None: + sanm_shift = (kernel_size - 1) // 2 self.decoders = repeat( att_layer_num, lambda lnum: DecoderLayerSANM( attention_dim, MultiHeadedAttentionSANMDecoder( - attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit + attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift ), MultiHeadedAttentionCrossAtt( attention_heads, @@ -303,7 +303,7 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder): attention_dim, self_attention_dropout_rate, kernel_size, - sanm_shfit=sanm_shfit, + sanm_shift=sanm_shift, ), None, PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate), diff --git a/funasr/models/scama/encoder.py b/funasr/models/scama/encoder.py index e1fe9242e..0c871e107 100644 --- a/funasr/models/scama/encoder.py +++ b/funasr/models/scama/encoder.py @@ -69,7 +69,7 @@ class EncoderLayerSANM(nn.Module): self.stochastic_depth_rate = stochastic_depth_rate self.dropout_rate = dropout_rate - def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None): """Compute encoded features. Args: @@ -106,7 +106,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ), ), @@ -122,7 +122,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) @@ -131,7 +131,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) @@ -145,7 +145,7 @@ class EncoderLayerSANM(nn.Module): if not self.normalize_before: x = self.norm2(x) - return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): """Compute encoded features. @@ -212,7 +212,7 @@ class SANMEncoderChunkOpt(nn.Module): interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, kernel_size: int = 11, - sanm_shfit: int = 0, + sanm_shift: int = 0, selfattention_layer_type: str = "sanm", chunk_size: Union[int, Sequence[int]] = (16,), stride: Union[int, Sequence[int]] = (10,), @@ -299,7 +299,7 @@ class SANMEncoderChunkOpt(nn.Module): output_size, attention_dropout_rate, kernel_size, - sanm_shfit, + sanm_shift, ) encoder_selfattn_layer_args = ( @@ -308,7 +308,7 @@ class SANMEncoderChunkOpt(nn.Module): output_size, attention_dropout_rate, kernel_size, - sanm_shfit, + sanm_shift, ) self.encoders0 = repeat( 1, @@ -343,12 +343,12 @@ class SANMEncoderChunkOpt(nn.Module): assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None - shfit_fsmn = (kernel_size - 1) // 2 + shift_fsmn = (kernel_size - 1) // 2 self.overlap_chunk_cls = overlap_chunk( chunk_size=chunk_size, stride=stride, pad_left=pad_left, - shfit_fsmn=shfit_fsmn, + shift_fsmn=shift_fsmn, encoder_att_look_back_factor=encoder_att_look_back_factor, decoder_att_look_back_factor=decoder_att_look_back_factor, ) @@ -397,31 +397,31 @@ class SANMEncoderChunkOpt(nn.Module): else: xs_pad = self.embed(xs_pad) - mask_shfit_chunk, mask_att_chunk_encoder = None, None + mask_shift_chunk, mask_att_chunk_encoder = None, None if self.overlap_chunk_cls is not None: ilens = masks.squeeze(1).sum(1) chunk_outs = self.overlap_chunk_cls.gen_chunk_mask(ilens, ind) xs_pad, ilens = self.overlap_chunk_cls.split_chunk(xs_pad, ilens, chunk_outs=chunk_outs) masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) - mask_shfit_chunk = self.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.overlap_chunk_cls.get_mask_shift_chunk( chunk_outs, xs_pad.device, xs_pad.size(0), dtype=xs_pad.dtype ) mask_att_chunk_encoder = self.overlap_chunk_cls.get_mask_att_chunk_encoder( chunk_outs, xs_pad.device, xs_pad.size(0), dtype=xs_pad.dtype ) - encoder_outs = self.encoders0(xs_pad, masks, None, mask_shfit_chunk, mask_att_chunk_encoder) + encoder_outs = self.encoders0(xs_pad, masks, None, mask_shift_chunk, mask_att_chunk_encoder) xs_pad, masks = encoder_outs[0], encoder_outs[1] intermediate_outs = [] if len(self.interctc_layer_idx) == 0: encoder_outs = self.encoders( - xs_pad, masks, None, mask_shfit_chunk, mask_att_chunk_encoder + xs_pad, masks, None, mask_shift_chunk, mask_att_chunk_encoder ) xs_pad, masks = encoder_outs[0], encoder_outs[1] else: for layer_idx, encoder_layer in enumerate(self.encoders): encoder_outs = encoder_layer( - xs_pad, masks, None, mask_shfit_chunk, mask_att_chunk_encoder + xs_pad, masks, None, mask_shift_chunk, mask_att_chunk_encoder ) xs_pad, masks = encoder_outs[0], encoder_outs[1] if layer_idx + 1 in self.interctc_layer_idx: diff --git a/funasr/models/scama/model.py b/funasr/models/scama/model.py index c15f435a4..4a28435fb 100644 --- a/funasr/models/scama/model.py +++ b/funasr/models/scama/model.py @@ -321,10 +321,10 @@ class SCAMA(nn.Module): mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - encoder_out = encoder_out * mask_shfit_chunk + encoder_out = encoder_out * mask_shift_chunk pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor( encoder_out, ys_out_pad, @@ -415,10 +415,10 @@ class SCAMA(nn.Module): mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - encoder_out = encoder_out * mask_shfit_chunk + encoder_out = encoder_out * mask_shift_chunk pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor( encoder_out, ys_out_pad, diff --git a/funasr/models/scama/template.yaml b/funasr/models/scama/template.yaml index bc2e210b2..8e14cd38e 100644 --- a/funasr/models/scama/template.yaml +++ b/funasr/models/scama/template.yaml @@ -26,7 +26,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm # decoder @@ -41,7 +41,7 @@ decoder_conf: src_attention_dropout_rate: 0.1 att_layer_num: 16 kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 predictor: CifPredictorV2 predictor_conf: diff --git a/funasr/models/seaco_paraformer/template.yaml b/funasr/models/seaco_paraformer/template.yaml index fcaf5243d..2bf0825eb 100644 --- a/funasr/models/seaco_paraformer/template.yaml +++ b/funasr/models/seaco_paraformer/template.yaml @@ -36,7 +36,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm # decoder @@ -51,7 +51,7 @@ decoder_conf: src_attention_dropout_rate: 0.1 att_layer_num: 16 kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 # seaco decoder seaco_decoder: ParaformerSANMDecoder @@ -64,7 +64,7 @@ seaco_decoder_conf: self_attention_dropout_rate: 0.1 src_attention_dropout_rate: 0.1 kernel_size: 21 - sanm_shfit: 0 + sanm_shift: 0 use_output_layer: false wo_input_layer: true diff --git a/funasr/models/sense_voice/model.py b/funasr/models/sense_voice/model.py index 9d8ef8421..ca0c40a9d 100644 --- a/funasr/models/sense_voice/model.py +++ b/funasr/models/sense_voice/model.py @@ -95,7 +95,7 @@ class MultiHeadedAttentionSANM(nn.Module): n_feat, dropout_rate, kernel_size, - sanm_shfit=0, + sanm_shift=0, lora_list=None, lora_rank=8, lora_alpha=16, @@ -121,17 +121,17 @@ class MultiHeadedAttentionSANM(nn.Module): ) # padding left_padding = (kernel_size - 1) // 2 - if sanm_shfit > 0: - left_padding = left_padding + sanm_shfit + if sanm_shift > 0: + left_padding = left_padding + sanm_shift right_padding = kernel_size - 1 - left_padding self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) - def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None): + def forward_fsmn(self, inputs, mask, mask_shift_chunk=None): b, t, d = inputs.size() if mask is not None: mask = torch.reshape(mask, (b, -1, 1)) - if mask_shfit_chunk is not None: - mask = mask * mask_shfit_chunk + if mask_shift_chunk is not None: + mask = mask * mask_shift_chunk inputs = inputs * mask x = inputs.transpose(1, 2) @@ -211,7 +211,7 @@ class MultiHeadedAttentionSANM(nn.Module): return self.linear_out(x) # (batch, time1, d_model) - def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + def forward(self, x, mask, mask_shift_chunk=None, mask_att_chunk_encoder=None): """Compute scaled dot product attention. Args: @@ -226,7 +226,7 @@ class MultiHeadedAttentionSANM(nn.Module): """ q_h, k_h, v_h, v = self.forward_qkv(x) - fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) + fsmn_memory = self.forward_fsmn(v, mask, mask_shift_chunk) q_h = q_h * self.d_k ** (-0.5) scores = torch.matmul(q_h, k_h.transpose(-2, -1)) att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) @@ -326,7 +326,7 @@ class EncoderLayerSANM(nn.Module): self.stochastic_depth_rate = stochastic_depth_rate self.dropout_rate = dropout_rate - def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None): """Compute encoded features. Args: @@ -363,7 +363,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ), ), @@ -379,7 +379,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) @@ -388,7 +388,7 @@ class EncoderLayerSANM(nn.Module): self.self_attn( x, mask, - mask_shfit_chunk=mask_shfit_chunk, + mask_shift_chunk=mask_shift_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder, ) ) @@ -402,7 +402,7 @@ class EncoderLayerSANM(nn.Module): if not self.normalize_before: x = self.norm2(x) - return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): """Compute encoded features. @@ -469,7 +469,7 @@ class SenseVoiceEncoderSmall(nn.Module): positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, kernel_size: int = 11, - sanm_shfit: int = 0, + sanm_shift: int = 0, selfattention_layer_type: str = "sanm", **kwargs, ): @@ -494,7 +494,7 @@ class SenseVoiceEncoderSmall(nn.Module): output_size, attention_dropout_rate, kernel_size, - sanm_shfit, + sanm_shift, ) encoder_selfattn_layer_args = ( attention_heads, @@ -502,7 +502,7 @@ class SenseVoiceEncoderSmall(nn.Module): output_size, attention_dropout_rate, kernel_size, - sanm_shfit, + sanm_shift, ) self.encoders0 = nn.ModuleList( diff --git a/funasr/models/sond/encoder/fsmn_encoder.py b/funasr/models/sond/encoder/fsmn_encoder.py index 9ec9912a4..5bccf433b 100644 --- a/funasr/models/sond/encoder/fsmn_encoder.py +++ b/funasr/models/sond/encoder/fsmn_encoder.py @@ -36,12 +36,12 @@ class FsmnBlock(torch.nn.Module): right_padding = kernel_size - 1 - left_padding self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) - def forward(self, inputs, mask, mask_shfit_chunk=None): + def forward(self, inputs, mask, mask_shift_chunk=None): b, t, d = inputs.size() if mask is not None: mask = torch.reshape(mask, (b, -1, 1)) - if mask_shfit_chunk is not None: - mask = mask * mask_shfit_chunk + if mask_shift_chunk is not None: + mask = mask * mask_shift_chunk inputs = inputs * mask x = inputs.transpose(1, 2) diff --git a/funasr/models/uniasr/model.py b/funasr/models/uniasr/model.py index bde637777..002dcdda2 100644 --- a/funasr/models/uniasr/model.py +++ b/funasr/models/uniasr/model.py @@ -521,10 +521,10 @@ class UniASR(torch.nn.Module): mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - encoder_out = encoder_out * mask_shfit_chunk + encoder_out = encoder_out * mask_shift_chunk pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor( encoder_out, ys_out_pad, @@ -622,10 +622,10 @@ class UniASR(torch.nn.Module): mask_chunk_predictor = self.encoder2.overlap_chunk_cls.get_mask_chunk_predictor( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - mask_shfit_chunk = self.encoder2.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.encoder2.overlap_chunk_cls.get_mask_shift_chunk( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - encoder_out = encoder_out * mask_shfit_chunk + encoder_out = encoder_out * mask_shift_chunk pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor2( encoder_out, ys_out_pad, @@ -724,10 +724,10 @@ class UniASR(torch.nn.Module): mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - encoder_out = encoder_out * mask_shfit_chunk + encoder_out = encoder_out * mask_shift_chunk pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor( encoder_out, ys_out_pad, @@ -806,10 +806,10 @@ class UniASR(torch.nn.Module): mask_chunk_predictor = self.encoder2.overlap_chunk_cls.get_mask_chunk_predictor( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - mask_shfit_chunk = self.encoder2.overlap_chunk_cls.get_mask_shfit_chunk( + mask_shift_chunk = self.encoder2.overlap_chunk_cls.get_mask_shift_chunk( None, device=encoder_out.device, batch_size=encoder_out.size(0) ) - encoder_out = encoder_out * mask_shfit_chunk + encoder_out = encoder_out * mask_shift_chunk pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor2( encoder_out, ys_out_pad, diff --git a/funasr/models/uniasr/template.yaml b/funasr/models/uniasr/template.yaml index 43d55fc26..c4a4deb4e 100644 --- a/funasr/models/uniasr/template.yaml +++ b/funasr/models/uniasr/template.yaml @@ -33,7 +33,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm chunk_size: - 20 @@ -89,7 +89,7 @@ encoder2_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 21 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm chunk_size: - 45 diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs index d5c00573b..97b1d906b 100644 --- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs +++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs @@ -19,7 +19,7 @@ namespace AliParaformerAsr.Model private float _src_attention_dropout_rate = 0.1F; private int _att_layer_num = 16; private int _kernel_size = 11; - private int _sanm_shfit = 0; + private int _sanm_shift = 0; public int attention_heads { get => _attention_heads; set => _attention_heads = value; } public int linear_units { get => _linear_units; set => _linear_units = value; } @@ -30,7 +30,7 @@ namespace AliParaformerAsr.Model public float src_attention_dropout_rate { get => _src_attention_dropout_rate; set => _src_attention_dropout_rate = value; } public int att_layer_num { get => _att_layer_num; set => _att_layer_num = value; } public int kernel_size { get => _kernel_size; set => _kernel_size = value; } - public int sanm_shfit { get => _sanm_shfit; set => _sanm_shfit = value; } - + public int sanm_shift { get => _sanm_shift; set => _sanm_shift = value; } + } } diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs index ffe505e94..5d01266da 100644 --- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs +++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs @@ -21,7 +21,7 @@ namespace AliParaformerAsr.Model private string _pos_enc_class = "SinusoidalPositionEncoder"; private bool _normalize_before = true; private int _kernel_size = 11; - private int _sanm_shfit = 0; + private int _sanm_shift = 0; private string _selfattention_layer_type = "sanm"; public int output_size { get => _output_size; set => _output_size = value; } @@ -35,7 +35,7 @@ namespace AliParaformerAsr.Model public string pos_enc_class { get => _pos_enc_class; set => _pos_enc_class = value; } public bool normalize_before { get => _normalize_before; set => _normalize_before = value; } public int kernel_size { get => _kernel_size; set => _kernel_size = value; } - public int sanm_shfit { get => _sanm_shfit; set => _sanm_shfit = value; } + public int sanm_shift { get => _sanm_shift; set => _sanm_shift = value; } public string selfattention_layer_type { get => _selfattention_layer_type; set => _selfattention_layer_type = value; } } } diff --git a/runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml b/runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml index 9b2266f06..cf5750441 100755 --- a/runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml +++ b/runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml @@ -8593,7 +8593,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm chunk_size: - 12 @@ -8623,7 +8623,7 @@ decoder_conf: src_attention_dropout_rate: 0.1 att_layer_num: 16 kernel_size: 11 - sanm_shfit: 5 + sanm_shift: 5 predictor: cif_predictor_v2 predictor_conf: idim: 512 diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml index 26bb9d3d2..a66f1ca45 100644 --- a/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml +++ b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml @@ -12,7 +12,7 @@ encoder_conf: pos_enc_class: SinusoidalPositionEncoder normalize_before: true kernel_size: 11 - sanm_shfit: 0 + sanm_shift: 0 selfattention_layer_type: sanm