diff --git a/examples/industrial_data_pretraining/bicif_paraformer/demo.py b/examples/industrial_data_pretraining/bicif_paraformer/demo.py index 4d921ea46..57edb685e 100644 --- a/examples/industrial_data_pretraining/bicif_paraformer/demo.py +++ b/examples/industrial_data_pretraining/bicif_paraformer/demo.py @@ -8,7 +8,7 @@ from funasr import AutoModel model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.0", vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", - vad_model_revision="v2.0.1", + vad_model_revision="v2.0.2", punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", punc_model_revision="v2.0.1", spk_model="/Users/shixian/code/modelscope_models/speech_campplus_sv_zh-cn_16k-common", @@ -21,7 +21,7 @@ print(res) model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.0", vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", - vad_model_revision="v2.0.1", + vad_model_revision="v2.0.2", punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", punc_model_revision="v2.0.1", spk_model="/Users/shixian/code/modelscope_models/speech_campplus_sv_zh-cn_16k-common", diff --git a/examples/industrial_data_pretraining/emotion2vec/demo.py b/examples/industrial_data_pretraining/emotion2vec/demo.py index 365331340..abaa9f40e 100644 --- a/examples/industrial_data_pretraining/emotion2vec/demo.py +++ b/examples/industrial_data_pretraining/emotion2vec/demo.py @@ -5,7 +5,7 @@ from funasr import AutoModel -model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.0") +model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.1") res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", output_dir="./outputs") print(res) \ No newline at end of file diff --git a/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py b/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py index 4e3cb7092..459dfff41 100644 --- a/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py +++ b/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py @@ -7,7 +7,7 @@ from funasr import AutoModel wav_file = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" chunk_size = 60000 # ms -model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.1") +model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.2") res = model(input=wav_file, chunk_size=chunk_size, ) print(res) diff --git a/examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh b/examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh index 08ef8bd7d..815c52a28 100644 --- a/examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh +++ b/examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh @@ -1,7 +1,7 @@ model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" -model_revision="v2.0.1" +model_revision="v2.0.2" python funasr/bin/inference.py \ +model=${model} \ diff --git a/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py b/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py index fcf5f60ca..fc3a63552 100644 --- a/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py +++ b/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py @@ -8,7 +8,7 @@ from funasr import AutoModel model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.0", vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", - vad_model_revision="v2.0.1", + vad_model_revision="v2.0.2", punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", punc_model_revision="v2.0.1", spk_model="damo/speech_campplus_sv_zh-cn_16k-common", diff --git a/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh b/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh index 63347b6a1..f3fa90d20 100644 --- a/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh +++ b/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh @@ -2,7 +2,7 @@ model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" model_revision="v2.0.0" vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" -vad_model_revision="v2.0.1" +vad_model_revision="v2.0.2" punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" punc_model_revision="v2.0.1" spk_model="damo/speech_campplus_sv_zh-cn_16k-common" diff --git a/examples/industrial_data_pretraining/seaco_paraformer/demo.py b/examples/industrial_data_pretraining/seaco_paraformer/demo.py index 3b5963a4b..7f1fdb5db 100644 --- a/examples/industrial_data_pretraining/seaco_paraformer/demo.py +++ b/examples/industrial_data_pretraining/seaco_paraformer/demo.py @@ -8,7 +8,7 @@ from funasr import AutoModel model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.0", vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", - vad_model_revision="v2.0.1", + vad_model_revision="v2.0.2", punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", punc_model_revision="v2.0.1", ) diff --git a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh index c46449f74..ac5c1903d 100644 --- a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh +++ b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh @@ -2,7 +2,7 @@ model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" model_revision="v2.0.0" vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" -vad_model_revision="v2.0.1" +vad_model_revision="v2.0.2" punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" punc_model_revision="v2.0.1" diff --git a/funasr/download/download_from_hub.py b/funasr/download/download_from_hub.py index 27bd79d81..9779050dd 100644 --- a/funasr/download/download_from_hub.py +++ b/funasr/download/download_from_hub.py @@ -22,8 +22,8 @@ def download_from_ms(**kwargs): config = os.path.join(model_or_path, "config.yaml") if os.path.exists(config) and os.path.exists(os.path.join(model_or_path, "model.pb")): - cfg = OmegaConf.load(config) - kwargs = OmegaConf.merge(cfg, kwargs) + config = OmegaConf.load(config) + kwargs = OmegaConf.merge(config, kwargs) init_param = os.path.join(model_or_path, "model.pb") kwargs["init_param"] = init_param if os.path.exists(os.path.join(model_or_path, "tokens.txt")): @@ -34,7 +34,7 @@ def download_from_ms(**kwargs): kwargs["tokenizer_conf"]["seg_dict"] = os.path.join(model_or_path, "seg_dict") if os.path.exists(os.path.join(model_or_path, "bpe.model")): kwargs["tokenizer_conf"]["bpemodel"] = os.path.join(model_or_path, "bpe.model") - kwargs["model"] = cfg["model"] + kwargs["model"] = config["model"] if os.path.exists(os.path.join(model_or_path, "am.mvn")): kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn") if os.path.exists(os.path.join(model_or_path, "jieba_usr_dict")): @@ -43,14 +43,30 @@ def download_from_ms(**kwargs): assert os.path.exists(os.path.join(model_or_path, "configuration.json")) with open(os.path.join(model_or_path, "configuration.json"), 'r', encoding='utf-8') as f: conf_json = json.load(f) - config = os.path.join(model_or_path, conf_json["model_config"]) - cfg = OmegaConf.load(config) - kwargs = OmegaConf.merge(cfg, kwargs) - init_param = os.path.join(model_or_path, conf_json["model_file"]) - kwargs["init_param"] = init_param - kwargs["model"] = cfg["model"] + cfg = {} + add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg) + cfg.update(kwargs) + config = OmegaConf.load(cfg["config"]) + kwargs = OmegaConf.merge(config, cfg) + kwargs["model"] = config["model"] return OmegaConf.to_container(kwargs, resolve=True) +def add_file_root_path(model_or_path: str, file_path_metas: dict, cfg = {}): + + if isinstance(file_path_metas, dict): + for k, v in file_path_metas.items(): + if isinstance(v, str): + p = os.path.join(model_or_path, v) + if os.path.exists(p): + cfg[k] = p + elif isinstance(v, dict): + if k not in cfg: + cfg[k] = {} + return add_file_root_path(model_or_path, v, cfg[k]) + + return cfg + + def get_or_download_model_dir( model, model_revision=None, diff --git a/funasr/models/ct_transformer_streaming/__init__.py b/funasr/models/ct_transformer_streaming/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/funasr/models/ct_transformer_streaming/attention.py b/funasr/models/ct_transformer_streaming/attention.py new file mode 100644 index 000000000..a35ddee57 --- /dev/null +++ b/funasr/models/ct_transformer_streaming/attention.py @@ -0,0 +1,1091 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Shigeki Karita +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Multi-Head Attention layer definition.""" + +import math + +import numpy +import torch +from torch import nn +from typing import Optional, Tuple + +import torch.nn.functional as F +from funasr.models.transformer.utils.nets_utils import make_pad_mask +import funasr.models.lora.layers as lora + +class MultiHeadedAttention(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, n_head, n_feat, dropout_rate): + """Construct an MultiHeadedAttention object.""" + super(MultiHeadedAttention, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv(self, query, key, value): + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + return q, k, v + + def forward_attention(self, value, scores, mask): + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + if mask is not None: + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + min_value = float( + numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min + ) + scores = scores.masked_fill(mask, min_value) + self.attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, query, key, value, mask): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q, k, v = self.forward_qkv(query, key, value) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, mask) + + +class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding (old version). + + Details can be found in https://github.com/espnet/espnet/pull/2816. + + Paper: https://arxiv.org/abs/1901.02860 + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + + """ + + def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate) + self.zero_triu = zero_triu + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x): + """Compute relative positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, head, time1, time2). + + Returns: + torch.Tensor: Output tensor. + + """ + zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x) + + if self.zero_triu: + ones = torch.ones((x.size(2), x.size(3))) + x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] + + return x + + def forward(self, query, key, value, pos_emb, mask): + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + pos_emb (torch.Tensor): Positional embedding tensor (#batch, time1, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, time1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, time1) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k + ) # (batch, head, time1, time2) + + return self.forward_attention(v, scores, mask) + + +class RelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding (new implementation). + + Details can be found in https://github.com/espnet/espnet/pull/2816. + + Paper: https://arxiv.org/abs/1901.02860 + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + + """ + + def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate) + self.zero_triu = zero_triu + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x): + """Compute relative positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + + Returns: + torch.Tensor: Output tensor. + + """ + zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x)[ + :, :, :, : x.size(-1) // 2 + 1 + ] # only keep the positions from 0 to time2 + + if self.zero_triu: + ones = torch.ones((x.size(2), x.size(3)), device=x.device) + x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] + + return x + + def forward(self, query, key, value, pos_emb, mask): + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + pos_emb (torch.Tensor): Positional embedding tensor + (#batch, 2*time1-1, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, 2*time1-1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, 2*time1-1) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k + ) # (batch, head, time1, time2) + + return self.forward_attention(v, scores, mask) + + +class MultiHeadedAttentionSANM(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, n_head, in_feat, n_feat, dropout_rate, kernel_size, sanm_shfit=0, lora_list=None, lora_rank=8, lora_alpha=16, lora_dropout=0.1): + """Construct an MultiHeadedAttention object.""" + super(MultiHeadedAttentionSANM, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + # self.linear_q = nn.Linear(n_feat, n_feat) + # self.linear_k = nn.Linear(n_feat, n_feat) + # self.linear_v = nn.Linear(n_feat, n_feat) + if lora_list is not None: + if "o" in lora_list: + self.linear_out = lora.Linear(n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout) + else: + self.linear_out = nn.Linear(n_feat, n_feat) + lora_qkv_list = ["q" in lora_list, "k" in lora_list, "v" in lora_list] + if lora_qkv_list == [False, False, False]: + self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3) + else: + self.linear_q_k_v = lora.MergedLinear(in_feat, n_feat * 3, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout, enable_lora=lora_qkv_list) + else: + self.linear_out = nn.Linear(n_feat, n_feat) + self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + self.fsmn_block = nn.Conv1d(n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False) + # padding + left_padding = (kernel_size - 1) // 2 + if sanm_shfit > 0: + left_padding = left_padding + sanm_shfit + right_padding = kernel_size - 1 - left_padding + self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) + + def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None): + b, t, d = inputs.size() + if mask is not None: + mask = torch.reshape(mask, (b, -1, 1)) + if mask_shfit_chunk is not None: + mask = mask * mask_shfit_chunk + inputs = inputs * mask + + x = inputs.transpose(1, 2) + x = self.pad_fn(x) + x = self.fsmn_block(x) + x = x.transpose(1, 2) + x += inputs + x = self.dropout(x) + if mask is not None: + x = x * mask + return x + + def forward_qkv(self, x): + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + + """ + b, t, d = x.size() + q_k_v = self.linear_q_k_v(x) + q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) + q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(1, 2) # (batch, head, time1, d_k) + k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(1, 2) # (batch, head, time2, d_k) + v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(1, 2) # (batch, head, time2, d_k) + + return q_h, k_h, v_h, v + + def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None): + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + if mask is not None: + if mask_att_chunk_encoder is not None: + mask = mask * mask_att_chunk_encoder + + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + + min_value = float( + numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min + ) + scores = scores.masked_fill(mask, min_value) + self.attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h, v = self.forward_qkv(x) + fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) + return att_outs + fsmn_memory + + def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h, v = self.forward_qkv(x) + if chunk_size is not None and look_back > 0 or look_back == -1: + if cache is not None: + k_h_stride = k_h[:, :, :-(chunk_size[2]), :] + v_h_stride = v_h[:, :, :-(chunk_size[2]), :] + k_h = torch.cat((cache["k"], k_h), dim=2) + v_h = torch.cat((cache["v"], v_h), dim=2) + + cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2) + cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2) + if look_back != -1: + cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]):, :] + cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]):, :] + else: + cache_tmp = {"k": k_h[:, :, :-(chunk_size[2]), :], + "v": v_h[:, :, :-(chunk_size[2]), :]} + cache = cache_tmp + fsmn_memory = self.forward_fsmn(v, None) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, None) + return att_outs + fsmn_memory, cache + + +class MultiHeadedAttentionSANMwithMask(MultiHeadedAttentionSANM): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + q_h, k_h, v_h, v = self.forward_qkv(x) + fsmn_memory = self.forward_fsmn(v, mask[0], mask_shfit_chunk) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, mask[1], mask_att_chunk_encoder) + return att_outs + fsmn_memory + +class MultiHeadedAttentionSANMDecoder(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, n_feat, dropout_rate, kernel_size, sanm_shfit=0): + """Construct an MultiHeadedAttention object.""" + super(MultiHeadedAttentionSANMDecoder, self).__init__() + + self.dropout = nn.Dropout(p=dropout_rate) + + self.fsmn_block = nn.Conv1d(n_feat, n_feat, + kernel_size, stride=1, padding=0, groups=n_feat, bias=False) + # padding + # padding + left_padding = (kernel_size - 1) // 2 + if sanm_shfit > 0: + left_padding = left_padding + sanm_shfit + right_padding = kernel_size - 1 - left_padding + self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) + self.kernel_size = kernel_size + + def forward(self, inputs, mask, cache=None, mask_shfit_chunk=None): + ''' + :param x: (#batch, time1, size). + :param mask: Mask tensor (#batch, 1, time) + :return: + ''' + # print("in fsmn, inputs", inputs.size()) + b, t, d = inputs.size() + # logging.info( + # "mask: {}".format(mask.size())) + if mask is not None: + mask = torch.reshape(mask, (b ,-1, 1)) + # logging.info("in fsmn, mask: {}, {}".format(mask.size(), mask[0:100:50, :, :])) + if mask_shfit_chunk is not None: + # logging.info("in fsmn, mask_fsmn: {}, {}".format(mask_shfit_chunk.size(), mask_shfit_chunk[0:100:50, :, :])) + mask = mask * mask_shfit_chunk + # logging.info("in fsmn, mask_after_fsmn: {}, {}".format(mask.size(), mask[0:100:50, :, :])) + # print("in fsmn, mask", mask.size()) + # print("in fsmn, inputs", inputs.size()) + inputs = inputs * mask + + x = inputs.transpose(1, 2) + b, d, t = x.size() + if cache is None: + # print("in fsmn, cache is None, x", x.size()) + + x = self.pad_fn(x) + if not self.training: + cache = x + else: + # print("in fsmn, cache is not None, x", x.size()) + # x = torch.cat((x, cache), dim=2)[:, :, :-1] + # if t < self.kernel_size: + # x = self.pad_fn(x) + x = torch.cat((cache[:, :, 1:], x), dim=2) + x = x[:, :, -(self.kernel_size+t-1):] + # print("in fsmn, cache is not None, x_cat", x.size()) + cache = x + x = self.fsmn_block(x) + x = x.transpose(1, 2) + # print("in fsmn, fsmn_out", x.size()) + if x.size(1) != inputs.size(1): + inputs = inputs[:, -1, :] + + x = x + inputs + x = self.dropout(x) + if mask is not None: + x = x * mask + return x, cache + +class MultiHeadedAttentionCrossAtt(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, n_head, n_feat, dropout_rate, lora_list=None, lora_rank=8, lora_alpha=16, lora_dropout=0.1, encoder_output_size=None): + """Construct an MultiHeadedAttention object.""" + super(MultiHeadedAttentionCrossAtt, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + if lora_list is not None: + if "q" in lora_list: + self.linear_q = lora.Linear(n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout) + else: + self.linear_q = nn.Linear(n_feat, n_feat) + lora_kv_list = ["k" in lora_list, "v" in lora_list] + if lora_kv_list == [False, False]: + self.linear_k_v = nn.Linear(n_feat if encoder_output_size is None else encoder_output_size, n_feat*2) + else: + self.linear_k_v = lora.MergedLinear(n_feat if encoder_output_size is None else encoder_output_size, n_feat * 2, + r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout, enable_lora=lora_kv_list) + if "o" in lora_list: + self.linear_out = lora.Linear(n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout) + else: + self.linear_out = nn.Linear(n_feat, n_feat) + else: + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k_v = nn.Linear(n_feat if encoder_output_size is None else encoder_output_size, n_feat*2) + self.linear_out = nn.Linear(n_feat, n_feat) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv(self, x, memory): + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + + """ + + # print("in forward_qkv, x", x.size()) + b = x.size(0) + q = self.linear_q(x) + q_h = torch.reshape(q, (b, -1, self.h, self.d_k)).transpose(1, 2) # (batch, head, time1, d_k) + + k_v = self.linear_k_v(memory) + k, v = torch.split(k_v, int(self.h*self.d_k), dim=-1) + k_h = torch.reshape(k, (b, -1, self.h, self.d_k)).transpose(1, 2) # (batch, head, time2, d_k) + v_h = torch.reshape(v, (b, -1, self.h, self.d_k)).transpose(1, 2) # (batch, head, time2, d_k) + + + return q_h, k_h, v_h + + def forward_attention(self, value, scores, mask): + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + if mask is not None: + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + min_value = float( + numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min + ) + # logging.info( + # "scores: {}, mask_size: {}".format(scores.size(), mask.size())) + scores = scores.masked_fill(mask, min_value) + self.attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, x, memory, memory_mask): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h = self.forward_qkv(x, memory) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + return self.forward_attention(v_h, scores, memory_mask) + + def forward_chunk(self, x, memory, cache=None, chunk_size=None, look_back=0): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h = self.forward_qkv(x, memory) + if chunk_size is not None and look_back > 0: + if cache is not None: + k_h = torch.cat((cache["k"], k_h), dim=2) + v_h = torch.cat((cache["v"], v_h), dim=2) + cache["k"] = k_h[:, :, -(look_back * chunk_size[1]):, :] + cache["v"] = v_h[:, :, -(look_back * chunk_size[1]):, :] + else: + cache_tmp = {"k": k_h[:, :, -(look_back * chunk_size[1]):, :], + "v": v_h[:, :, -(look_back * chunk_size[1]):, :]} + cache = cache_tmp + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + return self.forward_attention(v_h, scores, None), cache + + +class MultiHeadSelfAttention(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, n_head, in_feat, n_feat, dropout_rate): + """Construct an MultiHeadedAttention object.""" + super(MultiHeadSelfAttention, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_out = nn.Linear(n_feat, n_feat) + self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv(self, x): + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + + """ + b, t, d = x.size() + q_k_v = self.linear_q_k_v(x) + q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) + q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(1, 2) # (batch, head, time1, d_k) + k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(1, 2) # (batch, head, time2, d_k) + v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(1, 2) # (batch, head, time2, d_k) + + return q_h, k_h, v_h, v + + def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None): + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + if mask is not None: + if mask_att_chunk_encoder is not None: + mask = mask * mask_att_chunk_encoder + + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + + min_value = float( + numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min + ) + scores = scores.masked_fill(mask, min_value) + self.attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, x, mask, mask_att_chunk_encoder=None): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h, v = self.forward_qkv(x) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) + return att_outs + +class RelPositionMultiHeadedAttentionChunk(torch.nn.Module): + """RelPositionMultiHeadedAttention definition. + Args: + num_heads: Number of attention heads. + embed_size: Embedding size. + dropout_rate: Dropout rate. + """ + + def __init__( + self, + num_heads: int, + embed_size: int, + dropout_rate: float = 0.0, + simplified_attention_score: bool = False, + ) -> None: + """Construct an MultiHeadedAttention object.""" + super().__init__() + + self.d_k = embed_size // num_heads + self.num_heads = num_heads + + assert self.d_k * num_heads == embed_size, ( + "embed_size (%d) must be divisible by num_heads (%d)", + (embed_size, num_heads), + ) + + self.linear_q = torch.nn.Linear(embed_size, embed_size) + self.linear_k = torch.nn.Linear(embed_size, embed_size) + self.linear_v = torch.nn.Linear(embed_size, embed_size) + + self.linear_out = torch.nn.Linear(embed_size, embed_size) + + if simplified_attention_score: + self.linear_pos = torch.nn.Linear(embed_size, num_heads) + + self.compute_att_score = self.compute_simplified_attention_score + else: + self.linear_pos = torch.nn.Linear(embed_size, embed_size, bias=False) + + self.pos_bias_u = torch.nn.Parameter(torch.Tensor(num_heads, self.d_k)) + self.pos_bias_v = torch.nn.Parameter(torch.Tensor(num_heads, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + self.compute_att_score = self.compute_attention_score + + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.attn = None + + def rel_shift(self, x: torch.Tensor, left_context: int = 0) -> torch.Tensor: + """Compute relative positional encoding. + Args: + x: Input sequence. (B, H, T_1, 2 * T_1 - 1) + left_context: Number of frames in left context. + Returns: + x: Output sequence. (B, H, T_1, T_2) + """ + batch_size, n_heads, time1, n = x.shape + time2 = time1 + left_context + + batch_stride, n_heads_stride, time1_stride, n_stride = x.stride() + + return x.as_strided( + (batch_size, n_heads, time1, time2), + (batch_stride, n_heads_stride, time1_stride - n_stride, n_stride), + storage_offset=(n_stride * (time1 - 1)), + ) + + def compute_simplified_attention_score( + self, + query: torch.Tensor, + key: torch.Tensor, + pos_enc: torch.Tensor, + left_context: int = 0, + ) -> torch.Tensor: + """Simplified attention score computation. + Reference: https://github.com/k2-fsa/icefall/pull/458 + Args: + query: Transformed query tensor. (B, H, T_1, d_k) + key: Transformed key tensor. (B, H, T_2, d_k) + pos_enc: Positional embedding tensor. (B, 2 * T_1 - 1, size) + left_context: Number of frames in left context. + Returns: + : Attention score. (B, H, T_1, T_2) + """ + pos_enc = self.linear_pos(pos_enc) + + matrix_ac = torch.matmul(query, key.transpose(2, 3)) + + matrix_bd = self.rel_shift( + pos_enc.transpose(1, 2).unsqueeze(2).repeat(1, 1, query.size(2), 1), + left_context=left_context, + ) + + return (matrix_ac + matrix_bd) / math.sqrt(self.d_k) + + def compute_attention_score( + self, + query: torch.Tensor, + key: torch.Tensor, + pos_enc: torch.Tensor, + left_context: int = 0, + ) -> torch.Tensor: + """Attention score computation. + Args: + query: Transformed query tensor. (B, H, T_1, d_k) + key: Transformed key tensor. (B, H, T_2, d_k) + pos_enc: Positional embedding tensor. (B, 2 * T_1 - 1, size) + left_context: Number of frames in left context. + Returns: + : Attention score. (B, H, T_1, T_2) + """ + p = self.linear_pos(pos_enc).view(pos_enc.size(0), -1, self.num_heads, self.d_k) + + query = query.transpose(1, 2) + q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2) + q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2) + + matrix_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1)) + + matrix_bd = torch.matmul(q_with_bias_v, p.permute(0, 2, 3, 1)) + matrix_bd = self.rel_shift(matrix_bd, left_context=left_context) + + return (matrix_ac + matrix_bd) / math.sqrt(self.d_k) + + def forward_qkv( + self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Transform query, key and value. + Args: + query: Query tensor. (B, T_1, size) + key: Key tensor. (B, T_2, size) + v: Value tensor. (B, T_2, size) + Returns: + q: Transformed query tensor. (B, H, T_1, d_k) + k: Transformed key tensor. (B, H, T_2, d_k) + v: Transformed value tensor. (B, H, T_2, d_k) + """ + n_batch = query.size(0) + + q = ( + self.linear_q(query) + .view(n_batch, -1, self.num_heads, self.d_k) + .transpose(1, 2) + ) + k = ( + self.linear_k(key) + .view(n_batch, -1, self.num_heads, self.d_k) + .transpose(1, 2) + ) + v = ( + self.linear_v(value) + .view(n_batch, -1, self.num_heads, self.d_k) + .transpose(1, 2) + ) + + return q, k, v + + def forward_attention( + self, + value: torch.Tensor, + scores: torch.Tensor, + mask: torch.Tensor, + chunk_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Compute attention context vector. + Args: + value: Transformed value. (B, H, T_2, d_k) + scores: Attention score. (B, H, T_1, T_2) + mask: Source mask. (B, T_2) + chunk_mask: Chunk mask. (T_1, T_1) + Returns: + attn_output: Transformed value weighted by attention score. (B, T_1, H * d_k) + """ + batch_size = scores.size(0) + mask = mask.unsqueeze(1).unsqueeze(2) + if chunk_mask is not None: + mask = chunk_mask.unsqueeze(0).unsqueeze(1) | mask + scores = scores.masked_fill(mask, float("-inf")) + self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) + + attn_output = self.dropout(self.attn) + attn_output = torch.matmul(attn_output, value) + + attn_output = self.linear_out( + attn_output.transpose(1, 2) + .contiguous() + .view(batch_size, -1, self.num_heads * self.d_k) + ) + + return attn_output + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + pos_enc: torch.Tensor, + mask: torch.Tensor, + chunk_mask: Optional[torch.Tensor] = None, + left_context: int = 0, + ) -> torch.Tensor: + """Compute scaled dot product attention with rel. positional encoding. + Args: + query: Query tensor. (B, T_1, size) + key: Key tensor. (B, T_2, size) + value: Value tensor. (B, T_2, size) + pos_enc: Positional embedding tensor. (B, 2 * T_1 - 1, size) + mask: Source mask. (B, T_2) + chunk_mask: Chunk mask. (T_1, T_1) + left_context: Number of frames in left context. + Returns: + : Output tensor. (B, T_1, H * d_k) + """ + q, k, v = self.forward_qkv(query, key, value) + scores = self.compute_att_score(q, k, pos_enc, left_context=left_context) + return self.forward_attention(v, scores, mask, chunk_mask=chunk_mask) + + +class CosineDistanceAttention(nn.Module): + """ Compute Cosine Distance between spk decoder output and speaker profile + Args: + profile_path: speaker profile file path (.npy file) + """ + + def __init__(self): + super().__init__() + self.softmax = nn.Softmax(dim=-1) + + def forward(self, spk_decoder_out, profile, profile_lens=None): + """ + Args: + spk_decoder_out(torch.Tensor):(B, L, D) + spk_profiles(torch.Tensor):(B, N, D) + """ + x = spk_decoder_out.unsqueeze(2) # (B, L, 1, D) + if profile_lens is not None: + + mask = (make_pad_mask(profile_lens)[:, None, :]).to(profile.device) + min_value = float( + numpy.finfo(torch.tensor(0, dtype=x.dtype).numpy().dtype).min + ) + weights_not_softmax=F.cosine_similarity(x, profile.unsqueeze(1), dim=-1).masked_fill(mask, min_value) + weights = self.softmax(weights_not_softmax).masked_fill(mask, 0.0) # (B, L, N) + else: + x = x[:, -1:, :, :] + weights_not_softmax=F.cosine_similarity(x, profile.unsqueeze(1).to(x.device), dim=-1) + weights = self.softmax(weights_not_softmax) # (B, 1, N) + spk_embedding = torch.matmul(weights, profile.to(weights.device)) # (B, L, D) + + return spk_embedding, weights diff --git a/funasr/models/ct_transformer_streaming/encoder.py b/funasr/models/ct_transformer_streaming/encoder.py new file mode 100644 index 000000000..784baf37d --- /dev/null +++ b/funasr/models/ct_transformer_streaming/encoder.py @@ -0,0 +1,383 @@ +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union +import logging +import torch +import torch.nn as nn +import torch.nn.functional as F +from funasr.models.scama.chunk_utilis import overlap_chunk +import numpy as np +from funasr.train_utils.device_funcs import to_device +from funasr.models.transformer.utils.nets_utils import make_pad_mask +from funasr.models.sanm.attention import MultiHeadedAttention +from funasr.models.ct_transformer.attention import MultiHeadedAttentionSANMwithMask +from funasr.models.transformer.embedding import SinusoidalPositionEncoder, StreamSinusoidalPositionEncoder +from funasr.models.transformer.layer_norm import LayerNorm +from funasr.models.transformer.utils.multi_layer_conv import Conv1dLinear +from funasr.models.transformer.utils.multi_layer_conv import MultiLayeredConv1d +from funasr.models.transformer.positionwise_feed_forward import ( + PositionwiseFeedForward, # noqa: H301 +) +from funasr.models.transformer.utils.repeat import repeat +from funasr.models.transformer.utils.subsampling import Conv2dSubsampling +from funasr.models.transformer.utils.subsampling import Conv2dSubsampling2 +from funasr.models.transformer.utils.subsampling import Conv2dSubsampling6 +from funasr.models.transformer.utils.subsampling import Conv2dSubsampling8 +from funasr.models.transformer.utils.subsampling import TooShortUttError +from funasr.models.transformer.utils.subsampling import check_short_utt +from funasr.models.transformer.utils.mask import subsequent_mask, vad_mask + +from funasr.models.ctc.ctc import CTC + +from funasr.register import tables + +class EncoderLayerSANM(nn.Module): + def __init__( + self, + in_size, + size, + self_attn, + feed_forward, + dropout_rate, + normalize_before=True, + concat_after=False, + stochastic_depth_rate=0.0, + ): + """Construct an EncoderLayer object.""" + super(EncoderLayerSANM, self).__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.norm1 = LayerNorm(in_size) + self.norm2 = LayerNorm(size) + self.dropout = nn.Dropout(dropout_rate) + self.in_size = in_size + self.size = size + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear = nn.Linear(size + size, size) + self.stochastic_depth_rate = stochastic_depth_rate + self.dropout_rate = dropout_rate + + def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + """Compute encoded features. + + Args: + x_input (torch.Tensor): Input tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + + """ + skip_layer = False + # with stochastic depth, residual connection `x + f(x)` becomes + # `x <- x + 1 / (1 - p) * f(x)` at training time. + stoch_layer_coeff = 1.0 + if self.training and self.stochastic_depth_rate > 0: + skip_layer = torch.rand(1).item() < self.stochastic_depth_rate + stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) + + if skip_layer: + if cache is not None: + x = torch.cat([cache, x], dim=1) + return x, mask + + residual = x + if self.normalize_before: + x = self.norm1(x) + + if self.concat_after: + x_concat = torch.cat((x, self.self_attn(x, mask, mask_shfit_chunk=mask_shfit_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder)), dim=-1) + if self.in_size == self.size: + x = residual + stoch_layer_coeff * self.concat_linear(x_concat) + else: + x = stoch_layer_coeff * self.concat_linear(x_concat) + else: + if self.in_size == self.size: + x = residual + stoch_layer_coeff * self.dropout( + self.self_attn(x, mask, mask_shfit_chunk=mask_shfit_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder) + ) + else: + x = stoch_layer_coeff * self.dropout( + self.self_attn(x, mask, mask_shfit_chunk=mask_shfit_chunk, mask_att_chunk_encoder=mask_att_chunk_encoder) + ) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm2(x) + + return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + + def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): + """Compute encoded features. + + Args: + x_input (torch.Tensor): Input tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + + """ + + residual = x + if self.normalize_before: + x = self.norm1(x) + + if self.in_size == self.size: + attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back) + x = residual + attn + else: + x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back) + + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + self.feed_forward(x) + if not self.normalize_before: + x = self.norm2(x) + + return x, cache + + +@tables.register("encoder_classes", "SANMVadEncoder") +class SANMVadEncoder(nn.Module): + """ + Author: Speech Lab of DAMO Academy, Alibaba Group + + """ + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: Optional[str] = "conv2d", + pos_enc_class=SinusoidalPositionEncoder, + normalize_before: bool = True, + concat_after: bool = False, + positionwise_layer_type: str = "linear", + positionwise_conv_kernel_size: int = 1, + padding_idx: int = -1, + interctc_layer_idx: List[int] = [], + interctc_use_conditioning: bool = False, + kernel_size : int = 11, + sanm_shfit : int = 0, + selfattention_layer_type: str = "sanm", + ): + super().__init__() + self._output_size = output_size + + if input_layer == "linear": + self.embed = torch.nn.Sequential( + torch.nn.Linear(input_size, output_size), + torch.nn.LayerNorm(output_size), + torch.nn.Dropout(dropout_rate), + torch.nn.ReLU(), + pos_enc_class(output_size, positional_dropout_rate), + ) + elif input_layer == "conv2d": + self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate) + elif input_layer == "conv2d2": + self.embed = Conv2dSubsampling2(input_size, output_size, dropout_rate) + elif input_layer == "conv2d6": + self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate) + elif input_layer == "conv2d8": + self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate) + elif input_layer == "embed": + self.embed = torch.nn.Sequential( + torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), + SinusoidalPositionEncoder(), + ) + elif input_layer is None: + if input_size == output_size: + self.embed = None + else: + self.embed = torch.nn.Linear(input_size, output_size) + elif input_layer == "pe": + self.embed = SinusoidalPositionEncoder() + else: + raise ValueError("unknown input_layer: " + input_layer) + self.normalize_before = normalize_before + if positionwise_layer_type == "linear": + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + ) + elif positionwise_layer_type == "conv1d": + positionwise_layer = MultiLayeredConv1d + positionwise_layer_args = ( + output_size, + linear_units, + positionwise_conv_kernel_size, + dropout_rate, + ) + elif positionwise_layer_type == "conv1d-linear": + positionwise_layer = Conv1dLinear + positionwise_layer_args = ( + output_size, + linear_units, + positionwise_conv_kernel_size, + dropout_rate, + ) + else: + raise NotImplementedError("Support only linear or conv1d.") + + if selfattention_layer_type == "selfattn": + encoder_selfattn_layer = MultiHeadedAttention + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + attention_dropout_rate, + ) + + elif selfattention_layer_type == "sanm": + self.encoder_selfattn_layer = MultiHeadedAttentionSANMwithMask + encoder_selfattn_layer_args0 = ( + attention_heads, + input_size, + output_size, + attention_dropout_rate, + kernel_size, + sanm_shfit, + ) + + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + output_size, + attention_dropout_rate, + kernel_size, + sanm_shfit, + ) + + self.encoders0 = repeat( + 1, + lambda lnum: EncoderLayerSANM( + input_size, + output_size, + self.encoder_selfattn_layer(*encoder_selfattn_layer_args0), + positionwise_layer(*positionwise_layer_args), + dropout_rate, + normalize_before, + concat_after, + ), + ) + + self.encoders = repeat( + num_blocks-1, + lambda lnum: EncoderLayerSANM( + output_size, + output_size, + self.encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + dropout_rate, + normalize_before, + concat_after, + ), + ) + if self.normalize_before: + self.after_norm = LayerNorm(output_size) + + self.interctc_layer_idx = interctc_layer_idx + if len(interctc_layer_idx) > 0: + assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks + self.interctc_use_conditioning = interctc_use_conditioning + self.conditioning_layer = None + self.dropout = nn.Dropout(dropout_rate) + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs_pad: torch.Tensor, + ilens: torch.Tensor, + vad_indexes: torch.Tensor, + prev_states: torch.Tensor = None, + ctc: CTC = None, + ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """Embed positions in tensor. + + Args: + xs_pad: input tensor (B, L, D) + ilens: input length (B) + prev_states: Not to be used now. + Returns: + position embedded tensor and mask + """ + masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) + sub_masks = subsequent_mask(masks.size(-1), device=xs_pad.device).unsqueeze(0) + no_future_masks = masks & sub_masks + xs_pad *= self.output_size()**0.5 + if self.embed is None: + xs_pad = xs_pad + elif (isinstance(self.embed, Conv2dSubsampling) or isinstance(self.embed, Conv2dSubsampling2) + or isinstance(self.embed, Conv2dSubsampling6) or isinstance(self.embed, Conv2dSubsampling8)): + short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1)) + if short_status: + raise TooShortUttError( + f"has {xs_pad.size(1)} frames and is too short for subsampling " + + f"(it needs more than {limit_size} frames), return empty results", + xs_pad.size(1), + limit_size, + ) + xs_pad, masks = self.embed(xs_pad, masks) + else: + xs_pad = self.embed(xs_pad) + + # xs_pad = self.dropout(xs_pad) + mask_tup0 = [masks, no_future_masks] + encoder_outs = self.encoders0(xs_pad, mask_tup0) + xs_pad, _ = encoder_outs[0], encoder_outs[1] + intermediate_outs = [] + + + for layer_idx, encoder_layer in enumerate(self.encoders): + if layer_idx + 1 == len(self.encoders): + # This is last layer. + coner_mask = torch.ones(masks.size(0), + masks.size(-1), + masks.size(-1), + device=xs_pad.device, + dtype=torch.bool) + for word_index, length in enumerate(ilens): + coner_mask[word_index, :, :] = vad_mask(masks.size(-1), + vad_indexes[word_index], + device=xs_pad.device) + layer_mask = masks & coner_mask + else: + layer_mask = no_future_masks + mask_tup1 = [masks, layer_mask] + encoder_outs = encoder_layer(xs_pad, mask_tup1) + xs_pad, layer_mask = encoder_outs[0], encoder_outs[1] + + if self.normalize_before: + xs_pad = self.after_norm(xs_pad) + + olens = masks.squeeze(1).sum(1) + if len(intermediate_outs) > 0: + return (xs_pad, intermediate_outs), olens, None + return xs_pad, olens, None diff --git a/funasr/models/ct_transformer_streaming/model.py b/funasr/models/ct_transformer_streaming/model.py new file mode 100644 index 000000000..4c84261b6 --- /dev/null +++ b/funasr/models/ct_transformer_streaming/model.py @@ -0,0 +1,340 @@ +from typing import Any +from typing import List +from typing import Tuple +from typing import Optional +import numpy as np +import torch.nn.functional as F + +from funasr.models.transformer.utils.nets_utils import make_pad_mask +from funasr.train_utils.device_funcs import force_gatherable +from funasr.train_utils.device_funcs import to_device +import torch +import torch.nn as nn +from funasr.models.ct_transformer.utils import split_to_mini_sentence, split_words +from funasr.utils.load_utils import load_audio_text_image_video + +from funasr.register import tables + +@tables.register("model_classes", "CTTransformerStreaming") +class CTTransformerStreaming(nn.Module): + """ + Author: Speech Lab of DAMO Academy, Alibaba Group + CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection + https://arxiv.org/pdf/2003.01309.pdf + """ + def __init__( + self, + encoder: str = None, + encoder_conf: dict = None, + vocab_size: int = -1, + punc_list: list = None, + punc_weight: list = None, + embed_unit: int = 128, + att_unit: int = 256, + dropout_rate: float = 0.5, + ignore_id: int = -1, + sos: int = 1, + eos: int = 2, + sentence_end_id: int = 3, + **kwargs, + ): + super().__init__() + + punc_size = len(punc_list) + if punc_weight is None: + punc_weight = [1] * punc_size + + + self.embed = nn.Embedding(vocab_size, embed_unit) + encoder_class = tables.encoder_classes.get(encoder.lower()) + encoder = encoder_class(**encoder_conf) + + self.decoder = nn.Linear(att_unit, punc_size) + self.encoder = encoder + self.punc_list = punc_list + self.punc_weight = punc_weight + self.ignore_id = ignore_id + self.sos = sos + self.eos = eos + self.sentence_end_id = sentence_end_id + + + + def punc_forward(self, text: torch.Tensor, text_lengths: torch.Tensor) -> Tuple[torch.Tensor, None]: + """Compute loss value from buffer sequences. + + Args: + input (torch.Tensor): Input ids. (batch, len) + hidden (torch.Tensor): Target ids. (batch, len) + + """ + x = self.embed(text) + # mask = self._target_mask(input) + h, _, _ = self.encoder(x, text_lengths) + y = self.decoder(h) + return y, None + + def with_vad(self): + return False + + def score(self, y: torch.Tensor, state: Any, x: torch.Tensor) -> Tuple[torch.Tensor, Any]: + """Score new token. + + Args: + y (torch.Tensor): 1D torch.int64 prefix tokens. + state: Scorer state for prefix tokens + x (torch.Tensor): encoder feature that generates ys. + + Returns: + tuple[torch.Tensor, Any]: Tuple of + torch.float32 scores for next token (vocab_size) + and next state for ys + + """ + y = y.unsqueeze(0) + h, _, cache = self.encoder.forward_one_step(self.embed(y), self._target_mask(y), cache=state) + h = self.decoder(h[:, -1]) + logp = h.log_softmax(dim=-1).squeeze(0) + return logp, cache + + def batch_score(self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor) -> Tuple[torch.Tensor, List[Any]]: + """Score new token batch. + + Args: + ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen). + states (List[Any]): Scorer states for prefix tokens. + xs (torch.Tensor): + The encoder feature that generates ys (n_batch, xlen, n_feat). + + Returns: + tuple[torch.Tensor, List[Any]]: Tuple of + batchfied scores for next token with shape of `(n_batch, vocab_size)` + and next state list for ys. + + """ + # merge states + n_batch = len(ys) + n_layers = len(self.encoder.encoders) + if states[0] is None: + batch_state = None + else: + # transpose state of [batch, layer] into [layer, batch] + batch_state = [torch.stack([states[b][i] for b in range(n_batch)]) for i in range(n_layers)] + + # batch decoding + h, _, states = self.encoder.forward_one_step(self.embed(ys), self._target_mask(ys), cache=batch_state) + h = self.decoder(h[:, -1]) + logp = h.log_softmax(dim=-1) + + # transpose state of [layer, batch] into [batch, layer] + state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] + return logp, state_list + + def nll( + self, + text: torch.Tensor, + punc: torch.Tensor, + text_lengths: torch.Tensor, + punc_lengths: torch.Tensor, + max_length: Optional[int] = None, + vad_indexes: Optional[torch.Tensor] = None, + vad_indexes_lengths: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute negative log likelihood(nll) + + Normally, this function is called in batchify_nll. + Args: + text: (Batch, Length) + punc: (Batch, Length) + text_lengths: (Batch,) + max_lengths: int + """ + batch_size = text.size(0) + # For data parallel + if max_length is None: + text = text[:, :text_lengths.max()] + punc = punc[:, :text_lengths.max()] + else: + text = text[:, :max_length] + punc = punc[:, :max_length] + + if self.with_vad(): + # Should be VadRealtimeTransformer + assert vad_indexes is not None + y, _ = self.punc_forward(text, text_lengths, vad_indexes) + else: + # Should be TargetDelayTransformer, + y, _ = self.punc_forward(text, text_lengths) + + # Calc negative log likelihood + # nll: (BxL,) + if self.training == False: + _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1) + from sklearn.metrics import f1_score + f1_score = f1_score(punc.view(-1).detach().cpu().numpy(), + indices.squeeze(-1).detach().cpu().numpy(), + average='micro') + nll = torch.Tensor([f1_score]).repeat(text_lengths.sum()) + return nll, text_lengths + else: + self.punc_weight = self.punc_weight.to(punc.device) + nll = F.cross_entropy(y.view(-1, y.shape[-1]), punc.view(-1), self.punc_weight, reduction="none", + ignore_index=self.ignore_id) + # nll: (BxL,) -> (BxL,) + if max_length is None: + nll.masked_fill_(make_pad_mask(text_lengths).to(nll.device).view(-1), 0.0) + else: + nll.masked_fill_( + make_pad_mask(text_lengths, maxlen=max_length + 1).to(nll.device).view(-1), + 0.0, + ) + # nll: (BxL,) -> (B, L) + nll = nll.view(batch_size, -1) + return nll, text_lengths + + + def forward( + self, + text: torch.Tensor, + punc: torch.Tensor, + text_lengths: torch.Tensor, + punc_lengths: torch.Tensor, + vad_indexes: Optional[torch.Tensor] = None, + vad_indexes_lengths: Optional[torch.Tensor] = None, + ): + nll, y_lengths = self.nll(text, punc, text_lengths, punc_lengths, vad_indexes=vad_indexes) + ntokens = y_lengths.sum() + loss = nll.sum() / ntokens + stats = dict(loss=loss.detach()) + + # force_gatherable: to-device and to-tensor if scalar for DataParallel + loss, stats, weight = force_gatherable((loss, stats, ntokens), loss.device) + return loss, stats, weight + + def generate(self, + data_in, + data_lengths=None, + key: list = None, + tokenizer=None, + frontend=None, + **kwargs, + ): + assert len(data_in) == 1 + text = load_audio_text_image_video(data_in, data_type=kwargs.get("kwargs", "text"))[0] + vad_indexes = kwargs.get("vad_indexes", None) + # text = data_in[0] + # text_lengths = data_lengths[0] if data_lengths is not None else None + split_size = kwargs.get("split_size", 20) + + jieba_usr_dict = kwargs.get("jieba_usr_dict", None) + if jieba_usr_dict and isinstance(jieba_usr_dict, str): + import jieba + jieba.load_userdict(jieba_usr_dict) + jieba_usr_dict = jieba + kwargs["jieba_usr_dict"] = "jieba_usr_dict" + tokens = split_words(text, jieba_usr_dict=jieba_usr_dict) + tokens_int = tokenizer.encode(tokens) + + mini_sentences = split_to_mini_sentence(tokens, split_size) + mini_sentences_id = split_to_mini_sentence(tokens_int, split_size) + assert len(mini_sentences) == len(mini_sentences_id) + cache_sent = [] + cache_sent_id = torch.from_numpy(np.array([], dtype='int32')) + new_mini_sentence = "" + new_mini_sentence_punc = [] + cache_pop_trigger_limit = 200 + results = [] + meta_data = {} + punc_array = None + for mini_sentence_i in range(len(mini_sentences)): + mini_sentence = mini_sentences[mini_sentence_i] + mini_sentence_id = mini_sentences_id[mini_sentence_i] + mini_sentence = cache_sent + mini_sentence + mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0) + data = { + "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0), + "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')), + } + data = to_device(data, kwargs["device"]) + # y, _ = self.wrapped_model(**data) + y, _ = self.punc_forward(**data) + _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1) + punctuations = indices + if indices.size()[0] != 1: + punctuations = torch.squeeze(indices) + assert punctuations.size()[0] == len(mini_sentence) + + # Search for the last Period/QuestionMark as cache + if mini_sentence_i < len(mini_sentences) - 1: + sentenceEnd = -1 + last_comma_index = -1 + for i in range(len(punctuations) - 2, 1, -1): + if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?": + sentenceEnd = i + break + if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",": + last_comma_index = i + + if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0: + # The sentence it too long, cut off at a comma. + sentenceEnd = last_comma_index + punctuations[sentenceEnd] = self.sentence_end_id + cache_sent = mini_sentence[sentenceEnd + 1:] + cache_sent_id = mini_sentence_id[sentenceEnd + 1:] + mini_sentence = mini_sentence[0:sentenceEnd + 1] + punctuations = punctuations[0:sentenceEnd + 1] + + # if len(punctuations) == 0: + # continue + + punctuations_np = punctuations.cpu().numpy() + new_mini_sentence_punc += [int(x) for x in punctuations_np] + words_with_punc = [] + for i in range(len(mini_sentence)): + if (i==0 or self.punc_list[punctuations[i-1]] == "。" or self.punc_list[punctuations[i-1]] == "?") and len(mini_sentence[i][0].encode()) == 1: + mini_sentence[i] = mini_sentence[i].capitalize() + if i == 0: + if len(mini_sentence[i][0].encode()) == 1: + mini_sentence[i] = " " + mini_sentence[i] + if i > 0: + if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1: + mini_sentence[i] = " " + mini_sentence[i] + words_with_punc.append(mini_sentence[i]) + if self.punc_list[punctuations[i]] != "_": + punc_res = self.punc_list[punctuations[i]] + if len(mini_sentence[i][0].encode()) == 1: + if punc_res == ",": + punc_res = "," + elif punc_res == "。": + punc_res = "." + elif punc_res == "?": + punc_res = "?" + words_with_punc.append(punc_res) + new_mini_sentence += "".join(words_with_punc) + # Add Period for the end of the sentence + new_mini_sentence_out = new_mini_sentence + new_mini_sentence_punc_out = new_mini_sentence_punc + if mini_sentence_i == len(mini_sentences) - 1: + if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、": + new_mini_sentence_out = new_mini_sentence[:-1] + "。" + new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.sentence_end_id] + elif new_mini_sentence[-1] == ",": + new_mini_sentence_out = new_mini_sentence[:-1] + "." + new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.sentence_end_id] + elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?" and len(new_mini_sentence[-1].encode())==0: + new_mini_sentence_out = new_mini_sentence + "。" + new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.sentence_end_id] + elif new_mini_sentence[-1] != "." and new_mini_sentence[-1] != "?" and len(new_mini_sentence[-1].encode())==1: + new_mini_sentence_out = new_mini_sentence + "." + new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.sentence_end_id] + # keep a punctuations array for punc segment + if punc_array is None: + punc_array = punctuations + else: + punc_array = torch.cat([punc_array, punctuations], dim=0) + + result_i = {"key": key[0], "text": new_mini_sentence_out, "punc_array": punc_array} + results.append(result_i) + + return results, meta_data + diff --git a/funasr/models/ct_transformer_streaming/template.yaml b/funasr/models/ct_transformer_streaming/template.yaml new file mode 100644 index 000000000..c20a09896 --- /dev/null +++ b/funasr/models/ct_transformer_streaming/template.yaml @@ -0,0 +1,53 @@ +# This is an example that demonstrates how to configure a model file. +# You can modify the configuration according to your own requirements. + +# to print the register_table: +# from funasr.register import tables +# tables.print() + +model: CTTransformerStreaming +model_conf: + ignore_id: 0 + embed_unit: 256 + att_unit: 256 + dropout_rate: 0.1 + punc_list: + - + - _ + - , + - 。 + - ? + - 、 + punc_weight: + - 1.0 + - 1.0 + - 1.0 + - 1.0 + - 1.0 + - 1.0 + sentence_end_id: 3 + +encoder: SANMEncoder +encoder_conf: + input_size: 256 + output_size: 256 + attention_heads: 8 + linear_units: 1024 + num_blocks: 4 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: pe + pos_enc_class: SinusoidalPositionEncoder + normalize_before: true + kernel_size: 11 + sanm_shfit: 0 + selfattention_layer_type: sanm + padding_idx: 0 + +tokenizer: CharTokenizer +tokenizer_conf: + unk_symbol: + + + diff --git a/funasr/models/ct_transformer_streaming/utils.py b/funasr/models/ct_transformer_streaming/utils.py new file mode 100644 index 000000000..917f2e035 --- /dev/null +++ b/funasr/models/ct_transformer_streaming/utils.py @@ -0,0 +1,111 @@ +import re + +def split_to_mini_sentence(words: list, word_limit: int = 20): + assert word_limit > 1 + if len(words) <= word_limit: + return [words] + sentences = [] + length = len(words) + sentence_len = length // word_limit + for i in range(sentence_len): + sentences.append(words[i * word_limit:(i + 1) * word_limit]) + if length % word_limit > 0: + sentences.append(words[sentence_len * word_limit:]) + return sentences + + +# def split_words(text: str, **kwargs): +# words = [] +# segs = text.split() +# for seg in segs: +# # There is no space in seg. +# current_word = "" +# for c in seg: +# if len(c.encode()) == 1: +# # This is an ASCII char. +# current_word += c +# else: +# # This is a Chinese char. +# if len(current_word) > 0: +# words.append(current_word) +# current_word = "" +# words.append(c) +# if len(current_word) > 0: +# words.append(current_word) +# +# return words + +def split_words(text: str, jieba_usr_dict=None, **kwargs): + if jieba_usr_dict: + input_list = text.split() + token_list_all = [] + langauge_list = [] + token_list_tmp = [] + language_flag = None + for token in input_list: + if isEnglish(token) and language_flag == 'Chinese': + token_list_all.append(token_list_tmp) + langauge_list.append('Chinese') + token_list_tmp = [] + elif not isEnglish(token) and language_flag == 'English': + token_list_all.append(token_list_tmp) + langauge_list.append('English') + token_list_tmp = [] + + token_list_tmp.append(token) + + if isEnglish(token): + language_flag = 'English' + else: + language_flag = 'Chinese' + + if token_list_tmp: + token_list_all.append(token_list_tmp) + langauge_list.append(language_flag) + + result_list = [] + for token_list_tmp, language_flag in zip(token_list_all, langauge_list): + if language_flag == 'English': + result_list.extend(token_list_tmp) + else: + seg_list = jieba_usr_dict.cut(join_chinese_and_english(token_list_tmp), HMM=False) + result_list.extend(seg_list) + + return result_list + + else: + words = [] + segs = text.split() + for seg in segs: + # There is no space in seg. + current_word = "" + for c in seg: + if len(c.encode()) == 1: + # This is an ASCII char. + current_word += c + else: + # This is a Chinese char. + if len(current_word) > 0: + words.append(current_word) + current_word = "" + words.append(c) + if len(current_word) > 0: + words.append(current_word) + return words + +def isEnglish(text:str): + if re.search('^[a-zA-Z\']+$', text): + return True + else: + return False + +def join_chinese_and_english(input_list): + line = '' + for token in input_list: + if isEnglish(token): + line = line + ' ' + token + else: + line = line + token + + line = line.strip() + return line diff --git a/funasr/models/ct_transformer_streaming/vad_realtime_transformer.py b/funasr/models/ct_transformer_streaming/vad_realtime_transformer.py new file mode 100644 index 000000000..155057ce8 --- /dev/null +++ b/funasr/models/ct_transformer_streaming/vad_realtime_transformer.py @@ -0,0 +1,135 @@ +from typing import Any +from typing import List +from typing import Tuple + +import torch +import torch.nn as nn + +from funasr.models.transformer.embedding import SinusoidalPositionEncoder +from funasr.models.ct_transformer.sanm_encoder import SANMVadEncoder as Encoder + + +class VadRealtimeTransformer(torch.nn.Module): + """ + Author: Speech Lab of DAMO Academy, Alibaba Group + CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection + https://arxiv.org/pdf/2003.01309.pdf + """ + def __init__( + self, + vocab_size: int, + punc_size: int, + pos_enc: str = None, + embed_unit: int = 128, + att_unit: int = 256, + head: int = 2, + unit: int = 1024, + layer: int = 4, + dropout_rate: float = 0.5, + kernel_size: int = 11, + sanm_shfit: int = 0, + ): + super().__init__() + if pos_enc == "sinusoidal": + # pos_enc_class = PositionalEncoding + pos_enc_class = SinusoidalPositionEncoder + elif pos_enc is None: + + def pos_enc_class(*args, **kwargs): + return nn.Sequential() # indentity + + else: + raise ValueError(f"unknown pos-enc option: {pos_enc}") + + self.embed = nn.Embedding(vocab_size, embed_unit) + self.encoder = Encoder( + input_size=embed_unit, + output_size=att_unit, + attention_heads=head, + linear_units=unit, + num_blocks=layer, + dropout_rate=dropout_rate, + input_layer="pe", + # pos_enc_class=pos_enc_class, + padding_idx=0, + kernel_size=kernel_size, + sanm_shfit=sanm_shfit, + ) + self.decoder = nn.Linear(att_unit, punc_size) + + +# def _target_mask(self, ys_in_pad): +# ys_mask = ys_in_pad != 0 +# m = subsequent_n_mask(ys_mask.size(-1), 5, device=ys_mask.device).unsqueeze(0) +# return ys_mask.unsqueeze(-2) & m + + def forward(self, input: torch.Tensor, text_lengths: torch.Tensor, + vad_indexes: torch.Tensor) -> Tuple[torch.Tensor, None]: + """Compute loss value from buffer sequences. + + Args: + input (torch.Tensor): Input ids. (batch, len) + hidden (torch.Tensor): Target ids. (batch, len) + + """ + x = self.embed(input) + # mask = self._target_mask(input) + h, _, _ = self.encoder(x, text_lengths, vad_indexes) + y = self.decoder(h) + return y, None + + def with_vad(self): + return True + + def score(self, y: torch.Tensor, state: Any, x: torch.Tensor) -> Tuple[torch.Tensor, Any]: + """Score new token. + + Args: + y (torch.Tensor): 1D torch.int64 prefix tokens. + state: Scorer state for prefix tokens + x (torch.Tensor): encoder feature that generates ys. + + Returns: + tuple[torch.Tensor, Any]: Tuple of + torch.float32 scores for next token (vocab_size) + and next state for ys + + """ + y = y.unsqueeze(0) + h, _, cache = self.encoder.forward_one_step(self.embed(y), self._target_mask(y), cache=state) + h = self.decoder(h[:, -1]) + logp = h.log_softmax(dim=-1).squeeze(0) + return logp, cache + + def batch_score(self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor) -> Tuple[torch.Tensor, List[Any]]: + """Score new token batch. + + Args: + ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen). + states (List[Any]): Scorer states for prefix tokens. + xs (torch.Tensor): + The encoder feature that generates ys (n_batch, xlen, n_feat). + + Returns: + tuple[torch.Tensor, List[Any]]: Tuple of + batchfied scores for next token with shape of `(n_batch, vocab_size)` + and next state list for ys. + + """ + # merge states + n_batch = len(ys) + n_layers = len(self.encoder.encoders) + if states[0] is None: + batch_state = None + else: + # transpose state of [batch, layer] into [layer, batch] + batch_state = [torch.stack([states[b][i] for b in range(n_batch)]) for i in range(n_layers)] + + # batch decoding + h, _, states = self.encoder.forward_one_step(self.embed(ys), self._target_mask(ys), cache=batch_state) + h = self.decoder(h[:, -1]) + logp = h.log_softmax(dim=-1) + + # transpose state of [layer, batch] into [batch, layer] + state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] + return logp, state_list