Merge branch 'dev_gzf_funasr2' of github.com:alibaba-damo-academy/FunASR into dev_gzf_funasr2

add
This commit is contained in:
游雁 2023-11-27 13:53:07 +08:00
commit 3cee2214b7
8 changed files with 67 additions and 57 deletions

View File

@ -1,3 +1,39 @@
FunASR Model Open Source License
Version 1.0
Copyright (C) [2023-2028] Alibaba Group. All rights reserved.
Thank you for choosing the FunASR open source models. The FunASR open source models contain a series of open-source models that allow everyone to use, modify, share, and learn from it.
To ensure better community collaboration, we have developed the following agreement and hope that you carefully read and abide by it.
1 Definitions
In this agreement, [FunASR software] refers to the FunASR open source model, and its derivatives, including fine-tuned models. [You] refer to individuals or organizations who use, modify, share, and learn from [FunASR software].
2 License and Restrictions
2.1 License
You are free to use, copy, modify, and share [FunASR software] under the conditions of this agreement.
2.2 Restrictions
You should indicate the code and model source and author information when using, copying, modifying and sharing [FunASR software]. You should keep the relevant names of models in [FunASR software].
3 Responsibility and Risk
[FunASR software] is for reference and learning purposes only and is not responsible for any direct or indirect losses caused by your use or modification of [FunASR software]. You should take responsibility and risks for your use and modification of [FunASR software].
4 Termination
If you violate any terms of this agreement, your license will be automatically terminated, and you must stop using, copying, modifying, and sharing [FunASR software].
5 Revision
This agreement may be updated and revised from time to time. The revised agreement will be published in the FunASR official repository and automatically take effect. If you continue to use, copy, modify, and share [FunASR software], it means you agree to the revised agreement.
6 Other Provisions
This agreement is subject to the laws of [Country/Region]. If any provisions are found to be illegal, invalid, or unenforceable, they shall be deemed deleted from this agreement, and the remaining provisions shall remain valid and binding.
If you have any questions or comments about this agreement, please contact us.
Copyright (c) [2023-2028] Alibaba Group. All rights reserved.
FunASR 模型开源协议
版本号1.0
@ -36,38 +72,3 @@ FunASR 模型开源协议
版权所有© [2023-2028] [阿里巴巴集团]。保留所有权利。
FunASR Model Open Source License
Version 1.0
Copyright (C) [2023-2028] Alibaba Group. All rights reserved.
Thank you for choosing the FunASR open source models. The FunASR open source models contain a series of open-source models that allow everyone to use, modify, share, and learn from it.
To ensure better community collaboration, we have developed the following agreement and hope that you carefully read and abide by it.
1 Definitions
In this agreement, [FunASR software] refers to the FunASR open source model, and its derivatives, including fine-tuned models. [You] refer to individuals or organizations who use, modify, share, and learn from [FunASR software].
2 License and Restrictions
2.1 License
You are free to use, copy, modify, and share [FunASR software] under the conditions of this agreement.
2.2 Restrictions
You should indicate the code and model source and author information when using, copying, modifying and sharing [FunASR software]. You should keep the relevant names of models in [FunASR software].
3 Responsibility and Risk
[FunASR software] is for reference and learning purposes only and is not responsible for any direct or indirect losses caused by your use or modification of [FunASR software]. You should take responsibility and risks for your use and modification of [FunASR software].
4 Termination
If you violate any terms of this agreement, your license will be automatically terminated, and you must stop using, copying, modifying, and sharing [FunASR software].
5 Revision
This agreement may be updated and revised from time to time. The revised agreement will be published in the FunASR official repository and automatically take effect. If you continue to use, copy, modify, and share [FunASR software], it means you agree to the revised agreement.
6 Other Provisions
This agreement is subject to the laws of [Country/Region]. If any provisions are found to be illegal, invalid, or unenforceable, they shall be deemed deleted from this agreement, and the remaining provisions shall remain valid and binding.
If you have any questions or comments about this agreement, please contact us.
Copyright (c) [2023-2028] Alibaba Group. All rights reserved.

View File

@ -956,24 +956,29 @@ def inference_paraformer_vad_speaker(
ed = int(vadsegment[1]) / 1000
vad_segments.append(
[st, ed, audio[int(st * 16000):int(ed * 16000)]])
check_audio_list(vad_segments)
# sv pipeline
segments = sv_chunk(vad_segments)
embeddings = []
for s in segments:
#_, embs = self.sv_pipeline([s[2]], output_emb=True)
# embeddings.append(embs)
wavs = sv_preprocess([s[2]])
# embs = self.forward(wavs)
embs = []
for x in wavs:
x = extract_feature([x])
embs.append(sv_model(x))
embs = torch.cat(embs)
embeddings.append(embs.detach().numpy())
embeddings = np.concatenate(embeddings)
labels = cb_model(embeddings)
sv_output = postprocess(segments, vad_segments, labels, embeddings)
audio_dur = check_audio_list(vad_segments)
if audio_dur > 5:
# sv pipeline
segments = sv_chunk(vad_segments)
embeddings = []
for s in segments:
#_, embs = self.sv_pipeline([s[2]], output_emb=True)
# embeddings.append(embs)
wavs = sv_preprocess([s[2]])
# embs = self.forward(wavs)
embs = []
for x in wavs:
x = extract_feature([x])
embs.append(sv_model(x))
embs = torch.cat(embs)
embeddings.append(embs.detach().numpy())
embeddings = np.concatenate(embeddings)
labels = cb_model(embeddings)
sv_output = postprocess(segments, vad_segments, labels, embeddings)
else:
# fake speaker res for too shot utterance
sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))
speech, speech_lengths = batch["speech"], batch["speech_lengths"]

View File

@ -134,7 +134,7 @@ class NeatContextualParaformer(Paraformer):
text_lengths: torch.Tensor,
hotword_pad: torch.Tensor,
hotword_lengths: torch.Tensor,
ideal_attn: torch.Tensor,
dha_pad: torch.Tensor,
) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
"""Frontend + Encoder + Decoder + Calc loss
@ -207,7 +207,7 @@ class NeatContextualParaformer(Paraformer):
# 2b. Attention decoder branch
if self.ctc_weight != 1.0:
loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths, ideal_attn
encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
)
# 3. CTC-Att loss definition
@ -242,7 +242,6 @@ class NeatContextualParaformer(Paraformer):
ys_pad_lens: torch.Tensor,
hotword_pad: torch.Tensor,
hotword_lengths: torch.Tensor,
ideal_attn: torch.Tensor,
):
encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
encoder_out.device)

View File

@ -35,7 +35,8 @@ def check_audio_list(audio: list):
assert seg[0] >= audio[
i - 1][1], 'modelscope error: Wrong time stamps.'
audio_dur += seg[1] - seg[0]
assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
return audio_dur
# assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
def sv_preprocess(inputs: Union[np.ndarray, list]):

View File

@ -110,6 +110,7 @@ int FsmnVadOnline::OnlineLfrCmvn(vector<vector<float>> &vad_feats, bool input_fi
p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
}
out_feats.emplace_back(p);
p.clear();
} else {
lfr_splice_frame_idxs = i;
break;

View File

@ -264,6 +264,7 @@ void FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {
p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
}
out_feats.emplace_back(p);
p.clear();
}
}
// Apply cmvn

View File

@ -164,6 +164,7 @@ int ParaformerOnline::OnlineLfrCmvn(vector<vector<float>> &wav_feats, bool input
p.insert(p.end(), wav_feats[wav_feats.size() - 1].begin(), wav_feats[wav_feats.size() - 1].end());
}
out_feats.emplace_back(p);
p.clear();
} else {
lfr_splice_frame_idxs = i;
break;

View File

@ -436,6 +436,7 @@ void Paraformer::LfrCmvn(std::vector<std::vector<float>> &asr_feats) {
p.insert(p.end(), asr_feats[asr_feats.size() - 1].begin(), asr_feats[asr_feats.size() - 1].end());
}
out_feats.emplace_back(p);
p.clear();
}
}
// Apply cmvn