From 05f05e7421da12e38109df8ba75be52e90f15092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Tue, 11 Jun 2024 19:00:55 +0800 Subject: [PATCH] decoding --- .../llm_asr/demo_speech2text.sh | 54 ++--- funasr/bin/train_ds.py | 4 +- funasr/datasets/openai_datasets/datasets.py | 224 ++++++++++++++++++ 3 files changed, 253 insertions(+), 29 deletions(-) diff --git a/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh b/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh index d4c409bf2..8e22cb955 100644 --- a/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh +++ b/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh @@ -12,6 +12,7 @@ jsonl_dir="/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData" out_dir="${ckpt_dir}/inference-${ckpt_id}" mkdir -p ${out_dir} for data_set in "librispeech_test_clean_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do +{ jsonl=${jsonl_dir}/${data_set} output_dir=${out_dir}/${data_set} mkdir -p ${output_dir} @@ -22,10 +23,12 @@ for data_set in "librispeech_test_clean_speech2text.jsonl" "librispeech_test_oth python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false +}& done +wait - -for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do +for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl"; do +{ jsonl=${jsonl_dir}/${data_set} output_dir=${out_dir}/${data_set} mkdir -p ${output_dir} @@ -36,30 +39,27 @@ for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true +}& done -for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do - jsonl=${jsonl_dir}/${data_set} - output_dir=${out_dir}/${data_set} - mkdir -p ${output_dir} - pred_file=${output_dir}/1best_recog/text_tn - ref_file=${output_dir}/1best_recog/label - - python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device} - - python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true - -done - -for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do - jsonl=${jsonl_dir}/${data_set} - output_dir=${out_dir}/${data_set} - mkdir -p ${output_dir} - pred_file=${output_dir}/1best_recog/text_tn - ref_file=${output_dir}/1best_recog/label - - python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device} - - python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false - -done \ No newline at end of file +#for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do +# jsonl=${jsonl_dir}/${data_set} +# output_dir=${out_dir}/${data_set} +# mkdir -p ${output_dir} +# pred_file=${output_dir}/1best_recog/text_tn +# ref_file=${output_dir}/1best_recog/label +# +# python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device} +# +#done +# +#for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do +# jsonl=${jsonl_dir}/${data_set} +# output_dir=${out_dir}/${data_set} +# mkdir -p ${output_dir} +# pred_file=${output_dir}/1best_recog/text_tn +# ref_file=${output_dir}/1best_recog/label +# +# python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device} +# +#done \ No newline at end of file diff --git a/funasr/bin/train_ds.py b/funasr/bin/train_ds.py index 5b1d4fd3b..67ed2ba93 100644 --- a/funasr/bin/train_ds.py +++ b/funasr/bin/train_ds.py @@ -182,7 +182,7 @@ def main(**kwargs): time_escaped = (time.perf_counter() - time_slice_i) / 3600.0 logging.info( - f"rank: {local_rank}, " + f"\n\nrank: {local_rank}, " f"time_escaped_epoch: {time_escaped:.3f} hours, " f"estimated to finish {dataloader.data_split_num} data_slices, remaining: {dataloader.data_split_num-data_split_i} slices, {(dataloader.data_split_num-data_split_i)*time_escaped:.3f} hours, " f"epoch: {trainer.max_epoch - epoch} epochs, {((trainer.max_epoch - epoch - 1)*dataloader.data_split_num + dataloader.data_split_num-data_split_i)*time_escaped:.3f} hours\n" @@ -199,7 +199,7 @@ def main(**kwargs): time2 = time.perf_counter() time_escaped = (time2 - time1) / 3600.0 logging.info( - f"rank: {local_rank}, " + f"\n\nrank: {local_rank}, " f"time_escaped_epoch: {time_escaped:.3f} hours, " f"estimated to finish {trainer.max_epoch} " f"epoch: {(trainer.max_epoch - epoch) * time_escaped:.3f} hours\n" diff --git a/funasr/datasets/openai_datasets/datasets.py b/funasr/datasets/openai_datasets/datasets.py index 8d243aced..630793041 100644 --- a/funasr/datasets/openai_datasets/datasets.py +++ b/funasr/datasets/openai_datasets/datasets.py @@ -222,3 +222,227 @@ class OpenAIDataset(torch.utils.data.Dataset): break return outputs + + +@tables.register("dataset_classes", "OpenAIDatasetMultiTurn") +class OpenAIDatasetMultiTurn(torch.utils.data.Dataset): + """ + SenseVoiceDataset + """ + + def __init__( + self, + path, + index_ds: str = None, + frontend=None, + tokenizer=None, + int_pad_value: int = -1, + float_pad_value: float = 0.0, + **kwargs, + ): + super().__init__() + index_ds_class = tables.index_ds_classes.get(index_ds) + self.index_ds = index_ds_class(path, **kwargs) + preprocessor_speech = kwargs.get("preprocessor_speech", None) + if preprocessor_speech: + preprocessor_speech_class = tables.preprocessor_classes.get(preprocessor_speech) + preprocessor_speech = preprocessor_speech_class( + **kwargs.get("preprocessor_speech_conf") + ) + self.preprocessor_speech = preprocessor_speech + preprocessor_text = kwargs.get("preprocessor_text", None) + if preprocessor_text: + preprocessor_text_class = tables.preprocessor_classes.get(preprocessor_text) + preprocessor_text = preprocessor_text_class(**kwargs.get("preprocessor_text_conf")) + self.preprocessor_text = preprocessor_text + + self.frontend = frontend + self.fs = 16000 if frontend is None else frontend.fs + self.data_type = "sound" + self.tokenizer = tokenizer + + self.int_pad_value = int_pad_value + self.float_pad_value = float_pad_value + self.sos = kwargs.get("sos", "<|startoftranscript|>") + self.eos = kwargs.get("eos", "<|endoftext|>") + self.batch_size = kwargs.get("batch_size") + self.batch_type = kwargs.get("batch_type") + self.prompt_ids_len = 0 + self.retry = kwargs.get("retry", 100) + + self.permute = False + from funasr.frontends.whisper_frontend import WhisperFrontend + + if isinstance(self.frontend, WhisperFrontend): + self.permute = True + + self.pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)") + # self.kwargs = kwargs + self.max_token_length = kwargs.get("max_token_length", 1024) + self.batch_size_scale_ratio_max = kwargs.get("batch_size_scale_ratio_max", 1.5) + self.batch_size_token_max = kwargs.get("batch_size_token_max", 2500) + self.multiturn_num_max = kwargs.get("multiturn_num_max", 5) + + def get_source_len(self, index): + item = self.index_ds[index] + return self.index_ds.get_source_len(item) + + def get_target_len(self, index): + item = self.index_ds[index] + return self.index_ds.get_target_len(item) + + def __len__(self): + return len(self.index_ds) + + def __getitem__(self, index): + # import pdb; + # pdb.set_trace() + + output = None + + for idx in range(self.retry): + badcase_flag = False + if idx == 0: + index_cur = index + else: + index_cur = torch.randint(0, len(self.index_ds), ()).item() + + item = self.index_ds[index_cur] + + system = item["system"] + user = item["user"] + assistant = item["assistant"] + + input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg = [], [], [], [], [], [] + + for i, (system_prompt, user_prompt, target_out) in enumerate( + zip(system, user, assistant) + ): + if i >= self.multiturn_num_max: + break + if i == 0: + source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n" + else: + source_input = ( + f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n" + ) + + splits = self.pattern.split(source_input) + source_ids = [] + fbank_mask_i = [] + fbank_beg_i = [] + fbank_lens_i = [] + for k, sub_str in enumerate(splits): + if not sub_str.startswith("<|startofspeech|>"): + sub_token = self.tokenizer.encode(sub_str) + source_ids += sub_token + fbank_mask_i += [0] * len(sub_token) + else: + sub_str = sub_str.replace("<|startofspeech|>", "").replace( + "<|endofspeech|>", "" + ) + if sub_str.startswith("!"): + try: + data_src = load_audio_text_image_video(sub_str[1:], fs=self.fs) + except Exception as e: + logging.error( + f"Loading wav failed! {str(e)}, {traceback.format_exc()}" + ) + badcase_flag = True + continue + speech, speech_lengths = extract_fbank( + data_src, + data_type=self.data_type, + frontend=self.frontend, + is_final=True, + ) # speech: [b, T, d] + if self.permute: + speech = speech.permute(0, 2, 1) + # if speech_lengths > self.batch_size: + # continue + + olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2 + olens = 1 + (olens - 3 + 2 * 1) // 2 + sub_token_len = (olens - 1) // 2 + 1 + sub_token = [0] * sub_token_len + fbank_beg_i = [len(source_ids)] + source_ids += sub_token + fbank_mask_i += [1] * len(sub_token) + + if badcase_flag: + continue + source_mask = [-100] * len(source_ids) + target_out = f"{target_out}<|im_end|>" + target_ids = self.tokenizer.encode(target_out) + input_ids += source_ids + target_ids + labels += source_mask + target_ids + fbank_mask += fbank_mask_i + fbank_beg.append(fbank_beg_i) + + if len(input_ids) > self.max_token_length: + logging.info( + f"input_ids > max_token_length: {len(input_ids)}>{self.max_token_length}, {item}" + ) + badcase_flag = True + if badcase_flag: + continue + input_ids = torch.tensor(input_ids, dtype=torch.int64) # [: self.max_token_length] + attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32) + labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length] + + fbank = speech[0, :, :] + fbank_lens = speech_lengths + fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32) + fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32) + + output = { + "speech": fbank, + "speech_lengths": fbank_lens, + "fbank_mask": fbank_mask, + "fbank_beg": fbank_beg, + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels_ids": labels, + } + break + + return output + + def collator(self, samples: list = None): + + for idx in range(self.retry): + badcase_flag = False + + outputs = {} + for sample in samples: + if sample is None: + continue + for key in sample.keys(): + if key not in outputs: + outputs[key] = [] + outputs[key].append(sample[key]) + + for key, data_list in outputs.items(): + if isinstance(data_list[0], torch.Tensor): + if data_list[0].dtype == torch.int64 or data_list[0].dtype == torch.int32: + + pad_value = self.int_pad_value + else: + pad_value = self.float_pad_value + + outputs[key] = torch.nn.utils.rnn.pad_sequence( + data_list, batch_first=True, padding_value=pad_value + ) + + if self.batch_type != "example": + b, t = outputs["input_ids"].shape + if b > 1 and b * t > self.batch_size_token_max: + logging.info( + f"Warning, {idx}th, b*t: {b}*{t}={b * t} > batch_size_sample_max: {self.batch_size_token_max}, drop last data" + ) + samples = samples[:-1] + continue + + break + + return outputs