From f2406e2d61218c848bfd6da933c36956a9b0a5aa Mon Sep 17 00:00:00 2001 From: zhifu gao Date: Tue, 5 Mar 2024 22:39:51 +0800 Subject: [PATCH] qwenaudio qwenaudiochat (#1433) --- README.md | 5 +- README_zh.md | 29 +++--- .../llm_asr/conf/template.yaml | 89 +++++++++++++++++++ .../llm_asr/demo_infer.sh | 14 +++ .../llm_asr/demo_train_or_finetune.sh | 47 ++++++++++ funasr/models/qwen_audio/model.py | 10 +++ funasr/tokenizer/hf_tokenizer.py | 3 +- funasr/tokenizer/whisper_tokenizer.py | 2 +- 8 files changed, 182 insertions(+), 17 deletions(-) create mode 100644 examples/industrial_data_pretraining/llm_asr/conf/template.yaml create mode 100644 examples/industrial_data_pretraining/llm_asr/demo_infer.sh create mode 100644 examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh diff --git a/README.md b/README.md index 970c5ebd0..d34249d19 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,8 @@ ## What's new: -- 2024/03/05:Added support for the Whisper-large-v3 model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the[modelscope](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary), and [openai](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining/whisper). +- 2024/03/05:Added the Qwen-Audio and Qwen-Audio-Chat large-scale audio-text multimodal models, which have topped multiple audio domain leaderboards. These models support speech dialogue, [usage](examples/industrial_data_pretraining/qwen_audio). +- 2024/03/05:Added support for the Whisper-large-v3 model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the[modelscope](examples/industrial_data_pretraining/whisper/demo.py), and [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py). - 2024/03/05: Offline File Transcription Service 4.4, Offline File Transcription Service of English 1.5,Real-time Transcription Service 1.9 released,docker image supports ARM64 platform, update modelscope;([docs](runtime/readme.md)) - 2024/01/30:funasr-1.0 has been released ([docs](https://github.com/alibaba-damo-academy/FunASR/discussions/1319)) - 2024/01/30:emotion recognition models are new supported. [model link](https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary), modified from [repo](https://github.com/ddlBoJack/emotion2vec). @@ -83,6 +84,8 @@ FunASR has open-sourced a large number of pre-trained models on industrial data. | cam++
( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗](https://huggingface.co/funasr/campplus) ) | speaker verification/diarization | 5000 hours | 7.2M | | Whisper-large-v2
([⭐](https://www.modelscope.cn/models/iic/speech_whisper-large_asr_multilingual/summary) [🍀](https://github.com/openai/whisper) ) | speech recognition, with timestamps, non-streaming | multilingual | 1.5G | | Whisper-large-v3
([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary) [🍀](https://github.com/openai/whisper) ) | speech recognition, with timestamps, non-streaming | multilingual | 1.5G | +| Qwen-Audio
([⭐](examples/industrial_data_pretraining/qwen_audio/demo.py) [🤗](https://huggingface.co/Qwen/Qwen-Audio) ) | audio-text multimodal models (pretraining) | multilingual | 8B | +| Qwen-Audio-Chat
([⭐](examples/industrial_data_pretraining/qwen_audio/demo_chat.py) [🤗](https://huggingface.co/Qwen/Qwen-Audio-Chat) ) | audio-text multimodal models (chat) | multilingual | 8B | diff --git a/README_zh.md b/README_zh.md index e6a106022..83e37fb26 100644 --- a/README_zh.md +++ b/README_zh.md @@ -29,7 +29,8 @@ FunASR希望在语音识别的学术研究和工业应用之间架起一座桥 ## 最新动态 -- 2024/03/05:新增加Whisper-large-v3模型支持,多语言语音识别/翻译/语种识别,支持从[modelscope](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)仓库下载,也支持从[openai](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining/whisper)仓库下载模型。 +- 2024/03/05:新增加Qwen-Audio与Qwen-Audio-Chat音频文本模态大模型,在多个音频领域测试榜单刷榜,中支持语音对话,详细用法见 [示例](examples/industrial_data_pretraining/qwen_audio)。 +- 2024/03/05:新增加Whisper-large-v3模型支持,多语言语音识别/翻译/语种识别,支持从 [modelscope](examples/industrial_data_pretraining/whisper/demo.py)仓库下载,也支持从 [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py)仓库下载模型。 - 2024/03/05: 中文离线文件转写服务 4.4、英文离线文件转写服务 1.5、中文实时语音听写服务 1.9 发布,docker镜像支持arm64平台,升级modelscope版本;详细信息参阅([部署文档](runtime/readme_cn.md)) - 2024/01/30:funasr-1.0发布,更新说明[文档](https://github.com/alibaba-damo-academy/FunASR/discussions/1319) - 2024/01/30:新增加情感识别 [模型链接](https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary),原始模型 [repo](https://github.com/ddlBoJack/emotion2vec). @@ -73,19 +74,19 @@ FunASR开源了大量在工业数据上预训练模型,您可以在[模型许 (注:⭐ 表示ModelScope模型仓库,🤗 表示Huggingface模型仓库,🍀表示OpenAI模型仓库) -| 模型名字 | 任务详情 | 训练数据 | 参数量 | -|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------:|:------------:|:----:| -| paraformer-zh
([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) [🤗](https://huggingface.co/funasr/paraformer-tp) ) | 语音识别,带时间戳输出,非实时 | 60000小时,中文 | 220M | -| paraformer-zh-streaming
( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗](https://huggingface.co/funasr/paraformer-zh-streaming) ) | 语音识别,实时 | 60000小时,中文 | 220M | -| paraformer-en
( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗](https://huggingface.co/funasr/paraformer-en) ) | 语音识别,非实时 | 50000小时,英文 | 220M | -| conformer-en
( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗](https://huggingface.co/funasr/conformer-en) ) | 语音识别,非实时 | 50000小时,英文 | 220M | -| ct-punc
( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗](https://huggingface.co/funasr/ct-punc) ) | 标点恢复 | 100M,中文与英文 | 1.1G | -| fsmn-vad
( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗](https://huggingface.co/funasr/fsmn-vad) ) | 语音端点检测,实时 | 5000小时,中文与英文 | 0.4M | -| fa-zh
( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗](https://huggingface.co/funasr/fa-zh) ) | 字级别时间戳预测 | 50000小时,中文 | 38M | -| cam++
( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗](https://huggingface.co/funasr/campplus) ) | 说话人确认/分割 | 5000小时 | 7.2M | -| Whisper-large-v2
([⭐](https://www.modelscope.cn/models/iic/speech_whisper-large_asr_multilingual/summary) [🍀](https://github.com/openai/whisper) ) | 语音识别,带时间戳输出,非实时 | 多语言 | 1G | -| Whisper-large-v3
([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary) [🍀](https://github.com/openai/whisper) ) | 语音识别,带时间戳输出,非实时 | 多语言 | 1G | - +| 模型名字 | 任务详情 | 训练数据 | 参数量 | +|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------:|:------------:|:----:| +| paraformer-zh
([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) [🤗](https://huggingface.co/funasr/paraformer-tp) ) | 语音识别,带时间戳输出,非实时 | 60000小时,中文 | 220M | +| paraformer-zh-streaming
( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗](https://huggingface.co/funasr/paraformer-zh-streaming) ) | 语音识别,实时 | 60000小时,中文 | 220M | +| paraformer-en
( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗](https://huggingface.co/funasr/paraformer-en) ) | 语音识别,非实时 | 50000小时,英文 | 220M | +| conformer-en
( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗](https://huggingface.co/funasr/conformer-en) ) | 语音识别,非实时 | 50000小时,英文 | 220M | +| ct-punc
( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗](https://huggingface.co/funasr/ct-punc) ) | 标点恢复 | 100M,中文与英文 | 1.1G | +| fsmn-vad
( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗](https://huggingface.co/funasr/fsmn-vad) ) | 语音端点检测,实时 | 5000小时,中文与英文 | 0.4M | +| fa-zh
( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗](https://huggingface.co/funasr/fa-zh) ) | 字级别时间戳预测 | 50000小时,中文 | 38M | +| cam++
( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗](https://huggingface.co/funasr/campplus) ) | 说话人确认/分割 | 5000小时 | 7.2M | +| Whisper-large-v3
([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary) [🍀](https://github.com/openai/whisper) ) | 语音识别,带时间戳输出,非实时 | 多语言 | 1G | +| Qwen-Audio
([⭐](examples/industrial_data_pretraining/qwen_audio/demo.py) [🤗](https://huggingface.co/Qwen/Qwen-Audio) ) | 音频文本多模态大模型(预训练) | 多语言 | 8B | +| Qwen-Audio-Chat
([⭐](examples/industrial_data_pretraining/qwen_audio/demo_chat.py) [🤗](https://huggingface.co/Qwen/Qwen-Audio-Chat) ) | 音频文本多模态大模型(chat版本) | 多语言 | 8B | ## 快速开始 diff --git a/examples/industrial_data_pretraining/llm_asr/conf/template.yaml b/examples/industrial_data_pretraining/llm_asr/conf/template.yaml new file mode 100644 index 000000000..3c51ff423 --- /dev/null +++ b/examples/industrial_data_pretraining/llm_asr/conf/template.yaml @@ -0,0 +1,89 @@ +# This is an example that demonstrates how to configure a model file. +# You can modify the configuration according to your own requirements. + +# to print the register_table: +# from funasr.register import tables +# tables.print() + +# network architecture +model: LLMASR +model_conf: + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: true + +# encoder +encoder: WhisperWarp +encoder_conf: + hub: funasr + init_param_path: "/nfs/maziyang.mzy/models/Whisper-large-v2" + freeze: true + +llm: Vicuna +llm_conf: + hub: hf + init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5" + freeze: true + +adaptor: Linear +adaptor_conf: + downsample_rate: 5 + llm_dim: 4096 + encoder_dim: 512 + +# frontend related +frontend: WhisperFrontend +frontend_conf: + fs: 16000 + whisper_model: large + do_pad_trim: true + + +specaug: SpecAugLFR +specaug_conf: + apply_time_warp: false + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 30 + lfr_rate: 6 + num_freq_mask: 1 + apply_time_mask: true + time_mask_width_range: + - 0 + - 12 + num_time_mask: 1 + +train_conf: + accum_grad: 1 + grad_clip: 5 + max_epoch: 150 + keep_nbest_models: 10 + log_interval: 10 + +optim: adamw +optim_conf: + lr: 0.0001 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 1500 + +dataset: AudioLLMDataset +dataset_conf: + index_ds: IndexDSJsonl + batch_sampler: RankFullLocalShuffleBatchSampler + batch_type: example # example or length + batch_size: 8 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len; + max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length, + buffer_size: 500 + shuffle: True + num_workers: 4 + preprocessor_text: TextPreprocessRemovePunctuation + +tokenizer: HuggingfaceTokenizer +tokenizer_conf: + unk_symbol: + init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5" + diff --git a/examples/industrial_data_pretraining/llm_asr/demo_infer.sh b/examples/industrial_data_pretraining/llm_asr/demo_infer.sh new file mode 100644 index 000000000..f8ebc4cf2 --- /dev/null +++ b/examples/industrial_data_pretraining/llm_asr/demo_infer.sh @@ -0,0 +1,14 @@ +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) + + + +python -m funasr.bin.inference \ +--config-path="/root/FunASR/examples/aishell/llm_asr_nar/conf" \ +--config-name="template.yaml" \ +++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \ +++input="/nfs/beinian.lzr/workspace/datasets/data/16k/opendata/aishell1/dev/wav/S0724/BAC009S0724W0121.wav" \ +++scope_map="encoder.model,audio_encoder,encoder_projector,adaptor" \ +++output_dir="./outputs/debug" \ +++device="cpu" \ + diff --git a/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh b/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh new file mode 100644 index 000000000..a518d57ac --- /dev/null +++ b/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh @@ -0,0 +1,47 @@ +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) + + +# which gpu to train or finetune +export CUDA_VISIBLE_DEVICES="0" +gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + +# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn +#data_dir="/Users/zhifu/funasr1.0/data/list" + +## generate jsonl from wav.scp and text.txt +#python -m funasr.datasets.audio_datasets.scp2jsonl \ +#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ +#++data_type_list='["source", "target"]' \ +#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl + +train_data="/nfs/zhifu.gzf/data/datalist/aishell1_aishell2_wav_speech_llm_train_data_del_tail500.json" +val_data="/nfs/zhifu.gzf/data/datalist/aishell1_aishell2_wav_speech_llm_train_data_tail500.json" + +# exp output dir +output_dir="/Users/zhifu/exp" +log_file="${output_dir}/log.txt" + +workspace=`pwd` +config="template.yaml" + +init_param="${output_dir}/model.pt" + +mkdir -p ${output_dir} +echo "log_file: ${log_file}" + +torchrun \ +--nnodes 1 \ +--nproc_per_node ${gpu_num} \ +../../../funasr/bin/train.py \ +--config-path "${workspace}/conf" \ +--config-name "${config}" \ +++train_data_set_list="${train_data}" \ +++valid_data_set_list="${val_data}" \ +++dataset_conf.batch_size=2 \ +++dataset_conf.batch_type="example" \ +++dataset_conf.num_workers=0 \ +++train_conf.max_epoch=11 \ +++optim_conf.lr=0.0002 \ +++init_param="${init_param}" \ +++output_dir="${output_dir}" &> ${log_file} diff --git a/funasr/models/qwen_audio/model.py b/funasr/models/qwen_audio/model.py index 3eba026c2..e419b1ef0 100644 --- a/funasr/models/qwen_audio/model.py +++ b/funasr/models/qwen_audio/model.py @@ -20,6 +20,11 @@ from funasr.register import tables @tables.register("model_classes", "QwenAudio") @tables.register("model_classes", "QwenAudioWarp") class QwenAudioWarp(nn.Module): + """ + Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models + https://arxiv.org/abs/2311.07919 + Modified from https://github.com/QwenLM/Qwen-Audio + """ def __init__(self, *args, **kwargs): super().__init__() @@ -72,6 +77,11 @@ class QwenAudioWarp(nn.Module): @tables.register("model_classes", "QwenAudioChatWarp") class QwenAudioChatWarp(nn.Module): def __init__(self, *args, **kwargs): + """ + Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models + https://arxiv.org/abs/2311.07919 + Modified from https://github.com/QwenLM/Qwen-Audio + """ super().__init__() model_or_path = kwargs.get("model_path", "QwenAudio") diff --git a/funasr/tokenizer/hf_tokenizer.py b/funasr/tokenizer/hf_tokenizer.py index c856b3d5d..81f553d79 100644 --- a/funasr/tokenizer/hf_tokenizer.py +++ b/funasr/tokenizer/hf_tokenizer.py @@ -2,7 +2,8 @@ try: from transformers import AutoTokenizer except: - print("If you want to use hugging, please `pip install -U transformers`") + # print("If you want to use hugging, please `pip install -U transformers`") + pass from funasr.register import tables diff --git a/funasr/tokenizer/whisper_tokenizer.py b/funasr/tokenizer/whisper_tokenizer.py index f41c8235e..3fb5b645c 100644 --- a/funasr/tokenizer/whisper_tokenizer.py +++ b/funasr/tokenizer/whisper_tokenizer.py @@ -2,7 +2,7 @@ try: from whisper.tokenizer import get_tokenizer except: - print("If you want to use hugging, please `pip install -U transformers`") + print("Notice: If you want to use whisper, please `pip install -U openai-whisper`") from funasr.register import tables