From f2406e2d61218c848bfd6da933c36956a9b0a5aa Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: Tue, 5 Mar 2024 22:39:51 +0800
Subject: [PATCH] qwenaudio qwenaudiochat (#1433)

---
 README.md                                     |  5 +-
 README_zh.md                                  | 29 +++---
 .../llm_asr/conf/template.yaml                | 89 +++++++++++++++++++
 .../llm_asr/demo_infer.sh                     | 14 +++
 .../llm_asr/demo_train_or_finetune.sh         | 47 ++++++++++
 funasr/models/qwen_audio/model.py             | 10 +++
 funasr/tokenizer/hf_tokenizer.py              |  3 +-
 funasr/tokenizer/whisper_tokenizer.py         |  2 +-
 8 files changed, 182 insertions(+), 17 deletions(-)
 create mode 100644 examples/industrial_data_pretraining/llm_asr/conf/template.yaml
 create mode 100644 examples/industrial_data_pretraining/llm_asr/demo_infer.sh
 create mode 100644 examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh
diff --git a/README.md b/README.md
index 970c5ebd0..d34249d19 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,8 @@
 
 <a name="whats-new"></a>
 ## What's new:
-- 2024/03/05：Added support for the Whisper-large-v3 model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the[modelscope](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary), and [openai](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining/whisper).
+- 2024/03/05：Added the Qwen-Audio and Qwen-Audio-Chat large-scale audio-text multimodal models, which have topped multiple audio domain leaderboards. These models support speech dialogue, [usage](examples/industrial_data_pretraining/qwen_audio).
+- 2024/03/05：Added support for the Whisper-large-v3 model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the[modelscope](examples/industrial_data_pretraining/whisper/demo.py), and [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py).
 - 2024/03/05: Offline File Transcription Service 4.4, Offline File Transcription Service of English 1.5，Real-time Transcription Service 1.9 released，docker image supports ARM64 platform, update modelscope；([docs](runtime/readme.md))
 - 2024/01/30：funasr-1.0 has been released ([docs](https://github.com/alibaba-damo-academy/FunASR/discussions/1319))
 - 2024/01/30：emotion recognition models are new supported. [model link](https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary), modified from [repo](https://github.com/ddlBoJack/emotion2vec).
@@ -83,6 +84,8 @@ FunASR has open-sourced a large number of pre-trained models on industrial data.
 |                                       cam++ <br> ( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗](https://huggingface.co/funasr/campplus) )                                        |           speaker verification/diarization            |            5000 hours            |    7.2M    | 
 |                                                  Whisper-large-v2 <br> ([⭐](https://www.modelscope.cn/models/iic/speech_whisper-large_asr_multilingual/summary)  [🍀](https://github.com/openai/whisper) )                                                  |  speech recognition, with timestamps, non-streaming   |          multilingual            |    1.5G    |
 |                                                Whisper-large-v3 <br> ([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)  [🍀](https://github.com/openai/whisper) )                                                 |  speech recognition, with timestamps, non-streaming   |          multilingual            |    1.5G    |
+|                                         Qwen-Audio <br> ([⭐](examples/industrial_data_pretraining/qwen_audio/demo.py)  [🤗](https://huggingface.co/Qwen/Qwen-Audio) )                                         |      audio-text multimodal models (pretraining)       |     multilingual      |  8B  |
+|                   Qwen-Audio-Chat <br> ([⭐](examples/industrial_data_pretraining/qwen_audio/demo_chat.py)  [🤗](https://huggingface.co/Qwen/Qwen-Audio-Chat) )                                                |          audio-text multimodal models (chat)          |     multilingual      |  8B  |
 
 
 
diff --git a/README_zh.md b/README_zh.md
index e6a106022..83e37fb26 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -29,7 +29,8 @@ FunASR希望在语音识别的学术研究和工业应用之间架起一座桥
 
 <a name="最新动态"></a>
 ## 最新动态
-- 2024/03/05：新增加Whisper-large-v3模型支持，多语言语音识别/翻译/语种识别，支持从[modelscope](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)仓库下载，也支持从[openai](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining/whisper)仓库下载模型。
+- 2024/03/05：新增加Qwen-Audio与Qwen-Audio-Chat音频文本模态大模型，在多个音频领域测试榜单刷榜，中支持语音对话，详细用法见 [示例](examples/industrial_data_pretraining/qwen_audio)。
+- 2024/03/05：新增加Whisper-large-v3模型支持，多语言语音识别/翻译/语种识别，支持从 [modelscope](examples/industrial_data_pretraining/whisper/demo.py)仓库下载，也支持从 [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py)仓库下载模型。
 - 2024/03/05: 中文离线文件转写服务 4.4、英文离线文件转写服务 1.5、中文实时语音听写服务 1.9 发布，docker镜像支持arm64平台，升级modelscope版本；详细信息参阅([部署文档](runtime/readme_cn.md))
 - 2024/01/30：funasr-1.0发布，更新说明[文档](https://github.com/alibaba-damo-academy/FunASR/discussions/1319)
 - 2024/01/30：新增加情感识别 [模型链接](https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary)，原始模型 [repo](https://github.com/ddlBoJack/emotion2vec).
@@ -73,19 +74,19 @@ FunASR开源了大量在工业数据上预训练模型，您可以在[模型许
 （注：⭐ 表示ModelScope模型仓库，🤗 表示Huggingface模型仓库，🍀表示OpenAI模型仓库）
 
 
-|                                                                                                     模型名字                                                                                                      |      任务详情       |     训练数据     | 参数量  | 
-|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------:|:------------:|:----:|
-|    paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [🤗](https://huggingface.co/funasr/paraformer-tp) )    | 语音识别，带时间戳输出，非实时 |  60000小时，中文  | 220M |
-| paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗](https://huggingface.co/funasr/paraformer-zh-streaming) ) |     语音识别，实时     |  60000小时，中文  | 220M |
-|         paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗](https://huggingface.co/funasr/paraformer-en) )         |    语音识别，非实时     |  50000小时，英文  | 220M |
-|                      conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗](https://huggingface.co/funasr/conformer-en) )                      |    语音识别，非实时     |  50000小时，英文  | 220M |
-|                        ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗](https://huggingface.co/funasr/ct-punc) )                         |      标点恢复       |  100M，中文与英文  | 1.1G | 
-|                            fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗](https://huggingface.co/funasr/fsmn-vad) )                             |    语音端点检测，实时    | 5000小时，中文与英文 | 0.4M | 
-|                              fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗](https://huggingface.co/funasr/fa-zh) )                               |    字级别时间戳预测     |  50000小时，中文  | 38M  |
-|                                 cam++ <br> ( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗](https://huggingface.co/funasr/campplus) )                                 |    说话人确认/分割     |    5000小时    | 7.2M | 
-|                           Whisper-large-v2 <br> ([⭐](https://www.modelscope.cn/models/iic/speech_whisper-large_asr_multilingual/summary)  [🍀](https://github.com/openai/whisper) )                           | 语音识别，带时间戳输出，非实时 |     多语言      |  1G  |
-|                         Whisper-large-v3 <br> ([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)  [🍀](https://github.com/openai/whisper) )                          | 语音识别，带时间戳输出，非实时 |     多语言      |  1G  |
-
+|                                                                                                     模型名字                                                                                                      |        任务详情        |     训练数据     | 参数量  | 
+|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------:|:------------:|:----:|
+|    paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [🤗](https://huggingface.co/funasr/paraformer-tp) )    |  语音识别，带时间戳输出，非实时   |  60000小时，中文  | 220M |
+| paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗](https://huggingface.co/funasr/paraformer-zh-streaming) ) |      语音识别，实时       |  60000小时，中文  | 220M |
+|         paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗](https://huggingface.co/funasr/paraformer-en) )         |      语音识别，非实时      |  50000小时，英文  | 220M |
+|                      conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗](https://huggingface.co/funasr/conformer-en) )                      |      语音识别，非实时      |  50000小时，英文  | 220M |
+|                        ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗](https://huggingface.co/funasr/ct-punc) )                         |        标点恢复        |  100M，中文与英文  | 1.1G | 
+|                            fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗](https://huggingface.co/funasr/fsmn-vad) )                             |     语音端点检测，实时      | 5000小时，中文与英文 | 0.4M | 
+|                              fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗](https://huggingface.co/funasr/fa-zh) )                               |      字级别时间戳预测      |  50000小时，中文  | 38M  |
+|                                 cam++ <br> ( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗](https://huggingface.co/funasr/campplus) )                                 |      说话人确认/分割      |    5000小时    | 7.2M | 
+|                                     Whisper-large-v3 <br> ([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)  [🍀](https://github.com/openai/whisper) )                                      |  语音识别，带时间戳输出，非实时   |     多语言      |  1G  |
+|                                         Qwen-Audio <br> ([⭐](examples/industrial_data_pretraining/qwen_audio/demo.py)  [🤗](https://huggingface.co/Qwen/Qwen-Audio) )                                         |  音频文本多模态大模型（预训练）   |     多语言      |  8B  |
+|                   Qwen-Audio-Chat <br> ([⭐](examples/industrial_data_pretraining/qwen_audio/demo_chat.py)  [🤗](https://huggingface.co/Qwen/Qwen-Audio-Chat) )                                                | 音频文本多模态大模型（chat版本） |     多语言      |  8B  |
 
 <a name="快速开始"></a>
 ## 快速开始
diff --git a/examples/industrial_data_pretraining/llm_asr/conf/template.yaml b/examples/industrial_data_pretraining/llm_asr/conf/template.yaml
new file mode 100644
index 000000000..3c51ff423
--- /dev/null
+++ b/examples/industrial_data_pretraining/llm_asr/conf/template.yaml
@@ -0,0 +1,89 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: LLMASR
+model_conf:
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: true
+
+# encoder
+encoder: WhisperWarp
+encoder_conf:
+    hub: funasr
+    init_param_path: "/nfs/maziyang.mzy/models/Whisper-large-v2"
+    freeze: true
+
+llm: Vicuna
+llm_conf:
+  hub: hf
+  init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"
+  freeze: true
+
+adaptor: Linear
+adaptor_conf:
+  downsample_rate: 5
+  llm_dim: 4096
+  encoder_dim: 512
+
+# frontend related
+frontend: WhisperFrontend
+frontend_conf:
+    fs: 16000
+    whisper_model: large
+    do_pad_trim: true
+
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  keep_nbest_models: 10
+  log_interval: 10
+
+optim: adamw
+optim_conf:
+   lr: 0.0001
+   weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 1500
+
+dataset: AudioLLMDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: RankFullLocalShuffleBatchSampler
+    batch_type: example # example or length
+    batch_size: 8 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 500
+    shuffle: True
+    num_workers: 4
+    preprocessor_text: TextPreprocessRemovePunctuation
+
+tokenizer: HuggingfaceTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+  init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"
+
diff --git a/examples/industrial_data_pretraining/llm_asr/demo_infer.sh b/examples/industrial_data_pretraining/llm_asr/demo_infer.sh
new file mode 100644
index 000000000..f8ebc4cf2
--- /dev/null
+++ b/examples/industrial_data_pretraining/llm_asr/demo_infer.sh
@@ -0,0 +1,14 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+
+python -m funasr.bin.inference \
+--config-path="/root/FunASR/examples/aishell/llm_asr_nar/conf" \
+--config-name="template.yaml" \
+++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \
+++input="/nfs/beinian.lzr/workspace/datasets/data/16k/opendata/aishell1/dev/wav/S0724/BAC009S0724W0121.wav" \
+++scope_map="encoder.model,audio_encoder,encoder_projector,adaptor" \
+++output_dir="./outputs/debug" \
+++device="cpu" \
+
diff --git a/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh b/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh
new file mode 100644
index 000000000..a518d57ac
--- /dev/null
+++ b/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh
@@ -0,0 +1,47 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn
+#data_dir="/Users/zhifu/funasr1.0/data/list"
+
+## generate jsonl from wav.scp and text.txt
+#python -m funasr.datasets.audio_datasets.scp2jsonl \
+#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+#++data_type_list='["source", "target"]' \
+#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+
+train_data="/nfs/zhifu.gzf/data/datalist/aishell1_aishell2_wav_speech_llm_train_data_del_tail500.json"
+val_data="/nfs/zhifu.gzf/data/datalist/aishell1_aishell2_wav_speech_llm_train_data_tail500.json"
+
+# exp output dir
+output_dir="/Users/zhifu/exp"
+log_file="${output_dir}/log.txt"
+
+workspace=`pwd`
+config="template.yaml"
+
+init_param="${output_dir}/model.pt"
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+torchrun \
+--nnodes 1 \
+--nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py \
+--config-path "${workspace}/conf" \
+--config-name "${config}" \
+++train_data_set_list="${train_data}" \
+++valid_data_set_list="${val_data}" \
+++dataset_conf.batch_size=2 \
+++dataset_conf.batch_type="example" \
+++dataset_conf.num_workers=0 \
+++train_conf.max_epoch=11 \
+++optim_conf.lr=0.0002 \
+++init_param="${init_param}" \
+++output_dir="${output_dir}" &> ${log_file}
diff --git a/funasr/models/qwen_audio/model.py b/funasr/models/qwen_audio/model.py
index 3eba026c2..e419b1ef0 100644
--- a/funasr/models/qwen_audio/model.py
+++ b/funasr/models/qwen_audio/model.py
@@ -20,6 +20,11 @@ from funasr.register import tables
 @tables.register("model_classes", "QwenAudio")
 @tables.register("model_classes", "QwenAudioWarp")
 class QwenAudioWarp(nn.Module):
+    """
+    Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models
+    https://arxiv.org/abs/2311.07919
+    Modified from https://github.com/QwenLM/Qwen-Audio
+    """
     def __init__(self, *args, **kwargs):
         super().__init__()
 
@@ -72,6 +77,11 @@ class QwenAudioWarp(nn.Module):
 @tables.register("model_classes", "QwenAudioChatWarp")
 class QwenAudioChatWarp(nn.Module):
     def __init__(self, *args, **kwargs):
+        """
+        Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models
+        https://arxiv.org/abs/2311.07919
+        Modified from https://github.com/QwenLM/Qwen-Audio
+        """
         super().__init__()
         
         model_or_path = kwargs.get("model_path", "QwenAudio")
diff --git a/funasr/tokenizer/hf_tokenizer.py b/funasr/tokenizer/hf_tokenizer.py
index c856b3d5d..81f553d79 100644
--- a/funasr/tokenizer/hf_tokenizer.py
+++ b/funasr/tokenizer/hf_tokenizer.py
@@ -2,7 +2,8 @@
 try:
 	from transformers import AutoTokenizer
 except:
-	print("If you want to use hugging, please `pip install -U transformers`")
+	# print("If you want to use hugging, please `pip install -U transformers`")
+	pass
 
 from funasr.register import tables
 
diff --git a/funasr/tokenizer/whisper_tokenizer.py b/funasr/tokenizer/whisper_tokenizer.py
index f41c8235e..3fb5b645c 100644
--- a/funasr/tokenizer/whisper_tokenizer.py
+++ b/funasr/tokenizer/whisper_tokenizer.py
@@ -2,7 +2,7 @@
 try:
 	from whisper.tokenizer import get_tokenizer
 except:
-	print("If you want to use hugging, please `pip install -U transformers`")
+	print("Notice: If you want to use whisper, please `pip install -U openai-whisper`")
 
 from funasr.register import tables