mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
parent
9595a9432f
commit
753d579531
15
examples/industrial_data_pretraining/qwen_audio/demo.py
Normal file
15
examples/industrial_data_pretraining/qwen_audio/demo.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
# To install requirements: pip3 install -U "funasr[llm]"
|
||||||
|
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
model = AutoModel(model="Qwen/Qwen-Audio",
|
||||||
|
model_path=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", language=None)
|
||||||
|
print(res)
|
||||||
26
examples/industrial_data_pretraining/qwen_audio/demo_chat.py
Normal file
26
examples/industrial_data_pretraining/qwen_audio/demo_chat.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
# To install requirements: pip3 install -U "funasr[llm]"
|
||||||
|
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
model = AutoModel(model="Qwen/Qwen-Audio-Chat",
|
||||||
|
model_path=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
|
||||||
|
|
||||||
|
# 1st dialogue turn
|
||||||
|
prompt = 'what does the person say?'
|
||||||
|
cache = {"history": None}
|
||||||
|
res = model.generate(input=audio_in, prompt=prompt, cache=cache)
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
prompt = 'Find the start time and end time of the word "middle classes"'
|
||||||
|
# 2nd dialogue turn
|
||||||
|
res = model.generate(input=None, prompt=prompt, cache=cache)
|
||||||
|
print(res)
|
||||||
|
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
# To install requirements: pip3 install -U "funasr[llm]"
|
||||||
|
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
model = AutoModel(model="Qwen/Qwen-Audio-Chat",
|
||||||
|
model_path="/nfs/zhifu.gzf/init_model/qwen/Qwen-Audio-Chat",
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
|
||||||
|
|
||||||
|
# 1st dialogue turn
|
||||||
|
prompt = 'what does the person say?'
|
||||||
|
cache = {"history": None}
|
||||||
|
res = model.generate(input=audio_in, prompt=prompt, cache=cache)
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
prompt = 'Find the start time and end time of the word "middle classes"'
|
||||||
|
# 2nd dialogue turn
|
||||||
|
res = model.generate(input=None, prompt=prompt, cache=cache)
|
||||||
|
print(res)
|
||||||
|
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
# To install requirements: pip3 install -U "funasr[llm]"
|
||||||
|
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
model = AutoModel(model="Qwen/Qwen-Audio",
|
||||||
|
model_path="/nfs/zhifu.gzf/init_model/qwen/Qwen-Audio",
|
||||||
|
)
|
||||||
|
|
||||||
|
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", language=None)
|
||||||
|
print(res)
|
||||||
@ -245,7 +245,10 @@ class AutoModel:
|
|||||||
|
|
||||||
time1 = time.perf_counter()
|
time1 = time.perf_counter()
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
results, meta_data = model.inference(**batch, **kwargs)
|
res = model.inference(**batch, **kwargs)
|
||||||
|
if isinstance(res, (list, tuple)):
|
||||||
|
results = res[0]
|
||||||
|
meta_data = res[1] if len(res) > 1 else {}
|
||||||
time2 = time.perf_counter()
|
time2 = time.perf_counter()
|
||||||
|
|
||||||
asr_result_list.extend(results)
|
asr_result_list.extend(results)
|
||||||
|
|||||||
@ -13,10 +13,16 @@ def download_model(**kwargs):
|
|||||||
pass
|
pass
|
||||||
elif hub == "openai":
|
elif hub == "openai":
|
||||||
model_or_path = kwargs.get("model")
|
model_or_path = kwargs.get("model")
|
||||||
if model_or_path in name_maps_openai:
|
if os.path.exists(model_or_path):
|
||||||
model_or_path = name_maps_openai[model_or_path]
|
# local path
|
||||||
kwargs["model_path"] = model_or_path
|
kwargs["model_path"] = model_or_path
|
||||||
|
kwargs["model"] = "WhisperWarp"
|
||||||
|
else:
|
||||||
|
# model name
|
||||||
|
if model_or_path in name_maps_openai:
|
||||||
|
model_or_path = name_maps_openai[model_or_path]
|
||||||
|
kwargs["model_path"] = model_or_path
|
||||||
|
|
||||||
return kwargs
|
return kwargs
|
||||||
|
|
||||||
def download_from_ms(**kwargs):
|
def download_from_ms(**kwargs):
|
||||||
@ -24,7 +30,7 @@ def download_from_ms(**kwargs):
|
|||||||
if model_or_path in name_maps_ms:
|
if model_or_path in name_maps_ms:
|
||||||
model_or_path = name_maps_ms[model_or_path]
|
model_or_path = name_maps_ms[model_or_path]
|
||||||
model_revision = kwargs.get("model_revision")
|
model_revision = kwargs.get("model_revision")
|
||||||
if not os.path.exists(model_or_path):
|
if not os.path.exists(model_or_path) and "model_path" not in kwargs:
|
||||||
try:
|
try:
|
||||||
model_or_path = get_or_download_model_dir(model_or_path, model_revision,
|
model_or_path = get_or_download_model_dir(model_or_path, model_revision,
|
||||||
is_training=kwargs.get("is_training"),
|
is_training=kwargs.get("is_training"),
|
||||||
@ -32,7 +38,7 @@ def download_from_ms(**kwargs):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Download: {model_or_path} failed!: {e}")
|
print(f"Download: {model_or_path} failed!: {e}")
|
||||||
|
|
||||||
kwargs["model_path"] = model_or_path
|
kwargs["model_path"] = model_or_path if "model_path" not in kwargs else kwargs["model_path"]
|
||||||
|
|
||||||
if os.path.exists(os.path.join(model_or_path, "configuration.json")):
|
if os.path.exists(os.path.join(model_or_path, "configuration.json")):
|
||||||
with open(os.path.join(model_or_path, "configuration.json"), 'r', encoding='utf-8') as f:
|
with open(os.path.join(model_or_path, "configuration.json"), 'r', encoding='utf-8') as f:
|
||||||
|
|||||||
@ -10,6 +10,7 @@ name_maps_ms = {
|
|||||||
"cam++": "damo/speech_campplus_sv_zh-cn_16k-common",
|
"cam++": "damo/speech_campplus_sv_zh-cn_16k-common",
|
||||||
"Whisper-large-v2": "iic/speech_whisper-large_asr_multilingual",
|
"Whisper-large-v2": "iic/speech_whisper-large_asr_multilingual",
|
||||||
"Whisper-large-v3": "iic/Whisper-large-v3",
|
"Whisper-large-v3": "iic/Whisper-large-v3",
|
||||||
|
"Qwen-Audio": "Qwen/Qwen-Audio",
|
||||||
}
|
}
|
||||||
|
|
||||||
name_maps_hf = {
|
name_maps_hf = {
|
||||||
|
|||||||
@ -9,25 +9,84 @@ from torch import Tensor
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
import whisper
|
import whisper
|
||||||
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
|
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
from transformers.generation import GenerationConfig
|
||||||
|
|
||||||
from funasr.register import tables
|
from funasr.register import tables
|
||||||
|
|
||||||
|
@tables.register("model_classes", "Qwen/Qwen-Audio")
|
||||||
|
@tables.register("model_classes", "Qwen-Audio")
|
||||||
|
@tables.register("model_classes", "Qwen/QwenAudio")
|
||||||
|
@tables.register("model_classes", "QwenAudio")
|
||||||
@tables.register("model_classes", "QwenAudioWarp")
|
@tables.register("model_classes", "QwenAudioWarp")
|
||||||
class WhisperWarp(nn.Module):
|
class QwenAudioWarp(nn.Module):
|
||||||
def __init__(self, whisper_dims: dict, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
hub = kwargs.get("hub", "funasr")
|
|
||||||
if hub == "openai":
|
model_or_path = kwargs.get("model_path", "QwenAudio")
|
||||||
init_param_path = kwargs.get("init_param_path", "large-v3")
|
model = AutoModelForCausalLM.from_pretrained(model_or_path, device_map="cpu",
|
||||||
model = whisper.load_model(init_param_path)
|
trust_remote_code=True)
|
||||||
else:
|
tokenizer = AutoTokenizer.from_pretrained(model_or_path, trust_remote_code=True)
|
||||||
dims = whisper.model.ModelDimensions(**whisper_dims)
|
|
||||||
model = whisper.model.Whisper(dims=dims)
|
|
||||||
|
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
|
def forward(self, ):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def inference(self,
|
||||||
|
data_in,
|
||||||
|
data_lengths=None,
|
||||||
|
key: list = None,
|
||||||
|
tokenizer=None,
|
||||||
|
frontend=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
if kwargs.get("batch_size", 1) > 1:
|
||||||
|
raise NotImplementedError("batch decoding is not implemented")
|
||||||
|
|
||||||
|
|
||||||
|
meta_data = {}
|
||||||
|
# meta_data["batch_data_time"] = -1
|
||||||
|
|
||||||
|
sp_prompt = "<|startoftranscription|><|en|><|transcribe|><|en|><|notimestamps|><|wo_itn|>"
|
||||||
|
query = f"<audio>{data_in[0]}</audio>{sp_prompt}"
|
||||||
|
audio_info = self.tokenizer.process_audio(query)
|
||||||
|
inputs = self.tokenizer(query, return_tensors='pt', audio_info=audio_info)
|
||||||
|
inputs = inputs.to(self.model.device)
|
||||||
|
pred = self.model.generate(**inputs, audio_info=audio_info)
|
||||||
|
response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False, audio_info=audio_info)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
result_i = {"key": key[0], "text": response}
|
||||||
|
|
||||||
|
results.append(result_i)
|
||||||
|
|
||||||
|
return results, meta_data
|
||||||
|
|
||||||
|
@tables.register("model_classes", "Qwen/Qwen-Audio-Chat")
|
||||||
|
@tables.register("model_classes", "Qwen/QwenAudioChat")
|
||||||
|
@tables.register("model_classes", "Qwen-Audio-Chat")
|
||||||
|
@tables.register("model_classes", "QwenAudioChat")
|
||||||
|
@tables.register("model_classes", "QwenAudioChatWarp")
|
||||||
|
class QwenAudioChatWarp(nn.Module):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
model_or_path = kwargs.get("model_path", "QwenAudio")
|
||||||
|
bf16 = kwargs.get("bf16", False)
|
||||||
|
fp16 = kwargs.get("fp16", False)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_or_path,
|
||||||
|
device_map="cpu",
|
||||||
|
bf16=bf16,
|
||||||
|
fp16=fp16,
|
||||||
|
trust_remote_code=True)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_or_path, trust_remote_code=True)
|
||||||
|
|
||||||
|
self.model = model
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
def forward(self, ):
|
def forward(self, ):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -41,45 +100,29 @@ class WhisperWarp(nn.Module):
|
|||||||
):
|
):
|
||||||
if kwargs.get("batch_size", 1) > 1:
|
if kwargs.get("batch_size", 1) > 1:
|
||||||
raise NotImplementedError("batch decoding is not implemented")
|
raise NotImplementedError("batch decoding is not implemented")
|
||||||
|
|
||||||
|
|
||||||
meta_data = {}
|
meta_data = {}
|
||||||
if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank": # fbank
|
|
||||||
speech, speech_lengths = data_in, data_lengths
|
prompt = kwargs.get("prompt", "what does the person say?")
|
||||||
if len(speech.shape) < 3:
|
cache = kwargs.get("cache", {})
|
||||||
speech = speech[None, :, :]
|
history = cache.get("history", None)
|
||||||
if speech_lengths is None:
|
if data_in[0] is not None:
|
||||||
speech_lengths = speech.shape[1]
|
# 1st dialogue turn
|
||||||
|
query = self.tokenizer.from_list_format([
|
||||||
|
{'audio': data_in[0]}, # Either a local path or an url
|
||||||
|
{'text': prompt},
|
||||||
|
])
|
||||||
else:
|
else:
|
||||||
# extract fbank feats
|
query = prompt
|
||||||
time1 = time.perf_counter()
|
response, history = self.model.chat(self.tokenizer, query=query, history=history)
|
||||||
audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
|
cache["history"] = history
|
||||||
data_type=kwargs.get("data_type", "sound"),
|
# print(response)
|
||||||
tokenizer=tokenizer)
|
# The person says: "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel".
|
||||||
time2 = time.perf_counter()
|
|
||||||
meta_data["load_data"] = f"{time2 - time1:0.3f}"
|
|
||||||
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
|
|
||||||
frontend=frontend)
|
|
||||||
time3 = time.perf_counter()
|
|
||||||
meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
|
|
||||||
frame_shift = frontend.frame_shift if hasattr(frontend, "frame_shift") else 10
|
|
||||||
lfr_n = frontend.lfr_n if hasattr(frontend, "lfr_n") else 1
|
|
||||||
meta_data["batch_data_time"] = speech_lengths.sum().item() * frame_shift * lfr_n / 1000
|
|
||||||
|
|
||||||
speech = speech.to(device=kwargs["device"])[0, :, :]
|
|
||||||
speech_lengths = speech_lengths.to(device=kwargs["device"])
|
|
||||||
|
|
||||||
# detect the spoken language
|
|
||||||
_, probs = self.model.detect_language(speech)
|
|
||||||
print(f"Detected language: {max(probs, key=probs.get)}")
|
|
||||||
|
|
||||||
# decode the audio
|
|
||||||
options = whisper.DecodingOptions(language=kwargs.get("language", None), fp16=False)
|
|
||||||
result = whisper.decode(self.model, speech, options)
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
result_i = {"key": key[0], "text": result.text}
|
result_i = {"key": key[0], "text": response}
|
||||||
|
|
||||||
results.append(result_i)
|
results.append(result_i)
|
||||||
|
|
||||||
return results, meta_data
|
return results, meta_data
|
||||||
|
|
||||||
@ -1 +1 @@
|
|||||||
1.0.13
|
1.0.14
|
||||||
|
|||||||
12
setup.py
12
setup.py
@ -41,6 +41,7 @@ requirements = {
|
|||||||
"jaconv",
|
"jaconv",
|
||||||
"hydra-core>=1.3.2",
|
"hydra-core>=1.3.2",
|
||||||
"tensorboardX",
|
"tensorboardX",
|
||||||
|
"rotary_embedding_torch",
|
||||||
],
|
],
|
||||||
# train: The modules invoked when training only.
|
# train: The modules invoked when training only.
|
||||||
"train": [
|
"train": [
|
||||||
@ -82,6 +83,17 @@ requirements = {
|
|||||||
"sphinx-markdown-tables>=0.0.12",
|
"sphinx-markdown-tables>=0.0.12",
|
||||||
"configargparse>=1.2.1"
|
"configargparse>=1.2.1"
|
||||||
],
|
],
|
||||||
|
"llm":[
|
||||||
|
"transformers>=4.32.0",
|
||||||
|
"accelerate",
|
||||||
|
"tiktoken",
|
||||||
|
"einops",
|
||||||
|
"transformers_stream_generator>=0.0.4",
|
||||||
|
"scipy",
|
||||||
|
"torchvision",
|
||||||
|
"pillow",
|
||||||
|
"matplotlib",
|
||||||
|
],
|
||||||
}
|
}
|
||||||
requirements["all"].extend(requirements["train"])
|
requirements["all"].extend(requirements["train"])
|
||||||
requirements["test"].extend(requirements["train"])
|
requirements["test"].extend(requirements["train"])
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user