From 382f16893b9db04e4ad92b6f981a6c8db9b6670d Mon Sep 17 00:00:00 2001 From: iflamed Date: Thu, 25 Jul 2024 16:22:52 +0800 Subject: [PATCH] add fastapi server --- api.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +- 2 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 api.py diff --git a/api.py b/api.py new file mode 100644 index 0000000..97d3b7d --- /dev/null +++ b/api.py @@ -0,0 +1,76 @@ +# Set the device with environment, default is cuda:0 +# export SENSEVOICE_DEVICE=cuda:1 + +import os, re +from fastapi import FastAPI, File, Form +from fastapi.responses import HTMLResponse +from typing_extensions import Annotated +from typing import List +from enum import Enum +import torchaudio +from model import SenseVoiceSmall +from funasr.utils.postprocess_utils import rich_transcription_postprocess + +class Language(str, Enum): + auto = "auto" + zh = "zh" + en = "en" + yue = "yue" + ja = "ja" + ko = "ko" + nospeech = "nospeech" + +model_dir = "iic/SenseVoiceSmall" +m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir, device=os.getenv("SENSEVOICE_DEVICE", "cuda:0")) +m.eval() + +regex = r"<\|.*\|>" + +app = FastAPI() + + +@app.get("/", response_class=HTMLResponse) +async def root(): + return """ + + + + + Api information + + + Documents of API + + + """ + +@app.post("/api/v1/asr") +async def create_files(files: Annotated[List[bytes], File(description="wav or mp3 audios in 16KHz")], keys: Annotated[str, Form(description="name of each audio joined with comma")], lang: Annotated[Language, Form(description="language of audio content")] = "auto"): + audios = [] + audio_fs = 0 + for file in files: + data_or_path_or_list, audio_fs = torchaudio.load(file) + data_or_path_or_list = data_or_path_or_list.mean(0) + audios.append(data_or_path_or_list) + if lang == "": + lang = "auto" + if keys == "": + key = ["wav_file_tmp_name"] + else: + key = keys.split(",") + res = m.inference( + data_in=audios, + language=lang, # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=False, + ban_emo_unk=False, + key=key, + fs=audio_fs, + **kwargs, + ) + if len(res) == 0: + return {"result": []} + for it in res[0]: + it["raw_text"] = it["text"] + it["clean_text"] = re.sub(regex, "", it["text"], 0, re.MULTILINE) + it["text"] = rich_transcription_postprocess(it["text"]) + return {"result": res[0]} diff --git a/requirements.txt b/requirements.txt index 76bd564..51284fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ huggingface huggingface_hub funasr>=1.1.3 numpy<=1.26.4 -gradio \ No newline at end of file +gradio +fastapi>=0.111.1 \ No newline at end of file