# Set the device with environment, default is cuda:0 # export SENSEVOICE_DEVICE=cuda:1 import os, re from fastapi import FastAPI, File, Form from fastapi.responses import HTMLResponse from typing_extensions import Annotated from typing import List from enum import Enum import torchaudio from model import SenseVoiceSmall from funasr.utils.postprocess_utils import rich_transcription_postprocess class Language(str, Enum): auto = "auto" zh = "zh" en = "en" yue = "yue" ja = "ja" ko = "ko" nospeech = "nospeech" model_dir = "iic/SenseVoiceSmall" m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir, device=os.getenv("SENSEVOICE_DEVICE", "cuda:0")) m.eval() regex = r"<\|.*\|>" app = FastAPI() @app.get("/", response_class=HTMLResponse) async def root(): return """ Api information Documents of API """ @app.post("/api/v1/asr") async def turn_audio_to_text(files: Annotated[List[bytes], File(description="wav or mp3 audios in 16KHz")], keys: Annotated[str, Form(description="name of each audio joined with comma")], lang: Annotated[Language, Form(description="language of audio content")] = "auto"): audios = [] audio_fs = 0 for file in files: data_or_path_or_list, audio_fs = torchaudio.load(file) data_or_path_or_list = data_or_path_or_list.mean(0) audios.append(data_or_path_or_list) if lang == "": lang = "auto" if keys == "": key = ["wav_file_tmp_name"] else: key = keys.split(",") res = m.inference( data_in=audios, language=lang, # "zh", "en", "yue", "ja", "ko", "nospeech" use_itn=False, ban_emo_unk=False, key=key, fs=audio_fs, **kwargs, ) if len(res) == 0: return {"result": []} for it in res[0]: it["raw_text"] = it["text"] it["clean_text"] = re.sub(regex, "", it["text"], 0, re.MULTILINE) it["text"] = rich_transcription_postprocess(it["text"]) return {"result": res[0]}