From 382f16893b9db04e4ad92b6f981a6c8db9b6670d Mon Sep 17 00:00:00 2001
From: iflamed <iflamed@gmail.com>
Date: Thu, 25 Jul 2024 16:22:52 +0800
Subject: [PATCH] add fastapi server

---
 api.py           | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  3 +-
 2 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 api.py
diff --git a/api.py b/api.py
new file mode 100644
index 0000000..97d3b7d
--- /dev/null
+++ b/api.py
@@ -0,0 +1,76 @@
+# Set the device with environment, default is cuda:0
+# export SENSEVOICE_DEVICE=cuda:1
+
+import os, re
+from fastapi import FastAPI, File, Form
+from fastapi.responses import HTMLResponse
+from typing_extensions import Annotated
+from typing import List
+from enum import Enum
+import torchaudio
+from model import SenseVoiceSmall
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+class Language(str, Enum):
+    auto = "auto"
+    zh = "zh"
+    en = "en"
+    yue = "yue"
+    ja = "ja"
+    ko = "ko"
+    nospeech = "nospeech"
+
+model_dir = "iic/SenseVoiceSmall"
+m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir, device=os.getenv("SENSEVOICE_DEVICE", "cuda:0"))
+m.eval()
+
+regex = r"<\|.*\|>"
+
+app = FastAPI()
+
+
+@app.get("/", response_class=HTMLResponse)
+async def root():
+    return """
+    <!DOCTYPE html>
+    <html>
+        <head>
+            <meta charset=utf-8>
+            <title>Api information</title>
+        </head>
+        <body>
+            <a href='./docs'>Documents of API</a>
+        </body>
+    </html>
+    """
+
+@app.post("/api/v1/asr")
+async def create_files(files: Annotated[List[bytes], File(description="wav or mp3 audios in 16KHz")], keys: Annotated[str, Form(description="name of each audio joined with comma")], lang: Annotated[Language, Form(description="language of audio content")] = "auto"):
+    audios = []
+    audio_fs = 0
+    for file in files:
+        data_or_path_or_list, audio_fs = torchaudio.load(file)
+        data_or_path_or_list = data_or_path_or_list.mean(0)
+        audios.append(data_or_path_or_list)
+    if lang == "":
+        lang = "auto"
+    if keys == "":
+        key = ["wav_file_tmp_name"]
+    else:
+        key = keys.split(",")
+    res = m.inference(
+        data_in=audios,
+        language=lang, # "zh", "en", "yue", "ja", "ko", "nospeech"
+        use_itn=False,
+        ban_emo_unk=False,
+        key=key,
+        fs=audio_fs,
+        **kwargs,
+    )
+    if len(res) == 0:
+        return {"result": []}
+    for it in res[0]:
+        it["raw_text"] = it["text"]
+        it["clean_text"] = re.sub(regex, "", it["text"], 0, re.MULTILINE)
+        it["text"] = rich_transcription_postprocess(it["text"])
+    return {"result": res[0]}
diff --git a/requirements.txt b/requirements.txt
index 76bd564..51284fb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ huggingface
 huggingface_hub
 funasr>=1.1.3
 numpy<=1.26.4
-gradio
\ No newline at end of file
+gradio
+fastapi>=0.111.1
\ No newline at end of file