mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
* update * update * update * update onnx * update with main (#1492) * contextual&seaco ONNX export (#1481) * contextual&seaco ONNX export * update ContextualEmbedderExport2 * update ContextualEmbedderExport2 * update code * onnx (#1482) * qwenaudio qwenaudiochat * qwenaudio qwenaudiochat * whisper * whisper * llm * llm * llm * llm * llm * llm * llm * llm * export onnx * export onnx * export onnx * dingding * dingding * llm * doc * onnx * onnx * onnx * onnx * onnx * onnx * v1.0.15 * qwenaudio * qwenaudio * issue doc * update * update * bugfix * onnx * update export calling * update codes * remove useless code * update code --------- Co-authored-by: zhifu gao <zhifu.gzf@alibaba-inc.com> * acknowledge --------- Co-authored-by: Shi Xian <40013335+R1ckShi@users.noreply.github.com> * update onnx * update onnx * train update * train update * train update * train update --------- Co-authored-by: Shi Xian <40013335+R1ckShi@users.noreply.github.com>
62 lines
1.9 KiB
Python
62 lines
1.9 KiB
Python
import os
|
|
import json
|
|
import torch
|
|
import logging
|
|
import hydra
|
|
from omegaconf import DictConfig, OmegaConf
|
|
import concurrent.futures
|
|
import librosa
|
|
import torch.distributed as dist
|
|
|
|
|
|
|
|
def gen_scp_from_jsonl(jsonl_file, data_type_list, wav_scp_file, text_file):
|
|
|
|
wav_f = open(wav_scp_file, "w")
|
|
text_f = open(text_file, "w")
|
|
with open(jsonl_file, encoding='utf-8') as fin:
|
|
for line in fin:
|
|
data = json.loads(line.strip())
|
|
|
|
prompt = data.get("prompt", "<ASR>")
|
|
source = data[data_type_list[0]]
|
|
target = data[data_type_list[1]]
|
|
source_len = data.get("source_len", 1)
|
|
target_len = data.get("target_len", 0)
|
|
if "aishell" in source:
|
|
target = target.replace(" ", "")
|
|
key = data["key"]
|
|
wav_f.write(f"{key}\t{source}\n")
|
|
wav_f.flush()
|
|
text_f.write(f"{key}\t{target}\n")
|
|
text_f.flush()
|
|
|
|
wav_f.close()
|
|
text_f.close()
|
|
|
|
|
|
|
|
@hydra.main(config_name=None, version_base=None)
|
|
def main_hydra(cfg: DictConfig):
|
|
|
|
kwargs = OmegaConf.to_container(cfg, resolve=True)
|
|
|
|
scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"))
|
|
if isinstance(scp_file_list, str):
|
|
scp_file_list = eval(scp_file_list)
|
|
data_type_list = kwargs.get("data_type_list", ("source", "target"))
|
|
jsonl_file = kwargs.get("jsonl_file_in", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl")
|
|
gen_scp_from_jsonl(jsonl_file, data_type_list, *scp_file_list)
|
|
|
|
|
|
"""
|
|
python -m funasr.datasets.audio_datasets.json2scp \
|
|
++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
|
|
++data_type_list='["source", "target"]' \
|
|
++jsonl_file_in=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
|
|
"""
|
|
|
|
if __name__ == "__main__":
|
|
main_hydra()
|
|
|
|
|