mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
82 lines
2.2 KiB
Python
82 lines
2.2 KiB
Python
import logging
|
|
from pathlib import Path
|
|
from typing import Dict
|
|
from typing import List
|
|
from typing import Union
|
|
|
|
from typeguard import check_argument_types
|
|
|
|
|
|
def read_2column_text(path: Union[Path, str]) -> Dict[str, str]:
|
|
"""Read a text file having 2 column as dict object.
|
|
|
|
Examples:
|
|
wav.scp:
|
|
key1 /some/path/a.wav
|
|
key2 /some/path/b.wav
|
|
|
|
>>> read_2column_text('wav.scp')
|
|
{'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'}
|
|
|
|
"""
|
|
assert check_argument_types()
|
|
|
|
data = {}
|
|
with Path(path).open("r", encoding="utf-8") as f:
|
|
for linenum, line in enumerate(f, 1):
|
|
sps = line.rstrip().split(maxsplit=1)
|
|
if len(sps) == 1:
|
|
k, v = sps[0], ""
|
|
else:
|
|
k, v = sps
|
|
if k in data:
|
|
raise RuntimeError(f"{k} is duplicated ({path}:{linenum})")
|
|
data[k] = v
|
|
return data
|
|
|
|
|
|
def load_num_sequence_text(
|
|
path: Union[Path, str], loader_type: str = "csv_int"
|
|
) -> Dict[str, List[Union[float, int]]]:
|
|
"""Read a text file indicating sequences of number
|
|
|
|
Examples:
|
|
key1 1 2 3
|
|
key2 34 5 6
|
|
|
|
>>> d = load_num_sequence_text('text')
|
|
>>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3]))
|
|
"""
|
|
assert check_argument_types()
|
|
if loader_type == "text_int":
|
|
delimiter = " "
|
|
dtype = int
|
|
elif loader_type == "text_float":
|
|
delimiter = " "
|
|
dtype = float
|
|
elif loader_type == "csv_int":
|
|
delimiter = ","
|
|
dtype = int
|
|
elif loader_type == "csv_float":
|
|
delimiter = ","
|
|
dtype = float
|
|
else:
|
|
raise ValueError(f"Not supported loader_type={loader_type}")
|
|
|
|
# path looks like:
|
|
# utta 1,0
|
|
# uttb 3,4,5
|
|
# -> return {'utta': np.ndarray([1, 0]),
|
|
# 'uttb': np.ndarray([3, 4, 5])}
|
|
d = read_2column_text(path)
|
|
|
|
# Using for-loop instead of dict-comprehension for debuggability
|
|
retval = {}
|
|
for k, v in d.items():
|
|
try:
|
|
retval[k] = [dtype(i) for i in v.split(delimiter)]
|
|
except TypeError:
|
|
logging.error(f'Error happened with path="{path}", id="{k}", value="{v}"')
|
|
raise
|
|
return retval
|