FunASR/funasr/datasets/large_datasets/utils/tokenize.py
2022-11-26 21:56:51 +08:00

18 lines
371 B
Python

#!/usr/bin/env python
import numpy as np
def tokenize(data,
vocab=None):
assert "text" in data
assert isinstance(vocab, dict)
text = data["text"]
token = []
for x in text:
if x in vocab:
token.append(vocab[x])
else:
token.append(vocab['<unk>'])
data["text"] = np.array(token)
return data