mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
18 lines
371 B
Python
18 lines
371 B
Python
#!/usr/bin/env python
|
|
import numpy as np
|
|
|
|
def tokenize(data,
|
|
vocab=None):
|
|
assert "text" in data
|
|
assert isinstance(vocab, dict)
|
|
text = data["text"]
|
|
token = []
|
|
for x in text:
|
|
if x in vocab:
|
|
token.append(vocab[x])
|
|
else:
|
|
token.append(vocab['<unk>'])
|
|
|
|
data["text"] = np.array(token)
|
|
return data
|