This commit is contained in:
shixian.shi 2023-06-27 19:17:26 +08:00
parent 41481008e2
commit bbea0265f1

View File

@ -59,7 +59,7 @@ def tokenize(data,
pre_index = None
for hw in hw_config['pre_hwlist']:
hw = " ".join(seg_tokenize(hw, seg_dict))
_find = " ".join(text.find(hw))
_find = " ".join(text).find(hw)
if _find != -1:
_find = text[:_find].count(" ") # bpe sometimes
pre_index = [_find, _find + max(hw.count(" "), 1)]