diff --git a/funasr/datasets/large_datasets/utils/hotword_utils.py b/funasr/datasets/large_datasets/utils/hotword_utils.py index fccfea696..73f8bdd90 100644 --- a/funasr/datasets/large_datasets/utils/hotword_utils.py +++ b/funasr/datasets/large_datasets/utils/hotword_utils.py @@ -6,7 +6,8 @@ def sample_hotword(length, sample_rate, double_rate, pre_prob, - pre_index=None): + pre_index=None, + pre_hwlist=None): if length < hotword_min_length: return [-1] if random.random() < sample_rate: diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py index 3bff172a8..c16e1dc7e 100644 --- a/funasr/datasets/large_datasets/utils/tokenize.py +++ b/funasr/datasets/large_datasets/utils/tokenize.py @@ -63,7 +63,6 @@ def tokenize(data, if _find != -1: # _find = text[:_find].count(" ") # bpe sometimes pre_index = [_find, _find + max(hw.count(" "), 1)] - # import pdb; pdb.set_trace() break hotword_indxs = sample_hotword(length, **hw_config, pre_index=pre_index) data['hotword_indxs'] = hotword_indxs