From 2502160fcdfaaaab35c6691484d7dda04bb5720d Mon Sep 17 00:00:00 2001 From: "shixian.shi" Date: Tue, 9 May 2023 21:02:22 +0800 Subject: [PATCH] update sentence timestamp for ClipVedio --- funasr/utils/timestamp_tools.py | 38 +++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py index 87cc49eee..489d317d5 100644 --- a/funasr/utils/timestamp_tools.py +++ b/funasr/utils/timestamp_tools.py @@ -94,19 +94,33 @@ def time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocess res.append({ 'text': text_postprocessed.split(), "start": time_stamp_postprocessed[0][0], - "end": time_stamp_postprocessed[-1][1] + "end": time_stamp_postprocessed[-1][1], + 'text_seg': text_postprocessed.split(), + "ts_list": time_stamp_postprocessed, }) return res if len(punc_id_list) != len(time_stamp_postprocessed): print(" warning length mistach!!!!!!") - sentence_text = '' + sentence_text = "" + sentence_text_seg = "" + ts_list = [] sentence_start = time_stamp_postprocessed[0][0] sentence_end = time_stamp_postprocessed[0][1] texts = text_postprocessed.split() punc_stamp_text_list = list(zip_longest(punc_id_list, time_stamp_postprocessed, texts, fillvalue=None)) for punc_stamp_text in punc_stamp_text_list: punc_id, time_stamp, text = punc_stamp_text - sentence_text += text if text is not None else '' + # sentence_text += text if text is not None else '' + if text is not None: + if 'a' <= text[0] <= 'z' or 'A' <= text[0] <= 'Z': + sentence_text += ' ' + text + elif len(sentence_text) and ('a' <= sentence_text[-1] <= 'z' or 'A' <= sentence_text[-1] <= 'Z'): + sentence_text += ' ' + text + else: + sentence_text += text + sentence_text_seg += text + ' ' + ts_list.append(time_stamp) + punc_id = int(punc_id) if punc_id is not None else 1 sentence_end = time_stamp[1] if time_stamp is not None else sentence_end @@ -115,27 +129,39 @@ def time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocess res.append({ 'text': sentence_text, "start": sentence_start, - "end": sentence_end + "end": sentence_end, + "text_seg": sentence_text_seg, + "ts_list": ts_list }) sentence_text = '' + sentence_text_seg = '' + ts_list = [] sentence_start = sentence_end elif punc_id == 3: sentence_text += '.' res.append({ 'text': sentence_text, "start": sentence_start, - "end": sentence_end + "end": sentence_end, + "text_seg": sentence_text_seg, + "ts_list": ts_list }) sentence_text = '' + sentence_text_seg = '' + ts_list = [] sentence_start = sentence_end elif punc_id == 4: sentence_text += '?' res.append({ 'text': sentence_text, "start": sentence_start, - "end": sentence_end + "end": sentence_end, + "text_seg": sentence_text_seg, + "ts_list": ts_list }) sentence_text = '' + sentence_text_seg = '' + ts_list = [] sentence_start = sentence_end return res