mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
fix compute cer problems
This commit is contained in:
parent
cbbb300743
commit
cf843d144a
@ -45,8 +45,8 @@ def compute_wer(ref_file,
|
|||||||
if out_item['wrong'] > 0:
|
if out_item['wrong'] > 0:
|
||||||
rst['wrong_sentences'] += 1
|
rst['wrong_sentences'] += 1
|
||||||
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
||||||
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
|
cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
|
||||||
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
|
cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
|
||||||
|
|
||||||
if rst['Wrd'] > 0:
|
if rst['Wrd'] > 0:
|
||||||
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
||||||
|
|||||||
@ -74,7 +74,7 @@ def modelscope_infer(params):
|
|||||||
# If text exists, compute CER
|
# If text exists, compute CER
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(best_recog_path, "token")
|
text_proc_file = os.path.join(best_recog_path, "text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -38,7 +38,7 @@ def modelscope_infer_after_finetune(params):
|
|||||||
# computer CER if GT text is set
|
# computer CER if GT text is set
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,7 @@ def modelscope_infer(params):
|
|||||||
# If text exists, compute CER
|
# If text exists, compute CER
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(best_recog_path, "token")
|
text_proc_file = os.path.join(best_recog_path, "text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -38,7 +38,7 @@ def modelscope_infer_after_finetune(params):
|
|||||||
# computer CER if GT text is set
|
# computer CER if GT text is set
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -63,8 +63,8 @@ fi
|
|||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||||
echo "Computing WER ..."
|
echo "Computing WER ..."
|
||||||
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||||
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
||||||
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
||||||
tail -n 3 ${output_dir}/1best_recog/text.cer
|
tail -n 3 ${output_dir}/1best_recog/text.cer
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -34,7 +34,7 @@ def modelscope_infer_after_finetune(params):
|
|||||||
# computer CER if GT text is set
|
# computer CER if GT text is set
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -63,8 +63,8 @@ fi
|
|||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||||
echo "Computing WER ..."
|
echo "Computing WER ..."
|
||||||
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||||
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
||||||
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
||||||
tail -n 3 ${output_dir}/1best_recog/text.cer
|
tail -n 3 ${output_dir}/1best_recog/text.cer
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -34,7 +34,7 @@ def modelscope_infer_after_finetune(params):
|
|||||||
# computer CER if GT text is set
|
# computer CER if GT text is set
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -75,7 +75,7 @@ def modelscope_infer(params):
|
|||||||
# If text exists, compute CER
|
# If text exists, compute CER
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(best_recog_path, "token")
|
text_proc_file = os.path.join(best_recog_path, "text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -39,7 +39,7 @@ def modelscope_infer_after_finetune(params):
|
|||||||
# computer CER if GT text is set
|
# computer CER if GT text is set
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -75,7 +75,7 @@ def modelscope_infer(params):
|
|||||||
# If text exists, compute CER
|
# If text exists, compute CER
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(best_recog_path, "token")
|
text_proc_file = os.path.join(best_recog_path, "text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -39,7 +39,7 @@ def modelscope_infer_after_finetune(params):
|
|||||||
# computer CER if GT text is set
|
# computer CER if GT text is set
|
||||||
text_in = os.path.join(params["data_dir"], "text")
|
text_in = os.path.join(params["data_dir"], "text")
|
||||||
if os.path.exists(text_in):
|
if os.path.exists(text_in):
|
||||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -797,7 +797,7 @@ def inference_modelscope(
|
|||||||
finish_count += 1
|
finish_count += 1
|
||||||
# asr_utils.print_progress(finish_count / file_count)
|
# asr_utils.print_progress(finish_count / file_count)
|
||||||
if writer is not None:
|
if writer is not None:
|
||||||
ibest_writer["text"][key] = text_postprocessed
|
ibest_writer["text"][key] = " ".join(word_lists)
|
||||||
|
|
||||||
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
|
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
|
||||||
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
|
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
|
||||||
|
|||||||
@ -338,7 +338,7 @@ def inference_modelscope(
|
|||||||
ibest_writer["token"][key] = " ".join(token)
|
ibest_writer["token"][key] = " ".join(token)
|
||||||
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
|
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
|
||||||
ibest_writer["vad"][key] = "{}".format(vadsegments)
|
ibest_writer["vad"][key] = "{}".format(vadsegments)
|
||||||
ibest_writer["text"][key] = text_postprocessed
|
ibest_writer["text"][key] = " ".join(word_lists)
|
||||||
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
|
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
|
||||||
if time_stamp_postprocessed is not None:
|
if time_stamp_postprocessed is not None:
|
||||||
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
|
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
|
||||||
|
|||||||
@ -670,7 +670,7 @@ def inference_modelscope(
|
|||||||
ibest_writer["token"][key] = " ".join(token)
|
ibest_writer["token"][key] = " ".join(token)
|
||||||
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
|
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
|
||||||
ibest_writer["vad"][key] = "{}".format(vadsegments)
|
ibest_writer["vad"][key] = "{}".format(vadsegments)
|
||||||
ibest_writer["text"][key] = text_postprocessed
|
ibest_writer["text"][key] = " ".join(word_lists)
|
||||||
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
|
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
|
||||||
if time_stamp_postprocessed is not None:
|
if time_stamp_postprocessed is not None:
|
||||||
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
|
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
|
||||||
|
|||||||
@ -738,13 +738,13 @@ def inference_modelscope(
|
|||||||
ibest_writer["rtf"][key] = rtf_cur
|
ibest_writer["rtf"][key] = rtf_cur
|
||||||
|
|
||||||
if text is not None:
|
if text is not None:
|
||||||
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
|
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
|
||||||
item = {'key': key, 'value': text_postprocessed}
|
item = {'key': key, 'value': text_postprocessed}
|
||||||
asr_result_list.append(item)
|
asr_result_list.append(item)
|
||||||
finish_count += 1
|
finish_count += 1
|
||||||
# asr_utils.print_progress(finish_count / file_count)
|
# asr_utils.print_progress(finish_count / file_count)
|
||||||
if writer is not None:
|
if writer is not None:
|
||||||
ibest_writer["text"][key] = text_postprocessed
|
ibest_writer["text"][key] = " ".join(word_lists)
|
||||||
|
|
||||||
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
|
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
|
||||||
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
|
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
|
||||||
|
|||||||
@ -507,13 +507,13 @@ def inference_modelscope(
|
|||||||
ibest_writer["score"][key] = str(hyp.score)
|
ibest_writer["score"][key] = str(hyp.score)
|
||||||
|
|
||||||
if text is not None:
|
if text is not None:
|
||||||
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
|
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
|
||||||
item = {'key': key, 'value': text_postprocessed}
|
item = {'key': key, 'value': text_postprocessed}
|
||||||
asr_result_list.append(item)
|
asr_result_list.append(item)
|
||||||
finish_count += 1
|
finish_count += 1
|
||||||
asr_utils.print_progress(finish_count / file_count)
|
asr_utils.print_progress(finish_count / file_count)
|
||||||
if writer is not None:
|
if writer is not None:
|
||||||
ibest_writer["text"][key] = text_postprocessed
|
ibest_writer["text"][key] = " ".join(word_lists)
|
||||||
return asr_result_list
|
return asr_result_list
|
||||||
|
|
||||||
return _forward
|
return _forward
|
||||||
|
|||||||
@ -507,13 +507,13 @@ def inference_modelscope(
|
|||||||
ibest_writer["score"][key] = str(hyp.score)
|
ibest_writer["score"][key] = str(hyp.score)
|
||||||
|
|
||||||
if text is not None:
|
if text is not None:
|
||||||
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
|
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
|
||||||
item = {'key': key, 'value': text_postprocessed}
|
item = {'key': key, 'value': text_postprocessed}
|
||||||
asr_result_list.append(item)
|
asr_result_list.append(item)
|
||||||
finish_count += 1
|
finish_count += 1
|
||||||
asr_utils.print_progress(finish_count / file_count)
|
asr_utils.print_progress(finish_count / file_count)
|
||||||
if writer is not None:
|
if writer is not None:
|
||||||
ibest_writer["text"][key] = text_postprocessed
|
ibest_writer["text"][key] = " ".join(word_lists)
|
||||||
return asr_result_list
|
return asr_result_list
|
||||||
|
|
||||||
return _forward
|
return _forward
|
||||||
|
|||||||
@ -45,8 +45,8 @@ def compute_wer(ref_file,
|
|||||||
if out_item['wrong'] > 0:
|
if out_item['wrong'] > 0:
|
||||||
rst['wrong_sentences'] += 1
|
rst['wrong_sentences'] += 1
|
||||||
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
||||||
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
|
cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
|
||||||
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
|
cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
|
||||||
|
|
||||||
if rst['Wrd'] > 0:
|
if rst['Wrd'] > 0:
|
||||||
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user