From 05e48ca88077a36426ff1e7075c1d591f8aa5f8b Mon Sep 17 00:00:00 2001 From: jinzr Date: Mon, 8 Apr 2024 17:19:42 +0800 Subject: [PATCH] misc. update --- egs/commonvoice/ASR/RESULTS.md | 10 ++++++---- .../ASR/local/compute_fbank_commonvoice_splits.py | 1 + egs/commonvoice/ASR/local/word_segment_yue.py | 6 +++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/egs/commonvoice/ASR/RESULTS.md b/egs/commonvoice/ASR/RESULTS.md index ce77e5bf3..f384f66a0 100644 --- a/egs/commonvoice/ASR/RESULTS.md +++ b/egs/commonvoice/ASR/RESULTS.md @@ -14,7 +14,7 @@ The best CER, for CommonVoice 16.1 (cv-corpus-16.1-2023-12-06/zh-HK) is below: | modified_beam_search | 0.98 | 1.11 | --epoch 24 --avg 5 | | fast_beam_search | 1.08 | 1.27 | --epoch 24 --avg 5 | -When doing the cross-corpus validation on MDCC (w/o blank penalty), +When doing the cross-corpus validation on [MDCC](https://arxiv.org/abs/2201.02419) (w/o blank penalty), the best CER is below: | | Dev | Test | Note | @@ -23,7 +23,7 @@ the best CER is below: | modified_beam_search | 39.73 | 39.19| --epoch 24 --avg 5 | | fast_beam_search | 42.14 | 41.98| --epoch 24 --avg 5 | -When doing the cross-corpus validation on MDCC (with blank penalty set to 2.2), +When doing the cross-corpus validation on [MDCC](https://arxiv.org/abs/2201.02419) (with blank penalty set to 2.2), the best CER is below: | | Dev | Test | Note | @@ -68,14 +68,16 @@ Detailed experimental results and pre-trained model are available at: -### GigaSpeech BPE training results (Pruned Stateless Transducer 7) +### CommonVoice English (en) BPE training results (Pruned Stateless Transducer 7) #### [pruned_transducer_stateless7](./pruned_transducer_stateless7) -See #997 for more details. +See #997 for more details. Number of model parameters: 70369391, i.e., 70.37 M +Note that the result is obtained using GigaSpeech transcript trained BPE model + The best WER, as of 2023-04-17, for Common Voice English 13.0 (cv-corpus-13.0-2023-03-09/en) is below: Results are: diff --git a/egs/commonvoice/ASR/local/compute_fbank_commonvoice_splits.py b/egs/commonvoice/ASR/local/compute_fbank_commonvoice_splits.py index 14fb9b446..aa672609a 100755 --- a/egs/commonvoice/ASR/local/compute_fbank_commonvoice_splits.py +++ b/egs/commonvoice/ASR/local/compute_fbank_commonvoice_splits.py @@ -47,6 +47,7 @@ def get_args(): "--subset", type=str, default="train", + choices=["train", "validated", "invalidated"], help="""Dataset parts to compute fbank. """, ) diff --git a/egs/commonvoice/ASR/local/word_segment_yue.py b/egs/commonvoice/ASR/local/word_segment_yue.py index e8448a0c2..35d262d10 100755 --- a/egs/commonvoice/ASR/local/word_segment_yue.py +++ b/egs/commonvoice/ASR/local/word_segment_yue.py @@ -130,11 +130,11 @@ if __name__ == "__main__": norm_lines = [normalize_text(line, lang) for line in lines] text_words_segments = get_word_segments(norm_lines) - with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f: + with open(output_dir / "transcript_words.txt", "w", encoding="utf-8") as f: f.writelines(text_words_segments) words = get_words(text_words_segments)[1:] # remove "\n" from words - with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f: + with open(output_dir / "words_no_ids.txt", "w", encoding="utf-8") as f: f.writelines([word + "\n" for word in sorted(words)]) words = ( @@ -143,5 +143,5 @@ if __name__ == "__main__": + ["#0", "", "<\s>"] ) - with open(output_dir / "words.txt", "w+", encoding="utf-8") as f: + with open(output_dir / "words.txt", "w", encoding="utf-8") as f: f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])