diff --git a/egs/librispeech/ASR/streaming_conformer_ctc/README.md b/egs/librispeech/ASR/streaming_conformer_ctc/README.md new file mode 100644 index 000000000..01be7090b --- /dev/null +++ b/egs/librispeech/ASR/streaming_conformer_ctc/README.md @@ -0,0 +1,90 @@ +## Train and Decode +Commands of data preparation/train/decode steps are almost the same with +../conformer_ctc experiment except some options. + +Please read the code and understand following new added options before running this experiment: + + For data preparation: + + Nothing new. + + For streaming_conformer_ctc/train.py: + + --dynamic-chunk-training + --short-chunk-proportion + + For streaming_conformer_ctc/streaming_decode.py: + + --chunk-size + --tailing-num-frames + --simulate-streaming + +## Performence and a trained model. + +The latest results with this streaming code is shown in following table: + +chunk size | wer on test-clean | wer on test-other +-- | -- | -- +full | 3.53 | 8.52 +40(1.96s) | 3.78 | 9.38 +32(1.28s) | 3.82 | 9.44 +24(0.96s) | 3.95 | 9.76 +16(0.64s) | 4.06 | 9.98 +8(0.32s) | 4.30 | 10.55 +4(0.16s) | 5.88 | 12.01 + + +A trained model is also provided. +By run +``` +git clone https://huggingface.co/GuoLiyong/streaming_conformer + +# You may want to manually check md5sum values of downloaded files +# 8e633bc1de37f5ae57a2694ceee32a93 trained_streaming_conformer.pt +# 4c0aeefe26c784ec64873cc9b95420f1 L.pt +# d1f91d81005fb8ce4d65953a4a984ee7 Linv.pt +# e1c1902feb7b9fc69cd8d26e663c2608 bpe.model +# 8617e67159b0ff9118baa54f04db24cc tokens.txt +# 72b075ab5e851005cd854e666c82c3bb words.txt +``` + +If there is any different md5sum values, please run +``` +cd streaming_models +git lfs pull +``` +And check md5sum values again. + +Finally, following files will be downloaded: +
+streaming_models/ +|-- lang_bpe +| |-- L.pt +| |-- Linv.pt +| |-- bpe.model +| |-- tokens.txt +| `-- words.txt +`-- trained_streaming_conformer.pt ++ + +And run commands you will get the same results of previous table: +``` +trained_models=/path/to/downloaded/streaming_models/ +for chunk_size in 4 8 16 24 36 40 -1; do + ./streaming_conformer_ctc/streaming_decode.py \ + --chunk-size=${chunk_size} \ + --trained-dir=${trained_models} +done +``` +Results of following command is indentical to previous one, +but model consumes features chunk_by_chunk, i.e. a streaming way. +``` +trained_models=/path/to/downloaded/streaming_models/ +for chunk_size in 4 8 16 24 36 40 -1; do + ./streaming_conformer_ctc/streaming_decode.py \ + --simulate-streaming=True \ + --chunk-size=${chunk_size} \ + --trained-dir=${trained_models} +done +``` diff --git a/egs/librispeech/ASR/streaming_conformer_ctc/streaming_decode.py b/egs/librispeech/ASR/streaming_conformer_ctc/streaming_decode.py index e88a4323c..a74c51836 100755 --- a/egs/librispeech/ASR/streaming_conformer_ctc/streaming_decode.py +++ b/egs/librispeech/ASR/streaming_conformer_ctc/streaming_decode.py @@ -121,6 +121,13 @@ def get_parser(): help="The experiment dir", ) + parser.add_argument( + "--trained-dir", + type=Path, + default=None, + help="The experiment dir", + ) + parser.add_argument( "--lang-dir", type=Path, @@ -144,7 +151,6 @@ def get_params() -> AttributeDict: { "exp_dir": Path("conformer_ctc/exp"), "lang_dir": Path("data/lang_bpe"), - "lm_dir": Path("data/lm"), # parameters for conformer "causal": True, "subsampling_factor": 4, @@ -410,6 +416,12 @@ def main(): logging.info("Decoding started") logging.info(params) + if params.trained_dir is not None: + params.lang_dir = Path(params.trained_dir) / "lang_bpe" + # used naming result files + params.epoch = "trained_model" + params.avg = 1 + lexicon = Lexicon(params.lang_dir) max_token_id = max(lexicon.tokens) num_classes = max_token_id + 1 # +1 for the blank @@ -441,7 +453,10 @@ def main(): causal=params.causal, ) - if params.avg == 1 and params.avg_models is not None: + if params.trained_dir is not None: + model_name = f"{params.trained_dir}/trained_streaming_conformer.pt" + load_checkpoint(model_name, model) + elif params.avg == 1 and params.avg_models is not None: load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model) else: filenames = [] diff --git a/egs/librispeech/ASR/streaming_conformer_ctc/train.py b/egs/librispeech/ASR/streaming_conformer_ctc/train.py index 1881cfcd0..8b4d6701e 100755 --- a/egs/librispeech/ASR/streaming_conformer_ctc/train.py +++ b/egs/librispeech/ASR/streaming_conformer_ctc/train.py @@ -127,7 +127,7 @@ def get_parser(): parser.add_argument( "--dynamic-chunk-training", type=str2bool, - default=False, + default=True, help="Whether to use dynamic right context during training.", )