From cbf8c18ebd274dfeea9b8aa224ff5faad713c28c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 19 Feb 2022 22:28:19 +0800 Subject: [PATCH] Minor fixes for aishell (#218) * Minor fixes to aishell. * Minor fixes. --- egs/aishell/ASR/RESULTS.md | 74 +++++------ .../ASR/local/display_manifest_statistics.py | 118 ++++++++++++++++++ egs/aishell/ASR/prepare.sh | 36 ++++-- egs/aishell/ASR/transducer_stateless/train.py | 1 - icefall/char_graph_compiler.py | 2 +- 5 files changed, 181 insertions(+), 50 deletions(-) create mode 100755 egs/aishell/ASR/local/display_manifest_statistics.py diff --git a/egs/aishell/ASR/RESULTS.md b/egs/aishell/ASR/RESULTS.md index 688e0f60c..ceb63b4cf 100644 --- a/egs/aishell/ASR/RESULTS.md +++ b/egs/aishell/ASR/RESULTS.md @@ -1,49 +1,49 @@ ## Results -### Aishell training result(Transducer-stateless) +### Aishell training result(Transducer-stateless) #### 2022-2-19 -(Duo Ma): The tensorboard log for training is available at https://tensorboard.dev/experiment/25PmX3MxSVGTdvIdhOwllw/#scalars +(Duo Ma): The tensorboard log for training is available at https://tensorboard.dev/experiment/25PmX3MxSVGTdvIdhOwllw/#scalars You can find a pretrained model by visiting https://huggingface.co/shuanguanma/icefall_aishell_transducer_stateless_context_size2_epoch60_2022_2_19 | | test |comment | |---------------------------|------|-----------------------------------------| | greedy search | 5.4 |--epoch 59, --avg 10, --max-duration 100 | -| beam search | 5.05|--epoch 59, --avg 10, --max-duration 100 | +| beam search | 5.05|--epoch 59, --avg 10, --max-duration 100 | You can use the following commands to reproduce our results: + +```bash +export CUDA_VISIBLE_DEVICES="0,1,2,3" +python3 ./transducer_stateless/train.py \ + --world-size 4 \ + --num-epochs 60 \ + --start-epoch 0 \ + --exp-dir exp/transducer_stateless_context_size2 \ + --max-duration 100 \ + --lr-factor 2.5 \ + --context-size 2 + +lang_dir=data/lang_char +dir=exp/transducer_stateless_context_size2 +python3 ./transducer_stateless/decode.py\ + --epoch 59 \ + --avg 10 \ + --exp-dir $dir \ + --lang-dir $lang_dir \ + --decoding-method greedy_search \ + --context-size 2 \ + --max-sym-per-frame 3 + +lang_dir=data/lang_char +dir=exp/transducer_stateless_context_size2 +python3 ./transducer_stateless/decode.py \ + --epoch 59\ + --avg 10\ + --exp-dir $dir \ + --lang-dir $lang_dir \ + --decoding-method beam_search \ + --context-size 2 \ + --max-sym-per-frame 3 ``` - export CUDA_VISIBLE_DEVICES="0,1,2,3" - python3 ./transducer_stateless/train.py \ - --world-size 4 \ - --num-epochs 60 \ - --start-epoch 0 \ - --exp-dir exp/transducer_stateless_context_size2 \ - --max-duration 100 \ - --lr-factor 2.5\ - --context-size 2 - - lang_dir=data/lang_char - dir=exp/transducer_stateless_context_size2 - python3 ./transducer_stateless/decode.py\ - --epoch 59\ - --avg 10\ - --exp-dir $dir \ - --lang-dir $lang_dir\ - --decoding-method greedy_search\ - --context-size 2\ - --max-sym-per-frame 3 - lang_dir=data/lang_char - dir=exp/transducer_stateless_context_size2 - python3 ./transducer_stateless/decode.py\ - --epoch 59\ - --avg 10\ - --exp-dir $dir \ - --lang-dir $lang_dir\ - --decoding-method beam_search\ - --context-size 2\ - --max-sym-per-frame 3 - ``` - - - + ### Aishell training results (Transducer-stateless) #### 2022-02-18 (Pingfeng Luo) : The tensorboard log for training is available at diff --git a/egs/aishell/ASR/local/display_manifest_statistics.py b/egs/aishell/ASR/local/display_manifest_statistics.py new file mode 100755 index 000000000..5e8b5cd3a --- /dev/null +++ b/egs/aishell/ASR/local/display_manifest_statistics.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This file displays duration statistics of utterances in a manifest. +You can use the displayed value to choose minimum/maximum duration +to remove short and long utterances during the training. + +See the function `remove_short_and_long_utt()` in transducer_stateless/train.py +for usage. +""" + + +from lhotse import load_manifest + + +def main(): + # path = "./data/fbank/cuts_train.json.gz" + # path = "./data/fbank/cuts_test.json.gz" + path = "./data/fbank/cuts_dev.json.gz" + + cuts = load_manifest(path) + cuts.describe() + + +if __name__ == "__main__": + main() + +""" +## train (after speed perturb) +Cuts count: 360294 +Total duration (hours): 455.6 +Speech duration (hours): 455.6 (100.0%) +*** +Duration statistics (seconds): +mean 4.6 +std 1.4 +min 1.1 +0.1% 1.8 +0.5% 2.2 +1% 2.3 +5% 2.7 +10% 3.0 +10% 3.0 +25% 3.5 +50% 4.3 +75% 5.4 +90% 6.5 +95% 7.2 +99% 8.8 +99.5% 9.4 +99.9% 10.9 +max 16.1 + +## test +Cuts count: 7176 +Total duration (hours): 10.0 +Speech duration (hours): 10.0 (100.0%) +*** +Duration statistics (seconds): +mean 5.0 +std 1.6 +min 1.9 +0.1% 2.2 +0.5% 2.4 +1% 2.6 +5% 3.0 +10% 3.2 +10% 3.2 +25% 3.8 +50% 4.7 +75% 5.9 +90% 7.3 +95% 8.2 +99% 9.9 +99.5% 10.7 +99.9% 11.9 +max 14.7 + +## dev +Cuts count: 14326 +Total duration (hours): 18.1 +Speech duration (hours): 18.1 (100.0%) +*** +Duration statistics (seconds): +mean 4.5 +std 1.3 +min 1.6 +0.1% 2.1 +0.5% 2.3 +1% 2.4 +5% 2.9 +10% 3.1 +10% 3.1 +25% 3.5 +50% 4.3 +75% 5.4 +90% 6.4 +95% 7.0 +99% 8.4 +99.5% 8.9 +99.9% 10.3 +max 12.5 +""" diff --git a/egs/aishell/ASR/prepare.sh b/egs/aishell/ASR/prepare.sh index a99558395..68f5c54d3 100755 --- a/egs/aishell/ASR/prepare.sh +++ b/egs/aishell/ASR/prepare.sh @@ -48,8 +48,9 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then log "stage -1: Download LM" # We assume that you have installed the git-lfs, if not, you could install it # using: `sudo apt-get install git-lfs && git-lfs install` - [ ! -e $dl_dir/lm ] && mkdir -p $dl_dir/lm - git clone https://huggingface.co/pkufool/aishell_lm $dl_dir/lm + if [ ! -f $dl_dir/lm/3-gram.unpruned.arpa ]; then + git clone https://huggingface.co/pkufool/aishell_lm $dl_dir/lm + fi fi if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then @@ -87,28 +88,41 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Prepare aishell manifest" # We assume that you have downloaded the aishell corpus # to $dl_dir/aishell - mkdir -p data/manifests - lhotse prepare aishell -j $nj $dl_dir/aishell data/manifests + if [ ! -f data/manifests/.aishell_manifests.done ]; then + mkdir -p data/manifests + lhotse prepare aishell $dl_dir/aishell data/manifests + touch data/manifests/.aishell_manifests.done + fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Prepare musan manifest" # We assume that you have downloaded the musan corpus # to data/musan - mkdir -p data/manifests - lhotse prepare musan $dl_dir/musan data/manifests + if [ ! -f data/manifests/.musan_manifests.done ]; then + log "It may take 6 minutes" + mkdir -p data/manifests + lhotse prepare musan $dl_dir/musan data/manifests + touch data/manifests/.musan_manifests.done + fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Compute fbank for aishell" - mkdir -p data/fbank - ./local/compute_fbank_aishell.py + if [ ! -f data/fbank/.aishell.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_aishell.py + touch data/fbank/.aishell.done + fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Compute fbank for musan" - mkdir -p data/fbank - ./local/compute_fbank_musan.py + if [ ! -f data/fbank/.msuan.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_musan.py + touch data/fbank/.msuan.done + fi fi lang_phone_dir=data/lang_phone @@ -134,7 +148,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then aishell_train_uid=$dl_dir/aishell/data_aishell/transcript/aishell_train_uid find $dl_dir/aishell/data_aishell/wav/train -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_train_uid awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_train_uid $aishell_text | - cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt + cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt fi if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then diff --git a/egs/aishell/ASR/transducer_stateless/train.py b/egs/aishell/ASR/transducer_stateless/train.py index b562f9dd4..cd37810dd 100755 --- a/egs/aishell/ASR/transducer_stateless/train.py +++ b/egs/aishell/ASR/transducer_stateless/train.py @@ -558,7 +558,6 @@ def run(rank, world_size, args): oov="", ) - # params.blank_id = graph_compiler.texts_to_ids("")[0][0] params.blank_id = 0 params.vocab_size = max(lexicon.tokens) + 1 diff --git a/icefall/char_graph_compiler.py b/icefall/char_graph_compiler.py index 4a79a300a..a50b57d40 100644 --- a/icefall/char_graph_compiler.py +++ b/icefall/char_graph_compiler.py @@ -36,7 +36,7 @@ class CharCtcTrainingGraphCompiler(object): """ Args: lexicon: - It is built from `data/lang/lexicon.txt`. + It is built from `data/lang_char/lexicon.txt`. device: The device to use for operations compiling transcripts to FSAs. oov: