From b93d5343b157a683b8925cb3611804b0aff6d940 Mon Sep 17 00:00:00 2001 From: Yuekai Zhang Date: Wed, 6 Jul 2022 20:59:50 +0800 Subject: [PATCH] fix aishell2 --- .../ASR/local/compute_fbank_aishell2.py | 2 +- .../ASR/local/display_manifest_statistics.py | 168 ++---------------- egs/aishell2/ASR/prepare.sh | 8 +- 3 files changed, 15 insertions(+), 163 deletions(-) diff --git a/egs/aishell2/ASR/local/compute_fbank_aishell2.py b/egs/aishell2/ASR/local/compute_fbank_aishell2.py index 5c5071578..7bc969a1a 100755 --- a/egs/aishell2/ASR/local/compute_fbank_aishell2.py +++ b/egs/aishell2/ASR/local/compute_fbank_aishell2.py @@ -17,7 +17,7 @@ """ -This file computes fbank features of the aishell dataset. +This file computes fbank features of the aishell2 dataset. It looks for manifests in the directory data/manifests. The generated fbank features are saved in data/fbank. diff --git a/egs/aishell2/ASR/local/display_manifest_statistics.py b/egs/aishell2/ASR/local/display_manifest_statistics.py index 9a805ee7c..5e81d6cfa 100755 --- a/egs/aishell2/ASR/local/display_manifest_statistics.py +++ b/egs/aishell2/ASR/local/display_manifest_statistics.py @@ -29,165 +29,17 @@ from lhotse import load_manifest_lazy def main(): - # path = "./data/fbank/aishell2_cuts_train.jsonl.gz" - # path = "./data/fbank/aishell2_cuts_test.jsonl.gz" - path = "./data/fbank/aishell2_cuts_dev.jsonl.gz" + paths = [ + "./data/fbank/aishell2_cuts_train.jsonl.gz", + "./data/fbank/aishell2_cuts_dev.jsonl.gz", + "./data/fbank/aishell2_cuts_test.jsonl.gz" + ] - cuts = load_manifest_lazy(path) - cuts.describe() + for path in paths: + print(f"Starting display the statistics for {path}") + cuts = load_manifest_lazy(path) + cuts.describe() if __name__ == "__main__": - main() - -""" -## train (after speed perturb) -Cuts count: 360294 -Total duration (hours): 455.6 -Speech duration (hours): 455.6 (100.0%) -*** -Duration statistics (seconds): -mean 4.6 -std 1.4 -min 1.1 -0.1% 1.8 -0.5% 2.2 -1% 2.3 -5% 2.7 -10% 3.0 -10% 3.0 -25% 3.5 -50% 4.3 -75% 5.4 -90% 6.5 -95% 7.2 -99% 8.8 -99.5% 9.4 -99.9% 10.9 -max 16.1 - -## test -Cuts count: 7176 -Total duration (hours): 10.0 -Speech duration (hours): 10.0 (100.0%) -*** -Duration statistics (seconds): -mean 5.0 -std 1.6 -min 1.9 -0.1% 2.2 -0.5% 2.4 -1% 2.6 -5% 3.0 -10% 3.2 -10% 3.2 -25% 3.8 -50% 4.7 -75% 5.9 -90% 7.3 -95% 8.2 -99% 9.9 -99.5% 10.7 -99.9% 11.9 -max 14.7 - -## dev -Cuts count: 14326 -Total duration (hours): 18.1 -Speech duration (hours): 18.1 (100.0%) -*** -Duration statistics (seconds): -mean 4.5 -std 1.3 -min 1.6 -0.1% 2.1 -0.5% 2.3 -1% 2.4 -5% 2.9 -10% 3.1 -10% 3.1 -25% 3.5 -50% 4.3 -75% 5.4 -90% 6.4 -95% 7.0 -99% 8.4 -99.5% 8.9 -99.9% 10.3 -max 12.5 - -## aidatatang_200zh (train) -Cuts count: 164905 -Total duration (hours): 139.9 -Speech duration (hours): 139.9 (100.0%) -*** -Duration statistics (seconds): -mean 3.1 -std 1.1 -min 1.1 -0.1% 1.5 -0.5% 1.7 -1% 1.8 -5% 2.0 -10% 2.1 -10% 2.1 -25% 2.3 -50% 2.7 -75% 3.4 -90% 4.6 -95% 5.4 -99% 7.1 -99.5% 7.8 -99.9% 9.1 -max 16.3 - -## aidatatang_200zh (test) -Cuts count: 48144 -Total duration (hours): 40.2 -Speech duration (hours): 40.2 (100.0%) -*** -Duration statistics (seconds): -mean 3.0 -std 1.1 -min 0.9 -0.1% 1.5 -0.5% 1.8 -1% 1.8 -5% 2.0 -10% 2.1 -10% 2.1 -25% 2.3 -50% 2.6 -75% 3.4 -90% 4.4 -95% 5.2 -99% 6.9 -99.5% 7.5 -99.9% 9.0 -max 21.8 - -## aidatatang_200zh (dev) -Cuts count: 24216 -Total duration (hours): 20.2 -Speech duration (hours): 20.2 (100.0%) -*** -Duration statistics (seconds): -mean 3.0 -std 1.0 -min 1.2 -0.1% 1.6 -0.5% 1.7 -1% 1.8 -5% 2.0 -10% 2.1 -10% 2.1 -25% 2.3 -50% 2.7 -75% 3.4 -90% 4.4 -95% 5.1 -99% 6.7 -99.5% 7.3 -99.9% 8.8 -max 11.3 -""" + main() \ No newline at end of file diff --git a/egs/aishell2/ASR/prepare.sh b/egs/aishell2/ASR/prepare.sh index b0b1e7016..cffcb0bdd 100755 --- a/egs/aishell2/ASR/prepare.sh +++ b/egs/aishell2/ASR/prepare.sh @@ -3,8 +3,8 @@ set -eou pipefail nj=30 -stage=3 -stop_stage=3 +stage=0 +stop_stage=5 # We assume dl_dir (download dir) contains the following # directories and files. If not, you need to apply aishell2 through @@ -117,12 +117,12 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then mkdir -p $lang_char_dir # Prepare text. - grep "\"text\":" data/manifests/aishell2_supervisions_train.json \ + grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \ | sed -e 's/["text:\t ]*//g' | sed 's/,//g' \ | ./local/text2token.py -t "char" > $lang_char_dir/text # Prepare words.txt - grep "\"text\":" data/manifests/aishell2_supervisions_train.json \ + grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \ | sed -e 's/["text:\t]*//g' | sed 's/,//g' \ | ./local/text2token.py -t "char" > $lang_char_dir/text_words