fix aishell2

2025-09-18 21:44:18 +00:00 · 2022-07-06 20:59:50 +08:00 · 2022-07-06 20:59:50 +08:00 · b93d5343b1
commit b93d5343b1
parent 30d0e1d5a1
3 changed files with 15 additions and 163 deletions
--- a/egs/aishell2/ASR/local/compute_fbank_aishell2.py
+++ b/egs/aishell2/ASR/local/compute_fbank_aishell2.py
@ -17,7 +17,7 @@
 """
-This file computes fbank features of the aishell dataset.
+This file computes fbank features of the aishell2 dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
--- a/egs/aishell2/ASR/local/display_manifest_statistics.py
+++ b/egs/aishell2/ASR/local/display_manifest_statistics.py
@ -29,165 +29,17 @@ from lhotse import load_manifest_lazy
 def main():
-    #  path = "./data/fbank/aishell2_cuts_train.jsonl.gz"
+    paths = [
-    #  path = "./data/fbank/aishell2_cuts_test.jsonl.gz"
+        "./data/fbank/aishell2_cuts_train.jsonl.gz",
-    path = "./data/fbank/aishell2_cuts_dev.jsonl.gz"
+        "./data/fbank/aishell2_cuts_dev.jsonl.gz",
        "./data/fbank/aishell2_cuts_test.jsonl.gz"
    ]
    for path in paths:
        print(f"Starting display the statistics for {path}")
        cuts = load_manifest_lazy(path)
        cuts.describe()
 if __name__ == "__main__":
    main()
 """
 ## train (after speed perturb)
 Cuts count: 360294
 Total duration (hours): 455.6
 Speech duration (hours): 455.6 (100.0%)
 ***
 Duration statistics (seconds):
 mean    4.6
 std     1.4
 min     1.1
 0.1%    1.8
 0.5%    2.2
 1%      2.3
 5%      2.7
 10%     3.0
 10%     3.0
 25%     3.5
 50%     4.3
 75%     5.4
 90%     6.5
 95%     7.2
 99%     8.8
 99.5%   9.4
 99.9%   10.9
 max     16.1
 ## test
 Cuts count: 7176
 Total duration (hours): 10.0
 Speech duration (hours): 10.0 (100.0%)
 ***
 Duration statistics (seconds):
 mean    5.0
 std     1.6
 min     1.9
 0.1%    2.2
 0.5%    2.4
 1%      2.6
 5%      3.0
 10%     3.2
 10%     3.2
 25%     3.8
 50%     4.7
 75%     5.9
 90%     7.3
 95%     8.2
 99%     9.9
 99.5%   10.7
 99.9%   11.9
 max     14.7
 ## dev
 Cuts count: 14326
 Total duration (hours): 18.1
 Speech duration (hours): 18.1 (100.0%)
 ***
 Duration statistics (seconds):
 mean    4.5
 std     1.3
 min     1.6
 0.1%    2.1
 0.5%    2.3
 1%      2.4
 5%      2.9
 10%     3.1
 10%     3.1
 25%     3.5
 50%     4.3
 75%     5.4
 90%     6.4
 95%     7.0
 99%     8.4
 99.5%   8.9
 99.9%   10.3
 max     12.5
 ## aidatatang_200zh (train)
 Cuts count: 164905
 Total duration (hours): 139.9
 Speech duration (hours): 139.9 (100.0%)
 ***
 Duration statistics (seconds):
 mean    3.1
 std     1.1
 min     1.1
 0.1%    1.5
 0.5%    1.7
 1%      1.8
 5%      2.0
 10%     2.1
 10%     2.1
 25%     2.3
 50%     2.7
 75%     3.4
 90%     4.6
 95%     5.4
 99%     7.1
 99.5%   7.8
 99.9%   9.1
 max     16.3
 ## aidatatang_200zh (test)
 Cuts count: 48144
 Total duration (hours): 40.2
 Speech duration (hours): 40.2 (100.0%)
 ***
 Duration statistics (seconds):
 mean    3.0
 std     1.1
 min     0.9
 0.1%    1.5
 0.5%    1.8
 1%      1.8
 5%      2.0
 10%     2.1
 10%     2.1
 25%     2.3
 50%     2.6
 75%     3.4
 90%     4.4
 95%     5.2
 99%     6.9
 99.5%   7.5
 99.9%   9.0
 max     21.8
 ## aidatatang_200zh (dev)
 Cuts count: 24216
 Total duration (hours): 20.2
 Speech duration (hours): 20.2 (100.0%)
 ***
 Duration statistics (seconds):
 mean    3.0
 std     1.0
 min     1.2
 0.1%    1.6
 0.5%    1.7
 1%      1.8
 5%      2.0
 10%     2.1
 10%     2.1
 25%     2.3
 50%     2.7
 75%     3.4
 90%     4.4
 95%     5.1
 99%     6.7
 99.5%   7.3
 99.9%   8.8
 max     11.3
 """
--- a/egs/aishell2/ASR/prepare.sh
+++ b/egs/aishell2/ASR/prepare.sh
@ -3,8 +3,8 @@
 set -eou pipefail
 nj=30
-stage=3
+stage=0
-stop_stage=3
+stop_stage=5
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, you need to apply aishell2 through
@ -117,12 +117,12 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  mkdir -p $lang_char_dir
  # Prepare text.
-  grep "\"text\":" data/manifests/aishell2_supervisions_train.json \
+  grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \
    | sed -e 's/["text:\t ]*//g' | sed 's/,//g' \
    | ./local/text2token.py -t "char" > $lang_char_dir/text
  # Prepare words.txt
-  grep "\"text\":" data/manifests/aishell2_supervisions_train.json \
+  grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \
    | sed -e 's/["text:\t]*//g' | sed 's/,//g' \
    | ./local/text2token.py -t "char" > $lang_char_dir/text_words