From 626a26fc2a569eb5719b364b2a6f5cd1fd99553b Mon Sep 17 00:00:00 2001 From: rxhmdia <41623136+rxhmdia@users.noreply.github.com> Date: Tue, 23 Aug 2022 17:30:03 +0800 Subject: [PATCH] some small changes for aidatatang_200zh (#542) * Update prepare.sh * Update compute_fbank_aidatatang_200zh.py --- .../local/compute_fbank_aidatatang_200zh.py | 2 +- egs/aidatatang_200zh/ASR/prepare.sh | 62 +++++++++---------- .../pruned_transducer_stateless2/decode.py | 57 ++--------------- 3 files changed, 35 insertions(+), 86 deletions(-) diff --git a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py index 9850cf251..0b54fcb9a 100755 --- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py +++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py @@ -43,7 +43,7 @@ torch.set_num_interop_threads(1) def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): - src_dir = Path("data/manifests") + src_dir = Path("data/manifests/aidatatang_200zh") output_dir = Path("data/fbank") num_jobs = min(15, os.cpu_count()) diff --git a/egs/aidatatang_200zh/ASR/prepare.sh b/egs/aidatatang_200zh/ASR/prepare.sh index 3da783006..039951354 100755 --- a/egs/aidatatang_200zh/ASR/prepare.sh +++ b/egs/aidatatang_200zh/ASR/prepare.sh @@ -50,28 +50,19 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Process aidatatang_200zh" - if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then - mkdir -p data/fbank/aidatatang_200zh - lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh - touch data/fbank/aidatatang_200zh/.fbank.done + log "Stage 2: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to data/musan + if [ ! -f data/manifests/.manifests.done ]; then + log "It may take 6 minutes" + mkdir -p data/manifests/ + lhotse prepare musan $dl_dir/musan data/manifests/ + touch data/manifests/.manifests.done fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Prepare musan manifest" - # We assume that you have downloaded the musan corpus - # to data/musan - if [ ! -f data/manifests/.musan_manifests.done ]; then - log "It may take 6 minutes" - mkdir -p data/manifests - lhotse prepare musan $dl_dir/musan data/manifests - touch data/manifests/.musan_manifests.done - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Compute fbank for musan" + log "Stage 3: Compute fbank for musan" if [ ! -f data/fbank/.msuan.done ]; then mkdir -p data/fbank ./local/compute_fbank_musan.py @@ -79,8 +70,8 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then fi fi -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute fbank for aidatatang_200zh" +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Compute fbank for aidatatang_200zh" if [ ! -f data/fbank/.aidatatang_200zh.done ]; then mkdir -p data/fbank ./local/compute_fbank_aidatatang_200zh.py @@ -88,31 +79,38 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then fi fi -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare char based lang" +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Prepare char based lang" lang_char_dir=data/lang_char mkdir -p $lang_char_dir - # Prepare text. - grep "\"text\":" data/manifests/aidatatang_200zh/supervisions_train.json \ - | sed -e 's/["text:\t ]*//g' | sed 's/,//g' \ - | ./local/text2token.py -t "char" > $lang_char_dir/text - + # Note: in Linux, you can install jq with the following command: + # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 + # 2. chmod +x ./jq + # 3. cp jq /usr/bin + if [ ! -f $lang_char_dir/text ]; then + gunzip -c data/manifests/aidatatang_200zh/aidatatang_supervisions_train.jsonl.gz \ + |jq '.text' |sed -e 's/["text:\t ]*//g' | sed 's/"//g' \ + | ./local/text2token.py -t "char" > $lang_char_dir/text + fi # Prepare words.txt - grep "\"text\":" data/manifests/aidatatang_200zh/supervisions_train.json \ - | sed -e 's/["text:\t]*//g' | sed 's/,//g' \ - | ./local/text2token.py -t "char" > $lang_char_dir/text_words + if [ ! -f $lang_char_dir/text_words ]; then + gunzip -c data/manifests/aidatatang_200zh/aidatatang_supervisions_train.jsonl.gz \ + | jq '.text' | sed -e 's/["text:\t]*//g' | sed 's/"//g' \ + | ./local/text2token.py -t "char" > $lang_char_dir/text_words + fi cat $lang_char_dir/text_words | sed 's/ /\n/g' | sort -u | sed '/^$/d' \ | uniq > $lang_char_dir/words_no_ids.txt if [ ! -f $lang_char_dir/words.txt ]; then ./local/prepare_words.py \ - --input-file $lang_char_dir/words_no_ids.txt - --output-file $lang_char_dir/words.txt + --input-file $lang_char_dir/words_no_ids.txt \ + --output-file $lang_char_dir/words.txt fi if [ ! -f $lang_char_dir/L_disambig.pt ]; then ./local/prepare_char.py fi fi + diff --git a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py index a185567da..f0407f429 100755 --- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py +++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py @@ -522,63 +522,14 @@ def main(): num_param = sum([p.numel() for p in model.parameters()]) logging.info(f"Number of model parameters: {num_param}") - # Note: Please use "pip install webdataset==0.1.103" - # for installing the webdataset. - import glob - import os - - from lhotse import CutSet - from lhotse.dataset.webdataset import export_to_webdataset - # we need cut ids to display recognition results. args.return_cuts = True aidatatang_200zh = Aidatatang_200zhAsrDataModule(args) - dev = "dev" - test = "test" - - if not os.path.exists(f"{dev}/shared-0.tar"): - os.makedirs(dev) - dev_cuts = aidatatang_200zh.valid_cuts() - export_to_webdataset( - dev_cuts, - output_path=f"{dev}/shared-%d.tar", - shard_size=300, - ) - - if not os.path.exists(f"{test}/shared-0.tar"): - os.makedirs(test) - test_cuts = aidatatang_200zh.test_cuts() - export_to_webdataset( - test_cuts, - output_path=f"{test}/shared-%d.tar", - shard_size=300, - ) - - dev_shards = [ - str(path) - for path in sorted(glob.glob(os.path.join(dev, "shared-*.tar"))) - ] - cuts_dev_webdataset = CutSet.from_webdataset( - dev_shards, - split_by_worker=True, - split_by_node=True, - shuffle_shards=True, - ) - - test_shards = [ - str(path) - for path in sorted(glob.glob(os.path.join(test, "shared-*.tar"))) - ] - cuts_test_webdataset = CutSet.from_webdataset( - test_shards, - split_by_worker=True, - split_by_node=True, - shuffle_shards=True, - ) - - dev_dl = aidatatang_200zh.valid_dataloaders(cuts_dev_webdataset) - test_dl = aidatatang_200zh.test_dataloaders(cuts_test_webdataset) + dev_cuts = aidatatang_200zh.valid_cuts() + test_cuts = aidatatang_200zh.test_cuts() + dev_dl = aidatatang_200zh.valid_dataloaders(dev_cuts) + test_dl = aidatatang_200zh.test_dataloaders(test_cuts) test_sets = ["dev", "test"] test_dl = [dev_dl, test_dl]