diff --git a/egs/aishell/ASR/local/compute_whisper_fbank_aishell.py b/egs/aishell/ASR/local/compute_whisper_fbank_aishell.py index f1d8a7460..72f4b7acb 100644 --- a/egs/aishell/ASR/local/compute_whisper_fbank_aishell.py +++ b/egs/aishell/ASR/local/compute_whisper_fbank_aishell.py @@ -49,8 +49,8 @@ def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False): dataset_parts = ( "train", - #"dev", - #"test", + "dev", + "test", ) prefix = "aishell" suffix = "jsonl.gz" @@ -69,7 +69,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False): dataset_parts, ) - extractor = WhisperFbank(WhisperFbankConfig(device='cuda')) + extractor = WhisperFbank(WhisperFbankConfig(num_filters=num_mel_bins, device='cuda')) with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): diff --git a/egs/aishell/ASR/prepare.sh b/egs/aishell/ASR/prepare.sh index 78c635690..97dc721c2 100755 --- a/egs/aishell/ASR/prepare.sh +++ b/egs/aishell/ASR/prepare.sh @@ -83,9 +83,9 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then # # ln -sfv /path/to/musan $dl_dir/musan # - if [ ! -d $dl_dir/musan ]; then - lhotse download musan $dl_dir - fi + # if [ ! -d $dl_dir/musan ]; then + # lhotse download musan $dl_dir + # fi fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then @@ -99,17 +99,17 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then fi fi -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Prepare musan manifest" - # We assume that you have downloaded the musan corpus - # to data/musan - if [ ! -f data/manifests/.musan_manifests.done ]; then - log "It may take 6 minutes" - mkdir -p data/manifests - lhotse prepare musan $dl_dir/musan data/manifests - touch data/manifests/.musan_manifests.done - fi -fi +# if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then +# log "Stage 2: Prepare musan manifest" +# # We assume that you have downloaded the musan corpus +# # to data/musan +# if [ ! -f data/manifests/.musan_manifests.done ]; then +# log "It may take 6 minutes" +# mkdir -p data/manifests +# lhotse prepare musan $dl_dir/musan data/manifests +# touch data/manifests/.musan_manifests.done +# fi +# fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Compute fbank for aishell" @@ -120,47 +120,56 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then fi fi -if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then +# if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then +# log "Stage 30: Compute whisper fbank for aishell" +# if [ ! -f data/fbank/.aishell.done ]; then +# mkdir -p data/fbank +# ./local/compute_whisper_fbank_aishell.py --perturb-speed True +# touch data/fbank/.aishell.done +# fi +# fi + +if [ $stage -le 300 ] && [ $stop_stage -ge 300 ]; then log "Stage 30: Compute whisper fbank for aishell" if [ ! -f data/fbank/.aishell.done ]; then mkdir -p data/fbank - ./local/compute_whisper_fbank_aishell.py --perturb-speed True + ./local/compute_whisper_fbank_aishell.py --perturb-speed True --num-mel-bins 128 touch data/fbank/.aishell.done fi fi -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Compute fbank for musan" - if [ ! -f data/fbank/.msuan.done ]; then - mkdir -p data/fbank - ./local/compute_fbank_musan.py - touch data/fbank/.msuan.done - fi -fi +# if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then +# log "Stage 4: Compute fbank for musan" +# if [ ! -f data/fbank/.msuan.done ]; then +# mkdir -p data/fbank +# ./local/compute_fbank_musan.py +# touch data/fbank/.msuan.done +# fi +# fi -if [ $stage -le 40 ] && [ $stop_stage -ge 40 ]; then - log "Stage 4: Compute fbank for musan" - if [ ! -f data/fbank/.msuan.done ]; then - mkdir -p data/fbank - ./local/compute_whisper_fbank_musan.py - touch data/fbank/.msuan.done - fi -fi +# if [ $stage -le 40 ] && [ $stop_stage -ge 40 ]; then +# log "Stage 4: Compute fbank for musan" +# if [ ! -f data/fbank/.msuan.done ]; then +# mkdir -p data/fbank +# ./local/compute_whisper_fbank_musan.py +# touch data/fbank/.msuan.done +# fi +# fi -lang_phone_dir=data/lang_phone -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Prepare phone based lang" - mkdir -p $lang_phone_dir +# lang_phone_dir=data/lang_phone +# if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then +# log "Stage 5: Prepare phone based lang" +# mkdir -p $lang_phone_dir - (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | - cat - $dl_dir/aishell/resource_aishell/lexicon.txt | - sort | uniq > $lang_phone_dir/lexicon.txt +# (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | +# cat - $dl_dir/aishell/resource_aishell/lexicon.txt | +# sort | uniq > $lang_phone_dir/lexicon.txt - ./local/generate_unique_lexicon.py --lang-dir $lang_phone_dir +# ./local/generate_unique_lexicon.py --lang-dir $lang_phone_dir - if [ ! -f $lang_phone_dir/L_disambig.pt ]; then - ./local/prepare_lang.py --lang-dir $lang_phone_dir - fi +# if [ ! -f $lang_phone_dir/L_disambig.pt ]; then +# ./local/prepare_lang.py --lang-dir $lang_phone_dir +# fi # Train a bigram P for MMI training @@ -173,93 +182,93 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt fi - if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then - ./local/convert_transcript_words_to_tokens.py \ - --lexicon $lang_phone_dir/uniq_lexicon.txt \ - --transcript $lang_phone_dir/transcript_words.txt \ - --oov "" \ - > $lang_phone_dir/transcript_tokens.txt - fi +# if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then +# ./local/convert_transcript_words_to_tokens.py \ +# --lexicon $lang_phone_dir/uniq_lexicon.txt \ +# --transcript $lang_phone_dir/transcript_words.txt \ +# --oov "" \ +# > $lang_phone_dir/transcript_tokens.txt +# fi - if [ ! -f $lang_phone_dir/P.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 2 \ - -text $lang_phone_dir/transcript_tokens.txt \ - -lm $lang_phone_dir/P.arpa - fi +# if [ ! -f $lang_phone_dir/P.arpa ]; then +# ./shared/make_kn_lm.py \ +# -ngram-order 2 \ +# -text $lang_phone_dir/transcript_tokens.txt \ +# -lm $lang_phone_dir/P.arpa +# fi - if [ ! -f $lang_phone_dir/P.fst.txt ]; then - python3 -m kaldilm \ - --read-symbol-table="$lang_phone_dir/tokens.txt" \ - --disambig-symbol='#0' \ - --max-order=2 \ - $lang_phone_dir/P.arpa > $lang_phone_dir/P.fst.txt - fi -fi +# if [ ! -f $lang_phone_dir/P.fst.txt ]; then +# python3 -m kaldilm \ +# --read-symbol-table="$lang_phone_dir/tokens.txt" \ +# --disambig-symbol='#0' \ +# --max-order=2 \ +# $lang_phone_dir/P.arpa > $lang_phone_dir/P.fst.txt +# fi +# fi -lang_char_dir=data/lang_char -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare char based lang" - mkdir -p $lang_char_dir - # We reuse words.txt from phone based lexicon - # so that the two can share G.pt later. +# lang_char_dir=data/lang_char +# if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then +# log "Stage 6: Prepare char based lang" +# mkdir -p $lang_char_dir +# # We reuse words.txt from phone based lexicon +# # so that the two can share G.pt later. - # The transcripts in training set, generated in stage 5 - cp $lang_phone_dir/transcript_words.txt $lang_char_dir/transcript_words.txt +# # The transcripts in training set, generated in stage 5 +# cp $lang_phone_dir/transcript_words.txt $lang_char_dir/transcript_words.txt - cat $dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt | - cut -d " " -f 2- > $lang_char_dir/text +# cat $dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt | +# cut -d " " -f 2- > $lang_char_dir/text - (echo ' 0'; echo '!SIL 1'; echo ' 2'; echo ' 3';) \ - > $lang_char_dir/words.txt +# (echo ' 0'; echo '!SIL 1'; echo ' 2'; echo ' 3';) \ +# > $lang_char_dir/words.txt - cat $lang_char_dir/text | sed 's/ /\n/g' | sort -u | sed '/^$/d' \ - | awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt +# cat $lang_char_dir/text | sed 's/ /\n/g' | sort -u | sed '/^$/d' \ +# | awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt - num_lines=$(< $lang_char_dir/words.txt wc -l) - (echo "#0 $num_lines"; echo " $(($num_lines + 1))"; echo " $(($num_lines + 2))";) \ - >> $lang_char_dir/words.txt +# num_lines=$(< $lang_char_dir/words.txt wc -l) +# (echo "#0 $num_lines"; echo " $(($num_lines + 1))"; echo " $(($num_lines + 2))";) \ +# >> $lang_char_dir/words.txt - if [ ! -f $lang_char_dir/L_disambig.pt ]; then - ./local/prepare_char.py --lang-dir $lang_char_dir - fi -fi +# if [ ! -f $lang_char_dir/L_disambig.pt ]; then +# ./local/prepare_char.py --lang-dir $lang_char_dir +# fi +# fi -if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then - log "Stage 7: Prepare Byte BPE based lang" +# if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then +# log "Stage 7: Prepare Byte BPE based lang" - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bbpe_${vocab_size} - mkdir -p $lang_dir +# for vocab_size in ${vocab_sizes[@]}; do +# lang_dir=data/lang_bbpe_${vocab_size} +# mkdir -p $lang_dir - cp $lang_char_dir/words.txt $lang_dir - cp $lang_char_dir/text $lang_dir +# cp $lang_char_dir/words.txt $lang_dir +# cp $lang_char_dir/text $lang_dir - if [ ! -f $lang_dir/bbpe.model ]; then - ./local/train_bbpe_model.py \ - --lang-dir $lang_dir \ - --vocab-size $vocab_size \ - --transcript $lang_dir/text - fi +# if [ ! -f $lang_dir/bbpe.model ]; then +# ./local/train_bbpe_model.py \ +# --lang-dir $lang_dir \ +# --vocab-size $vocab_size \ +# --transcript $lang_dir/text +# fi - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang_bbpe.py --lang-dir $lang_dir - fi - done -fi +# if [ ! -f $lang_dir/L_disambig.pt ]; then +# ./local/prepare_lang_bbpe.py --lang-dir $lang_dir +# fi +# done +# fi -if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 8: Prepare G" +# if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then +# log "Stage 8: Prepare G" - mkdir -p data/lm +# mkdir -p data/lm - # Train LM on transcripts - if [ ! -f data/lm/3-gram.unpruned.arpa ]; then - python3 ./shared/make_kn_lm.py \ - -ngram-order 3 \ - -text $lang_char_dir/transcript_words.txt \ - -lm data/lm/3-gram.unpruned.arpa - fi +# # Train LM on transcripts +# if [ ! -f data/lm/3-gram.unpruned.arpa ]; then +# python3 ./shared/make_kn_lm.py \ +# -ngram-order 3 \ +# -text $lang_char_dir/transcript_words.txt \ +# -lm data/lm/3-gram.unpruned.arpa +# fi # We assume you have installed kaldilm, if not, please install # it using: pip install kaldilm @@ -285,112 +294,112 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then fi fi -if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Compile LG & HLG" - ./local/compile_hlg.py --lang-dir $lang_phone_dir --lm G_3_gram_phone - ./local/compile_hlg.py --lang-dir $lang_char_dir --lm G_3_gram_char - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bbpe_${vocab_size} - ./local/compile_hlg.py --lang-dir $lang_dir --lm G_3_gram_char - done +# if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then +# log "Stage 9: Compile LG & HLG" +# ./local/compile_hlg.py --lang-dir $lang_phone_dir --lm G_3_gram_phone +# ./local/compile_hlg.py --lang-dir $lang_char_dir --lm G_3_gram_char +# for vocab_size in ${vocab_sizes[@]}; do +# lang_dir=data/lang_bbpe_${vocab_size} +# ./local/compile_hlg.py --lang-dir $lang_dir --lm G_3_gram_char +# done - ./local/compile_lg.py --lang-dir $lang_phone_dir --lm G_3_gram_phone - ./local/compile_lg.py --lang-dir $lang_char_dir --lm G_3_gram_char - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bbpe_${vocab_size} - ./local/compile_lg.py --lang-dir $lang_dir --lm G_3_gram_char - done -fi +# ./local/compile_lg.py --lang-dir $lang_phone_dir --lm G_3_gram_phone +# ./local/compile_lg.py --lang-dir $lang_char_dir --lm G_3_gram_char +# for vocab_size in ${vocab_sizes[@]}; do +# lang_dir=data/lang_bbpe_${vocab_size} +# ./local/compile_lg.py --lang-dir $lang_dir --lm G_3_gram_char +# done +# fi -if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then - log "Stage 10: Generate LM training data" +# if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then +# log "Stage 10: Generate LM training data" - log "Processing char based data" - out_dir=data/lm_training_char - mkdir -p $out_dir $dl_dir/lm +# log "Processing char based data" +# out_dir=data/lm_training_char +# mkdir -p $out_dir $dl_dir/lm - if [ ! -f $dl_dir/lm/aishell-train-word.txt ]; then - cp $lang_phone_dir/transcript_words.txt $dl_dir/lm/aishell-train-word.txt - fi +# if [ ! -f $dl_dir/lm/aishell-train-word.txt ]; then +# cp $lang_phone_dir/transcript_words.txt $dl_dir/lm/aishell-train-word.txt +# fi - # training words - ./local/prepare_char_lm_training_data.py \ - --lang-char data/lang_char \ - --lm-data $dl_dir/lm/aishell-train-word.txt \ - --lm-archive $out_dir/lm_data.pt +# # training words +# ./local/prepare_char_lm_training_data.py \ +# --lang-char data/lang_char \ +# --lm-data $dl_dir/lm/aishell-train-word.txt \ +# --lm-archive $out_dir/lm_data.pt - # valid words - if [ ! -f $dl_dir/lm/aishell-valid-word.txt ]; then - aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt - aishell_valid_uid=$dl_dir/aishell/data_aishell/transcript/aishell_valid_uid - find $dl_dir/aishell/data_aishell/wav/dev -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_valid_uid - awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_valid_uid $aishell_text | - cut -d " " -f 2- > $dl_dir/lm/aishell-valid-word.txt - fi +# # valid words +# if [ ! -f $dl_dir/lm/aishell-valid-word.txt ]; then +# aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt +# aishell_valid_uid=$dl_dir/aishell/data_aishell/transcript/aishell_valid_uid +# find $dl_dir/aishell/data_aishell/wav/dev -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_valid_uid +# awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_valid_uid $aishell_text | +# cut -d " " -f 2- > $dl_dir/lm/aishell-valid-word.txt +# fi - ./local/prepare_char_lm_training_data.py \ - --lang-char data/lang_char \ - --lm-data $dl_dir/lm/aishell-valid-word.txt \ - --lm-archive $out_dir/lm_data_valid.pt +# ./local/prepare_char_lm_training_data.py \ +# --lang-char data/lang_char \ +# --lm-data $dl_dir/lm/aishell-valid-word.txt \ +# --lm-archive $out_dir/lm_data_valid.pt - # test words - if [ ! -f $dl_dir/lm/aishell-test-word.txt ]; then - aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt - aishell_test_uid=$dl_dir/aishell/data_aishell/transcript/aishell_test_uid - find $dl_dir/aishell/data_aishell/wav/test -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_test_uid - awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_test_uid $aishell_text | - cut -d " " -f 2- > $dl_dir/lm/aishell-test-word.txt - fi +# # test words +# if [ ! -f $dl_dir/lm/aishell-test-word.txt ]; then +# aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt +# aishell_test_uid=$dl_dir/aishell/data_aishell/transcript/aishell_test_uid +# find $dl_dir/aishell/data_aishell/wav/test -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_test_uid +# awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_test_uid $aishell_text | +# cut -d " " -f 2- > $dl_dir/lm/aishell-test-word.txt +# fi - ./local/prepare_char_lm_training_data.py \ - --lang-char data/lang_char \ - --lm-data $dl_dir/lm/aishell-test-word.txt \ - --lm-archive $out_dir/lm_data_test.pt -fi +# ./local/prepare_char_lm_training_data.py \ +# --lang-char data/lang_char \ +# --lm-data $dl_dir/lm/aishell-test-word.txt \ +# --lm-archive $out_dir/lm_data_test.pt +# fi -if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then - log "Stage 11: Sort LM training data" - # Sort LM training data by sentence length in descending order - # for ease of training. - # - # Sentence length equals to the number of tokens - # in a sentence. +# if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then +# log "Stage 11: Sort LM training data" +# # Sort LM training data by sentence length in descending order +# # for ease of training. +# # +# # Sentence length equals to the number of tokens +# # in a sentence. - out_dir=data/lm_training_char - mkdir -p $out_dir - ln -snf ../../../librispeech/ASR/local/sort_lm_training_data.py local/ +# out_dir=data/lm_training_char +# mkdir -p $out_dir +# ln -snf ../../../librispeech/ASR/local/sort_lm_training_data.py local/ - ./local/sort_lm_training_data.py \ - --in-lm-data $out_dir/lm_data.pt \ - --out-lm-data $out_dir/sorted_lm_data.pt \ - --out-statistics $out_dir/statistics.txt +# ./local/sort_lm_training_data.py \ +# --in-lm-data $out_dir/lm_data.pt \ +# --out-lm-data $out_dir/sorted_lm_data.pt \ +# --out-statistics $out_dir/statistics.txt - ./local/sort_lm_training_data.py \ - --in-lm-data $out_dir/lm_data_valid.pt \ - --out-lm-data $out_dir/sorted_lm_data-valid.pt \ - --out-statistics $out_dir/statistics-valid.txt +# ./local/sort_lm_training_data.py \ +# --in-lm-data $out_dir/lm_data_valid.pt \ +# --out-lm-data $out_dir/sorted_lm_data-valid.pt \ +# --out-statistics $out_dir/statistics-valid.txt - ./local/sort_lm_training_data.py \ - --in-lm-data $out_dir/lm_data_test.pt \ - --out-lm-data $out_dir/sorted_lm_data-test.pt \ - --out-statistics $out_dir/statistics-test.txt -fi +# ./local/sort_lm_training_data.py \ +# --in-lm-data $out_dir/lm_data_test.pt \ +# --out-lm-data $out_dir/sorted_lm_data-test.pt \ +# --out-statistics $out_dir/statistics-test.txt +# fi -if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then - log "Stage 11: Train RNN LM model" - python ../../../icefall/rnn_lm/train.py \ - --start-epoch 0 \ - --world-size 1 \ - --num-epochs 20 \ - --use-fp16 0 \ - --embedding-dim 512 \ - --hidden-dim 512 \ - --num-layers 2 \ - --batch-size 400 \ - --exp-dir rnnlm_char/exp \ - --lm-data $out_dir/sorted_lm_data.pt \ - --lm-data-valid $out_dir/sorted_lm_data-valid.pt \ - --vocab-size 4336 \ - --master-port 12345 -fi +# if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then +# log "Stage 11: Train RNN LM model" +# python ../../../icefall/rnn_lm/train.py \ +# --start-epoch 0 \ +# --world-size 1 \ +# --num-epochs 20 \ +# --use-fp16 0 \ +# --embedding-dim 512 \ +# --hidden-dim 512 \ +# --num-layers 2 \ +# --batch-size 400 \ +# --exp-dir rnnlm_char/exp \ +# --lm-data $out_dir/sorted_lm_data.pt \ +# --lm-data-valid $out_dir/sorted_lm_data-valid.pt \ +# --vocab-size 4336 \ +# --master-port 12345 +# fi diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py index 293a1a569..df21a9508 100644 --- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -176,7 +176,7 @@ class AishellAsrDataModule: group.add_argument( "--enable-musan", type=str2bool, - default=True, + default=False, help="When enabled, select noise from MUSAN and mix it" "with training dataset. ", ) @@ -192,11 +192,11 @@ class AishellAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz") transforms = [] if self.args.enable_musan: logging.info("Enable MUSAN") + cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz") transforms.append( CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True) ) diff --git a/egs/aishell/ASR/whisper/decode.py b/egs/aishell/ASR/whisper/decode.py index 578546bbe..2d8dbbfc3 100644 --- a/egs/aishell/ASR/whisper/decode.py +++ b/egs/aishell/ASR/whisper/decode.py @@ -127,6 +127,15 @@ def get_parser(): help="The experiment dir", ) + parser.add_argument( + "--model-name", + type=str, + default="large-v2", + choices=["large-v2", "large-v3", "medium", "small", "tiny"], + help="""The model name to use. + """, + ) + return parser @@ -370,7 +379,7 @@ def main(): logging.info(f"device: {device}") - model = whisper.load_model("medium") + model = whisper.load_model(params.model_name) if params.epoch > 0: if params.avg > 1: start = params.epoch - params.avg diff --git a/egs/aishell/ASR/whisper/ds_config_zero1.json b/egs/aishell/ASR/whisper/ds_config_zero1.json index 59318968e..0d69f83b9 100644 --- a/egs/aishell/ASR/whisper/ds_config_zero1.json +++ b/egs/aishell/ASR/whisper/ds_config_zero1.json @@ -2,10 +2,10 @@ "fp16": { "enabled": true, "loss_scale": 0, - "loss_scale_window": 1000, + "loss_scale_window": 100, "initial_scale_power": 16, "hysteresis": 2, - "min_loss_scale": 1 + "min_loss_scale": 0.01 }, "zero_optimization": { "stage": 1, @@ -19,8 +19,8 @@ "scheduler": { "type": "WarmupLR", "params": { - "warmup_min_lr": 5e-6, - "warmup_max_lr": 1e-5, + "warmup_min_lr": 1e-6, + "warmup_max_lr": 5e-6, "warmup_num_steps": 100 } }, diff --git a/egs/aishell/ASR/whisper/model.py b/egs/aishell/ASR/whisper/model.py index 89d76383a..4e0ef28fa 100644 --- a/egs/aishell/ASR/whisper/model.py +++ b/egs/aishell/ASR/whisper/model.py @@ -2,6 +2,8 @@ import torch import torch.nn as nn import base64 import gzip +import warnings +from tqdm import tqdm from dataclasses import dataclass from typing import Dict, Iterable, Optional, Union import os @@ -275,6 +277,11 @@ class Whisper(nn.Module): @property def is_multilingual(self): return self.dims.n_vocab == 51865 + return self.dims.n_vocab >= 51865 + + @property + def num_languages(self): + return self.dims.n_vocab - 51765 - int(self.is_multilingual) def install_kv_cache_hooks(self, cache: Optional[dict] = None): """ @@ -324,6 +331,7 @@ _MODELS = { "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt", "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt", "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", + "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", "large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", } diff --git a/egs/aishell/ASR/whisper/requirements.txt b/egs/aishell/ASR/whisper/requirements.txt index b71e4a4ad..52bcf1196 100644 --- a/egs/aishell/ASR/whisper/requirements.txt +++ b/egs/aishell/ASR/whisper/requirements.txt @@ -1,11 +1,11 @@ k2 kaldialign -lhotse==1.18 -#git+https://github.com/lhotse-speech/lhotse +#lhotse==1.18 +git+https://github.com/lhotse-speech/lhotse sentencepiece tensorboard librosa -openai-whisper +openai-whisper==20231117 zhconv WeTextProcessing deepspeed diff --git a/egs/aishell/ASR/whisper/train.py b/egs/aishell/ASR/whisper/train.py index 158ad9443..d2937c9ee 100644 --- a/egs/aishell/ASR/whisper/train.py +++ b/egs/aishell/ASR/whisper/train.py @@ -796,7 +796,7 @@ def run(rank, world_size, args): logging.info(f"Number of model parameters: {num_param}") tokenizer = whisper.tokenizer.get_tokenizer( - model.is_multilingual, language="zh", task="transcribe" + model.is_multilingual, num_languages=model.num_languages, language="zh", task="transcribe" ) assert params.save_every_n >= params.average_period