support large-v3

This commit is contained in:
Yuekai Zhang 2024-01-14 18:27:41 +08:00
parent fa7ad4dc72
commit 2ce09809cd
8 changed files with 246 additions and 220 deletions

View File

@ -49,8 +49,8 @@ def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
dataset_parts = ( dataset_parts = (
"train", "train",
#"dev", "dev",
#"test", "test",
) )
prefix = "aishell" prefix = "aishell"
suffix = "jsonl.gz" suffix = "jsonl.gz"
@ -69,7 +69,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
dataset_parts, dataset_parts,
) )
extractor = WhisperFbank(WhisperFbankConfig(device='cuda')) extractor = WhisperFbank(WhisperFbankConfig(num_filters=num_mel_bins, device='cuda'))
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items(): for partition, m in manifests.items():

View File

@ -83,9 +83,9 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
# #
# ln -sfv /path/to/musan $dl_dir/musan # ln -sfv /path/to/musan $dl_dir/musan
# #
if [ ! -d $dl_dir/musan ]; then # if [ ! -d $dl_dir/musan ]; then
lhotse download musan $dl_dir # lhotse download musan $dl_dir
fi # fi
fi fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
@ -99,17 +99,17 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
fi fi
fi fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then # if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest" # log "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus # # We assume that you have downloaded the musan corpus
# to data/musan # # to data/musan
if [ ! -f data/manifests/.musan_manifests.done ]; then # if [ ! -f data/manifests/.musan_manifests.done ]; then
log "It may take 6 minutes" # log "It may take 6 minutes"
mkdir -p data/manifests # mkdir -p data/manifests
lhotse prepare musan $dl_dir/musan data/manifests # lhotse prepare musan $dl_dir/musan data/manifests
touch data/manifests/.musan_manifests.done # touch data/manifests/.musan_manifests.done
fi # fi
fi # fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for aishell" log "Stage 3: Compute fbank for aishell"
@ -120,47 +120,56 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
fi fi
fi fi
if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then # if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
# log "Stage 30: Compute whisper fbank for aishell"
# if [ ! -f data/fbank/.aishell.done ]; then
# mkdir -p data/fbank
# ./local/compute_whisper_fbank_aishell.py --perturb-speed True
# touch data/fbank/.aishell.done
# fi
# fi
if [ $stage -le 300 ] && [ $stop_stage -ge 300 ]; then
log "Stage 30: Compute whisper fbank for aishell" log "Stage 30: Compute whisper fbank for aishell"
if [ ! -f data/fbank/.aishell.done ]; then if [ ! -f data/fbank/.aishell.done ]; then
mkdir -p data/fbank mkdir -p data/fbank
./local/compute_whisper_fbank_aishell.py --perturb-speed True ./local/compute_whisper_fbank_aishell.py --perturb-speed True --num-mel-bins 128
touch data/fbank/.aishell.done touch data/fbank/.aishell.done
fi fi
fi fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then # if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank for musan" # log "Stage 4: Compute fbank for musan"
if [ ! -f data/fbank/.msuan.done ]; then # if [ ! -f data/fbank/.msuan.done ]; then
mkdir -p data/fbank # mkdir -p data/fbank
./local/compute_fbank_musan.py # ./local/compute_fbank_musan.py
touch data/fbank/.msuan.done # touch data/fbank/.msuan.done
fi # fi
fi # fi
if [ $stage -le 40 ] && [ $stop_stage -ge 40 ]; then # if [ $stage -le 40 ] && [ $stop_stage -ge 40 ]; then
log "Stage 4: Compute fbank for musan" # log "Stage 4: Compute fbank for musan"
if [ ! -f data/fbank/.msuan.done ]; then # if [ ! -f data/fbank/.msuan.done ]; then
mkdir -p data/fbank # mkdir -p data/fbank
./local/compute_whisper_fbank_musan.py # ./local/compute_whisper_fbank_musan.py
touch data/fbank/.msuan.done # touch data/fbank/.msuan.done
fi # fi
fi # fi
lang_phone_dir=data/lang_phone # lang_phone_dir=data/lang_phone
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then # if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Prepare phone based lang" # log "Stage 5: Prepare phone based lang"
mkdir -p $lang_phone_dir # mkdir -p $lang_phone_dir
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) | # (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
cat - $dl_dir/aishell/resource_aishell/lexicon.txt | # cat - $dl_dir/aishell/resource_aishell/lexicon.txt |
sort | uniq > $lang_phone_dir/lexicon.txt # sort | uniq > $lang_phone_dir/lexicon.txt
./local/generate_unique_lexicon.py --lang-dir $lang_phone_dir # ./local/generate_unique_lexicon.py --lang-dir $lang_phone_dir
if [ ! -f $lang_phone_dir/L_disambig.pt ]; then # if [ ! -f $lang_phone_dir/L_disambig.pt ]; then
./local/prepare_lang.py --lang-dir $lang_phone_dir # ./local/prepare_lang.py --lang-dir $lang_phone_dir
fi # fi
# Train a bigram P for MMI training # Train a bigram P for MMI training
@ -173,93 +182,93 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt cut -d " " -f 2- > $lang_phone_dir/transcript_words.txt
fi fi
if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then # if [ ! -f $lang_phone_dir/transcript_tokens.txt ]; then
./local/convert_transcript_words_to_tokens.py \ # ./local/convert_transcript_words_to_tokens.py \
--lexicon $lang_phone_dir/uniq_lexicon.txt \ # --lexicon $lang_phone_dir/uniq_lexicon.txt \
--transcript $lang_phone_dir/transcript_words.txt \ # --transcript $lang_phone_dir/transcript_words.txt \
--oov "<UNK>" \ # --oov "<UNK>" \
> $lang_phone_dir/transcript_tokens.txt # > $lang_phone_dir/transcript_tokens.txt
fi # fi
if [ ! -f $lang_phone_dir/P.arpa ]; then # if [ ! -f $lang_phone_dir/P.arpa ]; then
./shared/make_kn_lm.py \ # ./shared/make_kn_lm.py \
-ngram-order 2 \ # -ngram-order 2 \
-text $lang_phone_dir/transcript_tokens.txt \ # -text $lang_phone_dir/transcript_tokens.txt \
-lm $lang_phone_dir/P.arpa # -lm $lang_phone_dir/P.arpa
fi # fi
if [ ! -f $lang_phone_dir/P.fst.txt ]; then # if [ ! -f $lang_phone_dir/P.fst.txt ]; then
python3 -m kaldilm \ # python3 -m kaldilm \
--read-symbol-table="$lang_phone_dir/tokens.txt" \ # --read-symbol-table="$lang_phone_dir/tokens.txt" \
--disambig-symbol='#0' \ # --disambig-symbol='#0' \
--max-order=2 \ # --max-order=2 \
$lang_phone_dir/P.arpa > $lang_phone_dir/P.fst.txt # $lang_phone_dir/P.arpa > $lang_phone_dir/P.fst.txt
fi # fi
fi # fi
lang_char_dir=data/lang_char # lang_char_dir=data/lang_char
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then # if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Prepare char based lang" # log "Stage 6: Prepare char based lang"
mkdir -p $lang_char_dir # mkdir -p $lang_char_dir
# We reuse words.txt from phone based lexicon # # We reuse words.txt from phone based lexicon
# so that the two can share G.pt later. # # so that the two can share G.pt later.
# The transcripts in training set, generated in stage 5 # # The transcripts in training set, generated in stage 5
cp $lang_phone_dir/transcript_words.txt $lang_char_dir/transcript_words.txt # cp $lang_phone_dir/transcript_words.txt $lang_char_dir/transcript_words.txt
cat $dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt | # cat $dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt |
cut -d " " -f 2- > $lang_char_dir/text # cut -d " " -f 2- > $lang_char_dir/text
(echo '<eps> 0'; echo '!SIL 1'; echo '<SPOKEN_NOISE> 2'; echo '<UNK> 3';) \ # (echo '<eps> 0'; echo '!SIL 1'; echo '<SPOKEN_NOISE> 2'; echo '<UNK> 3';) \
> $lang_char_dir/words.txt # > $lang_char_dir/words.txt
cat $lang_char_dir/text | sed 's/ /\n/g' | sort -u | sed '/^$/d' \ # cat $lang_char_dir/text | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
| awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt # | awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt
num_lines=$(< $lang_char_dir/words.txt wc -l) # num_lines=$(< $lang_char_dir/words.txt wc -l)
(echo "#0 $num_lines"; echo "<s> $(($num_lines + 1))"; echo "</s> $(($num_lines + 2))";) \ # (echo "#0 $num_lines"; echo "<s> $(($num_lines + 1))"; echo "</s> $(($num_lines + 2))";) \
>> $lang_char_dir/words.txt # >> $lang_char_dir/words.txt
if [ ! -f $lang_char_dir/L_disambig.pt ]; then # if [ ! -f $lang_char_dir/L_disambig.pt ]; then
./local/prepare_char.py --lang-dir $lang_char_dir # ./local/prepare_char.py --lang-dir $lang_char_dir
fi # fi
fi # fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then # if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Prepare Byte BPE based lang" # log "Stage 7: Prepare Byte BPE based lang"
for vocab_size in ${vocab_sizes[@]}; do # for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bbpe_${vocab_size} # lang_dir=data/lang_bbpe_${vocab_size}
mkdir -p $lang_dir # mkdir -p $lang_dir
cp $lang_char_dir/words.txt $lang_dir # cp $lang_char_dir/words.txt $lang_dir
cp $lang_char_dir/text $lang_dir # cp $lang_char_dir/text $lang_dir
if [ ! -f $lang_dir/bbpe.model ]; then # if [ ! -f $lang_dir/bbpe.model ]; then
./local/train_bbpe_model.py \ # ./local/train_bbpe_model.py \
--lang-dir $lang_dir \ # --lang-dir $lang_dir \
--vocab-size $vocab_size \ # --vocab-size $vocab_size \
--transcript $lang_dir/text # --transcript $lang_dir/text
fi # fi
if [ ! -f $lang_dir/L_disambig.pt ]; then # if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang_bbpe.py --lang-dir $lang_dir # ./local/prepare_lang_bbpe.py --lang-dir $lang_dir
fi # fi
done # done
fi # fi
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then # if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
log "Stage 8: Prepare G" # log "Stage 8: Prepare G"
mkdir -p data/lm # mkdir -p data/lm
# Train LM on transcripts # # Train LM on transcripts
if [ ! -f data/lm/3-gram.unpruned.arpa ]; then # if [ ! -f data/lm/3-gram.unpruned.arpa ]; then
python3 ./shared/make_kn_lm.py \ # python3 ./shared/make_kn_lm.py \
-ngram-order 3 \ # -ngram-order 3 \
-text $lang_char_dir/transcript_words.txt \ # -text $lang_char_dir/transcript_words.txt \
-lm data/lm/3-gram.unpruned.arpa # -lm data/lm/3-gram.unpruned.arpa
fi # fi
# We assume you have installed kaldilm, if not, please install # We assume you have installed kaldilm, if not, please install
# it using: pip install kaldilm # it using: pip install kaldilm
@ -285,112 +294,112 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
fi fi
fi fi
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then # if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
log "Stage 9: Compile LG & HLG" # log "Stage 9: Compile LG & HLG"
./local/compile_hlg.py --lang-dir $lang_phone_dir --lm G_3_gram_phone # ./local/compile_hlg.py --lang-dir $lang_phone_dir --lm G_3_gram_phone
./local/compile_hlg.py --lang-dir $lang_char_dir --lm G_3_gram_char # ./local/compile_hlg.py --lang-dir $lang_char_dir --lm G_3_gram_char
for vocab_size in ${vocab_sizes[@]}; do # for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bbpe_${vocab_size} # lang_dir=data/lang_bbpe_${vocab_size}
./local/compile_hlg.py --lang-dir $lang_dir --lm G_3_gram_char # ./local/compile_hlg.py --lang-dir $lang_dir --lm G_3_gram_char
done # done
./local/compile_lg.py --lang-dir $lang_phone_dir --lm G_3_gram_phone # ./local/compile_lg.py --lang-dir $lang_phone_dir --lm G_3_gram_phone
./local/compile_lg.py --lang-dir $lang_char_dir --lm G_3_gram_char # ./local/compile_lg.py --lang-dir $lang_char_dir --lm G_3_gram_char
for vocab_size in ${vocab_sizes[@]}; do # for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bbpe_${vocab_size} # lang_dir=data/lang_bbpe_${vocab_size}
./local/compile_lg.py --lang-dir $lang_dir --lm G_3_gram_char # ./local/compile_lg.py --lang-dir $lang_dir --lm G_3_gram_char
done # done
fi # fi
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then # if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
log "Stage 10: Generate LM training data" # log "Stage 10: Generate LM training data"
log "Processing char based data" # log "Processing char based data"
out_dir=data/lm_training_char # out_dir=data/lm_training_char
mkdir -p $out_dir $dl_dir/lm # mkdir -p $out_dir $dl_dir/lm
if [ ! -f $dl_dir/lm/aishell-train-word.txt ]; then # if [ ! -f $dl_dir/lm/aishell-train-word.txt ]; then
cp $lang_phone_dir/transcript_words.txt $dl_dir/lm/aishell-train-word.txt # cp $lang_phone_dir/transcript_words.txt $dl_dir/lm/aishell-train-word.txt
fi # fi
# training words # # training words
./local/prepare_char_lm_training_data.py \ # ./local/prepare_char_lm_training_data.py \
--lang-char data/lang_char \ # --lang-char data/lang_char \
--lm-data $dl_dir/lm/aishell-train-word.txt \ # --lm-data $dl_dir/lm/aishell-train-word.txt \
--lm-archive $out_dir/lm_data.pt # --lm-archive $out_dir/lm_data.pt
# valid words # # valid words
if [ ! -f $dl_dir/lm/aishell-valid-word.txt ]; then # if [ ! -f $dl_dir/lm/aishell-valid-word.txt ]; then
aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt # aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
aishell_valid_uid=$dl_dir/aishell/data_aishell/transcript/aishell_valid_uid # aishell_valid_uid=$dl_dir/aishell/data_aishell/transcript/aishell_valid_uid
find $dl_dir/aishell/data_aishell/wav/dev -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_valid_uid # find $dl_dir/aishell/data_aishell/wav/dev -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_valid_uid
awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_valid_uid $aishell_text | # awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_valid_uid $aishell_text |
cut -d " " -f 2- > $dl_dir/lm/aishell-valid-word.txt # cut -d " " -f 2- > $dl_dir/lm/aishell-valid-word.txt
fi # fi
./local/prepare_char_lm_training_data.py \ # ./local/prepare_char_lm_training_data.py \
--lang-char data/lang_char \ # --lang-char data/lang_char \
--lm-data $dl_dir/lm/aishell-valid-word.txt \ # --lm-data $dl_dir/lm/aishell-valid-word.txt \
--lm-archive $out_dir/lm_data_valid.pt # --lm-archive $out_dir/lm_data_valid.pt
# test words # # test words
if [ ! -f $dl_dir/lm/aishell-test-word.txt ]; then # if [ ! -f $dl_dir/lm/aishell-test-word.txt ]; then
aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt # aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
aishell_test_uid=$dl_dir/aishell/data_aishell/transcript/aishell_test_uid # aishell_test_uid=$dl_dir/aishell/data_aishell/transcript/aishell_test_uid
find $dl_dir/aishell/data_aishell/wav/test -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_test_uid # find $dl_dir/aishell/data_aishell/wav/test -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_test_uid
awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_test_uid $aishell_text | # awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_test_uid $aishell_text |
cut -d " " -f 2- > $dl_dir/lm/aishell-test-word.txt # cut -d " " -f 2- > $dl_dir/lm/aishell-test-word.txt
fi # fi
./local/prepare_char_lm_training_data.py \ # ./local/prepare_char_lm_training_data.py \
--lang-char data/lang_char \ # --lang-char data/lang_char \
--lm-data $dl_dir/lm/aishell-test-word.txt \ # --lm-data $dl_dir/lm/aishell-test-word.txt \
--lm-archive $out_dir/lm_data_test.pt # --lm-archive $out_dir/lm_data_test.pt
fi # fi
if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then # if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
log "Stage 11: Sort LM training data" # log "Stage 11: Sort LM training data"
# Sort LM training data by sentence length in descending order # # Sort LM training data by sentence length in descending order
# for ease of training. # # for ease of training.
# # #
# Sentence length equals to the number of tokens # # Sentence length equals to the number of tokens
# in a sentence. # # in a sentence.
out_dir=data/lm_training_char # out_dir=data/lm_training_char
mkdir -p $out_dir # mkdir -p $out_dir
ln -snf ../../../librispeech/ASR/local/sort_lm_training_data.py local/ # ln -snf ../../../librispeech/ASR/local/sort_lm_training_data.py local/
./local/sort_lm_training_data.py \ # ./local/sort_lm_training_data.py \
--in-lm-data $out_dir/lm_data.pt \ # --in-lm-data $out_dir/lm_data.pt \
--out-lm-data $out_dir/sorted_lm_data.pt \ # --out-lm-data $out_dir/sorted_lm_data.pt \
--out-statistics $out_dir/statistics.txt # --out-statistics $out_dir/statistics.txt
./local/sort_lm_training_data.py \ # ./local/sort_lm_training_data.py \
--in-lm-data $out_dir/lm_data_valid.pt \ # --in-lm-data $out_dir/lm_data_valid.pt \
--out-lm-data $out_dir/sorted_lm_data-valid.pt \ # --out-lm-data $out_dir/sorted_lm_data-valid.pt \
--out-statistics $out_dir/statistics-valid.txt # --out-statistics $out_dir/statistics-valid.txt
./local/sort_lm_training_data.py \ # ./local/sort_lm_training_data.py \
--in-lm-data $out_dir/lm_data_test.pt \ # --in-lm-data $out_dir/lm_data_test.pt \
--out-lm-data $out_dir/sorted_lm_data-test.pt \ # --out-lm-data $out_dir/sorted_lm_data-test.pt \
--out-statistics $out_dir/statistics-test.txt # --out-statistics $out_dir/statistics-test.txt
fi # fi
if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then # if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
log "Stage 11: Train RNN LM model" # log "Stage 11: Train RNN LM model"
python ../../../icefall/rnn_lm/train.py \ # python ../../../icefall/rnn_lm/train.py \
--start-epoch 0 \ # --start-epoch 0 \
--world-size 1 \ # --world-size 1 \
--num-epochs 20 \ # --num-epochs 20 \
--use-fp16 0 \ # --use-fp16 0 \
--embedding-dim 512 \ # --embedding-dim 512 \
--hidden-dim 512 \ # --hidden-dim 512 \
--num-layers 2 \ # --num-layers 2 \
--batch-size 400 \ # --batch-size 400 \
--exp-dir rnnlm_char/exp \ # --exp-dir rnnlm_char/exp \
--lm-data $out_dir/sorted_lm_data.pt \ # --lm-data $out_dir/sorted_lm_data.pt \
--lm-data-valid $out_dir/sorted_lm_data-valid.pt \ # --lm-data-valid $out_dir/sorted_lm_data-valid.pt \
--vocab-size 4336 \ # --vocab-size 4336 \
--master-port 12345 # --master-port 12345
fi # fi

View File

@ -176,7 +176,7 @@ class AishellAsrDataModule:
group.add_argument( group.add_argument(
"--enable-musan", "--enable-musan",
type=str2bool, type=str2bool,
default=True, default=False,
help="When enabled, select noise from MUSAN and mix it" help="When enabled, select noise from MUSAN and mix it"
"with training dataset. ", "with training dataset. ",
) )
@ -192,11 +192,11 @@ class AishellAsrDataModule:
The state dict for the training sampler. The state dict for the training sampler.
""" """
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
transforms = [] transforms = []
if self.args.enable_musan: if self.args.enable_musan:
logging.info("Enable MUSAN") logging.info("Enable MUSAN")
cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
transforms.append( transforms.append(
CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True) CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
) )

View File

@ -127,6 +127,15 @@ def get_parser():
help="The experiment dir", help="The experiment dir",
) )
parser.add_argument(
"--model-name",
type=str,
default="large-v2",
choices=["large-v2", "large-v3", "medium", "small", "tiny"],
help="""The model name to use.
""",
)
return parser return parser
@ -370,7 +379,7 @@ def main():
logging.info(f"device: {device}") logging.info(f"device: {device}")
model = whisper.load_model("medium") model = whisper.load_model(params.model_name)
if params.epoch > 0: if params.epoch > 0:
if params.avg > 1: if params.avg > 1:
start = params.epoch - params.avg start = params.epoch - params.avg

View File

@ -2,10 +2,10 @@
"fp16": { "fp16": {
"enabled": true, "enabled": true,
"loss_scale": 0, "loss_scale": 0,
"loss_scale_window": 1000, "loss_scale_window": 100,
"initial_scale_power": 16, "initial_scale_power": 16,
"hysteresis": 2, "hysteresis": 2,
"min_loss_scale": 1 "min_loss_scale": 0.01
}, },
"zero_optimization": { "zero_optimization": {
"stage": 1, "stage": 1,
@ -19,8 +19,8 @@
"scheduler": { "scheduler": {
"type": "WarmupLR", "type": "WarmupLR",
"params": { "params": {
"warmup_min_lr": 5e-6, "warmup_min_lr": 1e-6,
"warmup_max_lr": 1e-5, "warmup_max_lr": 5e-6,
"warmup_num_steps": 100 "warmup_num_steps": 100
} }
}, },

View File

@ -2,6 +2,8 @@ import torch
import torch.nn as nn import torch.nn as nn
import base64 import base64
import gzip import gzip
import warnings
from tqdm import tqdm
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, Iterable, Optional, Union from typing import Dict, Iterable, Optional, Union
import os import os
@ -275,6 +277,11 @@ class Whisper(nn.Module):
@property @property
def is_multilingual(self): def is_multilingual(self):
return self.dims.n_vocab == 51865 return self.dims.n_vocab == 51865
return self.dims.n_vocab >= 51865
@property
def num_languages(self):
return self.dims.n_vocab - 51765 - int(self.is_multilingual)
def install_kv_cache_hooks(self, cache: Optional[dict] = None): def install_kv_cache_hooks(self, cache: Optional[dict] = None):
""" """
@ -324,6 +331,7 @@ _MODELS = {
"medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt", "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
"large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt", "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
"large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
"large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
"large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", "large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
} }

View File

@ -1,11 +1,11 @@
k2 k2
kaldialign kaldialign
lhotse==1.18 #lhotse==1.18
#git+https://github.com/lhotse-speech/lhotse git+https://github.com/lhotse-speech/lhotse
sentencepiece sentencepiece
tensorboard tensorboard
librosa librosa
openai-whisper openai-whisper==20231117
zhconv zhconv
WeTextProcessing WeTextProcessing
deepspeed deepspeed

View File

@ -796,7 +796,7 @@ def run(rank, world_size, args):
logging.info(f"Number of model parameters: {num_param}") logging.info(f"Number of model parameters: {num_param}")
tokenizer = whisper.tokenizer.get_tokenizer( tokenizer = whisper.tokenizer.get_tokenizer(
model.is_multilingual, language="zh", task="transcribe" model.is_multilingual, num_languages=model.num_languages, language="zh", task="transcribe"
) )
assert params.save_every_n >= params.average_period assert params.save_every_n >= params.average_period