mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
minor updates to the scripts
This commit is contained in:
parent
266e840475
commit
32a7d2222d
@ -236,7 +236,7 @@ class TransformerDecoder(nn.Module):
|
||||
causal_mask = subsequent_mask(x.shape[0], device=x.device) # (seq_len, seq_len)
|
||||
attn_mask = torch.logical_or(
|
||||
padding_mask.unsqueeze(1), # (batch, 1, seq_len)
|
||||
torch.logical_not(causal_mask).unsqueeze(0) # (1, seq_len, seq_len)
|
||||
torch.logical_not(causal_mask).unsqueeze(0), # (1, seq_len, seq_len)
|
||||
) # (batch, seq_len, seq_len)
|
||||
|
||||
if memory is not None:
|
||||
@ -367,7 +367,9 @@ class MultiHeadAttention(nn.Module):
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = attention_dim // num_heads
|
||||
assert self.head_dim * num_heads == attention_dim, (
|
||||
self.head_dim, num_heads, attention_dim
|
||||
self.head_dim,
|
||||
num_heads,
|
||||
attention_dim,
|
||||
)
|
||||
self.dropout = dropout
|
||||
self.name = None # will be overwritten in training code; for diagnostics.
|
||||
@ -437,15 +439,19 @@ class MultiHeadAttention(nn.Module):
|
||||
if key_padding_mask is not None:
|
||||
assert key_padding_mask.shape == (batch, src_len), key_padding_mask.shape
|
||||
attn_weights = attn_weights.masked_fill(
|
||||
key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"),
|
||||
key_padding_mask.unsqueeze(1).unsqueeze(2),
|
||||
float("-inf"),
|
||||
)
|
||||
|
||||
if attn_mask is not None:
|
||||
assert (
|
||||
attn_mask.shape == (batch, 1, src_len)
|
||||
or attn_mask.shape == (batch, tgt_len, src_len)
|
||||
assert attn_mask.shape == (batch, 1, src_len) or attn_mask.shape == (
|
||||
batch,
|
||||
tgt_len,
|
||||
src_len,
|
||||
), attn_mask.shape
|
||||
attn_weights = attn_weights.masked_fill(attn_mask.unsqueeze(1), float("-inf"))
|
||||
attn_weights = attn_weights.masked_fill(
|
||||
attn_mask.unsqueeze(1), float("-inf")
|
||||
)
|
||||
|
||||
attn_weights = attn_weights.view(batch * num_heads, tgt_len, src_len)
|
||||
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
||||
@ -456,7 +462,11 @@ class MultiHeadAttention(nn.Module):
|
||||
|
||||
# (batch * head, tgt_len, head_dim)
|
||||
attn_output = torch.bmm(attn_weights, v)
|
||||
assert attn_output.shape == (batch * num_heads, tgt_len, head_dim), attn_output.shape
|
||||
assert attn_output.shape == (
|
||||
batch * num_heads,
|
||||
tgt_len,
|
||||
head_dim,
|
||||
), attn_output.shape
|
||||
|
||||
attn_output = attn_output.transpose(0, 1).contiguous()
|
||||
attn_output = attn_output.view(tgt_len, batch, num_heads * head_dim)
|
||||
|
@ -487,6 +487,7 @@ def export_encoder_model_onnx(
|
||||
|
||||
add_meta_data(filename=encoder_filename, meta_data=meta_data)
|
||||
|
||||
|
||||
def export_decoder_model_onnx(
|
||||
decoder_model: OnnxDecoder,
|
||||
decoder_filename: str,
|
||||
@ -754,24 +755,25 @@ def main():
|
||||
)
|
||||
logging.info(f"Exported joiner to {joiner_filename}")
|
||||
|
||||
if(params.fp16) :
|
||||
if params.fp16:
|
||||
from onnxconverter_common import float16
|
||||
|
||||
logging.info("Generate fp16 models")
|
||||
|
||||
encoder = onnx.load(encoder_filename)
|
||||
encoder_fp16 = float16.convert_float_to_float16(encoder, keep_io_types=True)
|
||||
encoder_filename_fp16 = params.exp_dir / f"encoder-{suffix}.fp16.onnx"
|
||||
onnx.save(encoder_fp16,encoder_filename_fp16)
|
||||
onnx.save(encoder_fp16, encoder_filename_fp16)
|
||||
|
||||
decoder = onnx.load(decoder_filename)
|
||||
decoder_fp16 = float16.convert_float_to_float16(decoder, keep_io_types=True)
|
||||
decoder_filename_fp16 = params.exp_dir / f"decoder-{suffix}.fp16.onnx"
|
||||
onnx.save(decoder_fp16,decoder_filename_fp16)
|
||||
onnx.save(decoder_fp16, decoder_filename_fp16)
|
||||
|
||||
joiner = onnx.load(joiner_filename)
|
||||
joiner_fp16 = float16.convert_float_to_float16(joiner, keep_io_types=True)
|
||||
joiner_filename_fp16 = params.exp_dir / f"joiner-{suffix}.fp16.onnx"
|
||||
onnx.save(joiner_fp16,joiner_filename_fp16)
|
||||
onnx.save(joiner_fp16, joiner_filename_fp16)
|
||||
|
||||
# Generate int8 quantization models
|
||||
# See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
|
||||
|
@ -592,23 +592,23 @@ def main():
|
||||
)
|
||||
logging.info(f"Exported joiner to {joiner_filename}")
|
||||
|
||||
if(params.fp16) :
|
||||
if params.fp16:
|
||||
logging.info("Generate fp16 models")
|
||||
|
||||
encoder = onnx.load(encoder_filename)
|
||||
encoder_fp16 = float16.convert_float_to_float16(encoder, keep_io_types=True)
|
||||
encoder_filename_fp16 = params.exp_dir / f"encoder-{suffix}.fp16.onnx"
|
||||
onnx.save(encoder_fp16,encoder_filename_fp16)
|
||||
onnx.save(encoder_fp16, encoder_filename_fp16)
|
||||
|
||||
decoder = onnx.load(decoder_filename)
|
||||
decoder_fp16 = float16.convert_float_to_float16(decoder, keep_io_types=True)
|
||||
decoder_filename_fp16 = params.exp_dir / f"decoder-{suffix}.fp16.onnx"
|
||||
onnx.save(decoder_fp16,decoder_filename_fp16)
|
||||
onnx.save(decoder_fp16, decoder_filename_fp16)
|
||||
|
||||
joiner = onnx.load(joiner_filename)
|
||||
joiner_fp16 = float16.convert_float_to_float16(joiner, keep_io_types=True)
|
||||
joiner_filename_fp16 = params.exp_dir / f"joiner-{suffix}.fp16.onnx"
|
||||
onnx.save(joiner_fp16,joiner_filename_fp16)
|
||||
onnx.save(joiner_fp16, joiner_filename_fp16)
|
||||
|
||||
# Generate int8 quantization models
|
||||
# See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
|
||||
|
@ -124,7 +124,7 @@ def compute_fbank_libritts(
|
||||
supervisions=m["supervisions"],
|
||||
)
|
||||
if sampling_rate != 24000:
|
||||
logging.info(f"Resampling audio to {sampling_rate}")
|
||||
logging.info(f"Resampling audio to {sampling_rate}Hz")
|
||||
cut_set = cut_set.resample(sampling_rate)
|
||||
if "train" in partition:
|
||||
if perturb_speed:
|
||||
|
1
egs/libritts/ASR/local/convert_transcript_words_to_tokens.py
Symbolic link
1
egs/libritts/ASR/local/convert_transcript_words_to_tokens.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py
|
1
egs/libritts/ASR/local/download_lm.py
Symbolic link
1
egs/libritts/ASR/local/download_lm.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/download_lm.py
|
1
egs/libritts/ASR/local/norm_text.py
Symbolic link
1
egs/libritts/ASR/local/norm_text.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../libriheavy/ASR/local/norm_text.py
|
1
egs/libritts/ASR/local/prepare_lang.py
Symbolic link
1
egs/libritts/ASR/local/prepare_lang.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/prepare_lang.py
|
1
egs/libritts/ASR/local/prepare_lang_bpe.py
Symbolic link
1
egs/libritts/ASR/local/prepare_lang_bpe.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/prepare_lang_bpe.py
|
1
egs/libritts/ASR/local/prepare_lang_fst.py
Symbolic link
1
egs/libritts/ASR/local/prepare_lang_fst.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/prepare_lang_fst.py
|
1
egs/libritts/ASR/local/prepare_lm_training_data.py
Symbolic link
1
egs/libritts/ASR/local/prepare_lm_training_data.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/prepare_lm_training_data.py
|
1
egs/libritts/ASR/local/train_bpe_model.py
Symbolic link
1
egs/libritts/ASR/local/train_bpe_model.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/train_bpe_model.py
|
1
egs/libritts/ASR/local/validate_bpe_lexicon.py
Symbolic link
1
egs/libritts/ASR/local/validate_bpe_lexicon.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/validate_bpe_lexicon.py
|
@ -7,9 +7,15 @@ set -eou pipefail
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
sampling_rate=24000
|
||||
sampling_rate=16000
|
||||
nj=32
|
||||
perturb_speed=true
|
||||
vocab_sizes=(
|
||||
# 5000
|
||||
# 2000
|
||||
# 1000
|
||||
500
|
||||
)
|
||||
|
||||
dl_dir=$PWD/download
|
||||
|
||||
@ -27,6 +33,15 @@ log() {
|
||||
|
||||
log "dl_dir: $dl_dir"
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||
log "Stage -1: Download LM" # we directly use the librispeech lm here
|
||||
mkdir -p $dl_dir/lm
|
||||
if [ ! -e $dl_dir/lm/.done ]; then
|
||||
./local/download_lm.py --out-dir=$dl_dir/lm
|
||||
touch $dl_dir/lm/.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
log "Stage 0: Download data"
|
||||
|
||||
@ -107,3 +122,73 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
touch data/fbank/.msuan.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 5: Train BPE model for normalized text"
|
||||
|
||||
if [ ! -f data/texts ]; then
|
||||
gunzip -c data/manifests/libritts_supervisions_train-clean-100.jsonl.gz \
|
||||
| jq ".text" | sed 's/"//g' \
|
||||
| ./local/norm_text.py > data/texts
|
||||
|
||||
gunzip -c data/manifests/libritts_supervisions_train-clean-360.jsonl.gz \
|
||||
| jq ".text" | sed 's/"//g' \
|
||||
| ./local/norm_text.py >> data/texts
|
||||
|
||||
gunzip -c data/manifests/libritts_supervisions_train-other-500.jsonl.gz \
|
||||
| jq ".text" | sed 's/"//g' \
|
||||
| ./local/norm_text.py >> data/texts
|
||||
fi
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
mkdir -p $lang_dir
|
||||
|
||||
cp data/texts $lang_dir/text
|
||||
|
||||
if [ ! -f $lang_dir/bpe.model ]; then
|
||||
./local/train_bpe_model.py \
|
||||
--lang-dir $lang_dir \
|
||||
--vocab-size $vocab_size \
|
||||
--transcript $lang_dir/text
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 6: Prepare phone based lang"
|
||||
lang_dir=data/lang_phone
|
||||
mkdir -p $lang_dir
|
||||
|
||||
if [ ! -f $dl_dir/lm/librispeech-lexicon.txt ]; then
|
||||
log "No lexicon file in $dl_dir/lm, please run :"
|
||||
log "prepare.sh --stage -1 --stop-stage -1"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/lexicon.txt ]; then
|
||||
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
||||
cat - $dl_dir/lm/librispeech-lexicon.txt |
|
||||
sort | uniq > $lang_dir/lexicon.txt
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||
./local/prepare_lang.py --lang-dir $lang_dir
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/L.fst ]; then
|
||||
log "Converting L.pt to L.fst"
|
||||
./shared/convert-k2-to-openfst.py \
|
||||
--olabels aux_labels \
|
||||
$lang_dir/L.pt \
|
||||
$lang_dir/L.fst
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/L_disambig.fst ]; then
|
||||
log "Converting L_disambig.pt to L_disambig.fst"
|
||||
./shared/convert-k2-to-openfst.py \
|
||||
--olabels aux_labels \
|
||||
$lang_dir/L_disambig.pt \
|
||||
$lang_dir/L_disambig.fst
|
||||
fi
|
||||
fi
|
||||
|
264
egs/libritts/ASR/prepare_lm.sh
Executable file
264
egs/libritts/ASR/prepare_lm.sh
Executable file
@ -0,0 +1,264 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||
|
||||
set -eou pipefail
|
||||
|
||||
# This script generate Ngram LM / NNLM and related files that needed by decoding.
|
||||
|
||||
# We assume dl_dir (download dir) contains the following
|
||||
# directories and files. If not, they will be downloaded
|
||||
# by this script automatically.
|
||||
#
|
||||
# - $dl_dir/lm
|
||||
# This directory contains the following files downloaded from
|
||||
# http://www.openslr.org/resources/11
|
||||
#
|
||||
# - 3-gram.pruned.1e-7.arpa.gz
|
||||
# - 3-gram.pruned.1e-7.arpa
|
||||
# - 4-gram.arpa.gz
|
||||
# - 4-gram.arpa
|
||||
# - librispeech-vocab.txt
|
||||
# - librispeech-lexicon.txt
|
||||
# - librispeech-lm-norm.txt.gz
|
||||
#
|
||||
|
||||
. prepare.sh --stage -1 --stop-stage 6 || exit 1
|
||||
|
||||
log "Running prepare_lm.sh"
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
. shared/parse_options.sh || exit 1
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
log "Stage 0: Prepare BPE based lexicon."
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
# We reuse words.txt from phone based lexicon
|
||||
# so that the two can share G.pt later.
|
||||
cp data/lang_phone/words.txt $lang_dir
|
||||
|
||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||
./local/prepare_lang_bpe.py --lang-dir $lang_dir
|
||||
|
||||
log "Validating $lang_dir/lexicon.txt"
|
||||
./local/validate_bpe_lexicon.py \
|
||||
--lexicon $lang_dir/lexicon.txt \
|
||||
--bpe-model $lang_dir/bpe.model
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/L.fst ]; then
|
||||
log "Converting L.pt to L.fst"
|
||||
./shared/convert-k2-to-openfst.py \
|
||||
--olabels aux_labels \
|
||||
$lang_dir/L.pt \
|
||||
$lang_dir/L.fst
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/L_disambig.fst ]; then
|
||||
log "Converting L_disambig.pt to L_disambig.fst"
|
||||
./shared/convert-k2-to-openfst.py \
|
||||
--olabels aux_labels \
|
||||
$lang_dir/L_disambig.pt \
|
||||
$lang_dir/L_disambig.fst
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare word level G"
|
||||
# We assume you have installed kaldilm, if not, please install
|
||||
# it using: pip install kaldilm
|
||||
|
||||
mkdir -p data/lm
|
||||
if [ ! -f data/lm/G_3_gram.fst.txt ]; then
|
||||
# It is used in building HLG
|
||||
python3 -m kaldilm \
|
||||
--read-symbol-table="data/lang_phone/words.txt" \
|
||||
--disambig-symbol='#0' \
|
||||
--max-order=3 \
|
||||
$dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt
|
||||
fi
|
||||
|
||||
if [ ! -f data/lm/G_4_gram.fst.txt ]; then
|
||||
# It is used for LM rescoring
|
||||
python3 -m kaldilm \
|
||||
--read-symbol-table="data/lang_phone/words.txt" \
|
||||
--disambig-symbol='#0' \
|
||||
--max-order=4 \
|
||||
$dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt
|
||||
fi
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
|
||||
if [ ! -f $lang_dir/HL.fst ]; then
|
||||
./local/prepare_lang_fst.py \
|
||||
--lang-dir $lang_dir \
|
||||
--ngram-G ./data/lm/G_3_gram.fst.txt
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Compile HLG"
|
||||
./local/compile_hlg.py --lang-dir data/lang_phone
|
||||
|
||||
# Note If ./local/compile_hlg.py throws OOM,
|
||||
# please switch to the following command
|
||||
#
|
||||
# ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
./local/compile_hlg.py --lang-dir $lang_dir
|
||||
|
||||
# Note If ./local/compile_hlg.py throws OOM,
|
||||
# please switch to the following command
|
||||
#
|
||||
# ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir
|
||||
done
|
||||
fi
|
||||
|
||||
# Compile LG for RNN-T fast_beam_search decoding
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Compile LG"
|
||||
./local/compile_lg.py --lang-dir data/lang_phone
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
./local/compile_lg.py --lang-dir $lang_dir
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
log "Stage 4: Prepare token level ngram G"
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
|
||||
if [ ! -f $lang_dir/transcript_tokens.txt ]; then
|
||||
./local/convert_transcript_words_to_tokens.py \
|
||||
--lexicon $lang_dir/lexicon.txt \
|
||||
--transcript $lang_dir/transcript_words.txt \
|
||||
--oov "<UNK>" \
|
||||
> $lang_dir/transcript_tokens.txt
|
||||
fi
|
||||
|
||||
for ngram in 2 3 4 5; do
|
||||
if [ ! -f $lang_dir/${ngram}gram.arpa ]; then
|
||||
./shared/make_kn_lm.py \
|
||||
-ngram-order ${ngram} \
|
||||
-text $lang_dir/transcript_tokens.txt \
|
||||
-lm $lang_dir/${ngram}gram.arpa
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_dir/${ngram}gram.fst.txt ]; then
|
||||
python3 -m kaldilm \
|
||||
--read-symbol-table="$lang_dir/tokens.txt" \
|
||||
--disambig-symbol='#0' \
|
||||
--max-order=${ngram} \
|
||||
$lang_dir/${ngram}gram.arpa > $lang_dir/${ngram}gram.fst.txt
|
||||
fi
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 5: Generate NNLM training data"
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
log "Processing vocab_size == ${vocab_size}"
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
out_dir=data/lm_training_bpe_${vocab_size}
|
||||
mkdir -p $out_dir
|
||||
|
||||
./local/prepare_lm_training_data.py \
|
||||
--bpe-model $lang_dir/bpe.model \
|
||||
--lm-data $dl_dir/lm/librispeech-lm-norm.txt \
|
||||
--lm-archive $out_dir/lm_data.pt
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 6: Generate NNLM validation data"
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
log "Processing vocab_size == ${vocab_size}"
|
||||
out_dir=data/lm_training_bpe_${vocab_size}
|
||||
mkdir -p $out_dir
|
||||
|
||||
if [ ! -f $out_dir/valid.txt ]; then
|
||||
gunzip -c data/manifests/libritts_supervisions_dev-clean.jsonl.gz \
|
||||
| jq ".text" | sed 's/"//g' \
|
||||
| ./local/norm_text.py > $out_dir/valid.txt
|
||||
|
||||
gunzip -c data/manifests/libritts_supervisions_dev-other.jsonl.gz \
|
||||
| jq ".text" | sed 's/"//g' \
|
||||
| ./local/norm_text.py >> $out_dir/valid.txt
|
||||
fi
|
||||
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
./local/prepare_lm_training_data.py \
|
||||
--bpe-model $lang_dir/bpe.model \
|
||||
--lm-data $out_dir/valid.txt \
|
||||
--lm-archive $out_dir/lm_data-valid.pt
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||
log "Stage 7: Generate NNLM test data"
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
log "Processing vocab_size == ${vocab_size}"
|
||||
out_dir=data/lm_training_bpe_${vocab_size}
|
||||
mkdir -p $out_dir
|
||||
|
||||
if [ ! -f $out_dir/test.txt ]; then
|
||||
gunzip -c data/manifests/libritts_supervisions_test-clean.jsonl.gz \
|
||||
| jq ".text" | sed 's/"//g' \
|
||||
| ./local/norm_text.py > $out_dir/test.txt
|
||||
|
||||
gunzip -c data/manifests/libritts_supervisions_test-other.jsonl.gz \
|
||||
| jq ".text" | sed 's/"//g' \
|
||||
| ./local/norm_text.py >> $out_dir/test.txt
|
||||
fi
|
||||
|
||||
lang_dir=data/lang_bpe_${vocab_size}
|
||||
./local/prepare_lm_training_data.py \
|
||||
--bpe-model $lang_dir/bpe.model \
|
||||
--lm-data $out_dir/test.txt \
|
||||
--lm-archive $out_dir/lm_data-test.pt
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
|
||||
log "Stage 8: Sort NNLM training data"
|
||||
# Sort LM training data by sentence length in descending order
|
||||
# for ease of training.
|
||||
#
|
||||
# Sentence length equals to the number of BPE tokens
|
||||
# in a sentence.
|
||||
|
||||
for vocab_size in ${vocab_sizes[@]}; do
|
||||
out_dir=data/lm_training_bpe_${vocab_size}
|
||||
mkdir -p $out_dir
|
||||
./local/sort_lm_training_data.py \
|
||||
--in-lm-data $out_dir/lm_data.pt \
|
||||
--out-lm-data $out_dir/sorted_lm_data.pt \
|
||||
--out-statistics $out_dir/statistics.txt
|
||||
|
||||
./local/sort_lm_training_data.py \
|
||||
--in-lm-data $out_dir/lm_data-valid.pt \
|
||||
--out-lm-data $out_dir/sorted_lm_data-valid.pt \
|
||||
--out-statistics $out_dir/statistics-valid.txt
|
||||
|
||||
./local/sort_lm_training_data.py \
|
||||
--in-lm-data $out_dir/lm_data-test.pt \
|
||||
--out-lm-data $out_dir/sorted_lm_data-test.pt \
|
||||
--out-statistics $out_dir/statistics-test.txt
|
||||
done
|
||||
fi
|
@ -1041,13 +1041,13 @@ def main():
|
||||
|
||||
# we need cut ids to display recognition results.
|
||||
args.return_cuts = True
|
||||
librispeech = LibriTTSAsrDataModule(args)
|
||||
libritts = LibriTTSAsrDataModule(args)
|
||||
|
||||
test_clean_cuts = librispeech.test_clean_cuts()
|
||||
test_other_cuts = librispeech.test_other_cuts()
|
||||
test_clean_cuts = libritts.test_clean_cuts()
|
||||
test_other_cuts = libritts.test_other_cuts()
|
||||
|
||||
test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
|
||||
test_other_dl = librispeech.test_dataloaders(test_other_cuts)
|
||||
test_clean_dl = libritts.test_dataloaders(test_clean_cuts)
|
||||
test_other_dl = libritts.test_dataloaders(test_other_cuts)
|
||||
|
||||
test_sets = ["test-clean", "test-other"]
|
||||
test_dl = [test_clean_dl, test_other_dl]
|
||||
|
@ -864,10 +864,10 @@ def main():
|
||||
num_param = sum([p.numel() for p in model.parameters()])
|
||||
logging.info(f"Number of model parameters: {num_param}")
|
||||
|
||||
librispeech = LibriTTSAsrDataModule(args)
|
||||
libritts = LibriTTSAsrDataModule(args)
|
||||
|
||||
test_clean_cuts = librispeech.test_clean_cuts()
|
||||
test_other_cuts = librispeech.test_other_cuts()
|
||||
test_clean_cuts = libritts.test_clean_cuts()
|
||||
test_other_cuts = libritts.test_other_cuts()
|
||||
|
||||
test_sets = ["test-clean", "test-other"]
|
||||
test_cuts = [test_clean_cuts, test_other_cuts]
|
||||
|
@ -603,6 +603,15 @@ def _to_int_tuple(s: str):
|
||||
return tuple(map(int, s.split(",")))
|
||||
|
||||
|
||||
def remove_punc_to_upper(text: str) -> str:
|
||||
text = text.replace("‘", "'")
|
||||
text = text.replace("’", "'")
|
||||
tokens = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'")
|
||||
s_list = [x.upper() if x in tokens else " " for x in text]
|
||||
s = " ".join("".join(s_list).split()).strip()
|
||||
return s
|
||||
|
||||
|
||||
def get_encoder_embed(params: AttributeDict) -> nn.Module:
|
||||
# encoder_embed converts the input of shape (N, T, num_features)
|
||||
# to the shape (N, (T - 7) // 2, encoder_dims).
|
||||
@ -1284,21 +1293,26 @@ def run(rank, world_size, args):
|
||||
if params.inf_check:
|
||||
register_inf_check_hooks(model)
|
||||
|
||||
librispeech = LibriTTSAsrDataModule(args)
|
||||
libritts = LibriTTSAsrDataModule(args)
|
||||
|
||||
if params.full_libri:
|
||||
train_cuts = librispeech.train_all_shuf_cuts()
|
||||
train_cuts = libritts.train_all_shuf_cuts()
|
||||
|
||||
# previously we used the following code to load all training cuts,
|
||||
# strictly speaking, shuffled training cuts should be used instead,
|
||||
# but we leave the code here to demonstrate that there is an option
|
||||
# like this to combine multiple cutsets
|
||||
|
||||
# train_cuts = librispeech.train_clean_100_cuts()
|
||||
# train_cuts += librispeech.train_clean_360_cuts()
|
||||
# train_cuts += librispeech.train_other_500_cuts()
|
||||
# train_cuts = libritts.train_clean_100_cuts()
|
||||
# train_cuts += libritts.train_clean_360_cuts()
|
||||
# train_cuts += libritts.train_other_500_cuts()
|
||||
else:
|
||||
train_cuts = librispeech.train_clean_100_cuts()
|
||||
train_cuts = libritts.train_clean_100_cuts()
|
||||
|
||||
def normalize_text(c: Cut):
|
||||
text = remove_punc_to_upper(c.supervisions[0].text)
|
||||
c.supervisions[0].text = text
|
||||
return c
|
||||
|
||||
def remove_short_and_long_utt(c: Cut):
|
||||
# Keep only utterances with duration between 1 second and 20 seconds
|
||||
@ -1338,6 +1352,7 @@ def run(rank, world_size, args):
|
||||
return True
|
||||
|
||||
train_cuts = train_cuts.filter(remove_short_and_long_utt)
|
||||
train_cuts = train_cuts.map(normalize_text)
|
||||
|
||||
if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
|
||||
# We only load the sampler's state dict when it loads a checkpoint
|
||||
@ -1346,13 +1361,13 @@ def run(rank, world_size, args):
|
||||
else:
|
||||
sampler_state_dict = None
|
||||
|
||||
train_dl = librispeech.train_dataloaders(
|
||||
train_dl = libritts.train_dataloaders(
|
||||
train_cuts, sampler_state_dict=sampler_state_dict
|
||||
)
|
||||
|
||||
valid_cuts = librispeech.dev_clean_cuts()
|
||||
valid_cuts += librispeech.dev_other_cuts()
|
||||
valid_dl = librispeech.valid_dataloaders(valid_cuts)
|
||||
valid_cuts = libritts.dev_clean_cuts()
|
||||
valid_cuts += libritts.dev_other_cuts()
|
||||
valid_dl = libritts.valid_dataloaders(valid_cuts)
|
||||
|
||||
if not params.print_diagnostics:
|
||||
scan_pessimistic_batches_for_oom(
|
||||
|
@ -37,15 +37,6 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
if [ ! -d $dl_dir/LibriTTS ]; then
|
||||
lhotse download libritts $dl_dir
|
||||
fi
|
||||
|
||||
# If you have pre-downloaded it to /path/to/musan,
|
||||
# you can create a symlink
|
||||
#
|
||||
# ln -sfv /path/to/musan $dl_dir/musan
|
||||
#
|
||||
if [ ! -d $dl_dir/musan ]; then
|
||||
lhotse download musan $dl_dir
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
|
Loading…
x
Reference in New Issue
Block a user