minor updates to the scripts

This commit is contained in:
JinZr 2024-10-07 20:55:53 +08:00
parent 266e840475
commit 32a7d2222d
19 changed files with 422 additions and 46 deletions

View File

@ -236,7 +236,7 @@ class TransformerDecoder(nn.Module):
causal_mask = subsequent_mask(x.shape[0], device=x.device) # (seq_len, seq_len)
attn_mask = torch.logical_or(
padding_mask.unsqueeze(1), # (batch, 1, seq_len)
torch.logical_not(causal_mask).unsqueeze(0) # (1, seq_len, seq_len)
torch.logical_not(causal_mask).unsqueeze(0), # (1, seq_len, seq_len)
) # (batch, seq_len, seq_len)
if memory is not None:
@ -367,7 +367,9 @@ class MultiHeadAttention(nn.Module):
self.num_heads = num_heads
self.head_dim = attention_dim // num_heads
assert self.head_dim * num_heads == attention_dim, (
self.head_dim, num_heads, attention_dim
self.head_dim,
num_heads,
attention_dim,
)
self.dropout = dropout
self.name = None # will be overwritten in training code; for diagnostics.
@ -437,15 +439,19 @@ class MultiHeadAttention(nn.Module):
if key_padding_mask is not None:
assert key_padding_mask.shape == (batch, src_len), key_padding_mask.shape
attn_weights = attn_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"),
key_padding_mask.unsqueeze(1).unsqueeze(2),
float("-inf"),
)
if attn_mask is not None:
assert (
attn_mask.shape == (batch, 1, src_len)
or attn_mask.shape == (batch, tgt_len, src_len)
assert attn_mask.shape == (batch, 1, src_len) or attn_mask.shape == (
batch,
tgt_len,
src_len,
), attn_mask.shape
attn_weights = attn_weights.masked_fill(attn_mask.unsqueeze(1), float("-inf"))
attn_weights = attn_weights.masked_fill(
attn_mask.unsqueeze(1), float("-inf")
)
attn_weights = attn_weights.view(batch * num_heads, tgt_len, src_len)
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
@ -456,7 +462,11 @@ class MultiHeadAttention(nn.Module):
# (batch * head, tgt_len, head_dim)
attn_output = torch.bmm(attn_weights, v)
assert attn_output.shape == (batch * num_heads, tgt_len, head_dim), attn_output.shape
assert attn_output.shape == (
batch * num_heads,
tgt_len,
head_dim,
), attn_output.shape
attn_output = attn_output.transpose(0, 1).contiguous()
attn_output = attn_output.view(tgt_len, batch, num_heads * head_dim)

View File

@ -487,6 +487,7 @@ def export_encoder_model_onnx(
add_meta_data(filename=encoder_filename, meta_data=meta_data)
def export_decoder_model_onnx(
decoder_model: OnnxDecoder,
decoder_filename: str,
@ -754,30 +755,31 @@ def main():
)
logging.info(f"Exported joiner to {joiner_filename}")
if(params.fp16) :
if params.fp16:
from onnxconverter_common import float16
logging.info("Generate fp16 models")
encoder = onnx.load(encoder_filename)
encoder_fp16 = float16.convert_float_to_float16(encoder, keep_io_types=True)
encoder_filename_fp16 = params.exp_dir / f"encoder-{suffix}.fp16.onnx"
onnx.save(encoder_fp16,encoder_filename_fp16)
onnx.save(encoder_fp16, encoder_filename_fp16)
decoder = onnx.load(decoder_filename)
decoder_fp16 = float16.convert_float_to_float16(decoder, keep_io_types=True)
decoder_filename_fp16 = params.exp_dir / f"decoder-{suffix}.fp16.onnx"
onnx.save(decoder_fp16,decoder_filename_fp16)
onnx.save(decoder_fp16, decoder_filename_fp16)
joiner = onnx.load(joiner_filename)
joiner_fp16 = float16.convert_float_to_float16(joiner, keep_io_types=True)
joiner_filename_fp16 = params.exp_dir / f"joiner-{suffix}.fp16.onnx"
onnx.save(joiner_fp16,joiner_filename_fp16)
onnx.save(joiner_fp16, joiner_filename_fp16)
# Generate int8 quantization models
# See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
logging.info("Generate int8 quantization models")
encoder_filename_int8 = params.exp_dir / f"encoder-{suffix}.int8.onnx"
quantize_dynamic(
model_input=encoder_filename,

View File

@ -592,23 +592,23 @@ def main():
)
logging.info(f"Exported joiner to {joiner_filename}")
if(params.fp16) :
if params.fp16:
logging.info("Generate fp16 models")
encoder = onnx.load(encoder_filename)
encoder_fp16 = float16.convert_float_to_float16(encoder, keep_io_types=True)
encoder_filename_fp16 = params.exp_dir / f"encoder-{suffix}.fp16.onnx"
onnx.save(encoder_fp16,encoder_filename_fp16)
onnx.save(encoder_fp16, encoder_filename_fp16)
decoder = onnx.load(decoder_filename)
decoder_fp16 = float16.convert_float_to_float16(decoder, keep_io_types=True)
decoder_filename_fp16 = params.exp_dir / f"decoder-{suffix}.fp16.onnx"
onnx.save(decoder_fp16,decoder_filename_fp16)
onnx.save(decoder_fp16, decoder_filename_fp16)
joiner = onnx.load(joiner_filename)
joiner_fp16 = float16.convert_float_to_float16(joiner, keep_io_types=True)
joiner_filename_fp16 = params.exp_dir / f"joiner-{suffix}.fp16.onnx"
onnx.save(joiner_fp16,joiner_filename_fp16)
onnx.save(joiner_fp16, joiner_filename_fp16)
# Generate int8 quantization models
# See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection

View File

@ -124,7 +124,7 @@ def compute_fbank_libritts(
supervisions=m["supervisions"],
)
if sampling_rate != 24000:
logging.info(f"Resampling audio to {sampling_rate}")
logging.info(f"Resampling audio to {sampling_rate}Hz")
cut_set = cut_set.resample(sampling_rate)
if "train" in partition:
if perturb_speed:

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/download_lm.py

View File

@ -0,0 +1 @@
../../../libriheavy/ASR/local/norm_text.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/prepare_lang.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/prepare_lang_bpe.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/prepare_lang_fst.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/prepare_lm_training_data.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/train_bpe_model.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/validate_bpe_lexicon.py

View File

@ -7,9 +7,15 @@ set -eou pipefail
stage=0
stop_stage=100
sampling_rate=24000
sampling_rate=16000
nj=32
perturb_speed=true
vocab_sizes=(
# 5000
# 2000
# 1000
500
)
dl_dir=$PWD/download
@ -27,6 +33,15 @@ log() {
log "dl_dir: $dl_dir"
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage -1: Download LM" # we directly use the librispeech lm here
mkdir -p $dl_dir/lm
if [ ! -e $dl_dir/lm/.done ]; then
./local/download_lm.py --out-dir=$dl_dir/lm
touch $dl_dir/lm/.done
fi
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
@ -107,3 +122,73 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
touch data/fbank/.msuan.done
fi
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Train BPE model for normalized text"
if [ ! -f data/texts ]; then
gunzip -c data/manifests/libritts_supervisions_train-clean-100.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/norm_text.py > data/texts
gunzip -c data/manifests/libritts_supervisions_train-clean-360.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/norm_text.py >> data/texts
gunzip -c data/manifests/libritts_supervisions_train-other-500.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/norm_text.py >> data/texts
fi
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
mkdir -p $lang_dir
cp data/texts $lang_dir/text
if [ ! -f $lang_dir/bpe.model ]; then
./local/train_bpe_model.py \
--lang-dir $lang_dir \
--vocab-size $vocab_size \
--transcript $lang_dir/text
fi
done
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Prepare phone based lang"
lang_dir=data/lang_phone
mkdir -p $lang_dir
if [ ! -f $dl_dir/lm/librispeech-lexicon.txt ]; then
log "No lexicon file in $dl_dir/lm, please run :"
log "prepare.sh --stage -1 --stop-stage -1"
exit -1
fi
if [ ! -f $lang_dir/lexicon.txt ]; then
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
cat - $dl_dir/lm/librispeech-lexicon.txt |
sort | uniq > $lang_dir/lexicon.txt
fi
if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang.py --lang-dir $lang_dir
fi
if [ ! -f $lang_dir/L.fst ]; then
log "Converting L.pt to L.fst"
./shared/convert-k2-to-openfst.py \
--olabels aux_labels \
$lang_dir/L.pt \
$lang_dir/L.fst
fi
if [ ! -f $lang_dir/L_disambig.fst ]; then
log "Converting L_disambig.pt to L_disambig.fst"
./shared/convert-k2-to-openfst.py \
--olabels aux_labels \
$lang_dir/L_disambig.pt \
$lang_dir/L_disambig.fst
fi
fi

264
egs/libritts/ASR/prepare_lm.sh Executable file
View File

@ -0,0 +1,264 @@
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
# This script generate Ngram LM / NNLM and related files that needed by decoding.
# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
# by this script automatically.
#
# - $dl_dir/lm
# This directory contains the following files downloaded from
# http://www.openslr.org/resources/11
#
# - 3-gram.pruned.1e-7.arpa.gz
# - 3-gram.pruned.1e-7.arpa
# - 4-gram.arpa.gz
# - 4-gram.arpa
# - librispeech-vocab.txt
# - librispeech-lexicon.txt
# - librispeech-lm-norm.txt.gz
#
. prepare.sh --stage -1 --stop-stage 6 || exit 1
log "Running prepare_lm.sh"
stage=0
stop_stage=100
. shared/parse_options.sh || exit 1
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Prepare BPE based lexicon."
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
# We reuse words.txt from phone based lexicon
# so that the two can share G.pt later.
cp data/lang_phone/words.txt $lang_dir
if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang_bpe.py --lang-dir $lang_dir
log "Validating $lang_dir/lexicon.txt"
./local/validate_bpe_lexicon.py \
--lexicon $lang_dir/lexicon.txt \
--bpe-model $lang_dir/bpe.model
fi
if [ ! -f $lang_dir/L.fst ]; then
log "Converting L.pt to L.fst"
./shared/convert-k2-to-openfst.py \
--olabels aux_labels \
$lang_dir/L.pt \
$lang_dir/L.fst
fi
if [ ! -f $lang_dir/L_disambig.fst ]; then
log "Converting L_disambig.pt to L_disambig.fst"
./shared/convert-k2-to-openfst.py \
--olabels aux_labels \
$lang_dir/L_disambig.pt \
$lang_dir/L_disambig.fst
fi
done
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare word level G"
# We assume you have installed kaldilm, if not, please install
# it using: pip install kaldilm
mkdir -p data/lm
if [ ! -f data/lm/G_3_gram.fst.txt ]; then
# It is used in building HLG
python3 -m kaldilm \
--read-symbol-table="data/lang_phone/words.txt" \
--disambig-symbol='#0' \
--max-order=3 \
$dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt
fi
if [ ! -f data/lm/G_4_gram.fst.txt ]; then
# It is used for LM rescoring
python3 -m kaldilm \
--read-symbol-table="data/lang_phone/words.txt" \
--disambig-symbol='#0' \
--max-order=4 \
$dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt
fi
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
if [ ! -f $lang_dir/HL.fst ]; then
./local/prepare_lang_fst.py \
--lang-dir $lang_dir \
--ngram-G ./data/lm/G_3_gram.fst.txt
fi
done
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Compile HLG"
./local/compile_hlg.py --lang-dir data/lang_phone
# Note If ./local/compile_hlg.py throws OOM,
# please switch to the following command
#
# ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
./local/compile_hlg.py --lang-dir $lang_dir
# Note If ./local/compile_hlg.py throws OOM,
# please switch to the following command
#
# ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir
done
fi
# Compile LG for RNN-T fast_beam_search decoding
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compile LG"
./local/compile_lg.py --lang-dir data/lang_phone
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
./local/compile_lg.py --lang-dir $lang_dir
done
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Prepare token level ngram G"
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
if [ ! -f $lang_dir/transcript_tokens.txt ]; then
./local/convert_transcript_words_to_tokens.py \
--lexicon $lang_dir/lexicon.txt \
--transcript $lang_dir/transcript_words.txt \
--oov "<UNK>" \
> $lang_dir/transcript_tokens.txt
fi
for ngram in 2 3 4 5; do
if [ ! -f $lang_dir/${ngram}gram.arpa ]; then
./shared/make_kn_lm.py \
-ngram-order ${ngram} \
-text $lang_dir/transcript_tokens.txt \
-lm $lang_dir/${ngram}gram.arpa
fi
if [ ! -f $lang_dir/${ngram}gram.fst.txt ]; then
python3 -m kaldilm \
--read-symbol-table="$lang_dir/tokens.txt" \
--disambig-symbol='#0' \
--max-order=${ngram} \
$lang_dir/${ngram}gram.arpa > $lang_dir/${ngram}gram.fst.txt
fi
done
done
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Generate NNLM training data"
for vocab_size in ${vocab_sizes[@]}; do
log "Processing vocab_size == ${vocab_size}"
lang_dir=data/lang_bpe_${vocab_size}
out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir
./local/prepare_lm_training_data.py \
--bpe-model $lang_dir/bpe.model \
--lm-data $dl_dir/lm/librispeech-lm-norm.txt \
--lm-archive $out_dir/lm_data.pt
done
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Generate NNLM validation data"
for vocab_size in ${vocab_sizes[@]}; do
log "Processing vocab_size == ${vocab_size}"
out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir
if [ ! -f $out_dir/valid.txt ]; then
gunzip -c data/manifests/libritts_supervisions_dev-clean.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/norm_text.py > $out_dir/valid.txt
gunzip -c data/manifests/libritts_supervisions_dev-other.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/norm_text.py >> $out_dir/valid.txt
fi
lang_dir=data/lang_bpe_${vocab_size}
./local/prepare_lm_training_data.py \
--bpe-model $lang_dir/bpe.model \
--lm-data $out_dir/valid.txt \
--lm-archive $out_dir/lm_data-valid.pt
done
fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Generate NNLM test data"
for vocab_size in ${vocab_sizes[@]}; do
log "Processing vocab_size == ${vocab_size}"
out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir
if [ ! -f $out_dir/test.txt ]; then
gunzip -c data/manifests/libritts_supervisions_test-clean.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/norm_text.py > $out_dir/test.txt
gunzip -c data/manifests/libritts_supervisions_test-other.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/norm_text.py >> $out_dir/test.txt
fi
lang_dir=data/lang_bpe_${vocab_size}
./local/prepare_lm_training_data.py \
--bpe-model $lang_dir/bpe.model \
--lm-data $out_dir/test.txt \
--lm-archive $out_dir/lm_data-test.pt
done
fi
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
log "Stage 8: Sort NNLM training data"
# Sort LM training data by sentence length in descending order
# for ease of training.
#
# Sentence length equals to the number of BPE tokens
# in a sentence.
for vocab_size in ${vocab_sizes[@]}; do
out_dir=data/lm_training_bpe_${vocab_size}
mkdir -p $out_dir
./local/sort_lm_training_data.py \
--in-lm-data $out_dir/lm_data.pt \
--out-lm-data $out_dir/sorted_lm_data.pt \
--out-statistics $out_dir/statistics.txt
./local/sort_lm_training_data.py \
--in-lm-data $out_dir/lm_data-valid.pt \
--out-lm-data $out_dir/sorted_lm_data-valid.pt \
--out-statistics $out_dir/statistics-valid.txt
./local/sort_lm_training_data.py \
--in-lm-data $out_dir/lm_data-test.pt \
--out-lm-data $out_dir/sorted_lm_data-test.pt \
--out-statistics $out_dir/statistics-test.txt
done
fi

View File

@ -1041,13 +1041,13 @@ def main():
# we need cut ids to display recognition results.
args.return_cuts = True
librispeech = LibriTTSAsrDataModule(args)
libritts = LibriTTSAsrDataModule(args)
test_clean_cuts = librispeech.test_clean_cuts()
test_other_cuts = librispeech.test_other_cuts()
test_clean_cuts = libritts.test_clean_cuts()
test_other_cuts = libritts.test_other_cuts()
test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
test_other_dl = librispeech.test_dataloaders(test_other_cuts)
test_clean_dl = libritts.test_dataloaders(test_clean_cuts)
test_other_dl = libritts.test_dataloaders(test_other_cuts)
test_sets = ["test-clean", "test-other"]
test_dl = [test_clean_dl, test_other_dl]

View File

@ -864,10 +864,10 @@ def main():
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
librispeech = LibriTTSAsrDataModule(args)
libritts = LibriTTSAsrDataModule(args)
test_clean_cuts = librispeech.test_clean_cuts()
test_other_cuts = librispeech.test_other_cuts()
test_clean_cuts = libritts.test_clean_cuts()
test_other_cuts = libritts.test_other_cuts()
test_sets = ["test-clean", "test-other"]
test_cuts = [test_clean_cuts, test_other_cuts]

View File

@ -603,6 +603,15 @@ def _to_int_tuple(s: str):
return tuple(map(int, s.split(",")))
def remove_punc_to_upper(text: str) -> str:
text = text.replace("", "'")
text = text.replace("", "'")
tokens = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'")
s_list = [x.upper() if x in tokens else " " for x in text]
s = " ".join("".join(s_list).split()).strip()
return s
def get_encoder_embed(params: AttributeDict) -> nn.Module:
# encoder_embed converts the input of shape (N, T, num_features)
# to the shape (N, (T - 7) // 2, encoder_dims).
@ -1284,21 +1293,26 @@ def run(rank, world_size, args):
if params.inf_check:
register_inf_check_hooks(model)
librispeech = LibriTTSAsrDataModule(args)
libritts = LibriTTSAsrDataModule(args)
if params.full_libri:
train_cuts = librispeech.train_all_shuf_cuts()
train_cuts = libritts.train_all_shuf_cuts()
# previously we used the following code to load all training cuts,
# strictly speaking, shuffled training cuts should be used instead,
# but we leave the code here to demonstrate that there is an option
# like this to combine multiple cutsets
# train_cuts = librispeech.train_clean_100_cuts()
# train_cuts += librispeech.train_clean_360_cuts()
# train_cuts += librispeech.train_other_500_cuts()
# train_cuts = libritts.train_clean_100_cuts()
# train_cuts += libritts.train_clean_360_cuts()
# train_cuts += libritts.train_other_500_cuts()
else:
train_cuts = librispeech.train_clean_100_cuts()
train_cuts = libritts.train_clean_100_cuts()
def normalize_text(c: Cut):
text = remove_punc_to_upper(c.supervisions[0].text)
c.supervisions[0].text = text
return c
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
@ -1338,6 +1352,7 @@ def run(rank, world_size, args):
return True
train_cuts = train_cuts.filter(remove_short_and_long_utt)
train_cuts = train_cuts.map(normalize_text)
if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
# We only load the sampler's state dict when it loads a checkpoint
@ -1346,13 +1361,13 @@ def run(rank, world_size, args):
else:
sampler_state_dict = None
train_dl = librispeech.train_dataloaders(
train_dl = libritts.train_dataloaders(
train_cuts, sampler_state_dict=sampler_state_dict
)
valid_cuts = librispeech.dev_clean_cuts()
valid_cuts += librispeech.dev_other_cuts()
valid_dl = librispeech.valid_dataloaders(valid_cuts)
valid_cuts = libritts.dev_clean_cuts()
valid_cuts += libritts.dev_other_cuts()
valid_dl = libritts.valid_dataloaders(valid_cuts)
if not params.print_diagnostics:
scan_pessimistic_batches_for_oom(

View File

@ -37,15 +37,6 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
if [ ! -d $dl_dir/LibriTTS ]; then
lhotse download libritts $dl_dir
fi
# If you have pre-downloaded it to /path/to/musan,
# you can create a symlink
#
# ln -sfv /path/to/musan $dl_dir/musan
#
if [ ! -d $dl_dir/musan ]; then
lhotse download musan $dl_dir
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then