Add multidataset (#1010)

* Add Common Voice for multidataset

* Add prepare_multidataset.sh

* Add dataset mixing


* Update prepare_multidataset.sh

* Update prepare_giga_speech.sh

* update comments

* Add split and shuffle mechanism

* Add multi-dataset train

* Fix for deleting

* Fix for modifying

* Add comments

* Change type for perturb_speed

* Fix for style check

* Small fix

* Add filter

* Remove warning
This commit is contained in:
Yifan Yang 2023-04-21 18:09:41 +08:00 committed by GitHub
parent 57d6482a79
commit d67a49afe4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 624 additions and 38 deletions

View File

@ -35,7 +35,7 @@ from filter_cuts import filter_cuts
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor, str2bool
# Torch's multithreaded behavior needs to be disabled or # Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down. # it wastes a lot of CPU and slow things down.
@ -61,12 +61,20 @@ def get_args():
help="""Dataset parts to compute fbank. If None, we will use all""", help="""Dataset parts to compute fbank. If None, we will use all""",
) )
parser.add_argument(
"--perturb-speed",
type=str2bool,
default=True,
help="""Perturb speed with factor 0.9 and 1.1 on train subset.""",
)
return parser.parse_args() return parser.parse_args()
def compute_fbank_librispeech( def compute_fbank_librispeech(
bpe_model: Optional[str] = None, bpe_model: Optional[str] = None,
dataset: Optional[str] = None, dataset: Optional[str] = None,
perturb_speed: Optional[bool] = True,
): ):
src_dir = Path("data/manifests") src_dir = Path("data/manifests")
output_dir = Path("data/fbank") output_dir = Path("data/fbank")
@ -125,9 +133,13 @@ def compute_fbank_librispeech(
if "train" in partition: if "train" in partition:
if bpe_model: if bpe_model:
cut_set = filter_cuts(cut_set, sp) cut_set = filter_cuts(cut_set, sp)
cut_set = ( if perturb_speed:
cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) logging.info(f"Doing speed perturb")
) cut_set = (
cut_set
+ cut_set.perturb_speed(0.9)
+ cut_set.perturb_speed(1.1)
)
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats_{partition}", storage_path=f"{output_dir}/{prefix}_feats_{partition}",
@ -145,4 +157,8 @@ if __name__ == "__main__":
logging.basicConfig(format=formatter, level=logging.INFO) logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args() args = get_args()
logging.info(vars(args)) logging.info(vars(args))
compute_fbank_librispeech(bpe_model=args.bpe_model, dataset=args.dataset) compute_fbank_librispeech(
bpe_model=args.bpe_model,
dataset=args.dataset,
perturb_speed=args.perturb_speed,
)

View File

@ -0,0 +1,117 @@
#!/usr/bin/env bash
set -eou pipefail
nj=16
stage=-1
stop_stage=100
# Split data/${lang}set to this number of pieces
# This is to avoid OOM during feature extraction.
num_splits=1000
# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
# by this script automatically.
#
# - $dl_dir/$release/$lang
# This directory contains the following files downloaded from
# https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz
#
# - clips
# - dev.tsv
# - invalidated.tsv
# - other.tsv
# - reported.tsv
# - test.tsv
# - train.tsv
# - validated.tsv
dl_dir=$PWD/download
release=cv-corpus-13.0-2023-03-09
lang=en
. shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data/${lang}".
# You can safely remove "data/${lang}" and rerun this script to regenerate it.
mkdir -p data/${lang}
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# If you have pre-downloaded it to /path/to/$release,
# you can create a symlink
#
# ln -sfv /path/to/$release $dl_dir/$release
#
if [ ! -d $dl_dir/$release/$lang/clips ]; then
lhotse download commonvoice --languages $lang --release $release $dl_dir
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare CommonVoice manifest"
# We assume that you have downloaded the CommonVoice corpus
# to $dl_dir/$release
mkdir -p data/${lang}/manifests
if [ ! -e data/${lang}/manifests/.cv-${lang}.done ]; then
lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/${lang}/manifests
touch data/${lang}/manifests/.cv-${lang}.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Preprocess CommonVoice manifest"
if [ ! -e data/${lang}/fbank/.preprocess_complete ]; then
./local/preprocess_commonvoice.py --language $lang
touch data/${lang}/fbank/.preprocess_complete
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for dev and test subsets of CommonVoice"
mkdir -p data/${lang}/fbank
if [ ! -e data/${lang}/fbank/.cv-${lang}_dev_test.done ]; then
./local/compute_fbank_commonvoice_dev_test.py --language $lang
touch data/${lang}/fbank/.cv-${lang}_dev_test.done
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Split train subset into ${num_splits} pieces"
split_dir=data/${lang}/fbank/cv-${lang}_train_split_${num_splits}
if [ ! -e $split_dir/.cv-${lang}_train_split.done ]; then
lhotse split $num_splits ./data/${lang}/fbank/cv-${lang}_cuts_train_raw.jsonl.gz $split_dir
touch $split_dir/.cv-${lang}_train_split.done
fi
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute features for train subset of CommonVoice"
if [ ! -e data/${lang}/fbank/.cv-${lang}_train.done ]; then
./local/compute_fbank_commonvoice_splits.py \
--num-workers $nj \
--batch-duration 600 \
--start 0 \
--num-splits $num_splits \
--language $lang
touch data/${lang}/fbank/.cv-${lang}_train.done
fi
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Combine features for train"
if [ ! -f data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz ]; then
pieces=$(find data/${lang}/fbank/cv-${lang}_train_split_${num_splits} -name "cv-${lang}_cuts_train.*.jsonl.gz")
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz
fi
fi

View File

@ -95,39 +95,45 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)" log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)"
# We assume that you have downloaded the GigaSpeech corpus # We assume that you have downloaded the GigaSpeech corpus
# to $dl_dir/GigaSpeech # to $dl_dir/GigaSpeech
mkdir -p data/manifests if [ ! -f data/manifests/.gigaspeech.done ]; then
lhotse prepare gigaspeech \ mkdir -p data/manifests
--subset XL \ lhotse prepare gigaspeech \
--subset L \ --subset XL \
--subset M \ --subset L \
--subset S \ --subset M \
--subset XS \ --subset S \
--subset DEV \ --subset XS \
--subset TEST \ --subset DEV \
-j $nj \ --subset TEST \
$dl_dir/GigaSpeech data/manifests -j $nj \
$dl_dir/GigaSpeech data/manifests
touch data/manifests/.gigaspeech.done
fi
fi fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Preprocess GigaSpeech manifest" log "Stage 2: Preprocess GigaSpeech manifest"
if [ ! -f data/fbank/.preprocess_complete ]; then if [ ! -f data/fbank/.gigaspeech_preprocess.done ]; then
log "It may take 2 hours for this stage" log "It may take 2 hours for this stage"
python3 ./local/preprocess_gigaspeech.py ./local/preprocess_gigaspeech.py
touch data/fbank/.preprocess_complete touch data/fbank/.gigaspeech_preprocess.done
fi fi
fi fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)" log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)"
python3 ./local/compute_fbank_gigaspeech_dev_test.py if [ ! -f data/fbank/.gigaspeech_dev_test.done ]; then
./local/compute_fbank_gigaspeech_dev_test.py
touch data/fbank/.gigaspeech_dev_test.done
fi
fi fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Split XL subset into ${num_splits} pieces" log "Stage 4: Split XL subset into ${num_splits} pieces"
split_dir=data/fbank/gigaspeech_XL_split_${num_splits} split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
if [ ! -f $split_dir/.split_completed ]; then if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then
lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
touch $split_dir/.split_completed touch $split_dir/.gigaspeech_XL_split.done
fi fi
fi fi
@ -135,8 +141,19 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute features for XL" log "Stage 5: Compute features for XL"
# Note: The script supports --start and --stop options. # Note: The script supports --start and --stop options.
# You can use several machines to compute the features in parallel. # You can use several machines to compute the features in parallel.
python3 ./local/compute_fbank_gigaspeech_splits.py \ if [ ! -f data/fbank/.gigaspeech_XL.done ]; then
--num-workers $nj \ ./local/compute_fbank_gigaspeech_splits.py \
--batch-duration 600 \ --num-workers $nj \
--num-splits $num_splits --batch-duration 600 \
--num-splits $num_splits
touch data/fbank/.gigaspeech_XL.done
fi
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Combine features for XL (may take 15 hours)"
if [ ! -f data/fbank/gigaspeech_cuts_XL.jsonl.gz ]; then
pieces=$(find data/fbank/gigaspeech_XL_split_${num_splits} -name "gigaspeech_cuts_XL.*.jsonl.gz")
lhotse combine $pieces data/fbank/gigaspeech_cuts_XL.jsonl.gz
fi
fi fi

View File

@ -0,0 +1,373 @@
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
nj=16
stage=-1
stop_stage=100
# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
# by this script automatically.
#
# - $dl_dir/LibriSpeech
# You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it.
# You can download them from https://www.openslr.org/12
#
# - $dl_dir/lm
# This directory contains the following files downloaded from
# http://www.openslr.org/resources/11
#
# - 3-gram.pruned.1e-7.arpa.gz
# - 3-gram.pruned.1e-7.arpa
# - 4-gram.arpa.gz
# - 4-gram.arpa
# - librispeech-vocab.txt
# - librispeech-lexicon.txt
# - librispeech-lm-norm.txt.gz
#
# - $dl_dir/musan
# This directory contains the following directories downloaded from
# http://www.openslr.org/17/
#
# - music
# - noise
# - speech
# Split all dataset to this number of pieces and mix each dataset pieces
# into multidataset pieces with shuffling.
num_splits=1998
dl_dir=$PWD/download
. shared/parse_options.sh || exit 1
# vocab size for sentence piece models.
# It will generate data/lang_bpe_xxx,
# data/lang_bpe_yyy if the array contains xxx, yyy
vocab_sizes=(
# 5000
# 2000
# 1000
500
)
# multidataset list.
# LibriSpeech and musan are required.
# The others are optional.
multidataset=(
"gigaspeech",
"commonvoice",
)
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
log "Dataset: LibriSpeech and musan"
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage -1: Download LM"
mkdir -p $dl_dir/lm
if [ ! -e $dl_dir/lm/.done ]; then
./local/download_lm.py --out-dir=$dl_dir/lm
touch $dl_dir/lm/.done
fi
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# If you have pre-downloaded it to /path/to/LibriSpeech,
# you can create a symlink
#
# ln -sfv /path/to/LibriSpeech $dl_dir/LibriSpeech
#
if [ ! -d $dl_dir/LibriSpeech/train-other-500 ]; then
lhotse download librispeech --full $dl_dir
fi
# If you have pre-downloaded it to /path/to/musan,
# you can create a symlink
#
# ln -sfv /path/to/musan $dl_dir/
#
if [ ! -d $dl_dir/musan ]; then
lhotse download musan $dl_dir
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare LibriSpeech manifest"
# We assume that you have downloaded the LibriSpeech corpus
# to $dl_dir/LibriSpeech
mkdir -p data/manifests
if [ ! -e data/manifests/.librispeech.done ]; then
lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests
touch data/manifests/.librispeech.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to data/musan
mkdir -p data/manifests
if [ ! -e data/manifests/.musan.done ]; then
lhotse prepare musan $dl_dir/musan data/manifests
touch data/manifests/.musan.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for librispeech"
mkdir -p data/fbank
if [ ! -e data/fbank/.librispeech.done ]; then
./local/compute_fbank_librispeech.py --perturb-speed False
touch data/fbank/.librispeech.done
fi
if [ ! -f data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz ]; then
cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz
fi
if [ ! -e data/fbank/.librispeech-validated.done ]; then
log "Validating data/fbank for LibriSpeech"
parts=(
train-clean-100
train-clean-360
train-other-500
test-clean
test-other
dev-clean
dev-other
)
for part in ${parts[@]}; do
python3 ./local/validate_manifest.py \
data/fbank/librispeech_cuts_${part}.jsonl.gz
done
touch data/fbank/.librispeech-validated.done
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank for musan"
mkdir -p data/fbank
if [ ! -e data/fbank/.musan.done ]; then
./local/compute_fbank_musan.py
touch data/fbank/.musan.done
fi
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Prepare phone based lang"
lang_dir=data/lang_phone
mkdir -p $lang_dir
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
cat - $dl_dir/lm/librispeech-lexicon.txt |
sort | uniq > $lang_dir/lexicon.txt
if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang.py --lang-dir $lang_dir
fi
if [ ! -f $lang_dir/L.fst ]; then
log "Converting L.pt to L.fst"
./shared/convert-k2-to-openfst.py \
--olabels aux_labels \
$lang_dir/L.pt \
$lang_dir/L.fst
fi
if [ ! -f $lang_dir/L_disambig.fst ]; then
log "Converting L_disambig.pt to L_disambig.fst"
./shared/convert-k2-to-openfst.py \
--olabels aux_labels \
$lang_dir/L_disambig.pt \
$lang_dir/disambig_L.fst
fi
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Prepare BPE based lang"
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
mkdir -p $lang_dir
# We reuse words.txt from phone based lexicon
# so that the two can share G.pt later.
cp data/lang_phone/words.txt $lang_dir
if [ ! -f $lang_dir/transcript_words.txt ]; then
log "Generate data for BPE training"
files=$(
find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt"
find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt"
find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt"
)
for f in ${files[@]}; do
cat $f | cut -d " " -f 2-
done > $lang_dir/transcript_words.txt
fi
if [ ! -f $lang_dir/bpe.model ]; then
./local/train_bpe_model.py \
--lang-dir $lang_dir \
--vocab-size $vocab_size \
--transcript $lang_dir/transcript_words.txt
fi
if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang_bpe.py --lang-dir $lang_dir
log "Validating $lang_dir/lexicon.txt"
./local/validate_bpe_lexicon.py \
--lexicon $lang_dir/lexicon.txt \
--bpe-model $lang_dir/bpe.model
fi
if [ ! -f $lang_dir/L.fst ]; then
log "Converting L.pt to L.fst"
./shared/convert-k2-to-openfst.py \
--olabels aux_labels \
$lang_dir/L.pt \
$lang_dir/L.fst
fi
if [ ! -f $lang_dir/L_disambig.fst ]; then
log "Converting L_disambig.pt to L_disambig.fst"
./shared/convert-k2-to-openfst.py \
--olabels aux_labels \
$lang_dir/L_disambig.pt \
$lang_dir/L_disambig.fst
fi
done
fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Prepare G"
# We assume you have install kaldilm, if not, please install
# it using: pip install kaldilm
mkdir -p data/lm
if [ ! -f data/lm/G_3_gram.fst.txt ]; then
# It is used in building HLG
python3 -m kaldilm \
--read-symbol-table="data/lang_phone/words.txt" \
--disambig-symbol='#0' \
--max-order=3 \
$dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt
fi
if [ ! -f data/lm/G_4_gram.fst.txt ]; then
# It is used for LM rescoring
python3 -m kaldilm \
--read-symbol-table="data/lang_phone/words.txt" \
--disambig-symbol='#0' \
--max-order=4 \
$dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt
fi
fi
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
log "Stage 8: Compile HLG"
./local/compile_hlg.py --lang-dir data/lang_phone
# Note If ./local/compile_hlg.py throws OOM,
# please switch to the following command
#
# ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
./local/compile_hlg.py --lang-dir $lang_dir
# Note If ./local/compile_hlg.py throws OOM,
# please switch to the following command
#
# ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir
done
fi
# Compile LG for RNN-T fast_beam_search decoding
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
log "Stage 9: Compile LG"
./local/compile_lg.py --lang-dir data/lang_phone
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
./local/compile_lg.py --lang-dir $lang_dir
done
fi
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
log "Stage 10: Prepare the other datasets"
# GigaSpeech
if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then
log "Dataset: GigaSpeech"
./prepare_giga_speech.sh --stop_stage 5
fi
# CommonVoice
if [[ "${multidataset[@]}" =~ "commonvoice" ]]; then
log "Dataset: CommonVoice"
./prepare_common_voice.sh
fi
fi
if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
log "Stage 11: Create multidataset"
split_dir=data/fbank/multidataset_split_${num_splits}
if [ ! -f data/fbank/multidataset_split/.multidataset.done ]; then
mkdir -p $split_dir/multidataset
log "Split LibriSpeech"
if [ ! -f $split_dir/.librispeech_split.done ]; then
lhotse split $num_splits ./data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz $split_dir
touch $split_dir/.librispeech_split.done
fi
if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then
log "Split GigaSpeech XL"
if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then
cd $split_dir
ln -sv ../gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz .
cd ../../..
touch $split_dir/.gigaspeech_XL_split.done
fi
fi
if [[ "${multidataset[@]}" =~ "commonvoice" ]]; then
log "Split CommonVoice"
if [ ! -f $split_dir/.cv-en_train_split.done ]; then
lhotse split $num_splits ./data/en/fbank/cv-en_cuts_train.jsonl.gz $split_dir
touch $split_dir/.cv-en_train_split.done
fi
fi
if [ ! -f $split_dir/.multidataset_mix.done ]; then
log "Mix multidataset"
for ((seq=1; seq<=$num_splits; seq++)); do
fseq=$(printf "%04d" $seq)
gunzip -c $split_dir/*.*${fseq}.jsonl.gz | \
shuf | gzip -c > $split_dir/multidataset/multidataset_cuts_train.${fseq}.jsonl.gz
done
touch $split_dir/.multidataset_mix.done
fi
touch data/fbank/multidataset_split/.multidataset.done
fi
fi

View File

@ -30,7 +30,7 @@ class GigaSpeech:
""" """
Args: Args:
manifest_dir: manifest_dir:
It is expected to contain the following files:: It is expected to contain the following files:
- gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
- gigaspeech_cuts_L_raw.jsonl.gz - gigaspeech_cuts_L_raw.jsonl.gz

View File

@ -0,0 +1,53 @@
# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import logging
import re
from pathlib import Path
import lhotse
from lhotse import CutSet, load_manifest_lazy
class MultiDataset:
def __init__(self, manifest_dir: str):
"""
Args:
manifest_dir:
It is expected to contain the following files:
- multidataset_split_1998/multidataset/multidataset_cuts_train.*.jsonl.gz
"""
self.manifest_dir = Path(manifest_dir)
def train_cuts(self) -> CutSet:
logging.info("About to get multidataset train cuts")
filenames = glob.glob(
f"{self.manifest_dir}/multidataset_split_1998/multidataset/multidataset_cuts_train.*.jsonl.gz"
)
pattern = re.compile(r"multidataset_cuts_train.([0-9]+).jsonl.gz")
idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames)
idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
sorted_filenames = [f[1] for f in idx_filenames]
logging.info(f"Loading {len(sorted_filenames)} splits")
return lhotse.combine(lhotse.load_manifest_lazy(p) for p in sorted_filenames)

View File

@ -1,8 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright 2021-2022 Xiaomi Corp. (authors: Fangjun Kuang, # Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang,
# Wei Kang, # Wei Kang,
# Mingshuang Luo,) # Mingshuang Luo,
# Zengwei Yao) # Zengwei Yao,
# Yifan Yang)
# #
# See ../../../../LICENSE for clarification regarding multiple authors # See ../../../../LICENSE for clarification regarding multiple authors
# #
@ -59,6 +60,7 @@ import torch
import torch.multiprocessing as mp import torch.multiprocessing as mp
import torch.nn as nn import torch.nn as nn
from asr_datamodule import LibriSpeechAsrDataModule from asr_datamodule import LibriSpeechAsrDataModule
from multidataset import MultiDataset
from decoder import Decoder from decoder import Decoder
from joiner import Joiner from joiner import Joiner
from lhotse.cut import Cut from lhotse.cut import Cut
@ -374,6 +376,13 @@ def get_parser():
help="Whether to use half precision training.", help="Whether to use half precision training.",
) )
parser.add_argument(
"--use-multidataset",
type=str2bool,
default=False,
help="Whether to use multidataset to train.",
)
add_model_arguments(parser) add_model_arguments(parser)
return parser return parser
@ -1043,10 +1052,14 @@ def run(rank, world_size, args):
librispeech = LibriSpeechAsrDataModule(args) librispeech = LibriSpeechAsrDataModule(args)
if params.full_libri: if params.use_multidataset:
train_cuts = librispeech.train_all_shuf_cuts() multidataset = MultiDataset(params.manifest_dir)
train_cuts = multidataset.train_cuts()
else: else:
train_cuts = librispeech.train_clean_100_cuts() if params.full_libri:
train_cuts = librispeech.train_all_shuf_cuts()
else:
train_cuts = librispeech.train_clean_100_cuts()
def remove_short_and_long_utt(c: Cut): def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds # Keep only utterances with duration between 1 second and 20 seconds
@ -1058,9 +1071,6 @@ def run(rank, world_size, args):
# an utterance duration distribution for your dataset to select # an utterance duration distribution for your dataset to select
# the threshold # the threshold
if c.duration < 1.0 or c.duration > 20.0: if c.duration < 1.0 or c.duration > 20.0:
logging.warning(
f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
)
return False return False
# In pruned RNN-T, we require that T >= S # In pruned RNN-T, we require that T >= S
@ -1102,7 +1112,7 @@ def run(rank, world_size, args):
valid_cuts += librispeech.dev_other_cuts() valid_cuts += librispeech.dev_other_cuts()
valid_dl = librispeech.valid_dataloaders(valid_cuts) valid_dl = librispeech.valid_dataloaders(valid_cuts)
if not params.print_diagnostics: if not params.use_multidataset and not params.print_diagnostics:
scan_pessimistic_batches_for_oom( scan_pessimistic_batches_for_oom(
model=model, model=model,
train_dl=train_dl, train_dl=train_dl,