mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Use CutSet.mux for multidataset (#1020)
* Use CutSet.mux * Remove mischange * Fix for style check
This commit is contained in:
parent
d67a49afe4
commit
2096e69bda
@ -328,46 +328,3 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
|
|||||||
./prepare_common_voice.sh
|
./prepare_common_voice.sh
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
|
||||||
log "Stage 11: Create multidataset"
|
|
||||||
split_dir=data/fbank/multidataset_split_${num_splits}
|
|
||||||
if [ ! -f data/fbank/multidataset_split/.multidataset.done ]; then
|
|
||||||
mkdir -p $split_dir/multidataset
|
|
||||||
log "Split LibriSpeech"
|
|
||||||
if [ ! -f $split_dir/.librispeech_split.done ]; then
|
|
||||||
lhotse split $num_splits ./data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz $split_dir
|
|
||||||
touch $split_dir/.librispeech_split.done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then
|
|
||||||
log "Split GigaSpeech XL"
|
|
||||||
if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then
|
|
||||||
cd $split_dir
|
|
||||||
ln -sv ../gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz .
|
|
||||||
cd ../../..
|
|
||||||
touch $split_dir/.gigaspeech_XL_split.done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "${multidataset[@]}" =~ "commonvoice" ]]; then
|
|
||||||
log "Split CommonVoice"
|
|
||||||
if [ ! -f $split_dir/.cv-en_train_split.done ]; then
|
|
||||||
lhotse split $num_splits ./data/en/fbank/cv-en_cuts_train.jsonl.gz $split_dir
|
|
||||||
touch $split_dir/.cv-en_train_split.done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f $split_dir/.multidataset_mix.done ]; then
|
|
||||||
log "Mix multidataset"
|
|
||||||
for ((seq=1; seq<=$num_splits; seq++)); do
|
|
||||||
fseq=$(printf "%04d" $seq)
|
|
||||||
gunzip -c $split_dir/*.*${fseq}.jsonl.gz | \
|
|
||||||
shuf | gzip -c > $split_dir/multidataset/multidataset_cuts_train.${fseq}.jsonl.gz
|
|
||||||
done
|
|
||||||
touch $split_dir/.multidataset_mix.done
|
|
||||||
fi
|
|
||||||
|
|
||||||
touch data/fbank/multidataset_split/.multidataset.done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
@ -25,29 +25,53 @@ from lhotse import CutSet, load_manifest_lazy
|
|||||||
|
|
||||||
|
|
||||||
class MultiDataset:
|
class MultiDataset:
|
||||||
def __init__(self, manifest_dir: str):
|
def __init__(self, manifest_dir: str, cv_manifest_dir: str):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
manifest_dir:
|
manifest_dir:
|
||||||
It is expected to contain the following files:
|
It is expected to contain the following files:
|
||||||
|
|
||||||
- multidataset_split_1998/multidataset/multidataset_cuts_train.*.jsonl.gz
|
- librispeech_cuts_train-all-shuf.jsonl.gz
|
||||||
|
- gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
|
||||||
|
|
||||||
|
cv_manifest_dir:
|
||||||
|
It is expected to contain the following files:
|
||||||
|
|
||||||
|
- cv-en_cuts_train.jsonl.gz
|
||||||
"""
|
"""
|
||||||
self.manifest_dir = Path(manifest_dir)
|
self.manifest_dir = Path(manifest_dir)
|
||||||
|
self.cv_manifest_dir = Path(cv_manifest_dir)
|
||||||
|
|
||||||
def train_cuts(self) -> CutSet:
|
def train_cuts(self) -> CutSet:
|
||||||
logging.info("About to get multidataset train cuts")
|
logging.info("About to get multidataset train cuts")
|
||||||
|
|
||||||
filenames = glob.glob(
|
# LibriSpeech
|
||||||
f"{self.manifest_dir}/multidataset_split_1998/multidataset/multidataset_cuts_train.*.jsonl.gz"
|
logging.info(f"Loading LibriSpeech in lazy mode")
|
||||||
|
librispeech_cuts = load_manifest_lazy(
|
||||||
|
self.manifest_dir / "librispeech_cuts_train-all-shuf.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
pattern = re.compile(r"multidataset_cuts_train.([0-9]+).jsonl.gz")
|
# GigaSpeech
|
||||||
|
filenames = glob.glob(
|
||||||
|
f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
|
pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
|
||||||
idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames)
|
idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames)
|
||||||
idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
|
idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
|
||||||
|
|
||||||
sorted_filenames = [f[1] for f in idx_filenames]
|
sorted_filenames = [f[1] for f in idx_filenames]
|
||||||
|
|
||||||
logging.info(f"Loading {len(sorted_filenames)} splits")
|
logging.info(f"Loading GigaSpeech {len(sorted_filenames)} splits in lazy mode")
|
||||||
|
|
||||||
return lhotse.combine(lhotse.load_manifest_lazy(p) for p in sorted_filenames)
|
gigaspeech_cuts = lhotse.combine(
|
||||||
|
lhotse.load_manifest_lazy(p) for p in sorted_filenames
|
||||||
|
)
|
||||||
|
|
||||||
|
# CommonVoice
|
||||||
|
logging.info(f"Loading CommonVoice in lazy mode")
|
||||||
|
commonvoice_cuts = load_manifest_lazy(
|
||||||
|
self.cv_manifest_dir / f"cv-en_cuts_train.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
|
return CutSet.mux(librispeech_cuts, gigaspeech_cuts, commonvoice_cuts)
|
||||||
|
@ -1053,7 +1053,7 @@ def run(rank, world_size, args):
|
|||||||
librispeech = LibriSpeechAsrDataModule(args)
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
|
|
||||||
if params.use_multidataset:
|
if params.use_multidataset:
|
||||||
multidataset = MultiDataset(params.manifest_dir)
|
multidataset = MultiDataset(params.manifest_dir, params.cv_manifest_dir)
|
||||||
train_cuts = multidataset.train_cuts()
|
train_cuts = multidataset.train_cuts()
|
||||||
else:
|
else:
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user