mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 17:42:21 +00:00
Use shuffled LibriSpeech cuts instead (#1450)
* use shuffled LibriSpeech cuts instead * leave the old code in comments for reference
This commit is contained in:
parent
b9b56eb879
commit
5445ea6df6
@ -952,10 +952,19 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
librispeech = LibriSpeechAsrDataModule(args)
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
|
|
||||||
train_cuts = librispeech.train_clean_100_cuts()
|
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
train_cuts += librispeech.train_clean_360_cuts()
|
train_cuts = librispeech.train_all_shuf_cuts()
|
||||||
train_cuts += librispeech.train_other_500_cuts()
|
|
||||||
|
# previously we used the following code to load all training cuts
|
||||||
|
# strictly speaking, shuffled training cuts should be used instead
|
||||||
|
# but we leave the code here to demonstrate that there is an option
|
||||||
|
# like this to combine multiple cutsets
|
||||||
|
|
||||||
|
# train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
# train_cuts += librispeech.train_clean_360_cuts()
|
||||||
|
# train_cuts += librispeech.train_other_500_cuts()
|
||||||
|
else:
|
||||||
|
train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
|
@ -771,10 +771,20 @@ def run(rank, world_size, args):
|
|||||||
valid_ali = None
|
valid_ali = None
|
||||||
|
|
||||||
librispeech = LibriSpeechAsrDataModule(args)
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
train_cuts = librispeech.train_clean_100_cuts()
|
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
train_cuts += librispeech.train_clean_360_cuts()
|
train_cuts = librispeech.train_all_shuf_cuts()
|
||||||
train_cuts += librispeech.train_other_500_cuts()
|
|
||||||
|
# previously we used the following code to load all training cuts,
|
||||||
|
# strictly speaking, shuffled training cuts should be used instead,
|
||||||
|
# but we leave the code here to demonstrate that there is an option
|
||||||
|
# like this to combine multiple cutsets
|
||||||
|
|
||||||
|
# train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
# train_cuts += librispeech.train_clean_360_cuts()
|
||||||
|
# train_cuts += librispeech.train_other_500_cuts()
|
||||||
|
else:
|
||||||
|
train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
|
@ -989,10 +989,19 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
librispeech = LibriSpeechAsrDataModule(args)
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
|
|
||||||
train_cuts = librispeech.train_clean_100_cuts()
|
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
train_cuts += librispeech.train_clean_360_cuts()
|
train_cuts = librispeech.train_all_shuf_cuts()
|
||||||
train_cuts += librispeech.train_other_500_cuts()
|
|
||||||
|
# previously we used the following code to load all training cuts,
|
||||||
|
# strictly speaking, shuffled training cuts should be used instead,
|
||||||
|
# but we leave the code here to demonstrate that there is an option
|
||||||
|
# like this to combine multiple cutsets
|
||||||
|
|
||||||
|
# train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
# train_cuts += librispeech.train_clean_360_cuts()
|
||||||
|
# train_cuts += librispeech.train_other_500_cuts()
|
||||||
|
else:
|
||||||
|
train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
|
@ -817,10 +817,19 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
librispeech = LibriSpeechAsrDataModule(args)
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
|
|
||||||
train_cuts = librispeech.train_clean_100_cuts()
|
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
train_cuts += librispeech.train_clean_360_cuts()
|
train_cuts = librispeech.train_all_shuf_cuts()
|
||||||
train_cuts += librispeech.train_other_500_cuts()
|
|
||||||
|
# previously we used the following code to load all training cuts,
|
||||||
|
# strictly speaking, shuffled training cuts should be used instead,
|
||||||
|
# but we leave the code here to demonstrate that there is an option
|
||||||
|
# like this to combine multiple cutsets
|
||||||
|
|
||||||
|
# train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
# train_cuts += librispeech.train_clean_360_cuts()
|
||||||
|
# train_cuts += librispeech.train_other_500_cuts()
|
||||||
|
else:
|
||||||
|
train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
|
@ -1038,13 +1038,26 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
librispeech = LibriSpeechAsrDataModule(args)
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
|
|
||||||
|
assert not (
|
||||||
|
params.mini_libri and params.full_libri
|
||||||
|
), f"Cannot set both mini-libri and full-libri flags to True, now mini-libri {params.mini_libri} and full-libri {params.full_libri}"
|
||||||
|
|
||||||
if params.mini_libri:
|
if params.mini_libri:
|
||||||
train_cuts = librispeech.train_clean_5_cuts()
|
train_cuts = librispeech.train_clean_5_cuts()
|
||||||
else:
|
else:
|
||||||
train_cuts = librispeech.train_clean_100_cuts()
|
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
train_cuts += librispeech.train_clean_360_cuts()
|
train_cuts = librispeech.train_all_shuf_cuts()
|
||||||
train_cuts += librispeech.train_other_500_cuts()
|
|
||||||
|
# previously we used the following code to load all training cuts,
|
||||||
|
# strictly speaking, shuffled training cuts should be used instead,
|
||||||
|
# but we leave the code here to demonstrate that there is an option
|
||||||
|
# like this to combine multiple cutsets
|
||||||
|
|
||||||
|
# train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
# train_cuts += librispeech.train_clean_360_cuts()
|
||||||
|
# train_cuts += librispeech.train_other_500_cuts()
|
||||||
|
else:
|
||||||
|
train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
|
@ -1150,10 +1150,15 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
librispeech = LibriSpeech(manifest_dir=args.manifest_dir)
|
librispeech = LibriSpeech(manifest_dir=args.manifest_dir)
|
||||||
|
|
||||||
train_cuts = librispeech.train_clean_100_cuts()
|
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
train_cuts += librispeech.train_clean_360_cuts()
|
train_cuts = librispeech.train_all_shuf_cuts()
|
||||||
train_cuts += librispeech.train_other_500_cuts()
|
|
||||||
|
# previously we used the following code to load all training cuts,
|
||||||
|
# strictly speaking, shuffled training cuts should be used instead,
|
||||||
|
# but we leave the code here to demonstrate that there is an option
|
||||||
|
# like this to combine multiple cutsets
|
||||||
|
else:
|
||||||
|
train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
|
||||||
train_cuts = filter_short_and_long_utterances(train_cuts, sp)
|
train_cuts = filter_short_and_long_utterances(train_cuts, sp)
|
||||||
|
|
||||||
|
@ -1174,10 +1174,19 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
librispeech = LibriSpeechAsrDataModule(args)
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
|
|
||||||
train_cuts = librispeech.train_clean_100_cuts()
|
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
train_cuts += librispeech.train_clean_360_cuts()
|
train_cuts = librispeech.train_all_shuf_cuts()
|
||||||
train_cuts += librispeech.train_other_500_cuts()
|
|
||||||
|
# previously we used the following code to load all training cuts,
|
||||||
|
# strictly speaking, shuffled training cuts should be used instead,
|
||||||
|
# but we leave the code here to demonstrate that there is an option
|
||||||
|
# like this to combine multiple cutsets
|
||||||
|
|
||||||
|
# train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
# train_cuts += librispeech.train_clean_360_cuts()
|
||||||
|
# train_cuts += librispeech.train_other_500_cuts()
|
||||||
|
else:
|
||||||
|
train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
|
@ -990,11 +990,13 @@ def run(rank, world_size, args):
|
|||||||
|
|
||||||
librispeech = LibriSpeechAsrDataModule(args)
|
librispeech = LibriSpeechAsrDataModule(args)
|
||||||
|
|
||||||
# train_cuts = librispeech.train_clean_100_cuts()
|
|
||||||
if params.full_libri:
|
if params.full_libri:
|
||||||
# train_cuts += librispeech.train_clean_360_cuts()
|
|
||||||
# train_cuts += librispeech.train_other_500_cuts()
|
|
||||||
train_cuts = librispeech.train_all_shuf_cuts()
|
train_cuts = librispeech.train_all_shuf_cuts()
|
||||||
|
|
||||||
|
# previously we used the following code to load all training cuts,
|
||||||
|
# strictly speaking, shuffled training cuts should be used instead,
|
||||||
|
# but we leave the code here to demonstrate that there is an option
|
||||||
|
# like this to combine multiple cutsets
|
||||||
else:
|
else:
|
||||||
train_cuts = librispeech.train_clean_100_cuts()
|
train_cuts = librispeech.train_clean_100_cuts()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user