mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
misc. update
This commit is contained in:
parent
030365f168
commit
06bca2ffed
@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2023 Xiaomi Corp. (Yifan Yang)
|
# Copyright 2023-2024 Xiaomi Corp. (Yifan Yang,
|
||||||
|
# Zengrui Jin,)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -74,21 +75,21 @@ def get_args():
|
|||||||
"--num-splits",
|
"--num-splits",
|
||||||
type=int,
|
type=int,
|
||||||
required=True,
|
required=True,
|
||||||
help="The number of splits of the train subset",
|
help="The number of splits of the subset",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--start",
|
"--start",
|
||||||
type=int,
|
type=int,
|
||||||
default=0,
|
default=0,
|
||||||
help="Process pieces starting from this number (inclusive).",
|
help="Process pieces starting from this number (included).",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--stop",
|
"--stop",
|
||||||
type=int,
|
type=int,
|
||||||
default=-1,
|
default=-1,
|
||||||
help="Stop processing pieces until this number (exclusive).",
|
help="Stop processing pieces until this number (excluded).",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -257,12 +257,14 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
|||||||
log "Also combine features for validated data"
|
log "Also combine features for validated data"
|
||||||
pieces=$(find data/${lang}/fbank/cv-${lang}_validated_split_${num_splits} -name "cv-${lang}_cuts_validated.*.jsonl.gz")
|
pieces=$(find data/${lang}/fbank/cv-${lang}_validated_split_${num_splits} -name "cv-${lang}_cuts_validated.*.jsonl.gz")
|
||||||
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_validated.jsonl.gz
|
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_validated.jsonl.gz
|
||||||
|
touch data/${lang}/fbank/.cv-${lang}_validated.done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $use_invalidated = true ] && [ -f data/${lang}/fbank/.cv-${lang}_invalidated.done ]; then
|
if [ $use_invalidated = true ] && [ -f data/${lang}/fbank/.cv-${lang}_invalidated.done ]; then
|
||||||
log "Also combine features for invalidated data"
|
log "Also combine features for invalidated data"
|
||||||
pieces=$(find data/${lang}/fbank/cv-${lang}_inalidated_split_${num_splits} -name "cv-${lang}_cuts_invalidated.*.jsonl.gz")
|
pieces=$(find data/${lang}/fbank/cv-${lang}_invalidated_split_${num_splits} -name "cv-${lang}_cuts_invalidated.*.jsonl.gz")
|
||||||
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_invalidated.jsonl.gz
|
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_invalidated.jsonl.gz
|
||||||
|
touch data/${lang}/fbank/.cv-${lang}_invalidated.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -289,8 +291,18 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
|
|||||||
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
|
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
|
||||||
# 2. chmod +x ./jq
|
# 2. chmod +x ./jq
|
||||||
# 3. cp jq /usr/bin
|
# 3. cp jq /usr/bin
|
||||||
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \
|
if [ $use_validated = true ]; then
|
||||||
| jq '.text' | sed 's/"//g' > $lang_dir/text
|
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_validated.jsonl.gz \
|
||||||
|
| jq '.text' | sed 's/"//g' >> $lang_dir/text
|
||||||
|
else
|
||||||
|
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \
|
||||||
|
| jq '.text' | sed 's/"//g' > $lang_dir/text
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $use_invalidated = true ]; then
|
||||||
|
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_invalidated.jsonl.gz \
|
||||||
|
| jq '.text' | sed 's/"//g' >> $lang_dir/text
|
||||||
|
fi
|
||||||
|
|
||||||
if [ $lang == "yue" ] || [ $lang == "zh-HK" ]; then
|
if [ $lang == "yue" ] || [ $lang == "zh-HK" ]; then
|
||||||
# Get words.txt and words_no_ids.txt
|
# Get words.txt and words_no_ids.txt
|
||||||
|
@ -417,6 +417,14 @@ class CommonVoiceAsrDataModule:
|
|||||||
/ f"cv-{self.args.language}_cuts_validated.jsonl.gz"
|
/ f"cv-{self.args.language}_cuts_validated.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def validated_cuts(self) -> CutSet:
|
||||||
|
logging.info("About to get invalidated cuts")
|
||||||
|
return load_manifest_lazy(
|
||||||
|
self.args.cv_manifest_dir
|
||||||
|
/ f"cv-{self.args.language}_cuts_invalidated.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def dev_cuts(self) -> CutSet:
|
def dev_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev cuts")
|
logging.info("About to get dev cuts")
|
||||||
|
@ -258,6 +258,15 @@ def get_parser():
|
|||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--use-invalidated-set",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="""Use the invalidated set for training.
|
||||||
|
In case you want to take the risk and utilize more data for training.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--base-lr",
|
"--base-lr",
|
||||||
type=float,
|
type=float,
|
||||||
@ -1047,6 +1056,9 @@ def run(rank, world_size, args):
|
|||||||
else:
|
else:
|
||||||
train_cuts = commonvoice.validated_cuts()
|
train_cuts = commonvoice.validated_cuts()
|
||||||
|
|
||||||
|
if args.use_invalidated_set:
|
||||||
|
train_cuts += commonvoice.invalidated_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
#
|
#
|
||||||
|
@ -274,6 +274,15 @@ def get_parser():
|
|||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--use-invalidated-set",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="""Use the invalidated set for training.
|
||||||
|
In case you want to take the risk and utilize more data for training.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--base-lr",
|
"--base-lr",
|
||||||
type=float,
|
type=float,
|
||||||
@ -1064,6 +1073,9 @@ def run(rank, world_size, args):
|
|||||||
else:
|
else:
|
||||||
train_cuts = commonvoice.validated_cuts()
|
train_cuts = commonvoice.validated_cuts()
|
||||||
|
|
||||||
|
if args.use_invalidated_set:
|
||||||
|
train_cuts += commonvoice.invalidated_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
#
|
#
|
||||||
|
@ -337,6 +337,15 @@ def get_parser():
|
|||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--use-invalidated-set",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="""Use the invalidated set for training.
|
||||||
|
In case you want to take the risk and utilize more data for training.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--base-lr",
|
"--base-lr",
|
||||||
type=float,
|
type=float,
|
||||||
@ -1191,6 +1200,9 @@ def run(rank, world_size, args):
|
|||||||
else:
|
else:
|
||||||
train_cuts = commonvoice.validated_cuts()
|
train_cuts = commonvoice.validated_cuts()
|
||||||
|
|
||||||
|
if args.use_invalidated_set:
|
||||||
|
train_cuts += commonvoice.invalidated_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
#
|
#
|
||||||
|
@ -184,6 +184,15 @@ def get_parser():
|
|||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--use-invalidated-set",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="""Use the invalidated set for training.
|
||||||
|
In case you want to take the risk and utilize more data for training.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--base-lr",
|
"--base-lr",
|
||||||
type=float,
|
type=float,
|
||||||
@ -904,6 +913,9 @@ def run(rank, world_size, args):
|
|||||||
else:
|
else:
|
||||||
train_cuts = commonvoice.validated_cuts()
|
train_cuts = commonvoice.validated_cuts()
|
||||||
|
|
||||||
|
if args.use_invalidated_set:
|
||||||
|
train_cuts += commonvoice.invalidated_cuts()
|
||||||
|
|
||||||
def remove_short_and_long_utt(c: Cut):
|
def remove_short_and_long_utt(c: Cut):
|
||||||
# Keep only utterances with duration between 1 second and 20 seconds
|
# Keep only utterances with duration between 1 second and 20 seconds
|
||||||
#
|
#
|
||||||
|
Loading…
x
Reference in New Issue
Block a user