misc. update

This commit is contained in:
jinzr 2024-03-15 10:43:33 +08:00
parent 030365f168
commit 06bca2ffed
7 changed files with 76 additions and 7 deletions

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
# Copyright 2023 Xiaomi Corp. (Yifan Yang)
# Copyright 2023-2024 Xiaomi Corp. (Yifan Yang,
# Zengrui Jin,)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
@ -74,21 +75,21 @@ def get_args():
"--num-splits",
type=int,
required=True,
help="The number of splits of the train subset",
help="The number of splits of the subset",
)
parser.add_argument(
"--start",
type=int,
default=0,
help="Process pieces starting from this number (inclusive).",
help="Process pieces starting from this number (included).",
)
parser.add_argument(
"--stop",
type=int,
default=-1,
help="Stop processing pieces until this number (exclusive).",
help="Stop processing pieces until this number (excluded).",
)
parser.add_argument(

View File

@ -257,12 +257,14 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Also combine features for validated data"
pieces=$(find data/${lang}/fbank/cv-${lang}_validated_split_${num_splits} -name "cv-${lang}_cuts_validated.*.jsonl.gz")
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_validated.jsonl.gz
touch data/${lang}/fbank/.cv-${lang}_validated.done
fi
if [ $use_invalidated = true ] && [ -f data/${lang}/fbank/.cv-${lang}_invalidated.done ]; then
log "Also combine features for invalidated data"
pieces=$(find data/${lang}/fbank/cv-${lang}_inalidated_split_${num_splits} -name "cv-${lang}_cuts_invalidated.*.jsonl.gz")
pieces=$(find data/${lang}/fbank/cv-${lang}_invalidated_split_${num_splits} -name "cv-${lang}_cuts_invalidated.*.jsonl.gz")
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_invalidated.jsonl.gz
touch data/${lang}/fbank/.cv-${lang}_invalidated.done
fi
fi
@ -289,8 +291,18 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin
if [ $use_validated = true ]; then
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_validated.jsonl.gz \
| jq '.text' | sed 's/"//g' >> $lang_dir/text
else
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \
| jq '.text' | sed 's/"//g' > $lang_dir/text
fi
if [ $use_invalidated = true ]; then
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_invalidated.jsonl.gz \
| jq '.text' | sed 's/"//g' >> $lang_dir/text
fi
if [ $lang == "yue" ] || [ $lang == "zh-HK" ]; then
# Get words.txt and words_no_ids.txt

View File

@ -417,6 +417,14 @@ class CommonVoiceAsrDataModule:
/ f"cv-{self.args.language}_cuts_validated.jsonl.gz"
)
@lru_cache()
def validated_cuts(self) -> CutSet:
logging.info("About to get invalidated cuts")
return load_manifest_lazy(
self.args.cv_manifest_dir
/ f"cv-{self.args.language}_cuts_invalidated.jsonl.gz"
)
@lru_cache()
def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts")

View File

@ -258,6 +258,15 @@ def get_parser():
""",
)
parser.add_argument(
"--use-invalidated-set",
type=str2bool,
default=False,
help="""Use the invalidated set for training.
In case you want to take the risk and utilize more data for training.
""",
)
parser.add_argument(
"--base-lr",
type=float,
@ -1047,6 +1056,9 @@ def run(rank, world_size, args):
else:
train_cuts = commonvoice.validated_cuts()
if args.use_invalidated_set:
train_cuts += commonvoice.invalidated_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#

View File

@ -274,6 +274,15 @@ def get_parser():
""",
)
parser.add_argument(
"--use-invalidated-set",
type=str2bool,
default=False,
help="""Use the invalidated set for training.
In case you want to take the risk and utilize more data for training.
""",
)
parser.add_argument(
"--base-lr",
type=float,
@ -1064,6 +1073,9 @@ def run(rank, world_size, args):
else:
train_cuts = commonvoice.validated_cuts()
if args.use_invalidated_set:
train_cuts += commonvoice.invalidated_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#

View File

@ -337,6 +337,15 @@ def get_parser():
""",
)
parser.add_argument(
"--use-invalidated-set",
type=str2bool,
default=False,
help="""Use the invalidated set for training.
In case you want to take the risk and utilize more data for training.
""",
)
parser.add_argument(
"--base-lr",
type=float,
@ -1191,6 +1200,9 @@ def run(rank, world_size, args):
else:
train_cuts = commonvoice.validated_cuts()
if args.use_invalidated_set:
train_cuts += commonvoice.invalidated_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#

View File

@ -184,6 +184,15 @@ def get_parser():
""",
)
parser.add_argument(
"--use-invalidated-set",
type=str2bool,
default=False,
help="""Use the invalidated set for training.
In case you want to take the risk and utilize more data for training.
""",
)
parser.add_argument(
"--base-lr",
type=float,
@ -904,6 +913,9 @@ def run(rank, world_size, args):
else:
train_cuts = commonvoice.validated_cuts()
if args.use_invalidated_set:
train_cuts += commonvoice.invalidated_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#