misc. update

This commit is contained in:
jinzr 2024-03-15 10:43:33 +08:00
parent 030365f168
commit 06bca2ffed
7 changed files with 76 additions and 7 deletions

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright 2023 Xiaomi Corp. (Yifan Yang) # Copyright 2023-2024 Xiaomi Corp. (Yifan Yang,
# Zengrui Jin,)
# #
# See ../../../../LICENSE for clarification regarding multiple authors # See ../../../../LICENSE for clarification regarding multiple authors
# #
@ -74,21 +75,21 @@ def get_args():
"--num-splits", "--num-splits",
type=int, type=int,
required=True, required=True,
help="The number of splits of the train subset", help="The number of splits of the subset",
) )
parser.add_argument( parser.add_argument(
"--start", "--start",
type=int, type=int,
default=0, default=0,
help="Process pieces starting from this number (inclusive).", help="Process pieces starting from this number (included).",
) )
parser.add_argument( parser.add_argument(
"--stop", "--stop",
type=int, type=int,
default=-1, default=-1,
help="Stop processing pieces until this number (exclusive).", help="Stop processing pieces until this number (excluded).",
) )
parser.add_argument( parser.add_argument(

View File

@ -257,12 +257,14 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Also combine features for validated data" log "Also combine features for validated data"
pieces=$(find data/${lang}/fbank/cv-${lang}_validated_split_${num_splits} -name "cv-${lang}_cuts_validated.*.jsonl.gz") pieces=$(find data/${lang}/fbank/cv-${lang}_validated_split_${num_splits} -name "cv-${lang}_cuts_validated.*.jsonl.gz")
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_validated.jsonl.gz lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_validated.jsonl.gz
touch data/${lang}/fbank/.cv-${lang}_validated.done
fi fi
if [ $use_invalidated = true ] && [ -f data/${lang}/fbank/.cv-${lang}_invalidated.done ]; then if [ $use_invalidated = true ] && [ -f data/${lang}/fbank/.cv-${lang}_invalidated.done ]; then
log "Also combine features for invalidated data" log "Also combine features for invalidated data"
pieces=$(find data/${lang}/fbank/cv-${lang}_inalidated_split_${num_splits} -name "cv-${lang}_cuts_invalidated.*.jsonl.gz") pieces=$(find data/${lang}/fbank/cv-${lang}_invalidated_split_${num_splits} -name "cv-${lang}_cuts_invalidated.*.jsonl.gz")
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_invalidated.jsonl.gz lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_invalidated.jsonl.gz
touch data/${lang}/fbank/.cv-${lang}_invalidated.done
fi fi
fi fi
@ -289,8 +291,18 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq # 2. chmod +x ./jq
# 3. cp jq /usr/bin # 3. cp jq /usr/bin
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \ if [ $use_validated = true ]; then
| jq '.text' | sed 's/"//g' > $lang_dir/text gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_validated.jsonl.gz \
| jq '.text' | sed 's/"//g' >> $lang_dir/text
else
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \
| jq '.text' | sed 's/"//g' > $lang_dir/text
fi
if [ $use_invalidated = true ]; then
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_invalidated.jsonl.gz \
| jq '.text' | sed 's/"//g' >> $lang_dir/text
fi
if [ $lang == "yue" ] || [ $lang == "zh-HK" ]; then if [ $lang == "yue" ] || [ $lang == "zh-HK" ]; then
# Get words.txt and words_no_ids.txt # Get words.txt and words_no_ids.txt

View File

@ -417,6 +417,14 @@ class CommonVoiceAsrDataModule:
/ f"cv-{self.args.language}_cuts_validated.jsonl.gz" / f"cv-{self.args.language}_cuts_validated.jsonl.gz"
) )
@lru_cache()
def validated_cuts(self) -> CutSet:
logging.info("About to get invalidated cuts")
return load_manifest_lazy(
self.args.cv_manifest_dir
/ f"cv-{self.args.language}_cuts_invalidated.jsonl.gz"
)
@lru_cache() @lru_cache()
def dev_cuts(self) -> CutSet: def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")

View File

@ -258,6 +258,15 @@ def get_parser():
""", """,
) )
parser.add_argument(
"--use-invalidated-set",
type=str2bool,
default=False,
help="""Use the invalidated set for training.
In case you want to take the risk and utilize more data for training.
""",
)
parser.add_argument( parser.add_argument(
"--base-lr", "--base-lr",
type=float, type=float,
@ -1047,6 +1056,9 @@ def run(rank, world_size, args):
else: else:
train_cuts = commonvoice.validated_cuts() train_cuts = commonvoice.validated_cuts()
if args.use_invalidated_set:
train_cuts += commonvoice.invalidated_cuts()
def remove_short_and_long_utt(c: Cut): def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds # Keep only utterances with duration between 1 second and 20 seconds
# #

View File

@ -274,6 +274,15 @@ def get_parser():
""", """,
) )
parser.add_argument(
"--use-invalidated-set",
type=str2bool,
default=False,
help="""Use the invalidated set for training.
In case you want to take the risk and utilize more data for training.
""",
)
parser.add_argument( parser.add_argument(
"--base-lr", "--base-lr",
type=float, type=float,
@ -1064,6 +1073,9 @@ def run(rank, world_size, args):
else: else:
train_cuts = commonvoice.validated_cuts() train_cuts = commonvoice.validated_cuts()
if args.use_invalidated_set:
train_cuts += commonvoice.invalidated_cuts()
def remove_short_and_long_utt(c: Cut): def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds # Keep only utterances with duration between 1 second and 20 seconds
# #

View File

@ -337,6 +337,15 @@ def get_parser():
""", """,
) )
parser.add_argument(
"--use-invalidated-set",
type=str2bool,
default=False,
help="""Use the invalidated set for training.
In case you want to take the risk and utilize more data for training.
""",
)
parser.add_argument( parser.add_argument(
"--base-lr", "--base-lr",
type=float, type=float,
@ -1191,6 +1200,9 @@ def run(rank, world_size, args):
else: else:
train_cuts = commonvoice.validated_cuts() train_cuts = commonvoice.validated_cuts()
if args.use_invalidated_set:
train_cuts += commonvoice.invalidated_cuts()
def remove_short_and_long_utt(c: Cut): def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds # Keep only utterances with duration between 1 second and 20 seconds
# #

View File

@ -184,6 +184,15 @@ def get_parser():
""", """,
) )
parser.add_argument(
"--use-invalidated-set",
type=str2bool,
default=False,
help="""Use the invalidated set for training.
In case you want to take the risk and utilize more data for training.
""",
)
parser.add_argument( parser.add_argument(
"--base-lr", "--base-lr",
type=float, type=float,
@ -904,6 +913,9 @@ def run(rank, world_size, args):
else: else:
train_cuts = commonvoice.validated_cuts() train_cuts = commonvoice.validated_cuts()
if args.use_invalidated_set:
train_cuts += commonvoice.invalidated_cuts()
def remove_short_and_long_utt(c: Cut): def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds # Keep only utterances with duration between 1 second and 20 seconds
# #