misc. update

2025-08-26 18:24:18 +00:00 · 2024-03-15 10:43:33 +08:00 · 2024-03-15 10:43:33 +08:00 · 06bca2ffed
commit 06bca2ffed
parent 030365f168
7 changed files with 76 additions and 7 deletions
--- a/egs/commonvoice/ASR/local/compute_fbank_commonvoice_splits.py
+++ b/egs/commonvoice/ASR/local/compute_fbank_commonvoice_splits.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3
-# Copyright    2023  Xiaomi Corp.             (Yifan Yang)
+# Copyright    2023-2024  Xiaomi Corp.             (Yifan Yang,
+#                                                   Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -74,21 +75,21 @@ def get_args():
        "--num-splits",
        type=int,
        required=True,
-        help="The number of splits of the train subset",
+        help="The number of splits of the subset",
    )

    parser.add_argument(
        "--start",
        type=int,
        default=0,
-        help="Process pieces starting from this number (inclusive).",
+        help="Process pieces starting from this number (included).",
    )

    parser.add_argument(
        "--stop",
        type=int,
        default=-1,
-        help="Stop processing pieces until this number (exclusive).",
+        help="Stop processing pieces until this number (excluded).",
    )

    parser.add_argument(
--- a/egs/commonvoice/ASR/prepare.sh
+++ b/egs/commonvoice/ASR/prepare.sh
@ -257,12 +257,14 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
    log "Also combine features for validated data"
    pieces=$(find data/${lang}/fbank/cv-${lang}_validated_split_${num_splits} -name "cv-${lang}_cuts_validated.*.jsonl.gz")
    lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_validated.jsonl.gz
+    touch data/${lang}/fbank/.cv-${lang}_validated.done
  fi

  if [ $use_invalidated = true ] && [ -f data/${lang}/fbank/.cv-${lang}_invalidated.done ]; then
    log "Also combine features for invalidated data"
-    pieces=$(find data/${lang}/fbank/cv-${lang}_inalidated_split_${num_splits} -name "cv-${lang}_cuts_invalidated.*.jsonl.gz")
+    pieces=$(find data/${lang}/fbank/cv-${lang}_invalidated_split_${num_splits} -name "cv-${lang}_cuts_invalidated.*.jsonl.gz")
    lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_invalidated.jsonl.gz
+    touch data/${lang}/fbank/.cv-${lang}_invalidated.done
  fi
 fi

@ -289,8 +291,18 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
        # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
        # 2. chmod +x ./jq
        # 3. cp jq /usr/bin
+        if [ $use_validated = true ]; then
+          gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_validated.jsonl.gz \
+            | jq '.text' | sed 's/"//g' >> $lang_dir/text
+        else
          gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \
            | jq '.text' | sed 's/"//g' > $lang_dir/text
+        fi
+        
+        if [ $use_invalidated = true ]; then
+          gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_invalidated.jsonl.gz \
+            | jq '.text' | sed 's/"//g' >> $lang_dir/text
+        fi

        if [ $lang == "yue" ] || [ $lang == "zh-HK" ]; then
          # Get words.txt and words_no_ids.txt
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7/asr_datamodule.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/asr_datamodule.py
@ -417,6 +417,14 @@ class CommonVoiceAsrDataModule:
            / f"cv-{self.args.language}_cuts_validated.jsonl.gz"
        )

+    @lru_cache()
+    def validated_cuts(self) -> CutSet:
+        logging.info("About to get invalidated cuts")
+        return load_manifest_lazy(
+            self.args.cv_manifest_dir
+            / f"cv-{self.args.language}_cuts_invalidated.jsonl.gz"
+        )
+
    @lru_cache()
    def dev_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
@ -258,6 +258,15 @@ def get_parser():
        """,
    )

+    parser.add_argument(
+        "--use-invalidated-set",
+        type=str2bool,
+        default=False,
+        help="""Use the invalidated set for training.
+        In case you want to take the risk and utilize more data for training.
+        """,
+    )
+
    parser.add_argument(
        "--base-lr",
        type=float,
@ -1047,6 +1056,9 @@ def run(rank, world_size, args):
    else:
        train_cuts = commonvoice.validated_cuts()

+    if args.use_invalidated_set:
+        train_cuts += commonvoice.invalidated_cuts()
+
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py
@ -274,6 +274,15 @@ def get_parser():
        """,
    )

+    parser.add_argument(
+        "--use-invalidated-set",
+        type=str2bool,
+        default=False,
+        help="""Use the invalidated set for training.
+        In case you want to take the risk and utilize more data for training.
+        """,
+    )
+
    parser.add_argument(
        "--base-lr",
        type=float,
@ -1064,6 +1073,9 @@ def run(rank, world_size, args):
    else:
        train_cuts = commonvoice.validated_cuts()

+    if args.use_invalidated_set:
+        train_cuts += commonvoice.invalidated_cuts()
+
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
--- a/egs/commonvoice/ASR/zipformer/train.py
+++ b/egs/commonvoice/ASR/zipformer/train.py
@ -337,6 +337,15 @@ def get_parser():
        """,
    )

+    parser.add_argument(
+        "--use-invalidated-set",
+        type=str2bool,
+        default=False,
+        help="""Use the invalidated set for training.
+        In case you want to take the risk and utilize more data for training.
+        """,
+    )
+
    parser.add_argument(
        "--base-lr",
        type=float,
@ -1191,6 +1200,9 @@ def run(rank, world_size, args):
    else:
        train_cuts = commonvoice.validated_cuts()

+    if args.use_invalidated_set:
+        train_cuts += commonvoice.invalidated_cuts()
+
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
--- a/egs/commonvoice/ASR/zipformer/train_char.py
+++ b/egs/commonvoice/ASR/zipformer/train_char.py
@ -184,6 +184,15 @@ def get_parser():
        """,
    )

+    parser.add_argument(
+        "--use-invalidated-set",
+        type=str2bool,
+        default=False,
+        help="""Use the invalidated set for training.
+        In case you want to take the risk and utilize more data for training.
+        """,
+    )
+
    parser.add_argument(
        "--base-lr",
        type=float,
@ -904,6 +913,9 @@ def run(rank, world_size, args):
    else:
        train_cuts = commonvoice.validated_cuts()

+    if args.use_invalidated_set:
+        train_cuts += commonvoice.invalidated_cuts()
+
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #