do some changes and text normalize

2025-09-08 16:44:20 +00:00 · 2022-06-07 12:16:51 +08:00 · 2022-06-07 12:16:51 +08:00 · ddc55423b1
commit ddc55423b1
parent 4215ec434a
3 changed files with 73 additions and 49 deletions
--- a/egs/aishell4/ASR/local/compute_fbank_aishell4.py
+++ b/egs/aishell4/ASR/local/compute_fbank_aishell4.py
@ -29,7 +29,7 @@ import os
 from pathlib import Path

 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor
@ -53,11 +53,13 @@ def compute_fbank_aishell4(num_mel_bins: int = 80):
        "train_L",
        "test",
    )
+    prefix = "aishell4"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
-        prefix="aishell4",
-        suffix="jsonl.gz",
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

@ -65,7 +67,8 @@ def compute_fbank_aishell4(num_mel_bins: int = 80):

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.jsonl").is_file():
+            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
+            if (output_dir / cuts_filename).is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -81,11 +84,11 @@ def compute_fbank_aishell4(num_mel_bins: int = 80):
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )

            logging.info("About splitting cuts into smaller chunks")
@ -94,7 +97,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80):
                min_duration=None,
            )

-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_json(output_dir / cuts_filename)


 def get_args():
--- a/egs/aishell4/ASR/prepare.sh
+++ b/egs/aishell4/ASR/prepare.sh
@ -48,7 +48,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  if [ ! -f $dl_dir/aishell4/train_L ]; then
    lhotse download aishell4  $dl_dir/aishell4
  fi
-  
+
  # If you have pre-downloaded it to /path/to/musan,
  # you can create a symlink
  #
@ -117,9 +117,26 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  # Prepare text.
  # Note: in Linux, you can install jq with the  following command:
  # wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
-  gunzip -c data/manifests/aishell4/supervisions_train_L.jsonl.gz \
-    | jq ".text" | sed 's/"//g' | sed 's/<sil>//g' \
-    | ./local/text2token.py -t "char" > $lang_char_dir/text
+  gunzip -c data/manifests/aishell4/aishell4_supervisions_train_S.jsonl.gz \
+    | jq ".text" | sed 's/"//g' \
+    | ./local/text2token.py -t "char" > $lang_char_dir/text_S
+
+  gunzip -c data/manifests/aishell4/aishell4_supervisions_train_M.jsonl.gz \
+    | jq ".text" | sed 's/"//g' \
+    | ./local/text2token.py -t "char" > $lang_char_dir/text_M
+
+  gunzip -c data/manifests/aishell4/aishell4_supervisions_train_L.jsonl.gz \
+    | jq ".text" | sed 's/"//g' \
+    | ./local/text2token.py -t "char" > $lang_char_dir/text_L
+
+  for r in text_S text_M text_L ; do
+    cat $lang_char_dir/$r >> $lang_char_dir/text_full
+  done
+
+  # Prepare text normalize
+  python ./local/text_normalize.py \
+    --input $lang_char_dir/text_full \
+    --output $lang_char_dir/text

  # Prepare words segments
  python ./local/text2segments.py \
--- a/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py
@ -23,14 +23,8 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional

 import torch
-from lhotse import (
-    CutSet,
-    Fbank,
-    FbankConfig,
-    load_manifest,
-    set_caching_enabled,
-)
-from lhotse.dataset import (
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
+from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
@ -39,15 +33,15 @@ from lhotse.dataset import (
    SingleCutSampler,
    SpecAugment,
 )
-from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from lhotse.dataset.input_strategies import (  # noqa F401 for AudioSamples
+    AudioSamples,
+    OnTheFlyFeatures,
+)
 from lhotse.utils import fix_random_seed
 from torch.utils.data import DataLoader

 from icefall.utils import str2bool

-set_caching_enabled(False)
-torch.set_num_threads(1)
-

 class _SeedWorkers:
    def __init__(self, seed: int):
@ -85,12 +79,14 @@ class Aishell4AsrDataModule:
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
+
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
+
        group.add_argument(
            "--max-duration",
            type=int,
@ -98,6 +94,7 @@ class Aishell4AsrDataModule:
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
+
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
@ -105,6 +102,7 @@ class Aishell4AsrDataModule:
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
+
        group.add_argument(
            "--num-buckets",
            type=int,
@ -112,6 +110,7 @@ class Aishell4AsrDataModule:
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
+
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
@ -119,6 +118,7 @@ class Aishell4AsrDataModule:
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
+
        group.add_argument(
            "--duration-factor",
            type=float,
@ -126,6 +126,7 @@ class Aishell4AsrDataModule:
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
+
        group.add_argument(
            "--gap",
            type=float,
@ -134,6 +135,7 @@ class Aishell4AsrDataModule:
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
+
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
@ -142,6 +144,7 @@ class Aishell4AsrDataModule:
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
+
        group.add_argument(
            "--shuffle",
            type=str2bool,
@ -149,6 +152,14 @@ class Aishell4AsrDataModule:
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
+
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+
        group.add_argument(
            "--return-cuts",
            type=str2bool,
@ -192,10 +203,10 @@ class Aishell4AsrDataModule:
        )

        group.add_argument(
-            "--lazy-load",
-            type=str2bool,
-            default=True,
-            help="lazily open CutSets to avoid OOM (for L|XL subset)",
+            "--input-strategy",
+            type=str,
+            default="PrecomputedFeatures",
+            help="AudioSamples or PrecomputedFeatures",
        )

        group.add_argument(
@ -218,8 +229,8 @@ class Aishell4AsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "cuts_musan.jsonl.gz"
        )

        transforms = []
@ -277,6 +288,7 @@ class Aishell4AsrDataModule:

        logging.info("About to create train dataset")
        train = K2SpeechRecognitionDataset(
+            input_strategy=eval(self.args.input_strategy)(),
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
@ -310,7 +322,7 @@ class Aishell4AsrDataModule:
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                buffer_size=30000,
-                drop_last=True,
+                drop_last=self.args.drop_last,
            )
        else:
            logging.info("Using SingleCutSampler.")
@ -367,8 +379,6 @@ class Aishell4AsrDataModule:
        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
-            rank=0,
-            world_size=1,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
@ -393,14 +403,12 @@ class Aishell4AsrDataModule:
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
+            else eval(self.args.input_strategy)(),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
            cuts,
            max_duration=self.args.max_duration,
-            rank=0,
-            world_size=1,
            shuffle=False,
        )
        from lhotse.dataset.iterable_dataset import IterableDatasetWrapper
@ -419,26 +427,22 @@ class Aishell4AsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        if self.args.lazy_load:
-            logging.info("use lazy cuts")
-            cuts_train = CutSet.from_jsonl_lazy(
-                self.args.manifest_dir
-                / f"cuts_train_{self.args.training_subset}.json.gz"
-            )
-        else:
-            cuts_train = CutSet.from_file(
-                self.args.manifest_dir
-                / f"cuts_train_{self.args.training_subset}.json.gz"
-            )
-        return cuts_train
+        return load_manifest_lazy(
+            self.args.manifest_dir
+            / "aishell4_cuts_train_{self.args.training_subset}.jsonl.gz"
+        )

    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
        # Aishell4 doesn't have dev data, here use test to replace dev.
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell4_cuts_test.jsonl.gz"
+        )

    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell4_cuts_test.jsonl.gz"
+        )