Begin to use multiple datasets.

2022-02-15 20:24:48 +08:00 · 2022-02-15 20:24:48 +08:00 · fb1e2ffdc1
commit fb1e2ffdc1
parent 70a3c56a18
7 changed files with 462 additions and 4 deletions
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@ -28,7 +28,7 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor
@ -85,7 +85,7 @@ def compute_fbank_librispeech():
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=ChunkedLilcomHdf5Writer,
            )
            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")

--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@ -28,7 +28,7 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
+from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor
@ -82,7 +82,7 @@ def compute_fbank_musan():
                storage_path=f"{output_dir}/feats_musan",
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=ChunkedLilcomHdf5Writer,
            )
        )
        musan_cuts.to_json(musan_cuts_path)
--- a/egs/librispeech/ASR/local/preprocess_gigaspeech.py
+++ b/egs/librispeech/ASR/local/preprocess_gigaspeech.py
@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+# Copyright    2021  Johns Hopkins University (Piotr Żelasko)
+# Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import re
+from pathlib import Path
+
+from lhotse import CutSet, SupervisionSegment
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Similar text filtering and normalization procedure as in:
+# https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
+
+
+def normalize_text(
+    utt: str,
+    punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
+    whitespace_pattern=re.compile(r"\s\s+"),
+) -> str:
+    return whitespace_pattern.sub(" ", punct_pattern.sub("", utt))
+
+
+def has_no_oov(
+    sup: SupervisionSegment,
+    oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
+) -> bool:
+    return oov_pattern.search(sup.text) is None
+
+
+def preprocess_giga_speech():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    output_dir.mkdir(exist_ok=True)
+
+    dataset_parts = (
+        "DEV",
+        "TEST",
+        "XS",
+        "S",
+        "M",
+        "L",
+        "XL",
+    )
+
+    logging.info("Loading manifest (may take 4 minutes)")
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix="gigaspeech",
+        suffix="jsonl.gz",
+    )
+    assert manifests is not None
+
+    for partition, m in manifests.items():
+        logging.info(f"Processing {partition}")
+        raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
+        if raw_cuts_path.is_file():
+            logging.info(f"{partition} already exists - skipping")
+            continue
+
+        # Note this step makes the recipe different than LibriSpeech:
+        # We must filter out some utterances and remove punctuation
+        # to be consistent with Kaldi.
+        logging.info("Filtering OOV utterances from supervisions")
+        m["supervisions"] = m["supervisions"].filter(has_no_oov)
+        logging.info(f"Normalizing text in {partition}")
+        for sup in m["supervisions"]:
+            sup.text = normalize_text(sup.text)
+            sup.custom = {"origin": "giga"}
+
+        # Create long-recording cut manifests.
+        logging.info(f"Processing {partition}")
+        cut_set = CutSet.from_manifests(
+            recordings=m["recordings"],
+            supervisions=m["supervisions"],
+        )
+        # Run data augmentation that needs to be done in the
+        # time domain.
+        if partition not in ["DEV", "TEST"]:
+            logging.info(
+                f"Speed perturb for {partition} with factors 0.9 and 1.1 "
+                "(Perturbing may take 8 minutes and saving may take 20 minutes)"
+            )
+            cut_set = (
+                cut_set
+                + cut_set.perturb_speed(0.9)
+                + cut_set.perturb_speed(1.1)
+            )
+
+        logging.info("About to split cuts into smaller chunks.")
+        cut_set = cut_set.trim_to_supervisions(
+            keep_overlapping=False, min_duration=None
+        )
+        logging.info(f"Saving to {raw_cuts_path}")
+        cut_set.to_file(raw_cuts_path)
+
+
+def main():
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    preprocess_giga_speech()
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/init.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/init.py
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/dataset.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/dataset.py
@ -0,0 +1,204 @@
+# Copyright      2021  Piotr Żelasko
+#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from lhotse import CutSet
+from icefall.utils import str2bool
+
+
+class AsrDataset:
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the BucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help="When enabled, use on-the-fly cut mixing and feature "
+            "extraction. Will drop existing precomputed feature manifests "
+            "if available.",
+        )
+
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=True,
+            help="When enabled, each batch will have the "
+            "field: batch['supervisions']['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="When enabled, use SpecAugment for training dataset.",
+        )
+
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Used only when --enable-spec-aug is True. "
+            "It specifies the factor for time warping in SpecAugment. "
+            "Larger values mean more warping. "
+            "A value less than 1 means to disable time warp.",
+        )
+
+        group.add_argument(
+            "--enable-musan",
+            type=str2bool,
+            default=True,
+            help="When enabled, select noise from MUSAN and mix it"
+            "with training dataset. ",
+        )
+
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/fbank"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+
+    def train_dataloaders(
+        self, cuts_train: CutSet, cuts_musan: Optional[CutSet] = None
+    ) -> DataLoader:
+        transforms = []
+        if cuts_musan is not None:
+            logging.info("Enable MUSAN")
+            transforms.append(
+                CutMix(
+                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
+                )
+            )
+        else:
+            logging.info("Disable MUSAN")
+
+        input_transforms = []
+
+        if self.args.enable_spec_aug:
+            logging.info("Enable SpecAugment")
+            logging.info(
+                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
+            )
+            input_transforms.append(
+                SpecAugment(
+                    time_warp_factor=self.args.spec_aug_time_warp_factor,
+                    num_frame_masks=2,
+                    features_mask_size=27,
+                    num_feature_masks=2,
+                    frames_mask_size=100,
+                )
+            )
+        else:
+            logging.info("Disable SpecAugment")
+
+        logging.info("About to create train dataset")
+        train = K2SpeechRecognitionDataset(
+            cut_transforms=transforms,
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+        )
+
+        # NOTE: the PerturbSpeed transform should be added only if we
+        # remove it from data prep stage.
+        # Add on-the-fly speed perturbation; since originally it would
+        # have increased epoch size by 3, we will apply prob 2/3 and use
+        # 3x more epochs.
+        # Speed perturbation probably should come first before
+        # concatenation, but in principle the transforms order doesn't have
+        # to be strict (e.g. could be randomized)
+        # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
+        # Drop feats to be on the safe side.
+        train = K2SpeechRecognitionDataset(
+            cut_transforms=transforms,
+            input_strategy=OnTheFlyFeatures(
+                Fbank(FbankConfig(num_mel_bins=80))
+            ),
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+        )
+
+        logging.info("Using DynamicBucketingSampler.")
+        train_sampler = DynamicBucketingSampler(
+            cuts_train,
+            max_duration=self.args.max_duration,
+            shuffle=self.args.shuffle,
+            num_buckets=self.args.num_buckets,
+            drop_last=True,
+        )
+
+        logging.info("About to create train dataloader")
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+        )
+        return train_dl
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
@ -0,0 +1,57 @@
+# Copyright      2021  Piotr Żelasko
+#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+from typing import Path
+
+from lhotse import CutSet, load_manifest
+
+
+class GigaSpeech:
+    def __init__(self, manifest_dir: str):
+        """
+        Args:
+          manifest_dir:
+            It is expected to contain the following files::
+
+                - cuts_L.jsonl.gz
+                - cuts_XL.jsonl.gz
+                - cuts_TEST.jsonl.gz
+                - cuts_DEV.jsonl.gz
+        """
+        self.manifest_dir = Path(manifest_dir)
+
+    def train_L_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_L.json.gz"
+        logging.info(f"About to get train-L cuts from {f}")
+        return CutSet.from_jsonl_lazy(f)
+
+    def train_XL_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_XL.json.gz"
+        logging.info(f"About to get train-XL cuts from {f}")
+        return CutSet.from_jsonl_lazy(f)
+
+    def test_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_TEST.json.gz"
+        logging.info(f"About to get TEST cuts from {f}")
+        return load_manifest(f)
+
+    def dev_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_DEV.json.gz"
+        logging.info(f"About to get DEV cuts from {f}")
+        return load_manifest(f)
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
@ -0,0 +1,74 @@
+# Copyright      2021  Piotr Żelasko
+#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Path
+
+from lhotse import CutSet, load_manifest
+
+
+class LibriSpeech:
+    def __init__(self, manifest_dir: str):
+        """
+        Args:
+          manifest_dir:
+            It is expected to contain the following files::
+
+                - cuts_dev-clean.json.gz
+                - cuts_dev-other.json.gz
+                - cuts_test-clean.json.gz
+                - cuts_test-other.json.gz
+                - cuts_train-clean-100.json.gz
+                - cuts_train-clean-360.json.gz
+                - cuts_train-other-500.json.gz
+        """
+        self.manifest_dir = Path(manifest_dir)
+
+    def train_clean_100_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_train-clean-100.json.gz"
+        logging.info(f"About to get train-clean-100 cuts from {f}")
+        return load_manifest(f)
+
+    def train_clean_360_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_train-clean-360.json.gz"
+        logging.info(f"About to get train-clean-360 cuts from {f}")
+        return load_manifest(f)
+
+    def train_other_500_cuts(self) -> CutSet:
+        f = self.args.manifest_dir / "cuts_train-other-500.json.gz"
+        logging.info(f"About to get train-other-500 cuts from {f}")
+        return load_manifest(f)
+
+    def test_clean_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_test-clean.json.gz"
+        logging.info(f"About to get test-clean cuts from {f}")
+        return load_manifest(f)
+
+    def test_other_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_test-other.json.gz"
+        logging.info(f"About to get test-other cuts from {f}")
+        return load_manifest(f)
+
+    def dev_clean_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_dev-clean.json.gz"
+        logging.info(f"About to get dev-clean cuts from {f}")
+        return load_manifest(f)
+
+    def dev_other_cuts(self) -> CutSet:
+        f = self.manifest_dir / "cuts_dev-other.json.gz"
+        logging.info(f"About to get dev-other cuts from {f}")
+        return load_manifest(f)