From cb4f4a941baea8c3c8a6605ae283d11c88a8455c Mon Sep 17 00:00:00 2001 From: Bailey Hirota Date: Wed, 16 Apr 2025 13:25:40 +0900 Subject: [PATCH] complete musan rezonspeech integration --- .../ASR/local/compute_fbank_musan.py | 152 ++++++++++++++++++ .../ASR/musan-k2-v2-reazonspeech-medium | 1 + egs/reazonspeech/ASR/prepare.sh | 52 +++++- egs/reazonspeech/ASR/zipformer/train.py | 1 + 4 files changed, 198 insertions(+), 8 deletions(-) create mode 100755 egs/reazonspeech/ASR/local/compute_fbank_musan.py create mode 160000 egs/reazonspeech/ASR/musan-k2-v2-reazonspeech-medium diff --git a/egs/reazonspeech/ASR/local/compute_fbank_musan.py b/egs/reazonspeech/ASR/local/compute_fbank_musan.py new file mode 100755 index 000000000..72c61f326 --- /dev/null +++ b/egs/reazonspeech/ASR/local/compute_fbank_musan.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This file computes fbank features of the musan dataset. +It looks for manifests in the directory data/manifests. + +The generated fbank features are saved in data/manifests. +""" +import argparse +import logging +import os +from pathlib import Path + +import torch +from lhotse import ( + CutSet, + Fbank, + FbankConfig, + LilcomChunkyWriter, + MonoCut, + WhisperFbank, + WhisperFbankConfig, + combine, +) +from lhotse.recipes.utils import read_manifests_if_cached + +from icefall.utils import get_executor, str2bool + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def is_cut_long(c: MonoCut) -> bool: + return c.duration > 5 + + +def compute_fbank_musan( + num_mel_bins: int = 80, whisper_fbank: bool = False, output_dir: str = "data/manifests" +): + src_dir = Path("data/manifests") + output_dir = Path(output_dir) + num_jobs = min(15, os.cpu_count()) + + dataset_parts = ( + "music", + "speech", + "noise", + ) + prefix = "musan" + suffix = "jsonl.gz" + manifests = read_manifests_if_cached( + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, + ) + assert manifests is not None + + assert len(manifests) == len(dataset_parts), ( + len(manifests), + len(dataset_parts), + list(manifests.keys()), + dataset_parts, + ) + + musan_cuts_path = output_dir / "musan_cuts.jsonl.gz" + + if musan_cuts_path.is_file(): + logging.info(f"{musan_cuts_path} already exists - skipping") + return + + logging.info("Extracting features for Musan") + + if whisper_fbank: + extractor = WhisperFbank( + WhisperFbankConfig(num_filters=num_mel_bins, device="cuda") + ) + else: + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + + with get_executor() as ex: # Initialize the executor only once. + # create chunks of Musan with duration 5 - 10 seconds + musan_cuts = ( + CutSet.from_manifests( + recordings=combine(part["recordings"] for part in manifests.values()) + ) + .cut_into_windows(10.0) + .filter(is_cut_long) + .compute_and_store_features( + extractor=extractor, + storage_path=f"{output_dir}/musan_feats", + num_jobs=num_jobs if ex is None else 80, + executor=ex, + storage_type=LilcomChunkyWriter, + ) + ) + musan_cuts.to_file(musan_cuts_path) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--num-mel-bins", + type=int, + default=80, + help="""The number of mel bins for Fbank""", + ) + parser.add_argument( + "--whisper-fbank", + type=str2bool, + default=False, + help="Use WhisperFbank instead of Fbank. Default: False.", + ) + parser.add_argument( + "--output-dir", + type=str, + default="data/manifests", + help="Output directory. Default: data/manifests.", + ) + return parser.parse_args() + + +if __name__ == "__main__": + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + + logging.basicConfig(format=formatter, level=logging.INFO) + args = get_args() + compute_fbank_musan( + num_mel_bins=args.num_mel_bins, + whisper_fbank=args.whisper_fbank, + output_dir=args.output_dir, + ) diff --git a/egs/reazonspeech/ASR/musan-k2-v2-reazonspeech-medium b/egs/reazonspeech/ASR/musan-k2-v2-reazonspeech-medium new file mode 160000 index 000000000..e6ca0bf17 --- /dev/null +++ b/egs/reazonspeech/ASR/musan-k2-v2-reazonspeech-medium @@ -0,0 +1 @@ +Subproject commit e6ca0bf179779b512a2ce5dd3fdc3e3e17570459 diff --git a/egs/reazonspeech/ASR/prepare.sh b/egs/reazonspeech/ASR/prepare.sh index d5e0a9491..76a2e90e9 100755 --- a/egs/reazonspeech/ASR/prepare.sh +++ b/egs/reazonspeech/ASR/prepare.sh @@ -17,8 +17,16 @@ stop_stage=100 # You can find FLAC files in this directory. # You can download them from https://huggingface.co/datasets/reazon-research/reazonspeech # -# - $dl_dir/dataset.json +# - $dl_dir/ReazonSpeech/dataset.json # The metadata of the ReazonSpeech dataset. +# +# - $dl_dir/musan +# This directory contains the following directories downloaded from +# http://www.openslr.org/17/ +# +# - music +# - noise +# - speech dl_dir=$PWD/download @@ -48,7 +56,15 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then # if [ ! -d $dl_dir/ReazonSpeech/downloads ]; then # Download small-v1 by default. - lhotse download reazonspeech --subset small-v1 $dl_dir + lhotse download reazonspeech --subset medium $dl_dir + fi + # If you have pre-downloaded it to /path/to/musan, + # you can create a symlink + # + # ln -sfv /path/to/musan $dl_dir/ + # + if [ ! -d $dl_dir/musan ]; then + lhotse download musan $dl_dir fi fi @@ -64,7 +80,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Compute ReazonSpeech fbank" + log "Stage 2: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to $dl_dir/musan + mkdir -p data/manifests + if [ ! -e data/manifests/.musan_prep.done ]; then + lhotse prepare musan $dl_dir/musan data/manifests + touch data/manifests/.musan_prep.done + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute ReazonSpeech fbank" if [ ! -e data/manifests/.reazonspeech-validated.done ]; then python local/compute_fbank_reazonspeech.py --manifest-dir data/manifests python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_train.jsonl.gz @@ -74,13 +101,22 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then fi fi -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Prepare ReazonSpeech lang_char" +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Compute fbank for musan" + mkdir -p data/manifests + if [ ! -e data/manifests/.musan_fbank.done ]; then + ./local/compute_fbank_musan.py + touch data/manifests/.musan_fbank.done + fi +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Prepare ReazonSpeech lang_char" python local/prepare_lang_char.py data/manifests/reazonspeech_cuts_train.jsonl.gz fi -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Show manifest statistics" +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Show manifest statistics" python local/display_manifest_statistics.py --manifest-dir data/manifests > data/manifests/manifest_statistics.txt cat data/manifests/manifest_statistics.txt -fi \ No newline at end of file +fi diff --git a/egs/reazonspeech/ASR/zipformer/train.py b/egs/reazonspeech/ASR/zipformer/train.py index 54b4a9950..b4d641935 100755 --- a/egs/reazonspeech/ASR/zipformer/train.py +++ b/egs/reazonspeech/ASR/zipformer/train.py @@ -68,6 +68,7 @@ from joiner import Joiner from lhotse.cut import Cut from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed +from lhotse import load_manifest from model import AsrModel from optim import Eden, ScaledAdam from scaling import ScheduledFloat