Updating musan part to make compatible wiht recent lhotse nomenclature

This commit is contained in:
s-mousmita 2022-07-26 10:30:34 -04:00
parent 34524acf44
commit 67e3607863
2 changed files with 18 additions and 12 deletions

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, combine from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -52,12 +52,22 @@ def compute_fbank_musan():
"speech", "speech",
"noise", "noise",
) )
prefix = "musan"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, output_dir=src_dir, lazy=True, suffix="jsonl" dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
musan_cuts_path = output_dir / "cuts_musan.json.gz" assert len(manifests) == len(dataset_parts), (
len(manifests),
len(dataset_parts),
)
musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
if musan_cuts_path.is_file(): if musan_cuts_path.is_file():
logging.info(f"{musan_cuts_path} already exists - skipping") logging.info(f"{musan_cuts_path} already exists - skipping")
@ -65,7 +75,7 @@ def compute_fbank_musan():
logging.info("Extracting features for Musan") logging.info("Extracting features for Musan")
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins, sampling_rate=sampling_rate)) extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins, sampling_rate=sampling_rate)
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
# create chunks of Musan with duration 5 - 10 seconds # create chunks of Musan with duration 5 - 10 seconds
@ -79,12 +89,13 @@ def compute_fbank_musan():
.filter(lambda c: c.duration > 5) .filter(lambda c: c.duration > 5)
.compute_and_store_features( .compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_musan", storage_path=f"{output_dir}/musan_feats",
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=LilcomChunkyWriter,
) )
) )
musan_cuts.to_json(musan_cuts_path) musan_cuts.to_file(musan_cuts_path)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -183,12 +183,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
# We assume that you have downloaded the musan corpus # We assume that you have downloaded the musan corpus
# to data/musan # to data/musan
mkdir -p data/manifests/musan mkdir -p data/manifests/musan
lhotse prepare musan $dl_dir/musan data/manifests/musan lhotse prepare musan $dl_dir/musan data/manifests/musan
for name in music noise speech ; do
jq -c '.[]' data/manifests/musan/recordings_${name}.json > data/manifests/recordings_${name}.jsonl
gzip -c data/manifests/recordings_${name}.jsonl > data/manifests/recordings_${name}.jsonl.gz
done
fi fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then