Updating musan part to make compatible wiht recent lhotse nomenclature

This commit is contained in:
s-mousmita 2022-07-26 10:30:34 -04:00
parent 34524acf44
commit 67e3607863
2 changed files with 18 additions and 12 deletions

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path
import torch
from lhotse import CutSet, Fbank, FbankConfig, combine
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@ -52,12 +52,22 @@ def compute_fbank_musan():
"speech",
"noise",
)
prefix = "musan"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, output_dir=src_dir, lazy=True, suffix="jsonl"
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
musan_cuts_path = output_dir / "cuts_musan.json.gz"
assert len(manifests) == len(dataset_parts), (
len(manifests),
len(dataset_parts),
)
musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
if musan_cuts_path.is_file():
logging.info(f"{musan_cuts_path} already exists - skipping")
@ -65,7 +75,7 @@ def compute_fbank_musan():
logging.info("Extracting features for Musan")
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins, sampling_rate=sampling_rate))
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins, sampling_rate=sampling_rate)
with get_executor() as ex: # Initialize the executor only once.
# create chunks of Musan with duration 5 - 10 seconds
@ -79,12 +89,13 @@ def compute_fbank_musan():
.filter(lambda c: c.duration > 5)
.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_musan",
storage_path=f"{output_dir}/musan_feats",
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomChunkyWriter,
)
)
musan_cuts.to_json(musan_cuts_path)
musan_cuts.to_file(musan_cuts_path)
if __name__ == "__main__":

View File

@ -183,12 +183,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
# We assume that you have downloaded the musan corpus
# to data/musan
mkdir -p data/manifests/musan
lhotse prepare musan $dl_dir/musan data/manifests/musan
for name in music noise speech ; do
jq -c '.[]' data/manifests/musan/recordings_${name}.json > data/manifests/recordings_${name}.jsonl
gzip -c data/manifests/recordings_${name}.jsonl > data/manifests/recordings_${name}.jsonl.gz
done
lhotse prepare musan $dl_dir/musan data/manifests/musan
fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then