add fbank

This commit is contained in:
Bailey Hirota 2025-05-02 03:31:55 +09:00
parent 21d1bf73bb
commit 82bd37cacd
3 changed files with 117 additions and 151 deletions

View File

@ -1,20 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright 2023 The University of Electro-Communications (Author: Teo Wen Shen) # noqa # Copyright 2023 The University of Electro-Communications
# (Author: Teo Wen Shen)
# #
# See ../../../../LICENSE for clarification regarding multiple authors # Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import logging import logging
@ -23,119 +11,106 @@ from pathlib import Path
from typing import List, Tuple from typing import List, Tuple
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
# fmt: off
from lhotse import ( # See the following for why LilcomChunkyWriter is preferred; https://github.com/k2-fsa/icefall/pull/404; https://github.com/lhotse-speech/lhotse/pull/527
CutSet,
Fbank,
FbankConfig,
LilcomChunkyWriter,
RecordingSet,
SupervisionSet,
)
from lhotse.utils import is_module_available from lhotse.utils import is_module_available
# fmt: on # Disable PyTorch intra/inter op threading overhead
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1) torch.set_num_threads(1)
torch.set_num_interop_threads(1) torch.set_num_interop_threads(1)
RNG_SEED = 42
concat_params = {"gap": 1.0, "maxlen": 10.0}
def make_cutset_blueprints( def make_cutset_blueprints(
mls_eng_hf_dataset_path: str = "parler-tts/mls_eng", mls_eng_hf_dataset_path: str = "parler-tts/mls_eng",
) -> List[Tuple[str, CutSet]]: ) -> List[Tuple[str, CutSet]]:
cut_sets = []
if not is_module_available("datasets"): if not is_module_available("datasets"):
raise ImportError( raise ImportError(
"To process the MLS English HF corpus, please install optional dependency: pip install datasets" "To process the MLS English HF corpus, please install datasets: pip install datasets"
) )
from datasets import load_dataset from datasets import load_dataset
print(f"{mls_eng_hf_dataset_path=}")
dataset = load_dataset(str(mls_eng_hf_dataset_path)) dataset = load_dataset(str(mls_eng_hf_dataset_path))
# Create test dataset return [
logging.info("Creating test cuts.") ("test", CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript")),
cut_sets.append( ("dev", CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript")),
( ("train", CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript")),
"test", ]
CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript"),
)
)
# Create dev dataset
logging.info("Creating dev cuts.")
cut_sets.append(
("dev", CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript"))
)
# Create train dataset
logging.info("Creating train cuts.")
cut_sets.append(
(
"train",
CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript"),
)
)
return cut_sets
def get_args(): def get_args():
parser = argparse.ArgumentParser( p = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter, formatter_class=argparse.ArgumentDefaultsHelpFormatter
) )
parser.add_argument("-m", "--manifest-dir", type=Path) p.add_argument("-m", "--manifest-dir",
parser.add_argument("-a", "--audio-dir", type=Path) type=Path,
parser.add_argument("-d", "--dl-dir", type=Path) default=Path("data/manifests"),
return parser.parse_args() help="Where to write JSONL cuts")
p.add_argument("-a", "--audio-dir",
type=Path,
default=Path("data/audio"),
help="Where to copy raw audio")
p.add_argument("-d", "--dl-dir",
type=Path,
required=True,
help="Where the HF dataset was cloned")
p.add_argument("--fbank-dir",
type=Path,
default=Path("data/fbank"),
help="Where to write FBANK features")
return p.parse_args()
def main(): def main():
args = get_args() args = get_args()
# Make sure our directories exist
for d in (args.manifest_dir, args.audio_dir, args.fbank_dir):
d.mkdir(parents=True, exist_ok=True)
# If we've already computed FBANK, skip.
done_marker = args.fbank_dir / ".mls_eng-fbank.done"
if done_marker.exists():
logging.info(
"Found done-marker at %s. Skipping FBANK computation.",
done_marker
)
return
# Set up logging
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
level=logging.INFO,
)
# Prepare Lhotse cut blueprints from HF dataset
cut_sets = make_cutset_blueprints(str(args.dl_dir))
# Feature extractor
extractor = Fbank(FbankConfig(num_mel_bins=80)) extractor = Fbank(FbankConfig(num_mel_bins=80))
num_jobs = min(16, os.cpu_count()) num_jobs = min(16, os.cpu_count())
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
if (args.manifest_dir / ".mls-eng-fbank.done").exists():
logging.info(
"Previous fbank computed for MLS English found. "
f"Delete {args.manifest_dir / '.mls-eng-fbank.done'} to allow recomputing fbank."
)
return
else:
mls_eng_hf_dataset_path = args.dl_dir # "/root/datasets/parler-tts--mls_eng"
cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
for part, cut_set in cut_sets: for part, cut_set in cut_sets:
logging.info(f"Processing {part}") logging.info("===== Processing split: %s =====", part)
cut_set = cut_set.save_audios(
num_jobs=num_jobs,
storage_path=(args.audio_dir / part).as_posix(),
) # makes new cutset that loads audio from paths to actual audio files
# 1) compute & store FBANK features into fbank-dir
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
num_jobs=num_jobs, num_jobs=num_jobs,
storage_path=(args.manifest_dir / f"feats_{part}").as_posix(), storage_path=(args.fbank_dir / f"mls_eng_feats_{part}").as_posix(),
storage_type=LilcomChunkyWriter, storage_type=LilcomChunkyWriter,
) )
cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz") # 2) copy raw audio into audio-dir/<split>/
cut_set = cut_set.save_audios(args.audio_dir / part)
logging.info("All fbank computed for MLS English.") # 3) write final cuts JSONL into manifest-dir
(args.manifest_dir / ".mls-eng-fbank.done").touch() out_manifest = args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz"
cut_set.to_file(out_manifest)
logging.info("Wrote cuts manifest to %s", out_manifest)
# Touch the done marker so next runs skip
done_marker.touch()
logging.info("All FBANK computed. Done marker created at %s", done_marker)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,92 +1,82 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Prepare script for MLS English ASR recipe in icefall # Prepare script for MLS English ASR recipe in icefall
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail set -eou pipefail
stage=-1 stage=-1
stop_stage=100 stop_stage=100
# Configuration for BPE tokenizer # Configuration for BPE tokenizer
vocab_sizes=(2000) # You can add more sizes like (500 1000 2000) for comparison vocab_sizes=(500)
# Directory where dataset will be downloaded # Directory where dataset will be downloaded
dl_dir=$PWD/download dl_dir=$PWD/download
. shared/parse_options.sh || exit 1 . shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data". # All files generated by this script are saved in "data/".
mkdir -p data mkdir -p data/manifests data/fbank data/audio data/lang
mkdir -p data/audio # Add this line
mkdir -p data/manifests
mkdir -p data/lang
log() { log() {
local fname=${BASH_SOURCE[1]##*/} local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${LINENO}:${FUNCNAME[1]}) $*"
} }
log "Starting MLS English data preparation" log "Starting MLS English data preparation"
# Stage 0: Download corpus
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download MLS English dataset" log "Stage 0: Download MLS English dataset"
if [ ! -d $dl_dir/mls_english ]; then if [ ! -d $dl_dir/mls_english ]; then
if ! git clone https://huggingface.co/datasets/parler-tts/mls_eng $dl_dir/mls_english; then git clone https://huggingface.co/datasets/parler-tts/mls_eng \
log "Failed to download MLS English dataset" $dl_dir/mls_english || {
exit 1 log "Failed to download MLS English dataset"; exit 1; }
fi
fi fi
fi fi
# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then # Stage 1: Compute fbank & emit manifests
# log "Stage 1: Prepare MLS English manifest"
# # We assume that you have downloaded the MLS English corpus
# # to $dl_dir/mls_english
# if [ ! -e data/manifests/.mls_english.done ]; then
# # lhotse prepare mls_english -j $nj $dl_dir/mls_english data/manifests
# python local/utils/save_audios.py --num-jobs 8 --dataset-dir $dl_dir/mls_english --audio-dir ./data/audio --manifest-dir ./data/manifests
# touch data/manifests/.mls_english.done
# fi
# fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Compute MLS English fbank" log "Stage 1: Compute & validate MLS English fbank"
if [ ! -e data/manifests/.mls_english-validated.done ]; then # we already did `mkdir -p data/manifests data/fbank data/audio` above
if [ ! -e data/fbank/.mls_eng-fbank.done ]; then
python local/compute_fbank_mls_english.py \ python local/compute_fbank_mls_english.py \
--manifest-dir data/manifests \ --manifest-dir data/manifests \
--audio-dir data/audio \ --audio-dir data/audio \
--dl-dir $dl_dir/mls_english --dl-dir $dl_dir/mls_english \
# --dl-dir /root/datasets/parler-tts--mls_eng --fbank-dir data/fbank
python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_train.jsonl.gz
python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_dev.jsonl.gz # Validate each splits manifest
python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_test.jsonl.gz for split in train dev test; do
touch data/manifests/.mls_english-validated.done python local/validate_manifest.py \
--manifest data/manifests/mls_eng_cuts_${split}.jsonl.gz
done
touch data/fbank/.mls_eng-fbank.done
log "fbank + manifest generation complete."
else
log "Skipping: fbank already done (data/fbank/.mls_eng-fbank.done exists)."
fi fi
fi fi
# Stage 2: Prepare transcript for BPE
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare transcript for BPE training" log "Stage 2: Generate transcript for BPE"
if [ ! -f data/lang/transcript.txt ]; then if [ ! -f data/lang/transcript.txt ]; then
log "Generating transcripts for BPE training" ./local/utils/generate_transcript.py --lang-dir data/lang
python local/utils/generate_transcript.py \
--dataset-path $dl_dir/mls_english \
--lang-dir data/lang \
--split train
fi fi
fi fi
# Stage 3: Train BPE models
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Prepare BPE tokenizer" log "Stage 3: Train BPE models"
for vocab_size in ${vocab_sizes[@]}; do for vocab_size in "${vocab_sizes[@]}"; do
log "Training BPE model with vocab_size=${vocab_size}" bpe_dir=data/lang_bpe_${vocab_size}
bpe_dir=data/lang/bpe_${vocab_size}
mkdir -p $bpe_dir mkdir -p $bpe_dir
if [ ! -f $bpe_dir/bpe.model ]; then if [ ! -f $bpe_dir/bpe.model ]; then
python local/train_bpe_model.py \ ./local/train_bpe_model.py \
--lang-dir $bpe_dir \ --lang-dir $bpe_dir \
--vocab-size $vocab_size \ --vocab-size $vocab_size \
--transcript data/lang/transcript.txt --transcript data/lang/transcript.txt

@ -0,0 +1 @@
Subproject commit 157dc62a6439f03de1fcccb4914fc0d30b6f21b8