mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-10 18:42:19 +00:00
Revert "add fbank"
This reverts commit ba603e0a0a514056ec6d32677053c41743a1a5dd.
This commit is contained in:
parent
82bd37cacd
commit
b2df5bbb83
@ -1,8 +1,20 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2023 The University of Electro-Communications
|
# Copyright 2023 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
|
||||||
# (Author: Teo Wen Shen)
|
|
||||||
#
|
#
|
||||||
# Apache-2.0
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
@ -11,106 +23,115 @@ from pathlib import Path
|
|||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
|
||||||
|
# fmt: off
|
||||||
|
from lhotse import ( # See the following for why LilcomChunkyWriter is preferred; https://github.com/k2-fsa/icefall/pull/404; https://github.com/lhotse-speech/lhotse/pull/527
|
||||||
|
CutSet,
|
||||||
|
Fbank,
|
||||||
|
FbankConfig,
|
||||||
|
LilcomChunkyWriter,
|
||||||
|
RecordingSet,
|
||||||
|
SupervisionSet,
|
||||||
|
)
|
||||||
from lhotse.utils import is_module_available
|
from lhotse.utils import is_module_available
|
||||||
|
|
||||||
# Disable PyTorch intra/inter op threading overhead
|
# fmt: on
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
torch.set_num_threads(1)
|
torch.set_num_threads(1)
|
||||||
torch.set_num_interop_threads(1)
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
RNG_SEED = 42
|
||||||
|
concat_params = {"gap": 1.0, "maxlen": 10.0}
|
||||||
|
|
||||||
|
|
||||||
def make_cutset_blueprints(
|
def make_cutset_blueprints(
|
||||||
mls_eng_hf_dataset_path: str = "parler-tts/mls_eng",
|
mls_eng_hf_dataset_path: str = "parler-tts/mls_eng",
|
||||||
) -> List[Tuple[str, CutSet]]:
|
) -> List[Tuple[str, CutSet]]:
|
||||||
|
cut_sets = []
|
||||||
|
|
||||||
if not is_module_available("datasets"):
|
if not is_module_available("datasets"):
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"To process the MLS English HF corpus, please install datasets: pip install datasets"
|
"To process the MLS English HF corpus, please install optional dependency: pip install datasets"
|
||||||
)
|
)
|
||||||
|
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
print(f"{mls_eng_hf_dataset_path=}")
|
||||||
dataset = load_dataset(str(mls_eng_hf_dataset_path))
|
dataset = load_dataset(str(mls_eng_hf_dataset_path))
|
||||||
|
|
||||||
return [
|
# Create test dataset
|
||||||
("test", CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript")),
|
logging.info("Creating test cuts.")
|
||||||
("dev", CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript")),
|
cut_sets.append(
|
||||||
("train", CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript")),
|
(
|
||||||
]
|
"test",
|
||||||
|
CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create dev dataset
|
||||||
|
logging.info("Creating dev cuts.")
|
||||||
|
cut_sets.append(
|
||||||
|
("dev", CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript"))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create train dataset
|
||||||
|
logging.info("Creating train cuts.")
|
||||||
|
cut_sets.append(
|
||||||
|
(
|
||||||
|
"train",
|
||||||
|
CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return cut_sets
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
p = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||||
)
|
)
|
||||||
p.add_argument("-m", "--manifest-dir",
|
parser.add_argument("-m", "--manifest-dir", type=Path)
|
||||||
type=Path,
|
parser.add_argument("-a", "--audio-dir", type=Path)
|
||||||
default=Path("data/manifests"),
|
parser.add_argument("-d", "--dl-dir", type=Path)
|
||||||
help="Where to write JSONL cuts")
|
return parser.parse_args()
|
||||||
p.add_argument("-a", "--audio-dir",
|
|
||||||
type=Path,
|
|
||||||
default=Path("data/audio"),
|
|
||||||
help="Where to copy raw audio")
|
|
||||||
p.add_argument("-d", "--dl-dir",
|
|
||||||
type=Path,
|
|
||||||
required=True,
|
|
||||||
help="Where the HF dataset was cloned")
|
|
||||||
p.add_argument("--fbank-dir",
|
|
||||||
type=Path,
|
|
||||||
default=Path("data/fbank"),
|
|
||||||
help="Where to write FBANK features")
|
|
||||||
return p.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = get_args()
|
args = get_args()
|
||||||
|
|
||||||
# Make sure our directories exist
|
|
||||||
for d in (args.manifest_dir, args.audio_dir, args.fbank_dir):
|
|
||||||
d.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# If we've already computed FBANK, skip.
|
|
||||||
done_marker = args.fbank_dir / ".mls_eng-fbank.done"
|
|
||||||
if done_marker.exists():
|
|
||||||
logging.info(
|
|
||||||
"Found done-marker at %s. Skipping FBANK computation.",
|
|
||||||
done_marker
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Set up logging
|
|
||||||
logging.basicConfig(
|
|
||||||
format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
|
|
||||||
level=logging.INFO,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prepare Lhotse cut blueprints from HF dataset
|
|
||||||
cut_sets = make_cutset_blueprints(str(args.dl_dir))
|
|
||||||
|
|
||||||
# Feature extractor
|
|
||||||
extractor = Fbank(FbankConfig(num_mel_bins=80))
|
extractor = Fbank(FbankConfig(num_mel_bins=80))
|
||||||
num_jobs = min(16, os.cpu_count())
|
num_jobs = min(16, os.cpu_count())
|
||||||
|
|
||||||
for part, cut_set in cut_sets:
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
logging.info("===== Processing split: %s =====", part)
|
|
||||||
|
|
||||||
# 1) compute & store FBANK features into fbank-dir
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
cut_set = cut_set.compute_and_store_features(
|
|
||||||
extractor=extractor,
|
if (args.manifest_dir / ".mls-eng-fbank.done").exists():
|
||||||
num_jobs=num_jobs,
|
logging.info(
|
||||||
storage_path=(args.fbank_dir / f"mls_eng_feats_{part}").as_posix(),
|
"Previous fbank computed for MLS English found. "
|
||||||
storage_type=LilcomChunkyWriter,
|
f"Delete {args.manifest_dir / '.mls-eng-fbank.done'} to allow recomputing fbank."
|
||||||
)
|
)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
mls_eng_hf_dataset_path = args.dl_dir # "/root/datasets/parler-tts--mls_eng"
|
||||||
|
cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
|
||||||
|
for part, cut_set in cut_sets:
|
||||||
|
logging.info(f"Processing {part}")
|
||||||
|
cut_set = cut_set.compute_and_store_features(
|
||||||
|
extractor=extractor,
|
||||||
|
num_jobs=num_jobs,
|
||||||
|
storage_path=(args.manifest_dir / f"feats_{part}").as_posix(),
|
||||||
|
storage_type=LilcomChunkyWriter,
|
||||||
|
)
|
||||||
|
|
||||||
# 2) copy raw audio into audio-dir/<split>/
|
cut_set = cut_set.save_audios(args.audio_dir / part) # makes new cutset that uses paths to actual audio files
|
||||||
cut_set = cut_set.save_audios(args.audio_dir / part)
|
cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz")
|
||||||
|
|
||||||
# 3) write final cuts JSONL into manifest-dir
|
logging.info("All fbank computed for MLS English.")
|
||||||
out_manifest = args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz"
|
(args.manifest_dir / ".mls-eng-fbank.done").touch()
|
||||||
cut_set.to_file(out_manifest)
|
|
||||||
logging.info("Wrote cuts manifest to %s", out_manifest)
|
|
||||||
|
|
||||||
# Touch the done marker so next runs skip
|
|
||||||
done_marker.touch()
|
|
||||||
logging.info("All FBANK computed. Done marker created at %s", done_marker)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,87 +1,94 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# Prepare script for MLS English ASR recipe in icefall
|
# Prepare script for MLS English ASR recipe in icefall
|
||||||
|
|
||||||
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||||
|
|
||||||
set -eou pipefail
|
set -eou pipefail
|
||||||
|
|
||||||
stage=-1
|
stage=-1
|
||||||
stop_stage=100
|
stop_stage=100
|
||||||
|
|
||||||
# Configuration for BPE tokenizer
|
# Configuration for BPE tokenizer
|
||||||
vocab_sizes=(500)
|
vocab_sizes=(2000) # You can add more sizes like (500 1000 2000) for comparison
|
||||||
|
|
||||||
# Directory where dataset will be downloaded
|
# Directory where dataset will be downloaded
|
||||||
dl_dir=$PWD/download
|
dl_dir=$PWD/download
|
||||||
|
|
||||||
. shared/parse_options.sh || exit 1
|
. shared/parse_options.sh || exit 1
|
||||||
|
|
||||||
# All files generated by this script are saved in "data/".
|
# All files generated by this script are saved in "data".
|
||||||
mkdir -p data/manifests data/fbank data/audio data/lang
|
mkdir -p data
|
||||||
|
mkdir -p data/audio # Add this line
|
||||||
|
mkdir -p data/manifests
|
||||||
|
mkdir -p data/lang
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${LINENO}:${FUNCNAME[1]}) $*"
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
}
|
}
|
||||||
|
|
||||||
log "Starting MLS English data preparation"
|
log "Starting MLS English data preparation"
|
||||||
|
|
||||||
# Stage 0: Download corpus
|
|
||||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||||
log "Stage 0: Download MLS English dataset"
|
log "Stage 0: Download MLS English dataset"
|
||||||
if [ ! -d $dl_dir/mls_english ]; then
|
if [ ! -d $dl_dir/mls_english ]; then
|
||||||
git clone https://huggingface.co/datasets/parler-tts/mls_eng \
|
if ! git clone https://huggingface.co/datasets/parler-tts/mls_eng $dl_dir/mls_english; then
|
||||||
$dl_dir/mls_english || {
|
log "Failed to download MLS English dataset"
|
||||||
log "Failed to download MLS English dataset"; exit 1; }
|
exit 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Stage 1: Compute fbank & emit manifests
|
# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
|
# log "Stage 1: Prepare MLS English manifest"
|
||||||
|
# # We assume that you have downloaded the MLS English corpus
|
||||||
|
# # to $dl_dir/mls_english
|
||||||
|
# if [ ! -e data/manifests/.mls_english.done ]; then
|
||||||
|
# # lhotse prepare mls_english -j $nj $dl_dir/mls_english data/manifests
|
||||||
|
# python local/utils/save_audios.py --num-jobs 8 --dataset-dir $dl_dir/mls_english --audio-dir ./data/audio --manifest-dir ./data/manifests
|
||||||
|
# touch data/manifests/.mls_english.done
|
||||||
|
# fi
|
||||||
|
# fi
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
log "Stage 1: Compute & validate MLS English fbank"
|
log "Stage 1: Compute MLS English fbank"
|
||||||
# we already did `mkdir -p data/manifests data/fbank data/audio` above
|
if [ ! -e data/manifests/.mls_english-validated.done ]; then
|
||||||
|
python local/compute_fbank_mls_english.py \
|
||||||
if [ ! -e data/fbank/.mls_eng-fbank.done ]; then
|
--manifest-dir data/manifests \
|
||||||
python local/compute_fbank_mls_english.py \
|
--audio-dir data/audio \
|
||||||
--manifest-dir data/manifests \
|
--dl-dir $dl_dir/mls_english
|
||||||
--audio-dir data/audio \
|
# --dl-dir /root/datasets/parler-tts--mls_eng
|
||||||
--dl-dir $dl_dir/mls_english \
|
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_train.jsonl.gz
|
||||||
--fbank-dir data/fbank
|
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_dev.jsonl.gz
|
||||||
|
python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_test.jsonl.gz
|
||||||
# Validate each split’s manifest
|
touch data/manifests/.mls_english-validated.done
|
||||||
for split in train dev test; do
|
fi
|
||||||
python local/validate_manifest.py \
|
|
||||||
--manifest data/manifests/mls_eng_cuts_${split}.jsonl.gz
|
|
||||||
done
|
|
||||||
|
|
||||||
touch data/fbank/.mls_eng-fbank.done
|
|
||||||
log "fbank + manifest generation complete."
|
|
||||||
else
|
|
||||||
log "Skipping: fbank already done (data/fbank/.mls_eng-fbank.done exists)."
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
# Stage 2: Prepare transcript for BPE
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
log "Stage 2: Generate transcript for BPE"
|
log "Stage 2: Prepare transcript for BPE training"
|
||||||
if [ ! -f data/lang/transcript.txt ]; then
|
if [ ! -f data/lang/transcript.txt ]; then
|
||||||
|
log "Generating transcripts for BPE training"
|
||||||
./local/utils/generate_transcript.py --lang-dir data/lang
|
./local/utils/generate_transcript.py --lang-dir data/lang
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Stage 3: Train BPE models
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
log "Stage 3: Train BPE models"
|
log "Stage 3: Prepare BPE tokenizer"
|
||||||
for vocab_size in "${vocab_sizes[@]}"; do
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
bpe_dir=data/lang_bpe_${vocab_size}
|
log "Training BPE model with vocab_size=${vocab_size}"
|
||||||
|
bpe_dir=data/lang/bpe_${vocab_size}
|
||||||
mkdir -p $bpe_dir
|
mkdir -p $bpe_dir
|
||||||
|
|
||||||
if [ ! -f $bpe_dir/bpe.model ]; then
|
if [ ! -f $bpe_dir/bpe.model ]; then
|
||||||
./local/train_bpe_model.py \
|
./local/train_bpe_model.py \
|
||||||
--lang-dir $bpe_dir \
|
--lang-dir $bpe_dir \
|
||||||
--vocab-size $vocab_size \
|
--vocab-size $vocab_size \
|
||||||
--transcript data/lang/transcript.txt
|
--transcript data/lang/transcript.txt
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "MLS English data preparation completed successfully"
|
log "MLS English data preparation completed successfully"
|
Loading…
x
Reference in New Issue
Block a user