diff --git a/egs/libritts/ASR/prepare.sh b/egs/libritts/ASR/prepare.sh index f3a78bdb8..23c84e838 100755 --- a/egs/libritts/ASR/prepare.sh +++ b/egs/libritts/ASR/prepare.sh @@ -85,10 +85,10 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then # Here we shuffle and combine the train-clean-100, train-clean-360 and # train-other-500 together to form the training set. if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then - cat <(gunzip -c ./libritts_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c ./libritts_cuts_train-clean-360.jsonl.gz) \ - <(gunzip -c ./libritts_cuts_train-other-500.jsonl.gz) | \ - shuf | gzip -c > ./libritts_cuts_train-all-shuf.jsonl.gz + cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \ + <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \ + <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \ + shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz fi if [ ! -e data/fbank/.libritts-validated.done ]; then @@ -106,4 +106,4 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then ./local/compute_fbank_musan.py touch data/fbank/.msuan.done fi -fi \ No newline at end of file +fi diff --git a/egs/libritts/ASR/local/compute_spectrogram_libritts.py b/egs/libritts/CODEC/local/compute_spectrogram_libritts.py similarity index 97% rename from egs/libritts/ASR/local/compute_spectrogram_libritts.py rename to egs/libritts/CODEC/local/compute_spectrogram_libritts.py index 6cdc55bc8..8d864db92 100755 --- a/egs/libritts/ASR/local/compute_spectrogram_libritts.py +++ b/egs/libritts/CODEC/local/compute_spectrogram_libritts.py @@ -46,6 +46,7 @@ from icefall.utils import get_executor torch.set_num_threads(1) torch.set_num_interop_threads(1) + def get_args(): parser = argparse.ArgumentParser() @@ -64,12 +65,13 @@ def get_args(): return parser.parse_args() -def compute_spectrogram_libritts(dataset: Optional[str] = None, sampling_rate: int = 24000,): +def compute_spectrogram_libritts( + dataset: Optional[str] = None, sampling_rate: int = 24000 +): src_dir = Path("data/manifests") output_dir = Path("data/spectrogram") num_jobs = min(32, os.cpu_count()) - frame_length = 1024 / sampling_rate # (in second) frame_shift = 256 / sampling_rate # (in second) use_fft_mag = True diff --git a/egs/libritts/CODEC/local/display_manifest_statistics.py b/egs/libritts/CODEC/local/display_manifest_statistics.py new file mode 100755 index 000000000..ec00e0454 --- /dev/null +++ b/egs/libritts/CODEC/local/display_manifest_statistics.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python3 +# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao) +# 2024 The Chinese Univ. of HK (authors: Zengrui Jin) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This file displays duration statistics of utterances in a manifest. +You can use the displayed value to choose minimum/maximum duration +to remove short and long utterances during the training. +""" + + +from lhotse import load_manifest_lazy + + +def main(): + paths = [ + "./data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz", + "./data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz", + "./data/spectrogram/libritts_cuts_train-other-500.jsonl.gz", + "./data/spectrogram/libritts_cuts_dev-clean.jsonl.gz", + "./data/spectrogram/libritts_cuts_dev-other.jsonl.gz", + "./data/spectrogram/libritts_cuts_test-clean.jsonl.gz", + "./data/spectrogram/libritts_cuts_test-other.jsonl.gz", + ] + for path in paths: + cuts = load_manifest_lazy(path) + cuts.describe() + + +if __name__ == "__main__": + main() + +""" +./data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz statistics: +________________________________________ +_ Cuts count: _ 33236 _ +________________________________________ +_ Total duration (hh:mm:ss) _ 53:47:18 _ +________________________________________ +_ mean _ 5.8 _ +________________________________________ +_ std _ 4.6 _ +________________________________________ +_ min _ 0.2 _ +________________________________________ +_ 25% _ 2.4 _ +________________________________________ +_ 50% _ 4.5 _ +________________________________________ +_ 75% _ 7.9 _ +________________________________________ +_ 99% _ 21.4 _ +________________________________________ +_ 99.5% _ 23.7 _ +________________________________________ +_ 99.9% _ 27.8 _ +________________________________________ +_ max _ 33.2 _ +________________________________________ +_ Recordings available: _ 33236 _ +________________________________________ +_ Features available: _ 33236 _ +________________________________________ +_ Supervisions available: _ 33236 _ +________________________________________ +SUPERVISION custom fields: +Speech duration statistics: +__________________________________________________________________ +_ Total speech duration _ 53:47:18 _ 100.00% of recording _ +__________________________________________________________________ +_ Total speaking time duration _ 53:47:18 _ 100.00% of recording _ +__________________________________________________________________ +_ Total silence duration _ 00:00:01 _ 0.00% of recording _ +__________________________________________________________________ + +./data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz statistics: +_________________________________________ +_ Cuts count: _ 116500 _ +_________________________________________ +_ Total duration (hh:mm:ss) _ 191:17:42 _ +_________________________________________ +_ mean _ 5.9 _ +_________________________________________ +_ std _ 4.6 _ +_________________________________________ +_ min _ 0.1 _ +_________________________________________ +_ 25% _ 2.4 _ +_________________________________________ +_ 50% _ 4.6 _ +_________________________________________ +_ 75% _ 8.1 _ +_________________________________________ +_ 99% _ 21.3 _ +_________________________________________ +_ 99.5% _ 23.4 _ +_________________________________________ +_ 99.9% _ 27.4 _ +_________________________________________ +_ max _ 40.4 _ +_________________________________________ +_ Recordings available: _ 116500 _ +_________________________________________ +_ Features available: _ 116500 _ +_________________________________________ +_ Supervisions available: _ 116500 _ +_________________________________________ +SUPERVISION custom fields: +Speech duration statistics: +___________________________________________________________________ +_ Total speech duration _ 191:17:42 _ 100.00% of recording _ +___________________________________________________________________ +_ Total speaking time duration _ 191:17:42 _ 100.00% of recording _ +___________________________________________________________________ +_ Total silence duration _ 00:00:01 _ 0.00% of recording _ +___________________________________________________________________ + +./data/spectrogram/libritts_cuts_train-other-500.jsonl.gz statistics: +_________________________________________ +_ Cuts count: _ 205043 _ +_________________________________________ +_ Total duration (hh:mm:ss) _ 310:04:36 _ +_________________________________________ +_ mean _ 5.4 _ +_________________________________________ +_ std _ 4.4 _ +_________________________________________ +_ min _ 0.1 _ +_________________________________________ +_ 25% _ 2.3 _ +_________________________________________ +_ 50% _ 4.2 _ +_________________________________________ +_ 75% _ 7.3 _ +_________________________________________ +_ 99% _ 20.6 _ +_________________________________________ +_ 99.5% _ 22.8 _ +_________________________________________ +_ 99.9% _ 27.4 _ +_________________________________________ +_ max _ 43.9 _ +_________________________________________ +_ Recordings available: _ 205043 _ +_________________________________________ +_ Features available: _ 205043 _ +_________________________________________ +_ Supervisions available: _ 205043 _ +_________________________________________ +SUPERVISION custom fields: +Speech duration statistics: +___________________________________________________________________ +_ Total speech duration _ 310:04:36 _ 100.00% of recording _ +___________________________________________________________________ +_ Total speaking time duration _ 310:04:36 _ 100.00% of recording _ +___________________________________________________________________ +_ Total silence duration _ 00:00:01 _ 0.00% of recording _ +___________________________________________________________________ + +./data/spectrogram/libritts_cuts_dev-clean.jsonl.gz statistics: +________________________________________ +_ Cuts count: _ 5736 _ +________________________________________ +_ Total duration (hh:mm:ss) _ 08:58:13 _ +________________________________________ +_ mean _ 5.6 _ +________________________________________ +_ std _ 4.3 _ +________________________________________ +_ min _ 0.3 _ +________________________________________ +_ 25% _ 2.4 _ +________________________________________ +_ 50% _ 4.4 _ +________________________________________ +_ 75% _ 7.8 _ +________________________________________ +_ 99% _ 19.9 _ +________________________________________ +_ 99.5% _ 21.9 _ +________________________________________ +_ 99.9% _ 26.3 _ +________________________________________ +_ max _ 30.1 _ +________________________________________ +_ Recordings available: _ 5736 _ +________________________________________ +_ Features available: _ 5736 _ +________________________________________ +_ Supervisions available: _ 5736 _ +________________________________________ +SUPERVISION custom fields: +Speech duration statistics: +__________________________________________________________________ +_ Total speech duration _ 08:58:13 _ 100.00% of recording _ +__________________________________________________________________ +_ Total speaking time duration _ 08:58:13 _ 100.00% of recording _ +__________________________________________________________________ +_ Total silence duration _ 00:00:01 _ 0.00% of recording _ +__________________________________________________________________ + +./data/spectrogram/libritts_cuts_dev-other.jsonl.gz statistics: +________________________________________ +_ Cuts count: _ 4613 _ +________________________________________ +_ Total duration (hh:mm:ss) _ 06:25:52 _ +________________________________________ +_ mean _ 5.0 _ +________________________________________ +_ std _ 4.1 _ +________________________________________ +_ min _ 0.3 _ +________________________________________ +_ 25% _ 2.2 _ +________________________________________ +_ 50% _ 3.8 _ +________________________________________ +_ 75% _ 6.5 _ +________________________________________ +_ 99% _ 19.7 _ +________________________________________ +_ 99.5% _ 24.5 _ +________________________________________ +_ 99.9% _ 31.0 _ +________________________________________ +_ max _ 32.6 _ +________________________________________ +_ Recordings available: _ 4613 _ +________________________________________ +_ Features available: _ 4613 _ +________________________________________ +_ Supervisions available: _ 4613 _ +________________________________________ +SUPERVISION custom fields: +Speech duration statistics: +__________________________________________________________________ +_ Total speech duration _ 06:25:52 _ 100.00% of recording _ +__________________________________________________________________ +_ Total speaking time duration _ 06:25:52 _ 100.00% of recording _ +__________________________________________________________________ +_ Total silence duration _ 00:00:01 _ 0.00% of recording _ +__________________________________________________________________ + +./data/spectrogram/libritts_cuts_test-clean.jsonl.gz statistics: +________________________________________ +_ Cuts count: _ 4837 _ +________________________________________ +_ Total duration (hh:mm:ss) _ 08:34:09 _ +________________________________________ +_ mean _ 6.4 _ +________________________________________ +_ std _ 5.1 _ +________________________________________ +_ min _ 0.3 _ +________________________________________ +_ 25% _ 2.4 _ +________________________________________ +_ 50% _ 4.8 _ +________________________________________ +_ 75% _ 8.9 _ +________________________________________ +_ 99% _ 22.6 _ +________________________________________ +_ 99.5% _ 24.4 _ +________________________________________ +_ 99.9% _ 29.6 _ +________________________________________ +_ max _ 36.7 _ +________________________________________ +_ Recordings available: _ 4837 _ +________________________________________ +_ Features available: _ 4837 _ +________________________________________ +_ Supervisions available: _ 4837 _ +________________________________________ +SUPERVISION custom fields: +Speech duration statistics: +__________________________________________________________________ +_ Total speech duration _ 08:34:09 _ 100.00% of recording _ +__________________________________________________________________ +_ Total speaking time duration _ 08:34:09 _ 100.00% of recording _ +__________________________________________________________________ +_ Total silence duration _ 00:00:01 _ 0.00% of recording _ +__________________________________________________________________ + +./data/spectrogram/libritts_cuts_test-other.jsonl.gz statistics: +________________________________________ +_ Cuts count: _ 5120 _ +________________________________________ +_ Total duration (hh:mm:ss) _ 06:41:31 _ +________________________________________ +_ mean _ 4.7 _ +________________________________________ +_ std _ 3.8 _ +________________________________________ +_ min _ 0.3 _ +________________________________________ +_ 25% _ 1.8 _ +________________________________________ +_ 50% _ 3.6 _ +________________________________________ +_ 75% _ 6.5 _ +________________________________________ +_ 99% _ 17.8 _ +________________________________________ +_ 99.5% _ 20.4 _ +________________________________________ +_ 99.9% _ 23.8 _ +________________________________________ +_ max _ 27.3 _ +________________________________________ +_ Recordings available: _ 5120 _ +________________________________________ +_ Features available: _ 5120 _ +________________________________________ +_ Supervisions available: _ 5120 _ +________________________________________ +SUPERVISION custom fields: +Speech duration statistics: +__________________________________________________________________ +_ Total speech duration _ 06:41:31 _ 100.00% of recording _ +__________________________________________________________________ +_ Total speaking time duration _ 06:41:31 _ 100.00% of recording _ +__________________________________________________________________ +_ Total silence duration _ 00:00:01 _ 0.00% of recording _ +__________________________________________________________________ +""" diff --git a/egs/libritts/CODEC/local/validate_manifest.py b/egs/libritts/CODEC/local/validate_manifest.py new file mode 120000 index 000000000..b4d52ebca --- /dev/null +++ b/egs/libritts/CODEC/local/validate_manifest.py @@ -0,0 +1 @@ +../../../ljspeech/TTS/local/validate_manifest.py \ No newline at end of file diff --git a/egs/libritts/CODEC/prepare.sh b/egs/libritts/CODEC/prepare.sh new file mode 100755 index 000000000..3dcb73474 --- /dev/null +++ b/egs/libritts/CODEC/prepare.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +stage=0 +stop_stage=100 +sampling_rate=24000 +nj=32 + +dl_dir=$PWD/download + +. shared/parse_options.sh || exit 1 + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download data" + + # If you have pre-downloaded it to /path/to/LibriTTS, + # you can create a symlink + # + # ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS + # + if [ ! -d $dl_dir/LibriTTS ]; then + lhotse download libritts $dl_dir + fi + + # If you have pre-downloaded it to /path/to/musan, + # you can create a symlink + # + # ln -sfv /path/to/musan $dl_dir/musan + # + if [ ! -d $dl_dir/musan ]; then + lhotse download musan $dl_dir + fi +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare LibriTTS manifest" + # We assume that you have downloaded the LibriTTS corpus + # to $dl_dir/LibriTTS + mkdir -p data/manifests + if [ ! -e data/manifests/.libritts.done ]; then + lhotse prepare libritts --num-jobs 32 $dl_dir/LibriTTS data/manifests + touch data/manifests/.libritts.done + fi +fi + + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Compute Spectrogram for LibriTTS" + mkdir -p data/spectrogram + if [ ! -e data/spectrogram/.libritts.done ]; then + ./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate + touch data/spectrogram/.libritts.done + fi + + # Here we shuffle and combine the train-clean-100, train-clean-360 and + # train-other-500 together to form the training set. + if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then + cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \ + <(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \ + <(gunzip -c /data/spectrogramlibritts_cuts_train-other-500.jsonl.gz) | \ + shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz + fi + + if [ ! -e data/spectrogram/.libritts-validated.done ]; then + log "Validating data/spectrogram for LibriTTS" + ./local/validate_manifest.py \ + data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz + touch data/spectrogram/.libritts-validated.done + fi +fi + diff --git a/egs/libritts/CODEC/shared b/egs/libritts/CODEC/shared new file mode 120000 index 000000000..4c5e91438 --- /dev/null +++ b/egs/libritts/CODEC/shared @@ -0,0 +1 @@ +../../../icefall/shared/ \ No newline at end of file