minor updates

2025-12-09 14:05:33 +00:00 · 2024-09-08 11:16:12 +08:00 · 2024-09-08 11:16:12 +08:00 · d45b400805
commit d45b400805
parent c236757674
6 changed files with 439 additions and 7 deletions
--- a/egs/libritts/ASR/prepare.sh
+++ b/egs/libritts/ASR/prepare.sh
@ -85,10 +85,10 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  # Here we shuffle and combine the train-clean-100, train-clean-360 and 
  # train-other-500 together to form the training set.
  if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
-    cat <(gunzip -c ./libritts_cuts_train-clean-100.jsonl.gz) \
+    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c ./libritts_cuts_train-clean-360.jsonl.gz) \
+      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
-      <(gunzip -c ./libritts_cuts_train-other-500.jsonl.gz) | \
+      <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
-      shuf | gzip -c > ./libritts_cuts_train-all-shuf.jsonl.gz
+      shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
  fi
  if [ ! -e data/fbank/.libritts-validated.done ]; then
@ -106,4 +106,4 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
    ./local/compute_fbank_musan.py
    touch data/fbank/.msuan.done
  fi
-fi
+fi
--- a/egs/libritts/CODEC/local/compute_spectrogram_libritts.py
+++ b/egs/libritts/CODEC/local/compute_spectrogram_libritts.py
@ -46,6 +46,7 @@ from icefall.utils import get_executor
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_args():
    parser = argparse.ArgumentParser()
@ -64,12 +65,13 @@ def get_args():
    return parser.parse_args()
-def compute_spectrogram_libritts(dataset: Optional[str] = None, sampling_rate: int = 24000,):
+def compute_spectrogram_libritts(
    dataset: Optional[str] = None, sampling_rate: int = 24000
 ):
    src_dir = Path("data/manifests")
    output_dir = Path("data/spectrogram")
    num_jobs = min(32, os.cpu_count())
    frame_length = 1024 / sampling_rate  # (in second)
    frame_shift = 256 / sampling_rate  # (in second)
    use_fft_mag = True
--- a/egs/libritts/CODEC/local/display_manifest_statistics.py
+++ b/egs/libritts/CODEC/local/display_manifest_statistics.py
@ -0,0 +1,341 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.             (authors: Zengwei Yao)
 #              2024  The Chinese Univ. of HK  (authors: Zengrui Jin)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file displays duration statistics of utterances in a manifest.
 You can use the displayed value to choose minimum/maximum duration
 to remove short and long utterances during the training.
 """
 from lhotse import load_manifest_lazy
 def main():
    paths = [
        "./data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz",
        "./data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz",
        "./data/spectrogram/libritts_cuts_train-other-500.jsonl.gz",
        "./data/spectrogram/libritts_cuts_dev-clean.jsonl.gz",
        "./data/spectrogram/libritts_cuts_dev-other.jsonl.gz",
        "./data/spectrogram/libritts_cuts_test-clean.jsonl.gz",
        "./data/spectrogram/libritts_cuts_test-other.jsonl.gz",
    ]
    for path in paths:
        cuts = load_manifest_lazy(path)
        cuts.describe()
 if __name__ == "__main__":
    main()
 """
 ./data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz statistics:
 ________________________________________
 _ Cuts count:               _ 33236    _
 ________________________________________
 _ Total duration (hh:mm:ss) _ 53:47:18 _
 ________________________________________
 _ mean                      _ 5.8      _
 ________________________________________
 _ std                       _ 4.6      _
 ________________________________________
 _ min                       _ 0.2      _
 ________________________________________
 _ 25%                       _ 2.4      _
 ________________________________________
 _ 50%                       _ 4.5      _
 ________________________________________
 _ 75%                       _ 7.9      _
 ________________________________________
 _ 99%                       _ 21.4     _
 ________________________________________
 _ 99.5%                     _ 23.7     _
 ________________________________________
 _ 99.9%                     _ 27.8     _
 ________________________________________
 _ max                       _ 33.2     _
 ________________________________________
 _ Recordings available:     _ 33236    _
 ________________________________________
 _ Features available:       _ 33236    _
 ________________________________________
 _ Supervisions available:   _ 33236    _
 ________________________________________
 SUPERVISION custom fields:
 Speech duration statistics:
 __________________________________________________________________
 _ Total speech duration        _ 53:47:18 _ 100.00% of recording _
 __________________________________________________________________
 _ Total speaking time duration _ 53:47:18 _ 100.00% of recording _
 __________________________________________________________________
 _ Total silence duration       _ 00:00:01 _ 0.00% of recording   _
 __________________________________________________________________
 ./data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz statistics:
 _________________________________________
 _ Cuts count:               _ 116500    _
 _________________________________________
 _ Total duration (hh:mm:ss) _ 191:17:42 _
 _________________________________________
 _ mean                      _ 5.9       _
 _________________________________________
 _ std                       _ 4.6       _
 _________________________________________
 _ min                       _ 0.1       _
 _________________________________________
 _ 25%                       _ 2.4       _
 _________________________________________
 _ 50%                       _ 4.6       _
 _________________________________________
 _ 75%                       _ 8.1       _
 _________________________________________
 _ 99%                       _ 21.3      _
 _________________________________________
 _ 99.5%                     _ 23.4      _
 _________________________________________
 _ 99.9%                     _ 27.4      _
 _________________________________________
 _ max                       _ 40.4      _
 _________________________________________
 _ Recordings available:     _ 116500    _
 _________________________________________
 _ Features available:       _ 116500    _
 _________________________________________
 _ Supervisions available:   _ 116500    _
 _________________________________________
 SUPERVISION custom fields:
 Speech duration statistics:
 ___________________________________________________________________
 _ Total speech duration        _ 191:17:42 _ 100.00% of recording _
 ___________________________________________________________________
 _ Total speaking time duration _ 191:17:42 _ 100.00% of recording _
 ___________________________________________________________________
 _ Total silence duration       _ 00:00:01  _ 0.00% of recording   _
 ___________________________________________________________________
 ./data/spectrogram/libritts_cuts_train-other-500.jsonl.gz statistics:
 _________________________________________
 _ Cuts count:               _ 205043    _
 _________________________________________
 _ Total duration (hh:mm:ss) _ 310:04:36 _
 _________________________________________
 _ mean                      _ 5.4       _
 _________________________________________
 _ std                       _ 4.4       _
 _________________________________________
 _ min                       _ 0.1       _
 _________________________________________
 _ 25%                       _ 2.3       _
 _________________________________________
 _ 50%                       _ 4.2       _
 _________________________________________
 _ 75%                       _ 7.3       _
 _________________________________________
 _ 99%                       _ 20.6      _
 _________________________________________
 _ 99.5%                     _ 22.8      _
 _________________________________________
 _ 99.9%                     _ 27.4      _
 _________________________________________
 _ max                       _ 43.9      _
 _________________________________________
 _ Recordings available:     _ 205043    _
 _________________________________________
 _ Features available:       _ 205043    _
 _________________________________________
 _ Supervisions available:   _ 205043    _
 _________________________________________
 SUPERVISION custom fields:
 Speech duration statistics:
 ___________________________________________________________________
 _ Total speech duration        _ 310:04:36 _ 100.00% of recording _
 ___________________________________________________________________
 _ Total speaking time duration _ 310:04:36 _ 100.00% of recording _
 ___________________________________________________________________
 _ Total silence duration       _ 00:00:01  _ 0.00% of recording   _
 ___________________________________________________________________
 ./data/spectrogram/libritts_cuts_dev-clean.jsonl.gz statistics:
 ________________________________________
 _ Cuts count:               _ 5736     _
 ________________________________________
 _ Total duration (hh:mm:ss) _ 08:58:13 _
 ________________________________________
 _ mean                      _ 5.6      _
 ________________________________________
 _ std                       _ 4.3      _
 ________________________________________
 _ min                       _ 0.3      _
 ________________________________________
 _ 25%                       _ 2.4      _
 ________________________________________
 _ 50%                       _ 4.4      _
 ________________________________________
 _ 75%                       _ 7.8      _
 ________________________________________
 _ 99%                       _ 19.9     _
 ________________________________________
 _ 99.5%                     _ 21.9     _
 ________________________________________
 _ 99.9%                     _ 26.3     _
 ________________________________________
 _ max                       _ 30.1     _
 ________________________________________
 _ Recordings available:     _ 5736     _
 ________________________________________
 _ Features available:       _ 5736     _
 ________________________________________
 _ Supervisions available:   _ 5736     _
 ________________________________________
 SUPERVISION custom fields:
 Speech duration statistics:
 __________________________________________________________________
 _ Total speech duration        _ 08:58:13 _ 100.00% of recording _
 __________________________________________________________________
 _ Total speaking time duration _ 08:58:13 _ 100.00% of recording _
 __________________________________________________________________
 _ Total silence duration       _ 00:00:01 _ 0.00% of recording   _
 __________________________________________________________________
 ./data/spectrogram/libritts_cuts_dev-other.jsonl.gz statistics:
 ________________________________________
 _ Cuts count:               _ 4613     _
 ________________________________________
 _ Total duration (hh:mm:ss) _ 06:25:52 _
 ________________________________________
 _ mean                      _ 5.0      _
 ________________________________________
 _ std                       _ 4.1      _
 ________________________________________
 _ min                       _ 0.3      _
 ________________________________________
 _ 25%                       _ 2.2      _
 ________________________________________
 _ 50%                       _ 3.8      _
 ________________________________________
 _ 75%                       _ 6.5      _
 ________________________________________
 _ 99%                       _ 19.7     _
 ________________________________________
 _ 99.5%                     _ 24.5     _
 ________________________________________
 _ 99.9%                     _ 31.0     _
 ________________________________________
 _ max                       _ 32.6     _
 ________________________________________
 _ Recordings available:     _ 4613     _
 ________________________________________
 _ Features available:       _ 4613     _
 ________________________________________
 _ Supervisions available:   _ 4613     _
 ________________________________________
 SUPERVISION custom fields:
 Speech duration statistics:
 __________________________________________________________________
 _ Total speech duration        _ 06:25:52 _ 100.00% of recording _
 __________________________________________________________________
 _ Total speaking time duration _ 06:25:52 _ 100.00% of recording _
 __________________________________________________________________
 _ Total silence duration       _ 00:00:01 _ 0.00% of recording   _
 __________________________________________________________________
 ./data/spectrogram/libritts_cuts_test-clean.jsonl.gz statistics:
 ________________________________________
 _ Cuts count:               _ 4837     _
 ________________________________________
 _ Total duration (hh:mm:ss) _ 08:34:09 _
 ________________________________________
 _ mean                      _ 6.4      _
 ________________________________________
 _ std                       _ 5.1      _
 ________________________________________
 _ min                       _ 0.3      _
 ________________________________________
 _ 25%                       _ 2.4      _
 ________________________________________
 _ 50%                       _ 4.8      _
 ________________________________________
 _ 75%                       _ 8.9      _
 ________________________________________
 _ 99%                       _ 22.6     _
 ________________________________________
 _ 99.5%                     _ 24.4     _
 ________________________________________
 _ 99.9%                     _ 29.6     _
 ________________________________________
 _ max                       _ 36.7     _
 ________________________________________
 _ Recordings available:     _ 4837     _
 ________________________________________
 _ Features available:       _ 4837     _
 ________________________________________
 _ Supervisions available:   _ 4837     _
 ________________________________________
 SUPERVISION custom fields:
 Speech duration statistics:
 __________________________________________________________________
 _ Total speech duration        _ 08:34:09 _ 100.00% of recording _
 __________________________________________________________________
 _ Total speaking time duration _ 08:34:09 _ 100.00% of recording _
 __________________________________________________________________
 _ Total silence duration       _ 00:00:01 _ 0.00% of recording   _
 __________________________________________________________________
 ./data/spectrogram/libritts_cuts_test-other.jsonl.gz statistics:
 ________________________________________
 _ Cuts count:               _ 5120     _
 ________________________________________
 _ Total duration (hh:mm:ss) _ 06:41:31 _
 ________________________________________
 _ mean                      _ 4.7      _
 ________________________________________
 _ std                       _ 3.8      _
 ________________________________________
 _ min                       _ 0.3      _
 ________________________________________
 _ 25%                       _ 1.8      _
 ________________________________________
 _ 50%                       _ 3.6      _
 ________________________________________
 _ 75%                       _ 6.5      _
 ________________________________________
 _ 99%                       _ 17.8     _
 ________________________________________
 _ 99.5%                     _ 20.4     _
 ________________________________________
 _ 99.9%                     _ 23.8     _
 ________________________________________
 _ max                       _ 27.3     _
 ________________________________________
 _ Recordings available:     _ 5120     _
 ________________________________________
 _ Features available:       _ 5120     _
 ________________________________________
 _ Supervisions available:   _ 5120     _
 ________________________________________
 SUPERVISION custom fields:
 Speech duration statistics:
 __________________________________________________________________
 _ Total speech duration        _ 06:41:31 _ 100.00% of recording _
 __________________________________________________________________
 _ Total speaking time duration _ 06:41:31 _ 100.00% of recording _
 __________________________________________________________________
 _ Total silence duration       _ 00:00:01 _ 0.00% of recording   _
 __________________________________________________________________
 """
--- a/egs/libritts/CODEC/local/validate_manifest.py
+++ b/egs/libritts/CODEC/local/validate_manifest.py
@ -0,0 +1 @@
 ../../../ljspeech/TTS/local/validate_manifest.py
--- a/egs/libritts/CODEC/prepare.sh
+++ b/egs/libritts/CODEC/prepare.sh
@ -0,0 +1,87 @@
 #!/usr/bin/env bash
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 stage=0
 stop_stage=100
 sampling_rate=24000
 nj=32
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "dl_dir: $dl_dir"
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
  # If you have pre-downloaded it to /path/to/LibriTTS,
  # you can create a symlink
  #
  #   ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS
  #
  if [ ! -d $dl_dir/LibriTTS ]; then
    lhotse download libritts $dl_dir
  fi
  # If you have pre-downloaded it to /path/to/musan,
  # you can create a symlink
  #
  #   ln -sfv /path/to/musan $dl_dir/musan
  #
  if [ ! -d $dl_dir/musan ]; then
    lhotse download musan $dl_dir
  fi
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare LibriTTS manifest"
  # We assume that you have downloaded the LibriTTS corpus
  # to $dl_dir/LibriTTS
  mkdir -p data/manifests
  if [ ! -e data/manifests/.libritts.done ]; then
    lhotse prepare libritts --num-jobs 32 $dl_dir/LibriTTS data/manifests
    touch data/manifests/.libritts.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Compute Spectrogram for LibriTTS"
  mkdir -p data/spectrogram
  if [ ! -e data/spectrogram/.libritts.done ]; then
    ./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate 
    touch data/spectrogram/.libritts.done
  fi
  # Here we shuffle and combine the train-clean-100, train-clean-360 and 
  # train-other-500 together to form the training set.
  if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then
    cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
      <(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \
      <(gunzip -c /data/spectrogramlibritts_cuts_train-other-500.jsonl.gz) | \
      shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
  fi
  if [ ! -e data/spectrogram/.libritts-validated.done ]; then
    log "Validating data/spectrogram for LibriTTS"
    ./local/validate_manifest.py \
      data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
    touch data/spectrogram/.libritts-validated.done
  fi
 fi
--- a/egs/libritts/CODEC/shared
+++ b/egs/libritts/CODEC/shared
@ -0,0 +1 @@
 ../../../icefall/shared/
		`@ -0,0 +1 @@`
							`../../../ljspeech/TTS/local/validate_manifest.py`