Merge branch 'k2-fsa:master' into dev/k2ssl

2025-08-09 01:52:41 +00:00 · 2025-01-06 17:30:59 +08:00 · 2025-01-06 17:30:59 +08:00 · ab44ac0f9e
commit ab44ac0f9e
parent c64a9bac05 3b6d54007b
14 changed files with 127 additions and 13 deletions
--- a/.github/scripts/docker/Dockerfile
+++ b/.github/scripts/docker/Dockerfile
@ -49,7 +49,7 @@ RUN pip install --no-cache-dir \
      kaldifst \
      kaldilm \
      librosa \
-      matplotlib \
+      "matplotlib<=3.9.4" \
      multi_quantization \
      numba \
      "numpy<2.0" \
--- a/.github/scripts/ljspeech/TTS/run-matcha.sh
+++ b/.github/scripts/ljspeech/TTS/run-matcha.sh
@ -77,7 +77,7 @@ function export_onnx() {
  popd
  pushd data/fbank
-  rm -v *.json
+  rm -fv *.json
  curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/data/cmvn.json
  popd
@ -115,6 +115,37 @@ function export_onnx() {
  ls -lh /icefall/*.wav
  soxi /icefall/generated-matcha-tts-steps-6-*.wav
  cp ./model-steps-*.onnx /icefall
  d=matcha-icefall-en_US-ljspeech
  mkdir $d
  cp -v data/tokens.txt $d
  cp model-steps-3.onnx $d
  pushd $d
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
  tar xf espeak-ng-data.tar.bz2
  rm espeak-ng-data.tar.bz2
 cat >README.md <<EOF
 # Introduction
 This model is trained using the dataset from
 https://keithito.com/LJ-Speech-Dataset/
 The dataset contains only 1 female speaker.
 You can find the training code at
 https://github.com/k2-fsa/icefall/tree/master/egs/ljspeech/TTS#matcha
 EOF
  ls -lh
  popd
  tar cvjf $d.tar.bz2 $d
  mv $d.tar.bz2 /icefall
  mv $d /icefall
 }
 prepare_data
--- a/.github/workflows/ljspeech.yml
+++ b/.github/workflows/ljspeech.yml
@ -30,8 +30,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
+          python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3"
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3")
          echo "::set-output name=matrix::${MATRIX}"
  ljspeech:
@ -70,6 +70,10 @@ jobs:
              cd /icefall
              git config --global --add safe.directory /icefall
              pip install "matplotlib<=3.9.4"
              pip list
              .github/scripts/ljspeech/TTS/run-matcha.sh
              .github/scripts/ljspeech/TTS/run.sh
@ -94,3 +98,69 @@ jobs:
          repo_name: k2-fsa/sherpa-onnx
          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
          tag: tts-models
      - uses: actions/upload-artifact@v4
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
        with:
          name: step-2
          path: ./model-steps-2.onnx
      - uses: actions/upload-artifact@v4
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
        with:
          name: step-3
          path: ./model-steps-3.onnx
      - uses: actions/upload-artifact@v4
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
        with:
          name: step-4
          path: ./model-steps-4.onnx
      - uses: actions/upload-artifact@v4
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
        with:
          name: step-5
          path: ./model-steps-5.onnx
      - uses: actions/upload-artifact@v4
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
        with:
          name: step-6
          path: ./model-steps-6.onnx
      - name: Upload models to huggingface
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
        shell: bash
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        run: |
          d=matcha-icefall-en_US-ljspeech
          GIT_LFS_SKIP_SMUDGE=1  git clone https://huggingface.co/csukuangfj/$d hf
          cp -av $d/* hf/
          pushd hf
          git lfs track "cmn_dict"
          git lfs track "ru_dict"
          git add .
          git config --global user.name "csukuangfj"
          git config --global user.email "csukuangfj@gmail.com"
          git config --global lfs.allowincompletepush true
          git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$d main || true
          popd
      - name: Release exported onnx models
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
          overwrite: true
          file: matcha-icefall-*.tar.bz2
          repo_name: k2-fsa/sherpa-onnx
          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
          tag: tts-models
--- a/egs/librispeech/ASR/conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/conformer.py
@ -32,7 +32,7 @@ class Conformer(Transformer):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        num_decoder_layers (int): number of decoder layers
        dropout (float): dropout rate
--- a/egs/librispeech/ASR/conformer_ctc2/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc2/conformer.py
@ -42,7 +42,7 @@ class Conformer(Transformer):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension, also the output dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        num_decoder_layers (int): number of decoder layers
        dropout (float): dropout rate
--- a/egs/librispeech/ASR/conformer_mmi/conformer.py
+++ b/egs/librispeech/ASR/conformer_mmi/conformer.py
@ -33,7 +33,7 @@ class Conformer(Transformer):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        num_decoder_layers (int): number of decoder layers
        dropout (float): dropout rate
--- a/egs/librispeech/ASR/pruned2_knowledge/conformer.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/conformer.py
@ -42,7 +42,7 @@ class Conformer(EncoderInterface):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension, also the output dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        dropout (float): dropout rate
        layer_dropout (float): layer-dropout rate.
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
@ -42,7 +42,7 @@ class Conformer(EncoderInterface):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension, also the output dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        dropout (float): dropout rate
        layer_dropout (float): layer-dropout rate.
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py
@ -42,7 +42,7 @@ class Conformer(EncoderInterface):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension, also the output dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        dropout (float): dropout rate
        layer_dropout (float): layer-dropout rate.
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py
@ -42,7 +42,7 @@ class Conformer(EncoderInterface):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension, also the output dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        dropout (float): dropout rate
        layer_dropout (float): layer-dropout rate.
--- a/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py
@ -69,7 +69,7 @@ class Conformer(Transformer):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        num_decoder_layers (int): number of decoder layers
        dropout (float): dropout rate
--- a/egs/librispeech/ASR/transducer_stateless/conformer.py
+++ b/egs/librispeech/ASR/transducer_stateless/conformer.py
@ -35,7 +35,7 @@ class Conformer(Transformer):
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension
        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
        num_encoder_layers (int): number of encoder layers
        dropout (float): dropout rate
        cnn_module_kernel (int): Kernel size of convolution module
--- a/egs/ljspeech/TTS/README.md
+++ b/egs/ljspeech/TTS/README.md
@ -176,6 +176,15 @@ The above command generates the following files:
 where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
 **HINT**: If you get the following error while running `export_onnx.py`:
 ```
 torch.onnx.errors.UnsupportedOperatorError: Exporting the operator
 'aten::scaled_dot_product_attention' to ONNX opset version 14 is not supported.
 ```
 please use `torch>=2.2.0`.
 To export the Hifigan vocoder to onnx, please use:
--- a/egs/ljspeech/TTS/matcha/export_onnx.py
+++ b/egs/ljspeech/TTS/matcha/export_onnx.py
@ -176,12 +176,16 @@ def main():
            "language": "English",
            "voice": "en-us",
            "has_espeak": 1,
            "jieba": 0,
            "n_speakers": 1,
            "sample_rate": 22050,
            "version": 1,
            "pad_id": tokenizer.pad_id,
            "model_author": "icefall",
            "maintainer": "k2-fsa",
            "use_eos_bos": 1,
            "dataset": "LJ Speech",
            "dataset_url": "https://keithito.com/LJ-Speech-Dataset/",
            "num_ode_steps": num_steps,
        }
        add_meta_data(filename=filename, meta_data=meta_data)