From 3b263539cd34fb14b53d72339bc7c095028f4578 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Thu, 2 Jan 2025 15:54:34 +0800
Subject: [PATCH 1/2] Publish MatchaTTS onnx models trained with LJSpeech to
 huggingface (#1854)

---
 .github/scripts/docker/Dockerfile          |  2 +-
 .github/scripts/ljspeech/TTS/run-matcha.sh | 33 +++++++++-
 .github/workflows/ljspeech.yml             | 74 +++++++++++++++++++++-
 egs/ljspeech/TTS/README.md                 |  9 +++
 egs/ljspeech/TTS/matcha/export_onnx.py     |  4 ++
 5 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/docker/Dockerfile b/.github/scripts/docker/Dockerfile
index 94e8d8e1e..cf0523401 100644
--- a/.github/scripts/docker/Dockerfile
+++ b/.github/scripts/docker/Dockerfile
@@ -49,7 +49,7 @@ RUN pip install --no-cache-dir \
       kaldifst \
       kaldilm \
       librosa \
-      matplotlib \
+      "matplotlib<=3.9.4" \
       multi_quantization \
       numba \
       "numpy<2.0" \
diff --git a/.github/scripts/ljspeech/TTS/run-matcha.sh b/.github/scripts/ljspeech/TTS/run-matcha.sh
index 954dd5bd8..bfb37fb6d 100755
--- a/.github/scripts/ljspeech/TTS/run-matcha.sh
+++ b/.github/scripts/ljspeech/TTS/run-matcha.sh
@@ -77,7 +77,7 @@ function export_onnx() {
   popd
 
   pushd data/fbank
-  rm -v *.json
+  rm -fv *.json
   curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/data/cmvn.json
   popd
 
@@ -115,6 +115,37 @@ function export_onnx() {
 
   ls -lh /icefall/*.wav
   soxi /icefall/generated-matcha-tts-steps-6-*.wav
+
+  cp ./model-steps-*.onnx /icefall
+
+  d=matcha-icefall-en_US-ljspeech
+  mkdir $d
+  cp -v data/tokens.txt $d
+  cp model-steps-3.onnx $d
+  pushd $d
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+  tar xf espeak-ng-data.tar.bz2
+  rm espeak-ng-data.tar.bz2
+
+cat >README.md <<EOF
+# Introduction
+
+This model is trained using the dataset from
+https://keithito.com/LJ-Speech-Dataset/
+
+The dataset contains only 1 female speaker.
+
+You can find the training code at
+https://github.com/k2-fsa/icefall/tree/master/egs/ljspeech/TTS#matcha
+EOF
+
+  ls -lh
+
+  popd
+
+  tar cvjf $d.tar.bz2 $d
+  mv $d.tar.bz2 /icefall
+  mv $d /icefall
 }
 
 prepare_data
diff --git a/.github/workflows/ljspeech.yml b/.github/workflows/ljspeech.yml
index 7dca96b37..31a65cd94 100644
--- a/.github/workflows/ljspeech.yml
+++ b/.github/workflows/ljspeech.yml
@@ -30,8 +30,8 @@ jobs:
         id: set-matrix
         run: |
           # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3")
           echo "::set-output name=matrix::${MATRIX}"
 
   ljspeech:
@@ -70,6 +70,10 @@ jobs:
               cd /icefall
               git config --global --add safe.directory /icefall
 
+              pip install "matplotlib<=3.9.4"
+
+              pip list
+
               .github/scripts/ljspeech/TTS/run-matcha.sh
               .github/scripts/ljspeech/TTS/run.sh
 
@@ -94,3 +98,69 @@ jobs:
           repo_name: k2-fsa/sherpa-onnx
           repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
           tag: tts-models
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        with:
+          name: step-2
+          path: ./model-steps-2.onnx
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        with:
+          name: step-3
+          path: ./model-steps-3.onnx
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        with:
+          name: step-4
+          path: ./model-steps-4.onnx
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        with:
+          name: step-5
+          path: ./model-steps-5.onnx
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        with:
+          name: step-6
+          path: ./model-steps-6.onnx
+
+      - name: Upload models to huggingface
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        shell: bash
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          d=matcha-icefall-en_US-ljspeech
+
+          GIT_LFS_SKIP_SMUDGE=1  git clone https://huggingface.co/csukuangfj/$d hf
+          cp -av $d/* hf/
+
+          pushd hf
+
+          git lfs track "cmn_dict"
+          git lfs track "ru_dict"
+
+          git add .
+
+          git config --global user.name "csukuangfj"
+          git config --global user.email "csukuangfj@gmail.com"
+          git config --global lfs.allowincompletepush true
+
+          git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$d main || true
+          popd
+
+      - name: Release exported onnx models
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          overwrite: true
+          file: matcha-icefall-*.tar.bz2
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: tts-models
diff --git a/egs/ljspeech/TTS/README.md b/egs/ljspeech/TTS/README.md
index c9cfc22fd..f5495eeaf 100644
--- a/egs/ljspeech/TTS/README.md
+++ b/egs/ljspeech/TTS/README.md
@@ -176,6 +176,15 @@ The above command generates the following files:
 
 where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
 
+**HINT**: If you get the following error while running `export_onnx.py`:
+
+```
+torch.onnx.errors.UnsupportedOperatorError: Exporting the operator
+'aten::scaled_dot_product_attention' to ONNX opset version 14 is not supported.
+```
+
+please use `torch>=2.2.0`.
+
 
 To export the Hifigan vocoder to onnx, please use:
 
diff --git a/egs/ljspeech/TTS/matcha/export_onnx.py b/egs/ljspeech/TTS/matcha/export_onnx.py
index 39709cc36..3c653fbf1 100755
--- a/egs/ljspeech/TTS/matcha/export_onnx.py
+++ b/egs/ljspeech/TTS/matcha/export_onnx.py
@@ -176,12 +176,16 @@ def main():
             "language": "English",
             "voice": "en-us",
             "has_espeak": 1,
+            "jieba": 0,
             "n_speakers": 1,
             "sample_rate": 22050,
             "version": 1,
+            "pad_id": tokenizer.pad_id,
             "model_author": "icefall",
             "maintainer": "k2-fsa",
+            "use_eos_bos": 1,
             "dataset": "LJ Speech",
+            "dataset_url": "https://keithito.com/LJ-Speech-Dataset/",
             "num_ode_steps": num_steps,
         }
         add_meta_data(filename=filename, meta_data=meta_data)

From 3b6d54007b7b9d0f2ee28ced3d91caed773ae3c1 Mon Sep 17 00:00:00 2001
From: Seonuk Kim <49300300+snkii@users.noreply.github.com>
Date: Mon, 6 Jan 2025 14:17:02 +0900
Subject: [PATCH 2/2] Update conformer.py (#1857)

* Update conformer.py

feedforward dimention -> feedforward dimension

* Update conformer.py

feedforward dimention -> feedforward dimension

* Update conformer.py

feedforward dimention -> feedforward dimension

* Update conformer.py

feedforward dimention -> feedforward dimension

* Update conformer.py

feedforward dimention -> feedforward dimension

* Update conformer.py

feedforward dimention -> feedforward dimension

* Update conformer.py

feedforward dimention -> feedforward dimension

* Update conformer.py

feedforward dimention -> feedforward dimension

* Update conformer.py

feedforward dimention -> feedforward dimension
---
 egs/librispeech/ASR/conformer_ctc/conformer.py                | 2 +-
 egs/librispeech/ASR/conformer_ctc2/conformer.py               | 2 +-
 egs/librispeech/ASR/conformer_mmi/conformer.py                | 2 +-
 egs/librispeech/ASR/pruned2_knowledge/conformer.py            | 2 +-
 egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py | 2 +-
 egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py | 2 +-
 egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py | 2 +-
 egs/librispeech/ASR/streaming_conformer_ctc/conformer.py      | 2 +-
 egs/librispeech/ASR/transducer_stateless/conformer.py         | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/egs/librispeech/ASR/conformer_ctc/conformer.py b/egs/librispeech/ASR/conformer_ctc/conformer.py
index a1cfe6e75..3ac60e32f 100644
--- a/egs/librispeech/ASR/conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/conformer.py
@@ -32,7 +32,7 @@ class Conformer(Transformer):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         num_decoder_layers (int): number of decoder layers
         dropout (float): dropout rate
diff --git a/egs/librispeech/ASR/conformer_ctc2/conformer.py b/egs/librispeech/ASR/conformer_ctc2/conformer.py
index 09f1eb000..02ea80a46 100644
--- a/egs/librispeech/ASR/conformer_ctc2/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc2/conformer.py
@@ -42,7 +42,7 @@ class Conformer(Transformer):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension, also the output dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         num_decoder_layers (int): number of decoder layers
         dropout (float): dropout rate
diff --git a/egs/librispeech/ASR/conformer_mmi/conformer.py b/egs/librispeech/ASR/conformer_mmi/conformer.py
index 53e48eb13..cffe3df28 100644
--- a/egs/librispeech/ASR/conformer_mmi/conformer.py
+++ b/egs/librispeech/ASR/conformer_mmi/conformer.py
@@ -33,7 +33,7 @@ class Conformer(Transformer):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         num_decoder_layers (int): number of decoder layers
         dropout (float): dropout rate
diff --git a/egs/librispeech/ASR/pruned2_knowledge/conformer.py b/egs/librispeech/ASR/pruned2_knowledge/conformer.py
index de367c234..69cc59756 100644
--- a/egs/librispeech/ASR/pruned2_knowledge/conformer.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/conformer.py
@@ -42,7 +42,7 @@ class Conformer(EncoderInterface):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension, also the output dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         dropout (float): dropout rate
         layer_dropout (float): layer-dropout rate.
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
index ab46e233b..85e61ebab 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
@@ -42,7 +42,7 @@ class Conformer(EncoderInterface):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension, also the output dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         dropout (float): dropout rate
         layer_dropout (float): layer-dropout rate.
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py
index 8bbceec61..968ea4150 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py
@@ -42,7 +42,7 @@ class Conformer(EncoderInterface):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension, also the output dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         dropout (float): dropout rate
         layer_dropout (float): layer-dropout rate.
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py
index 0667e7f61..8c1529500 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py
@@ -42,7 +42,7 @@ class Conformer(EncoderInterface):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension, also the output dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         dropout (float): dropout rate
         layer_dropout (float): layer-dropout rate.
diff --git a/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py b/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py
index 0b982f4bf..72842cc28 100644
--- a/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py
@@ -69,7 +69,7 @@ class Conformer(Transformer):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         num_decoder_layers (int): number of decoder layers
         dropout (float): dropout rate
diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py
index 90b722bde..9b11df673 100644
--- a/egs/librispeech/ASR/transducer_stateless/conformer.py
+++ b/egs/librispeech/ASR/transducer_stateless/conformer.py
@@ -35,7 +35,7 @@ class Conformer(Transformer):
         subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
         d_model (int): attention dimension
         nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
+        dim_feedforward (int): feedforward dimension
         num_encoder_layers (int): number of encoder layers
         dropout (float): dropout rate
         cnn_module_kernel (int): Kernel size of convolution module