From 3b263539cd34fb14b53d72339bc7c095028f4578 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 2 Jan 2025 15:54:34 +0800 Subject: [PATCH 1/2] Publish MatchaTTS onnx models trained with LJSpeech to huggingface (#1854) --- .github/scripts/docker/Dockerfile | 2 +- .github/scripts/ljspeech/TTS/run-matcha.sh | 33 +++++++++- .github/workflows/ljspeech.yml | 74 +++++++++++++++++++++- egs/ljspeech/TTS/README.md | 9 +++ egs/ljspeech/TTS/matcha/export_onnx.py | 4 ++ 5 files changed, 118 insertions(+), 4 deletions(-) diff --git a/.github/scripts/docker/Dockerfile b/.github/scripts/docker/Dockerfile index 94e8d8e1e..cf0523401 100644 --- a/.github/scripts/docker/Dockerfile +++ b/.github/scripts/docker/Dockerfile @@ -49,7 +49,7 @@ RUN pip install --no-cache-dir \ kaldifst \ kaldilm \ librosa \ - matplotlib \ + "matplotlib<=3.9.4" \ multi_quantization \ numba \ "numpy<2.0" \ diff --git a/.github/scripts/ljspeech/TTS/run-matcha.sh b/.github/scripts/ljspeech/TTS/run-matcha.sh index 954dd5bd8..bfb37fb6d 100755 --- a/.github/scripts/ljspeech/TTS/run-matcha.sh +++ b/.github/scripts/ljspeech/TTS/run-matcha.sh @@ -77,7 +77,7 @@ function export_onnx() { popd pushd data/fbank - rm -v *.json + rm -fv *.json curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/data/cmvn.json popd @@ -115,6 +115,37 @@ function export_onnx() { ls -lh /icefall/*.wav soxi /icefall/generated-matcha-tts-steps-6-*.wav + + cp ./model-steps-*.onnx /icefall + + d=matcha-icefall-en_US-ljspeech + mkdir $d + cp -v data/tokens.txt $d + cp model-steps-3.onnx $d + pushd $d + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2 + tar xf espeak-ng-data.tar.bz2 + rm espeak-ng-data.tar.bz2 + +cat >README.md <=2.2.0`. + To export the Hifigan vocoder to onnx, please use: diff --git a/egs/ljspeech/TTS/matcha/export_onnx.py b/egs/ljspeech/TTS/matcha/export_onnx.py index 39709cc36..3c653fbf1 100755 --- a/egs/ljspeech/TTS/matcha/export_onnx.py +++ b/egs/ljspeech/TTS/matcha/export_onnx.py @@ -176,12 +176,16 @@ def main(): "language": "English", "voice": "en-us", "has_espeak": 1, + "jieba": 0, "n_speakers": 1, "sample_rate": 22050, "version": 1, + "pad_id": tokenizer.pad_id, "model_author": "icefall", "maintainer": "k2-fsa", + "use_eos_bos": 1, "dataset": "LJ Speech", + "dataset_url": "https://keithito.com/LJ-Speech-Dataset/", "num_ode_steps": num_steps, } add_meta_data(filename=filename, meta_data=meta_data) From 3b6d54007b7b9d0f2ee28ced3d91caed773ae3c1 Mon Sep 17 00:00:00 2001 From: Seonuk Kim <49300300+snkii@users.noreply.github.com> Date: Mon, 6 Jan 2025 14:17:02 +0900 Subject: [PATCH 2/2] Update conformer.py (#1857) * Update conformer.py feedforward dimention -> feedforward dimension * Update conformer.py feedforward dimention -> feedforward dimension * Update conformer.py feedforward dimention -> feedforward dimension * Update conformer.py feedforward dimention -> feedforward dimension * Update conformer.py feedforward dimention -> feedforward dimension * Update conformer.py feedforward dimention -> feedforward dimension * Update conformer.py feedforward dimention -> feedforward dimension * Update conformer.py feedforward dimention -> feedforward dimension * Update conformer.py feedforward dimention -> feedforward dimension --- egs/librispeech/ASR/conformer_ctc/conformer.py | 2 +- egs/librispeech/ASR/conformer_ctc2/conformer.py | 2 +- egs/librispeech/ASR/conformer_mmi/conformer.py | 2 +- egs/librispeech/ASR/pruned2_knowledge/conformer.py | 2 +- egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py | 2 +- egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py | 2 +- egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py | 2 +- egs/librispeech/ASR/streaming_conformer_ctc/conformer.py | 2 +- egs/librispeech/ASR/transducer_stateless/conformer.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/egs/librispeech/ASR/conformer_ctc/conformer.py b/egs/librispeech/ASR/conformer_ctc/conformer.py index a1cfe6e75..3ac60e32f 100644 --- a/egs/librispeech/ASR/conformer_ctc/conformer.py +++ b/egs/librispeech/ASR/conformer_ctc/conformer.py @@ -32,7 +32,7 @@ class Conformer(Transformer): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers num_decoder_layers (int): number of decoder layers dropout (float): dropout rate diff --git a/egs/librispeech/ASR/conformer_ctc2/conformer.py b/egs/librispeech/ASR/conformer_ctc2/conformer.py index 09f1eb000..02ea80a46 100644 --- a/egs/librispeech/ASR/conformer_ctc2/conformer.py +++ b/egs/librispeech/ASR/conformer_ctc2/conformer.py @@ -42,7 +42,7 @@ class Conformer(Transformer): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension, also the output dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers num_decoder_layers (int): number of decoder layers dropout (float): dropout rate diff --git a/egs/librispeech/ASR/conformer_mmi/conformer.py b/egs/librispeech/ASR/conformer_mmi/conformer.py index 53e48eb13..cffe3df28 100644 --- a/egs/librispeech/ASR/conformer_mmi/conformer.py +++ b/egs/librispeech/ASR/conformer_mmi/conformer.py @@ -33,7 +33,7 @@ class Conformer(Transformer): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers num_decoder_layers (int): number of decoder layers dropout (float): dropout rate diff --git a/egs/librispeech/ASR/pruned2_knowledge/conformer.py b/egs/librispeech/ASR/pruned2_knowledge/conformer.py index de367c234..69cc59756 100644 --- a/egs/librispeech/ASR/pruned2_knowledge/conformer.py +++ b/egs/librispeech/ASR/pruned2_knowledge/conformer.py @@ -42,7 +42,7 @@ class Conformer(EncoderInterface): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension, also the output dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers dropout (float): dropout rate layer_dropout (float): layer-dropout rate. diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py index ab46e233b..85e61ebab 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py @@ -42,7 +42,7 @@ class Conformer(EncoderInterface): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension, also the output dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers dropout (float): dropout rate layer_dropout (float): layer-dropout rate. diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py index 8bbceec61..968ea4150 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py @@ -42,7 +42,7 @@ class Conformer(EncoderInterface): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension, also the output dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers dropout (float): dropout rate layer_dropout (float): layer-dropout rate. diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py index 0667e7f61..8c1529500 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless6/conformer.py @@ -42,7 +42,7 @@ class Conformer(EncoderInterface): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension, also the output dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers dropout (float): dropout rate layer_dropout (float): layer-dropout rate. diff --git a/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py b/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py index 0b982f4bf..72842cc28 100644 --- a/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py +++ b/egs/librispeech/ASR/streaming_conformer_ctc/conformer.py @@ -69,7 +69,7 @@ class Conformer(Transformer): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers num_decoder_layers (int): number of decoder layers dropout (float): dropout rate diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py index 90b722bde..9b11df673 100644 --- a/egs/librispeech/ASR/transducer_stateless/conformer.py +++ b/egs/librispeech/ASR/transducer_stateless/conformer.py @@ -35,7 +35,7 @@ class Conformer(Transformer): subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) d_model (int): attention dimension nhead (int): number of head - dim_feedforward (int): feedforward dimention + dim_feedforward (int): feedforward dimension num_encoder_layers (int): number of encoder layers dropout (float): dropout rate cnn_module_kernel (int): Kernel size of convolution module