diff --git a/.github/scripts/docker/generate_build_matrix.py b/.github/scripts/docker/generate_build_matrix.py index 9c53a38df..c5a1a54cb 100755 --- a/.github/scripts/docker/generate_build_matrix.py +++ b/.github/scripts/docker/generate_build_matrix.py @@ -45,13 +45,13 @@ def get_torchaudio_version(torch_version): def get_matrix(): k2_version = "1.24.4.dev20241029" kaldifeat_version = "1.25.5.dev20241029" - version = "20241029" + version = "20241218" # torchaudio 2.5.0 does not support python 3.13 python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"] torch_version = [] - # torch_version += ["1.13.0", "1.13.1"] - # torch_version += ["2.0.0", "2.0.1"] + torch_version += ["1.13.0", "1.13.1"] + torch_version += ["2.0.0", "2.0.1"] # torch_version += ["2.1.0", "2.1.1", "2.1.2"] # torch_version += ["2.2.0", "2.2.1", "2.2.2"] # Test only torch >= 2.3.0 @@ -59,6 +59,7 @@ def get_matrix(): torch_version += ["2.4.0"] torch_version += ["2.4.1"] torch_version += ["2.5.0"] + torch_version += ["2.5.1"] matrix = [] for p in python_version: @@ -79,8 +80,12 @@ def get_matrix(): # torch>=2.5 requires python 3.10 continue - k2_version_2 = k2_version - kaldifeat_version_2 = kaldifeat_version + if t == "2.5.1": + k2_version_2 = "1.24.4.dev20241122" + kaldifeat_version_2 = "1.25.5.dev20241126" + else: + k2_version_2 = k2_version + kaldifeat_version_2 = kaldifeat_version matrix.append( { diff --git a/.github/scripts/ljspeech/TTS/run-matcha.sh b/.github/scripts/ljspeech/TTS/run-matcha.sh index 0876cb47f..352d685a0 100755 --- a/.github/scripts/ljspeech/TTS/run-matcha.sh +++ b/.github/scripts/ljspeech/TTS/run-matcha.sh @@ -57,6 +57,7 @@ function infer() { curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1 ./matcha/infer.py \ + --num-buckets 2 \ --epoch 1 \ --exp-dir ./matcha/exp \ --tokens data/tokens.txt \ @@ -97,19 +98,23 @@ function export_onnx() { python3 ./matcha/export_onnx_hifigan.py else curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v1.onnx + curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v2.onnx + curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v3.onnx fi ls -lh *.onnx - python3 ./matcha/onnx_pretrained.py \ - --acoustic-model ./model-steps-6.onnx \ - --vocoder ./hifigan_v1.onnx \ - --tokens ./data/tokens.txt \ - --input-text "how are you doing?" \ - --output-wav /icefall/generated-matcha-tts-steps-6-v1.wav + for v in v1 v2 v3; do + python3 ./matcha/onnx_pretrained.py \ + --acoustic-model ./model-steps-6.onnx \ + --vocoder ./hifigan_$v.onnx \ + --tokens ./data/tokens.txt \ + --input-text "how are you doing?" \ + --output-wav /icefall/generated-matcha-tts-steps-6-$v.wav + done ls -lh /icefall/*.wav - soxi /icefall/generated-matcha-tts-steps-6-v1.wav + soxi /icefall/generated-matcha-tts-steps-6-*.wav } prepare_data @@ -118,3 +123,4 @@ infer export_onnx rm -rfv generator_v* matcha/exp +git checkout . diff --git a/egs/ljspeech/TTS/matcha/export_onnx.py b/egs/ljspeech/TTS/matcha/export_onnx.py index 487ea2995..623517431 100755 --- a/egs/ljspeech/TTS/matcha/export_onnx.py +++ b/egs/ljspeech/TTS/matcha/export_onnx.py @@ -163,7 +163,7 @@ def main(): (x, x_lengths, temperature, length_scale), filename, opset_version=opset_version, - input_names=["x", "x_length", "temperature", "length_scale"], + input_names=["x", "x_length", "noise_scale", "length_scale"], output_names=["mel"], dynamic_axes={ "x": {0: "N", 1: "L"}, diff --git a/egs/ljspeech/TTS/matcha/onnx_pretrained.py b/egs/ljspeech/TTS/matcha/onnx_pretrained.py index 4eff9a084..6d92b16eb 100755 --- a/egs/ljspeech/TTS/matcha/onnx_pretrained.py +++ b/egs/ljspeech/TTS/matcha/onnx_pretrained.py @@ -89,6 +89,7 @@ class OnnxHifiGANModel: self.model.get_inputs()[0].name: x.numpy(), }, )[0] + # audio: (batch_size, num_samples) return torch.from_numpy(audio) @@ -97,19 +98,24 @@ class OnnxModel: def __init__( self, filename: str, + tokens: str, ): session_opts = ort.SessionOptions() session_opts.inter_op_num_threads = 1 session_opts.intra_op_num_threads = 2 self.session_opts = session_opts - self.tokenizer = Tokenizer("./data/tokens.txt") + self.tokenizer = Tokenizer(tokens) self.model = ort.InferenceSession( filename, sess_options=self.session_opts, providers=["CPUExecutionProvider"], ) + logging.info(f"{self.model.get_modelmeta().custom_metadata_map}") + metadata = self.model.get_modelmeta().custom_metadata_map + self.sample_rate = int(metadata["sample_rate"]) + for i in self.model.get_inputs(): print(i) @@ -138,6 +144,7 @@ class OnnxModel: self.model.get_inputs()[3].name: length_scale.numpy(), }, )[0] + # mel: (batch_size, feat_dim, num_frames) return torch.from_numpy(mel) @@ -147,7 +154,7 @@ def main(): params = get_parser().parse_args() logging.info(vars(params)) - model = OnnxModel(params.acoustic_model) + model = OnnxModel(params.acoustic_model, params.tokens) vocoder = OnnxHifiGANModel(params.vocoder) text = params.input_text x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True) @@ -164,15 +171,17 @@ def main(): print("audio", audio.shape) # (1, 1, num_samples) audio = audio.squeeze() + sample_rate = model.sample_rate + t = (end_t - start_t).total_seconds() t2 = (end_t2 - start_t2).total_seconds() - rtf_am = t * 22050 / audio.shape[-1] - rtf_vocoder = t2 * 22050 / audio.shape[-1] + rtf_am = t * sample_rate / audio.shape[-1] + rtf_vocoder = t2 * sample_rate / audio.shape[-1] print("RTF for acoustic model ", rtf_am) print("RTF for vocoder", rtf_vocoder) # skip denoiser - sf.write(params.output_wav, audio, 22050, "PCM_16") + sf.write(params.output_wav, audio, sample_rate, "PCM_16") logging.info(f"Saved to {params.output_wav}")