Merge branch 'k2-fsa:master' into dev/k2ssl

2025-12-11 06:55:27 +00:00 · 2024-12-22 10:30:21 +08:00 · 2024-12-22 10:30:21 +08:00 · 01e2c2a566
commit 01e2c2a566
parent ebd31daba4 ad966fb81d
4 changed files with 38 additions and 18 deletions
--- a/.github/scripts/docker/generate_build_matrix.py
+++ b/.github/scripts/docker/generate_build_matrix.py
@ -45,13 +45,13 @@ def get_torchaudio_version(torch_version):
 def get_matrix():
    k2_version = "1.24.4.dev20241029"
    kaldifeat_version = "1.25.5.dev20241029"
-    version = "20241029"
+    version = "20241218"
    # torchaudio 2.5.0 does not support python 3.13
    python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
    torch_version = []
-    #  torch_version += ["1.13.0", "1.13.1"]
+    torch_version += ["1.13.0", "1.13.1"]
-    #  torch_version += ["2.0.0", "2.0.1"]
+    torch_version += ["2.0.0", "2.0.1"]
    #  torch_version += ["2.1.0", "2.1.1", "2.1.2"]
    #  torch_version += ["2.2.0", "2.2.1", "2.2.2"]
    # Test only torch >= 2.3.0
@ -59,6 +59,7 @@ def get_matrix():
    torch_version += ["2.4.0"]
    torch_version += ["2.4.1"]
    torch_version += ["2.5.0"]
    torch_version += ["2.5.1"]
    matrix = []
    for p in python_version:
@ -79,8 +80,12 @@ def get_matrix():
                # torch>=2.5 requires python 3.10
                continue
-            k2_version_2 = k2_version
+            if t == "2.5.1":
-            kaldifeat_version_2 = kaldifeat_version
+                k2_version_2 = "1.24.4.dev20241122"
                kaldifeat_version_2 = "1.25.5.dev20241126"
            else:
                k2_version_2 = k2_version
                kaldifeat_version_2 = kaldifeat_version
            matrix.append(
                {
--- a/.github/scripts/ljspeech/TTS/run-matcha.sh
+++ b/.github/scripts/ljspeech/TTS/run-matcha.sh
@ -57,6 +57,7 @@ function infer() {
  curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
  ./matcha/infer.py \
    --num-buckets 2 \
    --epoch 1 \
    --exp-dir ./matcha/exp \
    --tokens data/tokens.txt \
@ -97,19 +98,23 @@ function export_onnx() {
    python3 ./matcha/export_onnx_hifigan.py
  else
    curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v1.onnx
    curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v2.onnx
    curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v3.onnx
  fi
  ls -lh *.onnx
-  python3 ./matcha/onnx_pretrained.py \
+  for v in v1 v2 v3; do
-   --acoustic-model ./model-steps-6.onnx \
+    python3 ./matcha/onnx_pretrained.py \
-   --vocoder ./hifigan_v1.onnx \
+     --acoustic-model ./model-steps-6.onnx \
-   --tokens ./data/tokens.txt \
+     --vocoder ./hifigan_$v.onnx \
-   --input-text "how are you doing?" \
+     --tokens ./data/tokens.txt \
-   --output-wav /icefall/generated-matcha-tts-steps-6-v1.wav
+     --input-text "how are you doing?" \
     --output-wav /icefall/generated-matcha-tts-steps-6-$v.wav
  done
  ls -lh /icefall/*.wav
-  soxi /icefall/generated-matcha-tts-steps-6-v1.wav
+  soxi /icefall/generated-matcha-tts-steps-6-*.wav
 }
 prepare_data
@ -118,3 +123,4 @@ infer
 export_onnx
 rm -rfv generator_v* matcha/exp
 git checkout .
--- a/egs/ljspeech/TTS/matcha/export_onnx.py
+++ b/egs/ljspeech/TTS/matcha/export_onnx.py
@ -163,7 +163,7 @@ def main():
            (x, x_lengths, temperature, length_scale),
            filename,
            opset_version=opset_version,
-            input_names=["x", "x_length", "temperature", "length_scale"],
+            input_names=["x", "x_length", "noise_scale", "length_scale"],
            output_names=["mel"],
            dynamic_axes={
                "x": {0: "N", 1: "L"},
--- a/egs/ljspeech/TTS/matcha/onnx_pretrained.py
+++ b/egs/ljspeech/TTS/matcha/onnx_pretrained.py
@ -89,6 +89,7 @@ class OnnxHifiGANModel:
                self.model.get_inputs()[0].name: x.numpy(),
            },
        )[0]
        # audio: (batch_size, num_samples)
        return torch.from_numpy(audio)
@ -97,19 +98,24 @@ class OnnxModel:
    def __init__(
        self,
        filename: str,
        tokens: str,
    ):
        session_opts = ort.SessionOptions()
        session_opts.inter_op_num_threads = 1
        session_opts.intra_op_num_threads = 2
        self.session_opts = session_opts
-        self.tokenizer = Tokenizer("./data/tokens.txt")
+        self.tokenizer = Tokenizer(tokens)
        self.model = ort.InferenceSession(
            filename,
            sess_options=self.session_opts,
            providers=["CPUExecutionProvider"],
        )
        logging.info(f"{self.model.get_modelmeta().custom_metadata_map}")
        metadata = self.model.get_modelmeta().custom_metadata_map
        self.sample_rate = int(metadata["sample_rate"])
        for i in self.model.get_inputs():
            print(i)
@ -138,6 +144,7 @@ class OnnxModel:
                self.model.get_inputs()[3].name: length_scale.numpy(),
            },
        )[0]
        # mel: (batch_size, feat_dim, num_frames)
        return torch.from_numpy(mel)
@ -147,7 +154,7 @@ def main():
    params = get_parser().parse_args()
    logging.info(vars(params))
-    model = OnnxModel(params.acoustic_model)
+    model = OnnxModel(params.acoustic_model, params.tokens)
    vocoder = OnnxHifiGANModel(params.vocoder)
    text = params.input_text
    x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
@ -164,15 +171,17 @@ def main():
    print("audio", audio.shape)  # (1, 1, num_samples)
    audio = audio.squeeze()
    sample_rate = model.sample_rate
    t = (end_t - start_t).total_seconds()
    t2 = (end_t2 - start_t2).total_seconds()
-    rtf_am = t * 22050 / audio.shape[-1]
+    rtf_am = t * sample_rate / audio.shape[-1]
-    rtf_vocoder = t2 * 22050 / audio.shape[-1]
+    rtf_vocoder = t2 * sample_rate / audio.shape[-1]
    print("RTF for acoustic model ", rtf_am)
    print("RTF for vocoder", rtf_vocoder)
    # skip denoiser
-    sf.write(params.output_wav, audio, 22050, "PCM_16")
+    sf.write(params.output_wav, audio, sample_rate, "PCM_16")
    logging.info(f"Saved to {params.output_wav}")