Merge with master

2024-02-21 07:59:10 +08:00 · 2024-02-21 07:59:10 +08:00 · 5cc68f614b
commit 5cc68f614b
parent 2b117fe570 027302c902
20 changed files with 201 additions and 42 deletions
--- a/.github/scripts/docker/Dockerfile
+++ b/.github/scripts/docker/Dockerfile
@ -11,6 +11,7 @@ ARG _KALDIFEAT_VERSION="${KALDIFEAT_VERSION}+cpu.torch${TORCH_VERSION}"

 RUN apt-get update -y && \
    apt-get install -qq -y \
+    cmake \
    ffmpeg \
    git \
    git-lfs \
--- a/.github/scripts/docker/generate_build_matrix.py
+++ b/.github/scripts/docker/generate_build_matrix.py
@ -6,8 +6,8 @@ import json


 def version_gt(a, b):
-    a_major, a_minor = a.split(".")[:2]
-    b_major, b_minor = b.split(".")[:2]
+    a_major, a_minor = list(map(int, a.split(".")))[:2]
+    b_major, b_minor = list(map(int, b.split(".")))[:2]
    if a_major > b_major:
        return True

@ -18,8 +18,8 @@ def version_gt(a, b):


 def version_ge(a, b):
-    a_major, a_minor = a.split(".")[:2]
-    b_major, b_minor = b.split(".")[:2]
+    a_major, a_minor = list(map(int, a.split(".")))[:2]
+    b_major, b_minor = list(map(int, b.split(".")))[:2]
    if a_major > b_major:
        return True

@ -43,11 +43,12 @@ def get_torchaudio_version(torch_version):


 def get_matrix():
-    k2_version = "1.24.4.dev20231220"
-    kaldifeat_version = "1.25.3.dev20231221"
-    version = "1.3"
-    python_version = ["3.8", "3.9", "3.10", "3.11"]
+    k2_version = "1.24.4.dev20240218"
+    kaldifeat_version = "1.25.4.dev20240218"
+    version = "1.4"
+    python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
    torch_version = ["1.13.0", "1.13.1", "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.1.2"]
+    torch_version += ["2.2.0"]

    matrix = []
    for p in python_version:
@ -57,6 +58,10 @@ def get_matrix():
            if version_gt(p, "3.10") and not version_gt(t, "2.0"):
                continue

+            # only torch>=2.2.0 supports python 3.12
+            if version_gt(p, "3.11") and not version_gt(t, "2.1"):
+                continue
+
            matrix.append(
                {
                    "k2-version": k2_version,
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@ -16,7 +16,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
-        image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+        image: ["torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]

    steps:
      # refer to https://github.com/actions/checkout
--- a/.github/workflows/run-docker-image.yml
+++ b/.github/workflows/run-docker-image.yml
@ -14,13 +14,20 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
-        image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+        image: ["torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
    steps:
      # refer to https://github.com/actions/checkout
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0

+      - name: Free space
+        shell: bash
+        run: |
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+
      - name: Run the build process with Docker
        uses: addnab/docker-run-action@v3
        with:
--- a/.github/workflows/yesno.yml
+++ b/.github/workflows/yesno.yml
@ -59,4 +59,7 @@ jobs:
              cd /icefall
              git config --global --add safe.directory /icefall

+              python3 -m torch.utils.collect_env
+              python3 -m k2.version
+
              .github/scripts/yesno/ASR/run.sh
--- a/docker/torch1.12.1-cuda11.3.dockerfile
+++ b/docker/torch1.12.1-cuda11.3.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.7
-ARG K2_VERSION="1.24.4.dev20230725+cuda11.3.torch1.12.1"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.3.torch1.12.1"
+ARG K2_VERSION="1.24.4.dev20240211+cuda11.3.torch1.12.1"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.3.torch1.12.1"
 ARG TORCHAUDIO_VERSION="0.12.1+cu113"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch1.13.0-cuda11.6.dockerfile
+++ b/docker/torch1.13.0-cuda11.6.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.9
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.6.torch1.13.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.6.torch1.13.0"
+ARG K2_VERSION="1.24.4.dev20240211+cuda11.6.torch1.13.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.6.torch1.13.0"
 ARG TORCHAUDIO_VERSION="0.13.0+cu116"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch1.9.0-cuda10.2.dockerfile
+++ b/docker/torch1.9.0-cuda10.2.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.7
-ARG K2_VERSION="1.24.3.dev20230726+cuda10.2.torch1.9.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda10.2.torch1.9.0"
+ARG K2_VERSION="1.24.4.dev20240211+cuda10.2.torch1.9.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda10.2.torch1.9.0"
 ARG TORCHAUDIO_VERSION="0.9.0"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.0.0-cuda11.7.dockerfile
+++ b/docker/torch2.0.0-cuda11.7.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.7.torch2.0.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.7.torch2.0.0"
+ARG K2_VERSION="1.24.4.dev20240211+cuda11.7.torch2.0.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.7.torch2.0.0"
 ARG TORCHAUDIO_VERSION="2.0.0+cu117"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.1.0-cuda11.8.dockerfile
+++ b/docker/torch2.1.0-cuda11.8.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.8.torch2.1.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.8.torch2.1.0"
+ARG K2_VERSION="1.24.4.dev20240211+cuda11.8.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.8.torch2.1.0"
 ARG TORCHAUDIO_VERSION="2.1.0+cu118"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.1.0-cuda12.1.dockerfile
+++ b/docker/torch2.1.0-cuda12.1.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda12.1.torch2.1.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda12.1.torch2.1.0"
+ARG K2_VERSION="1.24.4.dev20240211+cuda12.1.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda12.1.torch2.1.0"
 ARG TORCHAUDIO_VERSION="2.1.0+cu121"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.2.0-cuda11.8.dockerfile
+++ b/docker/torch2.2.0-cuda11.8.dockerfile
@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240211+cuda11.8.torch2.2.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.8.torch2.2.0"
+ARG TORCHAUDIO_VERSION="2.2.0+cu118"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.2.0-cuda12.1.dockerfile
+++ b/docker/torch2.2.0-cuda12.1.dockerfile
@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240211+cuda12.1.torch2.2.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda12.1.torch2.2.0"
+ARG TORCHAUDIO_VERSION="2.2.0+cu121"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docs/source/decoding-with-langugage-models/LODR.rst
+++ b/docs/source/decoding-with-langugage-models/LODR.rst
@ -30,7 +30,7 @@ of langugae model integration.
 First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here <https://arxiv.org/abs/2002.11268>`_
 to address the language information mismatch between the training
 corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain
-are acoustically similar, DR derives the following formular for decoding with Bayes' theorem:
+are acoustically similar, DR derives the following formula for decoding with Bayes' theorem:

 .. math::

@ -41,7 +41,7 @@ are acoustically similar, DR derives the following formular for decoding with Ba


 where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively.
-Here, the source domain LM is trained on the training corpus. The only difference in the above formular compared to
+Here, the source domain LM is trained on the training corpus. The only difference in the above formula compared to
 shallow fusion is the subtraction of the source domain LM.

 Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is
@ -58,7 +58,7 @@ during decoding for transducer model:

 In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Compared to DR,
 the only difference lies in the choice of source domain LM. According to the original `paper <https://arxiv.org/abs/2203.16776>`_,
-LODR achieves similar performance compared DR in both intra-domain and cross-domain settings.
+LODR achieves similar performance compared to DR in both intra-domain and cross-domain settings.
 As a bi-gram is much faster to evaluate, LODR is usually much faster.

 Now, we will show you how to use LODR in ``icefall``.
--- a/docs/source/decoding-with-langugage-models/shallow-fusion.rst
+++ b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
@ -9,9 +9,9 @@ to improve the word-error-rate of a transducer model.

 .. note::

-    This tutorial is based on the recipe 
+    This tutorial is based on the recipe
    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
-    which is a streaming transducer model trained on `LibriSpeech`_. 
+    which is a streaming transducer model trained on `LibriSpeech`_.
    However, you can easily apply shallow fusion to other recipes.
    If you encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.

@ -69,11 +69,11 @@ Training a language model usually takes a long time, we can download a pre-train
 .. code-block:: bash

    $ # download the external LM
-    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm 
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
    $ # create a symbolic link so that the checkpoint can be loaded
    $ pushd icefall-librispeech-rnn-lm/exp
    $ git lfs pull --include "pretrained.pt"
-    $ ln -s pretrained.pt epoch-99.pt 
+    $ ln -s pretrained.pt epoch-99.pt
    $ popd

 .. note::
@ -85,7 +85,7 @@ Training a language model usually takes a long time, we can download a pre-train
 To use shallow fusion for decoding, we can execute the following command:

 .. code-block:: bash
-    
+
    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
    $ lm_dir=./icefall-librispeech-rnn-lm/exp
    $ lm_scale=0.29
@ -133,16 +133,16 @@ The decoding result obtained with the above command are shown below.
    $ For test-other, WER of different settings are:
    $ beam_size_4	7.08	best for test-other

-The improvement of shallow fusion is very obvious! The relative WER reduction on test-other is around 10.5%. 
+The improvement of shallow fusion is very obvious! The relative WER reduction on test-other is around 10.5%.
 A few parameters can be tuned to further boost the performance of shallow fusion:

- ``--lm-scale`` 
+- ``--lm-scale``

-    Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large, 
-    the LM score may dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
+    Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large,
+    the LM score might be dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
+
+- ``--beam-size``

- ``--beam-size`` 
-    
    The number of active paths in the search beam. It controls the trade-off between decoding efficiency and accuracy.

 Here, we also show how `--beam-size` effect the WER and decoding time:
@ -176,4 +176,4 @@ As we see, a larger beam size during shallow fusion improves the WER, but is als



- 
+
--- a/docs/source/docker/intro.rst
+++ b/docs/source/docker/intro.rst
@ -34,6 +34,8 @@ which will give you something like below:

 .. code-block:: bash

+  "torch2.2.0-cuda12.1"
+  "torch2.2.0-cuda11.8"
  "torch2.1.0-cuda12.1"
  "torch2.1.0-cuda11.8"
  "torch2.0.0-cuda11.7"
--- a/egs/ljspeech/TTS/shared
+++ b/egs/ljspeech/TTS/shared
@ -0,0 +1 @@
+../../../icefall/shared/
--- a/egs/ljspeech/TTS/shared/parse_options.sh
+++ b/egs/ljspeech/TTS/shared/parse_options.sh
@ -1 +0,0 @@
-../../../librispeech/ASR/shared/parse_options.sh
--- a/egs/ljspeech/TTS/vits/tokenizer.py
+++ b/egs/ljspeech/TTS/vits/tokenizer.py
@ -74,7 +74,7 @@ class Tokenizer(object):
            if intersperse_blank:
                token_ids = intersperse(token_ids, self.blank_id)

-                token_ids_list.append(token_ids)
+            token_ids_list.append(token_ids)

        return token_ids_list

@ -103,6 +103,7 @@ class Tokenizer(object):

            if intersperse_blank:
                token_ids = intersperse(token_ids, self.blank_id)
-                token_ids_list.append(token_ids)
+
+            token_ids_list.append(token_ids)

        return token_ids_list
--- a/icefall/lm_wrapper.py
+++ b/icefall/lm_wrapper.py
@ -159,7 +159,7 @@ class LmScorer(torch.nn.Module):
        """
        if lm_type == "rnn":
            model = RnnLmModel(
-                vocab_size=params.vocab_size,
+                vocab_size=params.lm_vocab_size,
                embedding_dim=params.rnn_lm_embedding_dim,
                hidden_dim=params.rnn_lm_hidden_dim,
                num_layers=params.rnn_lm_num_layers,
@ -183,7 +183,7 @@ class LmScorer(torch.nn.Module):

        elif lm_type == "transformer":
            model = TransformerLM(
-                vocab_size=params.vocab_size,
+                vocab_size=params.lm_vocab_size,
                d_model=params.transformer_lm_encoder_dim,
                embedding_dim=params.transformer_lm_embedding_dim,
                dim_feedforward=params.transformer_lm_dim_feedforward,
				`@ -1 +0,0 @@`
				`../../../librispeech/ASR/shared/parse_options.sh`