Merge with master

This commit is contained in:
pkufool 2024-02-21 07:59:10 +08:00
commit 5cc68f614b
20 changed files with 201 additions and 42 deletions

View File

@ -11,6 +11,7 @@ ARG _KALDIFEAT_VERSION="${KALDIFEAT_VERSION}+cpu.torch${TORCH_VERSION}"
RUN apt-get update -y && \ RUN apt-get update -y && \
apt-get install -qq -y \ apt-get install -qq -y \
cmake \
ffmpeg \ ffmpeg \
git \ git \
git-lfs \ git-lfs \

View File

@ -6,8 +6,8 @@ import json
def version_gt(a, b): def version_gt(a, b):
a_major, a_minor = a.split(".")[:2] a_major, a_minor = list(map(int, a.split(".")))[:2]
b_major, b_minor = b.split(".")[:2] b_major, b_minor = list(map(int, b.split(".")))[:2]
if a_major > b_major: if a_major > b_major:
return True return True
@ -18,8 +18,8 @@ def version_gt(a, b):
def version_ge(a, b): def version_ge(a, b):
a_major, a_minor = a.split(".")[:2] a_major, a_minor = list(map(int, a.split(".")))[:2]
b_major, b_minor = b.split(".")[:2] b_major, b_minor = list(map(int, b.split(".")))[:2]
if a_major > b_major: if a_major > b_major:
return True return True
@ -43,11 +43,12 @@ def get_torchaudio_version(torch_version):
def get_matrix(): def get_matrix():
k2_version = "1.24.4.dev20231220" k2_version = "1.24.4.dev20240218"
kaldifeat_version = "1.25.3.dev20231221" kaldifeat_version = "1.25.4.dev20240218"
version = "1.3" version = "1.4"
python_version = ["3.8", "3.9", "3.10", "3.11"] python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
torch_version = ["1.13.0", "1.13.1", "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.1.2"] torch_version = ["1.13.0", "1.13.1", "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.1.2"]
torch_version += ["2.2.0"]
matrix = [] matrix = []
for p in python_version: for p in python_version:
@ -57,6 +58,10 @@ def get_matrix():
if version_gt(p, "3.10") and not version_gt(t, "2.0"): if version_gt(p, "3.10") and not version_gt(t, "2.0"):
continue continue
# only torch>=2.2.0 supports python 3.12
if version_gt(p, "3.11") and not version_gt(t, "2.1"):
continue
matrix.append( matrix.append(
{ {
"k2-version": k2_version, "k2-version": k2_version,

View File

@ -16,7 +16,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"] image: ["torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
steps: steps:
# refer to https://github.com/actions/checkout # refer to https://github.com/actions/checkout

View File

@ -14,13 +14,20 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"] image: ["torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
steps: steps:
# refer to https://github.com/actions/checkout # refer to https://github.com/actions/checkout
- uses: actions/checkout@v2 - uses: actions/checkout@v2
with: with:
fetch-depth: 0 fetch-depth: 0
- name: Free space
shell: bash
run: |
df -h
rm -rf /opt/hostedtoolcache
df -h
- name: Run the build process with Docker - name: Run the build process with Docker
uses: addnab/docker-run-action@v3 uses: addnab/docker-run-action@v3
with: with:

View File

@ -59,4 +59,7 @@ jobs:
cd /icefall cd /icefall
git config --global --add safe.directory /icefall git config --global --add safe.directory /icefall
python3 -m torch.utils.collect_env
python3 -m k2.version
.github/scripts/yesno/ASR/run.sh .github/scripts/yesno/ASR/run.sh

View File

@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# python 3.7 # python 3.7
ARG K2_VERSION="1.24.4.dev20230725+cuda11.3.torch1.12.1" ARG K2_VERSION="1.24.4.dev20240211+cuda11.3.torch1.12.1"
ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.3.torch1.12.1" ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.3.torch1.12.1"
ARG TORCHAUDIO_VERSION="0.12.1+cu113" ARG TORCHAUDIO_VERSION="0.12.1+cu113"
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>" LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"

View File

@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# python 3.9 # python 3.9
ARG K2_VERSION="1.24.4.dev20231021+cuda11.6.torch1.13.0" ARG K2_VERSION="1.24.4.dev20240211+cuda11.6.torch1.13.0"
ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.6.torch1.13.0" ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.6.torch1.13.0"
ARG TORCHAUDIO_VERSION="0.13.0+cu116" ARG TORCHAUDIO_VERSION="0.13.0+cu116"
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>" LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"

View File

@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# python 3.7 # python 3.7
ARG K2_VERSION="1.24.3.dev20230726+cuda10.2.torch1.9.0" ARG K2_VERSION="1.24.4.dev20240211+cuda10.2.torch1.9.0"
ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda10.2.torch1.9.0" ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda10.2.torch1.9.0"
ARG TORCHAUDIO_VERSION="0.9.0" ARG TORCHAUDIO_VERSION="0.9.0"
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>" LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"

View File

@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# python 3.10 # python 3.10
ARG K2_VERSION="1.24.4.dev20231021+cuda11.7.torch2.0.0" ARG K2_VERSION="1.24.4.dev20240211+cuda11.7.torch2.0.0"
ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.7.torch2.0.0" ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.7.torch2.0.0"
ARG TORCHAUDIO_VERSION="2.0.0+cu117" ARG TORCHAUDIO_VERSION="2.0.0+cu117"
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>" LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"

View File

@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# python 3.10 # python 3.10
ARG K2_VERSION="1.24.4.dev20231021+cuda11.8.torch2.1.0" ARG K2_VERSION="1.24.4.dev20240211+cuda11.8.torch2.1.0"
ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.8.torch2.1.0" ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.8.torch2.1.0"
ARG TORCHAUDIO_VERSION="2.1.0+cu118" ARG TORCHAUDIO_VERSION="2.1.0+cu118"
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>" LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"

View File

@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# python 3.10 # python 3.10
ARG K2_VERSION="1.24.4.dev20231021+cuda12.1.torch2.1.0" ARG K2_VERSION="1.24.4.dev20240211+cuda12.1.torch2.1.0"
ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda12.1.torch2.1.0" ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda12.1.torch2.1.0"
ARG TORCHAUDIO_VERSION="2.1.0+cu121" ARG TORCHAUDIO_VERSION="2.1.0+cu121"
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>" LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"

View File

@ -0,0 +1,70 @@
FROM pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel
ENV LC_ALL C.UTF-8
ARG DEBIAN_FRONTEND=noninteractive
# python 3.10
ARG K2_VERSION="1.24.4.dev20240211+cuda11.8.torch2.2.0"
ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.8.torch2.2.0"
ARG TORCHAUDIO_VERSION="2.2.0+cu118"
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
LABEL k2_version=${K2_VERSION}
LABEL kaldifeat_version=${KALDIFEAT_VERSION}
LABEL github_repo="https://github.com/k2-fsa/icefall"
RUN apt-get update && \
apt-get install -y --no-install-recommends \
curl \
vim \
libssl-dev \
autoconf \
automake \
bzip2 \
ca-certificates \
ffmpeg \
g++ \
gfortran \
git \
libtool \
make \
patch \
sox \
subversion \
unzip \
valgrind \
wget \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/*
# Install dependencies
RUN pip install --no-cache-dir \
torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
git+https://github.com/lhotse-speech/lhotse \
kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
kaldi_native_io \
kaldialign \
kaldifst \
kaldilm \
sentencepiece>=0.1.96 \
tensorboard \
typeguard \
dill \
onnx \
onnxruntime \
onnxmltools \
multi_quantization \
typeguard \
numpy \
pytest \
graphviz
RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
cd /workspace/icefall && \
pip install --no-cache-dir -r requirements.txt
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
WORKDIR /workspace/icefall

View File

@ -0,0 +1,70 @@
FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
ENV LC_ALL C.UTF-8
ARG DEBIAN_FRONTEND=noninteractive
# python 3.10
ARG K2_VERSION="1.24.4.dev20240211+cuda12.1.torch2.2.0"
ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda12.1.torch2.2.0"
ARG TORCHAUDIO_VERSION="2.2.0+cu121"
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
LABEL k2_version=${K2_VERSION}
LABEL kaldifeat_version=${KALDIFEAT_VERSION}
LABEL github_repo="https://github.com/k2-fsa/icefall"
RUN apt-get update && \
apt-get install -y --no-install-recommends \
curl \
vim \
libssl-dev \
autoconf \
automake \
bzip2 \
ca-certificates \
ffmpeg \
g++ \
gfortran \
git \
libtool \
make \
patch \
sox \
subversion \
unzip \
valgrind \
wget \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/*
# Install dependencies
RUN pip install --no-cache-dir \
torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
git+https://github.com/lhotse-speech/lhotse \
kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
kaldi_native_io \
kaldialign \
kaldifst \
kaldilm \
sentencepiece>=0.1.96 \
tensorboard \
typeguard \
dill \
onnx \
onnxruntime \
onnxmltools \
multi_quantization \
typeguard \
numpy \
pytest \
graphviz
RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
cd /workspace/icefall && \
pip install --no-cache-dir -r requirements.txt
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
WORKDIR /workspace/icefall

View File

@ -30,7 +30,7 @@ of langugae model integration.
First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here <https://arxiv.org/abs/2002.11268>`_ First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here <https://arxiv.org/abs/2002.11268>`_
to address the language information mismatch between the training to address the language information mismatch between the training
corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain
are acoustically similar, DR derives the following formular for decoding with Bayes' theorem: are acoustically similar, DR derives the following formula for decoding with Bayes' theorem:
.. math:: .. math::
@ -41,7 +41,7 @@ are acoustically similar, DR derives the following formular for decoding with Ba
where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively. where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively.
Here, the source domain LM is trained on the training corpus. The only difference in the above formular compared to Here, the source domain LM is trained on the training corpus. The only difference in the above formula compared to
shallow fusion is the subtraction of the source domain LM. shallow fusion is the subtraction of the source domain LM.
Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is
@ -58,7 +58,7 @@ during decoding for transducer model:
In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Compared to DR, In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Compared to DR,
the only difference lies in the choice of source domain LM. According to the original `paper <https://arxiv.org/abs/2203.16776>`_, the only difference lies in the choice of source domain LM. According to the original `paper <https://arxiv.org/abs/2203.16776>`_,
LODR achieves similar performance compared DR in both intra-domain and cross-domain settings. LODR achieves similar performance compared to DR in both intra-domain and cross-domain settings.
As a bi-gram is much faster to evaluate, LODR is usually much faster. As a bi-gram is much faster to evaluate, LODR is usually much faster.
Now, we will show you how to use LODR in ``icefall``. Now, we will show you how to use LODR in ``icefall``.

View File

@ -139,7 +139,7 @@ A few parameters can be tuned to further boost the performance of shallow fusion
- ``--lm-scale`` - ``--lm-scale``
Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large, Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large,
the LM score may dominant during decoding, leading to bad WER. A typical value of this is around 0.3. the LM score might be dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
- ``--beam-size`` - ``--beam-size``

View File

@ -34,6 +34,8 @@ which will give you something like below:
.. code-block:: bash .. code-block:: bash
"torch2.2.0-cuda12.1"
"torch2.2.0-cuda11.8"
"torch2.1.0-cuda12.1" "torch2.1.0-cuda12.1"
"torch2.1.0-cuda11.8" "torch2.1.0-cuda11.8"
"torch2.0.0-cuda11.7" "torch2.0.0-cuda11.7"

1
egs/ljspeech/TTS/shared Symbolic link
View File

@ -0,0 +1 @@
../../../icefall/shared/

View File

@ -1 +0,0 @@
../../../librispeech/ASR/shared/parse_options.sh

View File

@ -74,7 +74,7 @@ class Tokenizer(object):
if intersperse_blank: if intersperse_blank:
token_ids = intersperse(token_ids, self.blank_id) token_ids = intersperse(token_ids, self.blank_id)
token_ids_list.append(token_ids) token_ids_list.append(token_ids)
return token_ids_list return token_ids_list
@ -103,6 +103,7 @@ class Tokenizer(object):
if intersperse_blank: if intersperse_blank:
token_ids = intersperse(token_ids, self.blank_id) token_ids = intersperse(token_ids, self.blank_id)
token_ids_list.append(token_ids)
token_ids_list.append(token_ids)
return token_ids_list return token_ids_list

View File

@ -159,7 +159,7 @@ class LmScorer(torch.nn.Module):
""" """
if lm_type == "rnn": if lm_type == "rnn":
model = RnnLmModel( model = RnnLmModel(
vocab_size=params.vocab_size, vocab_size=params.lm_vocab_size,
embedding_dim=params.rnn_lm_embedding_dim, embedding_dim=params.rnn_lm_embedding_dim,
hidden_dim=params.rnn_lm_hidden_dim, hidden_dim=params.rnn_lm_hidden_dim,
num_layers=params.rnn_lm_num_layers, num_layers=params.rnn_lm_num_layers,
@ -183,7 +183,7 @@ class LmScorer(torch.nn.Module):
elif lm_type == "transformer": elif lm_type == "transformer":
model = TransformerLM( model = TransformerLM(
vocab_size=params.vocab_size, vocab_size=params.lm_vocab_size,
d_model=params.transformer_lm_encoder_dim, d_model=params.transformer_lm_encoder_dim,
embedding_dim=params.transformer_lm_embedding_dim, embedding_dim=params.transformer_lm_embedding_dim,
dim_feedforward=params.transformer_lm_dim_feedforward, dim_feedforward=params.transformer_lm_dim_feedforward,