This commit is contained in:
Desh Raj 2023-01-15 13:44:45 -05:00
commit b978c6de55
238 changed files with 35545 additions and 1075 deletions

View File

@ -1,7 +1,7 @@
[flake8]
show-source=true
statistics=true
max-line-length = 80
max-line-length = 88
per-file-ignores =
# line too long
icefall/diagnostics.py: E501,
@ -12,6 +12,7 @@ per-file-ignores =
egs/librispeech/ASR/lstm_transducer_stateless*/*.py: E501, E203
egs/librispeech/ASR/conv_emformer_transducer_stateless*/*.py: E501, E203
egs/librispeech/ASR/conformer_ctc*/*py: E501,
egs/librispeech/ASR/zipformer_mmi/*.py: E501, E203
egs/librispeech/ASR/RESULTS.md: E999,
# invalid escape sequence (cause by tex formular), W605

View File

@ -13,7 +13,6 @@ cd egs/librispeech/ASR
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conformer-ctc3-2022-11-27
log "Downloading pre-trained model from $repo_url"
git lfs install
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
@ -23,7 +22,12 @@ soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
git lfs pull --include "data/*"
git lfs pull --include "data/lang_bpe_500/HLG.pt"
git lfs pull --include "data/lang_bpe_500/L.pt"
git lfs pull --include "data/lang_bpe_500/LG.pt"
git lfs pull --include "data/lang_bpe_500/Linv.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "data/lm/G_4_gram.pt"
git lfs pull --include "exp/jit_trace.pt"
git lfs pull --include "exp/pretrained.pt"
ln -s pretrained.pt epoch-99.pt

View File

@ -193,7 +193,7 @@ if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"shallow-fusion" ]]; then
ls -lh data
ls -lh lstm_transducer_stateless2/exp
log "Decoding test-clean and test-other"
log "Decoding test-clean and test-other with RNN LM"
./lstm_transducer_stateless2/decode.py \
--use-averaged-model 0 \
@ -201,12 +201,14 @@ if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"shallow-fusion" ]]; then
--avg 1 \
--exp-dir lstm_transducer_stateless2/exp \
--max-duration 600 \
--decoding-method modified_beam_search_rnnlm_shallow_fusion \
--decoding-method modified_beam_search_lm_shallow_fusion \
--beam 4 \
--rnn-lm-scale 0.3 \
--rnn-lm-exp-dir $lm_repo/exp \
--rnn-lm-epoch 88 \
--rnn-lm-avg 1 \
--use-shallow-fusion 1 \
--lm-type rnn \
--lm-exp-dir $lm_repo/exp \
--lm-epoch 88 \
--lm-avg 1 \
--lm-scale 0.3 \
--rnn-lm-num-layers 3 \
--rnn-lm-tie-weights 1
fi
@ -245,11 +247,13 @@ if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"LODR" ]]; then
--avg 1 \
--exp-dir lstm_transducer_stateless2/exp \
--max-duration 600 \
--decoding-method modified_beam_search_rnnlm_LODR \
--decoding-method modified_beam_search_LODR \
--beam 4 \
--rnn-lm-scale 0.3 \
--rnn-lm-exp-dir $lm_repo/exp \
--rnn-lm-epoch 88 \
--use-shallow-fusion 1 \
--lm-type rnn \
--lm-exp-dir $lm_repo/exp \
--lm-scale 0.4 \
--lm-epoch 88 \
--rnn-lm-avg 1 \
--rnn-lm-num-layers 3 \
--rnn-lm-tie-weights 1 \

View File

@ -30,6 +30,15 @@ ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "Test exporting to ONNX format"
./pruned_transducer_stateless7/export.py \
--exp-dir $repo/exp \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--onnx 1
log "Export to torchscript model"
./pruned_transducer_stateless7/export.py \
--exp-dir $repo/exp \
@ -41,6 +50,27 @@ log "Export to torchscript model"
ls -lh $repo/exp/*.pt
log "Decode with ONNX models"
./pruned_transducer_stateless7/onnx_check.py \
--jit-filename $repo/exp/cpu_jit.pt \
--onnx-encoder-filename $repo/exp/encoder.onnx \
--onnx-decoder-filename $repo/exp/decoder.onnx \
--onnx-joiner-filename $repo/exp/joiner.onnx \
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
./pruned_transducer_stateless7/onnx_pretrained.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--encoder-model-filename $repo/exp/encoder.onnx \
--decoder-model-filename $repo/exp/decoder.onnx \
--joiner-model-filename $repo/exp/joiner.onnx \
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
log "Decode with models exported by torch.jit.script()"
./pruned_transducer_stateless7/jit_pretrained.py \

View File

@ -13,7 +13,6 @@ cd egs/librispeech/ASR
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01
log "Downloading pre-trained model from $repo_url"
git lfs install
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
@ -23,7 +22,12 @@ soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
git lfs pull --include "data/*"
git lfs pull --include "data/lang_bpe_500/HLG.pt"
git lfs pull --include "data/lang_bpe_500/L.pt"
git lfs pull --include "data/lang_bpe_500/LG.pt"
git lfs pull --include "data/lang_bpe_500/Linv.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "data/lm/G_4_gram.pt"
git lfs pull --include "exp/cpu_jit.pt"
git lfs pull --include "exp/pretrained.pt"
ln -s pretrained.pt epoch-99.pt
@ -144,4 +148,4 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" ==
done
rm pruned_transducer_stateless7_ctc/exp/*.pt
fi
fi

View File

@ -0,0 +1,148 @@
#!/usr/bin/env bash
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
repo_url=https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
git lfs pull --include "data/lang_bpe_500/HLG.pt"
git lfs pull --include "data/lang_bpe_500/L.pt"
git lfs pull --include "data/lang_bpe_500/LG.pt"
git lfs pull --include "data/lang_bpe_500/Linv.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/cpu_jit.pt"
git lfs pull --include "exp/pretrained.pt"
ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "Export to torchscript model"
./pruned_transducer_stateless7_ctc_bs/export.py \
--exp-dir $repo/exp \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--jit 1
ls -lh $repo/exp/*.pt
log "Decode with models exported by torch.jit.script()"
./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--nn-model-filename $repo/exp/cpu_jit.pt \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
for m in ctc-decoding 1best; do
./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
--model-filename $repo/exp/cpu_jit.pt \
--words-file $repo/data/lang_bpe_500/words.txt \
--HLG $repo/data/lang_bpe_500/HLG.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--method $m \
--sample-rate 16000 \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
for sym in 1 2 3; do
log "Greedy search with --max-sym-per-frame $sym"
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
--method greedy_search \
--max-sym-per-frame $sym \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
for method in modified_beam_search beam_search fast_beam_search; do
log "$method"
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
--method $method \
--beam-size 4 \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
for m in ctc-decoding 1best; do
./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py \
--checkpoint $repo/exp/pretrained.pt \
--words-file $repo/data/lang_bpe_500/words.txt \
--HLG $repo/data/lang_bpe_500/HLG.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--method $m \
--sample-rate 16000 \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
mkdir -p pruned_transducer_stateless7_ctc_bs/exp
ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_ctc_bs/exp/epoch-999.pt
ln -s $PWD/$repo/data/lang_bpe_500 data/
ls -lh data
ls -lh pruned_transducer_stateless7_ctc_bs/exp
log "Decoding test-clean and test-other"
# use a small value for decoding with CPU
max_duration=100
for method in greedy_search fast_beam_search modified_beam_search; do
log "Decoding with $method"
./pruned_transducer_stateless7_ctc_bs/decode.py \
--decoding-method $method \
--epoch 999 \
--avg 1 \
--use-averaged-model 0 \
--max-duration $max_duration \
--exp-dir pruned_transducer_stateless7_ctc_bs/exp
done
for m in ctc-decoding 1best; do
./pruned_transducer_stateless7_ctc_bs/ctc_decode.py \
--epoch 999 \
--avg 1 \
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
--max-duration $max_duration \
--use-averaged-model 0 \
--decoding-method $m \
--hlg-scale 0.6
done
rm pruned_transducer_stateless7_ctc_bs/exp/*.pt
fi

View File

@ -0,0 +1,148 @@
#!/usr/bin/env bash
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
log "Downloading pre-trained model from $repo_url"
git lfs install
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/cpu_jit.pt"
git lfs pull --include "exp/pretrained.pt"
git lfs pull --include "exp/encoder_jit_trace.pt"
git lfs pull --include "exp/decoder_jit_trace.pt"
git lfs pull --include "exp/joiner_jit_trace.pt"
ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "Export to torchscript model"
./pruned_transducer_stateless7_streaming/export.py \
--exp-dir $repo/exp \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--decode-chunk-len 32 \
--epoch 99 \
--avg 1 \
--jit 1
ls -lh $repo/exp/*.pt
log "Decode with models exported by torch.jit.script()"
./pruned_transducer_stateless7_streaming/jit_pretrained.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--nn-model-filename $repo/exp/cpu_jit.pt \
--decode-chunk-len 32 \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
log "Export to torchscript model by torch.jit.trace()"
./pruned_transducer_stateless7_streaming/jit_trace_export.py \
--exp-dir $repo/exp \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--decode-chunk-len 32 \
--epoch 99 \
--avg 1
log "Decode with models exported by torch.jit.trace()"
./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--encoder-model-filename $repo/exp/encoder_jit_trace.pt \
--decoder-model-filename $repo/exp/decoder_jit_trace.pt \
--joiner-model-filename $repo/exp/joiner_jit_trace.pt \
--decode-chunk-len 32 \
$repo/test_wavs/1089-134686-0001.wav
for sym in 1 2 3; do
log "Greedy search with --max-sym-per-frame $sym"
./pruned_transducer_stateless7_streaming/pretrained.py \
--method greedy_search \
--max-sym-per-frame $sym \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--decode-chunk-len 32 \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
for method in modified_beam_search beam_search fast_beam_search; do
log "$method"
./pruned_transducer_stateless7_streaming/pretrained.py \
--method $method \
--beam-size 4 \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--decode-chunk-len 32 \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
mkdir -p pruned_transducer_stateless7_streaming/exp
ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_streaming/exp/epoch-999.pt
ln -s $PWD/$repo/data/lang_bpe_500 data/
ls -lh data
ls -lh pruned_transducer_stateless7_streaming/exp
log "Decoding test-clean and test-other"
# use a small value for decoding with CPU
max_duration=100
num_decode_stream=200
for method in greedy_search fast_beam_search modified_beam_search; do
log "decoding with $method"
./pruned_transducer_stateless7_streaming/decode.py \
--decoding-method $method \
--epoch 999 \
--avg 1 \
--use-averaged-model 0 \
--max-duration $max_duration \
--decode-chunk-len 32 \
--exp-dir pruned_transducer_stateless7_streaming/exp
done
for method in greedy_search fast_beam_search modified_beam_search; do
log "Decoding with $method"
./pruned_transducer_stateless7_streaming/streaming_decode.py \
--decoding-method $method \
--epoch 999 \
--avg 1 \
--use-averaged-model 0 \
--decode-chunk-len 32 \
--num-decode-streams $num_decode_stream
--exp-dir pruned_transducer_stateless7_streaming/exp
done
rm pruned_transducer_stateless7_streaming/exp/*.pt
fi

View File

@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
git lfs pull --include "data/lang_bpe_500/3gram.pt"
git lfs pull --include "data/lang_bpe_500/4gram.pt"
git lfs pull --include "data/lang_bpe_500/L.pt"
git lfs pull --include "data/lang_bpe_500/LG.pt"
git lfs pull --include "data/lang_bpe_500/Linv.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/cpu_jit.pt"
git lfs pull --include "exp/pretrained.pt"
ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "Export to torchscript model"
./zipformer_mmi/export.py \
--exp-dir $repo/exp \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--jit 1
ls -lh $repo/exp/*.pt
log "Decode with models exported by torch.jit.script()"
./zipformer_mmi/jit_pretrained.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--nn-model-filename $repo/exp/cpu_jit.pt \
--lang-dir $repo/data/lang_bpe_500 \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
log "$method"
./zipformer_mmi/pretrained.py \
--method $method \
--checkpoint $repo/exp/pretrained.pt \
--lang-dir $repo/data/lang_bpe_500 \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
mkdir -p zipformer_mmi/exp
ln -s $PWD/$repo/exp/pretrained.pt zipformer_mmi/exp/epoch-999.pt
ln -s $PWD/$repo/data/lang_bpe_500 data/
ls -lh data
ls -lh zipformer_mmi/exp
log "Decoding test-clean and test-other"
# use a small value for decoding with CPU
max_duration=100
for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
log "Decoding with $method"
./zipformer_mmi/decode.py \
--decoding-method $method \
--epoch 999 \
--avg 1 \
--use-averaged-model 0 \
--nbest-scale 1.2 \
--hp-scale 1.0 \
--max-duration $max_duration \
--lang-dir $repo/data/lang_bpe_500 \
--exp-dir zipformer_mmi/exp
done
rm zipformer_mmi/exp/*.pt
fi

View File

@ -39,7 +39,7 @@ concurrency:
jobs:
run_librispeech_2022_11_11_zipformer:
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:

View File

@ -0,0 +1,167 @@
# Copyright 2022 Zengwei Yao
# See ../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: run-librispeech-2022-12-08-zipformer-mmi
# zipformer
on:
push:
branches:
- master
pull_request:
types: [labeled]
schedule:
# minute (0-59)
# hour (0-23)
# day of the month (1-31)
# month (1-12)
# day of the week (0-6)
# nightly build at 15:50 UTC time every day
- cron: "50 15 * * *"
concurrency:
group: run_librispeech_2022_12_08_zipformer-${{ github.ref }}
cancel-in-progress: true
jobs:
run_librispeech_2022_12_08_zipformer:
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements-ci.txt'
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
uses: actions/cache@v2
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/install-kaldifeat.sh
- name: Cache LibriSpeech test-clean and test-other datasets
id: libri-test-clean-and-test-other-data
uses: actions/cache@v2
with:
path: |
~/tmp/download
key: cache-libri-test-clean-and-test-other
- name: Download LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
- name: Prepare manifests for LibriSpeech test-clean and test-other
shell: bash
run: |
.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
- name: Cache LibriSpeech test-clean and test-other fbank features
id: libri-test-clean-and-test-other-fbank
uses: actions/cache@v2
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
- name: Inference with pre-trained model
shell: bash
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
mkdir -p egs/librispeech/ASR/data
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
- name: Display decoding results for librispeech zipformer-mmi
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
shell: bash
run: |
cd egs/librispeech/ASR/
tree ./zipformer-mmi/exp
cd zipformer-mmi
echo "results for zipformer-mmi"
echo "===1best==="
find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===nbest==="
find exp/nbest -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/nbest -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===nbest-rescoring-LG==="
find exp/nbest-rescoring-LG -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/nbest-rescoring-LG -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===nbest-rescoring-3-gram==="
find exp/nbest-rescoring-3-gram -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/nbest-rescoring-3-gram -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===nbest-rescoring-4-gram==="
find exp/nbest-rescoring-4-gram -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/nbest-rescoring-4-gram -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
- name: Upload decoding results for librispeech zipformer-mmi
uses: actions/upload-artifact@v2
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
with:
name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer_mmi-2022-12-08
path: egs/librispeech/ASR/zipformer_mmi/exp/

View File

@ -0,0 +1,163 @@
# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
# See ../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: run-librispeech-2022-12-15-stateless7-ctc-bs
# zipformer
on:
push:
branches:
- master
pull_request:
types: [labeled]
schedule:
# minute (0-59)
# hour (0-23)
# day of the month (1-31)
# month (1-12)
# day of the week (0-6)
# nightly build at 15:50 UTC time every day
- cron: "50 15 * * *"
jobs:
run_librispeech_2022_12_15_zipformer_ctc_bs:
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements-ci.txt'
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
uses: actions/cache@v2
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/install-kaldifeat.sh
- name: Cache LibriSpeech test-clean and test-other datasets
id: libri-test-clean-and-test-other-data
uses: actions/cache@v2
with:
path: |
~/tmp/download
key: cache-libri-test-clean-and-test-other
- name: Download LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
- name: Prepare manifests for LibriSpeech test-clean and test-other
shell: bash
run: |
.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
- name: Cache LibriSpeech test-clean and test-other fbank features
id: libri-test-clean-and-test-other-fbank
uses: actions/cache@v2
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
- name: Inference with pre-trained model
shell: bash
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
mkdir -p egs/librispeech/ASR/data
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
- name: Display decoding results for librispeech pruned_transducer_stateless7_ctc_bs
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
shell: bash
run: |
cd egs/librispeech/ASR/
tree ./pruned_transducer_stateless7_ctc_bs/exp
cd pruned_transducer_stateless7_ctc_bs
echo "results for pruned_transducer_stateless7_ctc_bs"
echo "===greedy search==="
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===fast_beam_search==="
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===modified beam search==="
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===ctc decoding==="
find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===1best==="
find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
- name: Upload decoding results for librispeech pruned_transducer_stateless7_ctc_bs
uses: actions/upload-artifact@v2
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
with:
name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-ctc-bs-2022-12-15
path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/exp/

View File

@ -0,0 +1,172 @@
# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
# See ../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: run-librispeech-2022-12-29-stateless7-streaming
# zipformer
on:
push:
branches:
- master
pull_request:
types: [labeled]
schedule:
# minute (0-59)
# hour (0-23)
# day of the month (1-31)
# month (1-12)
# day of the week (0-6)
# nightly build at 15:50 UTC time every day
- cron: "50 15 * * *"
concurrency:
group: run_librispeech_2022_12_29_zipformer_streaming-${{ github.ref }}
cancel-in-progress: true
jobs:
run_librispeech_2022_12_29_zipformer_streaming:
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'streaming-zipformer' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements-ci.txt'
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
uses: actions/cache@v2
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/install-kaldifeat.sh
- name: Cache LibriSpeech test-clean and test-other datasets
id: libri-test-clean-and-test-other-data
uses: actions/cache@v2
with:
path: |
~/tmp/download
key: cache-libri-test-clean-and-test-other
- name: Download LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
- name: Prepare manifests for LibriSpeech test-clean and test-other
shell: bash
run: |
.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
- name: Cache LibriSpeech test-clean and test-other fbank features
id: libri-test-clean-and-test-other-fbank
uses: actions/cache@v2
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
- name: Inference with pre-trained model
shell: bash
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
mkdir -p egs/librispeech/ASR/data
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
- name: Display decoding results for librispeech pruned_transducer_stateless7_streaming
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
shell: bash
run: |
cd egs/librispeech/ASR/
tree ./pruned_transducer_stateless7_streaming/exp
cd pruned_transducer_stateless7_streaming
echo "results for pruned_transducer_stateless7_streaming"
echo "===greedy search==="
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===fast_beam_search==="
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===modified beam search==="
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===streaming greedy search==="
find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===streaming fast_beam_search==="
find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===streaming modified beam search==="
find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
- name: Upload decoding results for librispeech pruned_transducer_stateless7_streaming
uses: actions/upload-artifact@v2
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
with:
name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-streaming-2022-12-29
path: egs/librispeech/ASR/pruned_transducer_stateless7_streaming/exp/

View File

@ -139,9 +139,10 @@ jobs:
cd egs/librispeech/ASR
tree lstm_transducer_stateless2/exp
cd lstm_transducer_stateless2/exp
echo "===modified_beam_search_rnnlm_shallow_fusion==="
find modified_beam_search_rnnlm_shallow_fusion -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find modified_beam_search_rnnlm_shallow_fusion -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===modified_beam_search_lm_shallow_fusion==="
echo "===Using RNNLM==="
find modified_beam_search_lm_shallow_fusion -name "log-*rnn*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find modified_beam_search_lm_shallow_fusion -name "log-*rnn*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
- name: Display decoding results for lstm_transducer_stateless2
if: github.event.label.name == 'LODR'
@ -151,8 +152,8 @@ jobs:
tree lstm_transducer_stateless2/exp
cd lstm_transducer_stateless2/exp
echo "===modified_beam_search_rnnlm_LODR==="
find modified_beam_search_rnnlm_LODR -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find modified_beam_search_rnnlm_LODR -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
find modified_beam_search_LODR -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find modified_beam_search_LODR -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
- name: Upload decoding results for lstm_transducer_stateless2
uses: actions/upload-artifact@v2

View File

@ -113,6 +113,9 @@ jobs:
cd ../pruned_transducer_stateless4
pytest -v -s
cd ../pruned_transducer_stateless7
pytest -v -s
cd ../transducer_stateless
pytest -v -s

1
.gitignore vendored
View File

@ -33,3 +33,4 @@ node_modules
*.param
*.bin
.DS_Store

24
docs/README.md Normal file
View File

@ -0,0 +1,24 @@
## Usage
```bash
cd /path/to/icefall/docs
pip install -r requirements.txt
make clean
make html
cd build/html
python3 -m http.server 8000
```
It prints:
```
Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
```
Open your browser and go to <http://0.0.0.0:8000/> to view the generated
documentation.
Done!
**Hint**: You can change the port number when starting the server.

View File

@ -78,3 +78,12 @@ html_context = {
}
todo_include_todos = True
rst_epilog = """
.. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn
.. _icefall: https://github.com/k2-fsa/icefall
.. _git-lfs: https://git-lfs.com/
.. _ncnn: https://github.com/tencent/ncnn
.. _LibriSpeech: https://www.openslr.org/12
.. _musan: http://www.openslr.org/17/
"""

107
docs/source/faqs.rst Normal file
View File

@ -0,0 +1,107 @@
Frequently Asked Questions (FAQs)
=================================
In this section, we collect issues reported by users and post the corresponding
solutions.
OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
-----------------------------------------------------------------------------------
One user is using the following code to install ``torch`` and ``torchaudio``:
.. code-block:: bash
pip install \
torch==1.10.0+cu111 \
torchvision==0.11.0+cu111 \
torchaudio==0.10.0 \
-f https://download.pytorch.org/whl/torch_stable.html
and it throws the following error when running ``tdnn/train.py``:
.. code-block::
OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
The fix is to specify the CUDA version while installing ``torchaudio``. That
is, change ``torchaudio==0.10.0`` to ``torchaudio==0.10.0+cu11```. Therefore,
the correct command is:
.. code-block:: bash
pip install \
torch==1.10.0+cu111 \
torchvision==0.11.0+cu111 \
torchaudio==0.10.0+cu111 \
-f https://download.pytorch.org/whl/torch_stable.html
AttributeError: module 'distutils' has no attribute 'version'
-------------------------------------------------------------
The error log is:
.. code-block::
Traceback (most recent call last):
File "./tdnn/train.py", line 14, in <module>
from asr_datamodule import YesNoAsrDataModule
File "/home/xxx/code/next-gen-kaldi/icefall/egs/yesno/ASR/tdnn/asr_datamodule.py", line 34, in <module>
from icefall.dataset.datamodule import DataModule
File "/home/xxx/code/next-gen-kaldi/icefall/icefall/__init__.py", line 3, in <module>
from . import (
File "/home/xxx/code/next-gen-kaldi/icefall/icefall/decode.py", line 23, in <module>
from icefall.utils import add_eos, add_sos, get_texts
File "/home/xxx/code/next-gen-kaldi/icefall/icefall/utils.py", line 39, in <module>
from torch.utils.tensorboard import SummaryWriter
File "/home/xxx/tool/miniconda3/envs/yyy/lib/python3.8/site-packages/torch/utils/tensorboard/__init__.py", line 4, in <module>
LooseVersion = distutils.version.LooseVersion
AttributeError: module 'distutils' has no attribute 'version'
The fix is:
.. code-block:: bash
pip uninstall setuptools
pip install setuptools==58.0.4
ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
--------------------------------------------------------------------------------------------
If you are using ``conda`` and encounter the following issue:
.. code-block::
Traceback (most recent call last):
File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 24, in <module>
from _k2 import DeterminizeWeightPushingType
ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/k2-dev/yangyifan/icefall/egs/librispeech/ASR/./pruned_transducer_stateless7_ctc_bs/decode.py", line 104, in <module>
import k2
File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 30, in <module>
raise ImportError(
ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
Note: If you're using anaconda and importing k2 on MacOS,
you can probably fix this by setting the environment variable:
export DYLD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages:$DYLD_LIBRARY_PATH
Please first try to find where ``libpython3.10.so.1.0`` locates.
For instance,
.. code-block:: bash
cd $CONDA_PREFIX/lib
find . -name "libpython*"
If you are able to find it inside ``$CODNA_PREFIX/lib``, please set the
following environment variable:
.. code-block:: bash
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH

View File

@ -21,7 +21,16 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
:caption: Contents:
installation/index
faqs
model-export/index
.. toctree::
:maxdepth: 3
recipes/index
.. toctree::
:maxdepth: 2
contributing/index
huggingface/index

View File

@ -0,0 +1,21 @@
2023-01-11 12:15:38,677 INFO [export-for-ncnn.py:220] device: cpu
2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:229] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_v
alid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampl
ing_factor': 4, 'decoder_dim': 512, 'joiner_dim': 512, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.23.2', 'k2-build-type':
'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'a34171ed85605b0926eebbd0463d059431f4f74a', 'k2-git-date': 'Wed Dec 14 00:06:38 2022',
'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-vers
ion': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'fix-stateless3-train-2022-12-27', 'icefall-git-sha1': '530e8a1-dirty', '
icefall-git-date': 'Tue Dec 27 13:59:18 2022', 'icefall-path': '/star-fj/fangjun/open-source/icefall', 'k2-path': '/star-fj/fangjun/op
en-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279
-k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '127.0.0.1'}, 'epoch': 30, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefa
ll-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp'), 'bpe_model': './icefall-asr-librispeech-conv-emformer-transdu
cer-stateless2-2022-07-05//data/lang_bpe_500/bpe.model', 'jit': False, 'context_size': 2, 'use_averaged_model': False, 'encoder_dim':
512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'cnn_module_kernel': 31, 'left_context_length': 32, 'chunk_length'
: 32, 'right_context_length': 8, 'memory_size': 32, 'blank_id': 0, 'vocab_size': 500}
2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:231] About to create model
2023-01-11 12:15:40,053 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-conv-emformer-transducer-stateless2-2
022-07-05/exp/epoch-30.pt
2023-01-11 12:15:40,708 INFO [export-for-ncnn.py:315] Number of model parameters: 75490012
2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:318] Using torch.jit.trace()
2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:320] Exporting encoder
2023-01-11 12:15:41,682 INFO [export-for-ncnn.py:149] chunk_length: 32, right_context_length: 8

View File

@ -0,0 +1,104 @@
Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1
num encoder conv layers: 88
num joiner conv layers: 3
num files: 3
Processing ../test_wavs/1089-134686-0001.wav
Processing ../test_wavs/1221-135766-0001.wav
Processing ../test_wavs/1221-135766-0002.wav
Processing ../test_wavs/1089-134686-0001.wav
Processing ../test_wavs/1221-135766-0001.wav
Processing ../test_wavs/1221-135766-0002.wav
----------encoder----------
conv_87 : max = 15.942385 threshold = 15.938493 scale = 7.968131
conv_88 : max = 35.442448 threshold = 15.549335 scale = 8.167552
conv_89 : max = 23.228289 threshold = 8.001738 scale = 15.871552
linear_90 : max = 3.976146 threshold = 1.101789 scale = 115.267128
linear_91 : max = 6.962030 threshold = 5.162033 scale = 24.602713
linear_92 : max = 12.323041 threshold = 3.853959 scale = 32.953129
linear_94 : max = 6.905416 threshold = 4.648006 scale = 27.323545
linear_93 : max = 6.905416 threshold = 5.474093 scale = 23.200188
linear_95 : max = 1.888012 threshold = 1.403563 scale = 90.483986
linear_96 : max = 6.856741 threshold = 5.398679 scale = 23.524273
linear_97 : max = 9.635942 threshold = 2.613655 scale = 48.590950
linear_98 : max = 6.460340 threshold = 5.670146 scale = 22.398010
linear_99 : max = 9.532276 threshold = 2.585537 scale = 49.119396
linear_101 : max = 6.585871 threshold = 5.719224 scale = 22.205809
linear_100 : max = 6.585871 threshold = 5.751382 scale = 22.081648
linear_102 : max = 1.593344 threshold = 1.450581 scale = 87.551147
linear_103 : max = 6.592681 threshold = 5.705824 scale = 22.257959
linear_104 : max = 8.752957 threshold = 1.980955 scale = 64.110489
linear_105 : max = 6.696240 threshold = 5.877193 scale = 21.608953
linear_106 : max = 9.059659 threshold = 2.643138 scale = 48.048950
linear_108 : max = 6.975461 threshold = 4.589567 scale = 27.671457
linear_107 : max = 6.975461 threshold = 6.190381 scale = 20.515701
linear_109 : max = 3.710759 threshold = 2.305635 scale = 55.082436
linear_110 : max = 7.531228 threshold = 5.731162 scale = 22.159557
linear_111 : max = 10.528083 threshold = 2.259322 scale = 56.211544
linear_112 : max = 8.148807 threshold = 5.500842 scale = 23.087374
linear_113 : max = 8.592566 threshold = 1.948851 scale = 65.166611
linear_115 : max = 8.437109 threshold = 5.608947 scale = 22.642395
linear_114 : max = 8.437109 threshold = 6.193942 scale = 20.503904
linear_116 : max = 3.966980 threshold = 3.200896 scale = 39.676392
linear_117 : max = 9.451303 threshold = 6.061664 scale = 20.951344
linear_118 : max = 12.077262 threshold = 3.965800 scale = 32.023804
linear_119 : max = 9.671615 threshold = 4.847613 scale = 26.198460
linear_120 : max = 8.625638 threshold = 3.131427 scale = 40.556595
linear_122 : max = 10.274080 threshold = 4.888716 scale = 25.978189
linear_121 : max = 10.274080 threshold = 5.420480 scale = 23.429659
linear_123 : max = 4.826197 threshold = 3.599617 scale = 35.281532
linear_124 : max = 11.396383 threshold = 7.325849 scale = 17.335875
linear_125 : max = 9.337198 threshold = 3.941410 scale = 32.221970
linear_126 : max = 9.699965 threshold = 4.842878 scale = 26.224073
linear_127 : max = 8.775370 threshold = 3.884215 scale = 32.696438
linear_129 : max = 9.872276 threshold = 4.837319 scale = 26.254213
linear_128 : max = 9.872276 threshold = 7.180057 scale = 17.687883
linear_130 : max = 4.150427 threshold = 3.454298 scale = 36.765789
linear_131 : max = 11.112692 threshold = 7.924847 scale = 16.025545
linear_132 : max = 11.852893 threshold = 3.116593 scale = 40.749626
linear_133 : max = 11.517084 threshold = 5.024665 scale = 25.275314
linear_134 : max = 10.683807 threshold = 3.878618 scale = 32.743618
linear_136 : max = 12.421055 threshold = 6.322729 scale = 20.086264
linear_135 : max = 12.421055 threshold = 5.309880 scale = 23.917679
linear_137 : max = 4.827781 threshold = 3.744595 scale = 33.915554
linear_138 : max = 14.422395 threshold = 7.742882 scale = 16.402161
linear_139 : max = 8.527538 threshold = 3.866123 scale = 32.849449
linear_140 : max = 12.128619 threshold = 4.657793 scale = 27.266134
linear_141 : max = 9.839593 threshold = 3.845993 scale = 33.021378
linear_143 : max = 12.442304 threshold = 7.099039 scale = 17.889746
linear_142 : max = 12.442304 threshold = 5.325038 scale = 23.849592
linear_144 : max = 5.929444 threshold = 5.618206 scale = 22.605080
linear_145 : max = 13.382126 threshold = 9.321095 scale = 13.625010
linear_146 : max = 9.894987 threshold = 3.867645 scale = 32.836517
linear_147 : max = 10.915313 threshold = 4.906028 scale = 25.886522
linear_148 : max = 9.614287 threshold = 3.908151 scale = 32.496181
linear_150 : max = 11.724932 threshold = 4.485588 scale = 28.312899
linear_149 : max = 11.724932 threshold = 5.161146 scale = 24.606939
linear_151 : max = 7.164453 threshold = 5.847355 scale = 21.719223
linear_152 : max = 13.086471 threshold = 5.984121 scale = 21.222834
linear_153 : max = 11.099524 threshold = 3.991601 scale = 31.816805
linear_154 : max = 10.054585 threshold = 4.489706 scale = 28.286930
linear_155 : max = 12.389185 threshold = 3.100321 scale = 40.963501
linear_157 : max = 9.982999 threshold = 5.154796 scale = 24.637253
linear_156 : max = 9.982999 threshold = 8.537706 scale = 14.875190
linear_158 : max = 8.420287 threshold = 6.502287 scale = 19.531588
linear_159 : max = 25.014746 threshold = 9.423280 scale = 13.477261
linear_160 : max = 45.633553 threshold = 5.715335 scale = 22.220921
linear_161 : max = 20.371849 threshold = 5.117830 scale = 24.815203
linear_162 : max = 12.492933 threshold = 3.126283 scale = 40.623318
linear_164 : max = 20.697504 threshold = 4.825712 scale = 26.317358
linear_163 : max = 20.697504 threshold = 5.078367 scale = 25.008038
linear_165 : max = 9.023975 threshold = 6.836278 scale = 18.577358
linear_166 : max = 34.860619 threshold = 7.259792 scale = 17.493614
linear_167 : max = 30.380934 threshold = 5.496160 scale = 23.107042
linear_168 : max = 20.691216 threshold = 4.733317 scale = 26.831076
linear_169 : max = 9.723948 threshold = 3.952728 scale = 32.129707
linear_171 : max = 21.034811 threshold = 5.366547 scale = 23.665123
linear_170 : max = 21.034811 threshold = 5.356277 scale = 23.710501
linear_172 : max = 10.556884 threshold = 5.729481 scale = 22.166058
linear_173 : max = 20.033039 threshold = 10.207264 scale = 12.442120
linear_174 : max = 11.597379 threshold = 2.658676 scale = 47.768131
----------joiner----------
linear_2 : max = 19.293503 threshold = 14.305265 scale = 8.877850
linear_1 : max = 10.812222 threshold = 8.766452 scale = 14.487047
linear_3 : max = 0.999999 threshold = 0.999755 scale = 127.031174
ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\(^0^)/...233...

View File

@ -0,0 +1,7 @@
2023-01-11 14:02:12,216 INFO [streaming-ncnn-decode.py:320] {'tokens': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav'}
T 51 32
2023-01-11 14:02:13,141 INFO [streaming-ncnn-decode.py:328] Constructing Fbank computer
2023-01-11 14:02:13,151 INFO [streaming-ncnn-decode.py:331] Reading sound files: ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
2023-01-11 14:02:13,176 INFO [streaming-ncnn-decode.py:336] torch.Size([106000])
2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:380] ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:381] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS

View File

@ -1,12 +1,771 @@
Export to ncnn
==============
We support exporting LSTM transducer models to `ncnn <https://github.com/tencent/ncnn>`_.
Please refer to :ref:`export-model-for-ncnn` for details.
We support exporting both
`LSTM transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
and
`ConvEmformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_
to `ncnn <https://github.com/tencent/ncnn>`_.
We also provide `<https://github.com/k2-fsa/sherpa-ncnn>`_
performing speech recognition using ``ncnn`` with exported models.
It has been tested on Linux, macOS, Windows, and Raspberry Pi. The project is
self-contained and can be statically linked to produce a binary containing
everything needed.
It has been tested on Linux, macOS, Windows, ``Android``, and ``Raspberry Pi``.
`sherpa-ncnn`_ is self-contained and can be statically linked to produce
a binary containing everything needed. Please refer
to its documentation for details:
- `<https://k2-fsa.github.io/sherpa/ncnn/index.html>`_
Export LSTM transducer models
-----------------------------
Please refer to :ref:`export-lstm-transducer-model-for-ncnn` for details.
Export ConvEmformer transducer models
-------------------------------------
We use the pre-trained model from the following repository as an example:
- `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
.. hint::
We use ``Ubuntu 18.04``, ``torch 1.10``, and ``Python 3.8`` for testing.
.. caution::
Please use a more recent version of PyTorch. For instance, ``torch 1.8``
may ``not`` work.
1. Download the pre-trained model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. hint::
You can also refer to `<https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html#icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_ to download the pre-trained model.
You have to install `git-lfs`_ before you continue.
.. code-block:: bash
cd egs/librispeech/ASR
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
cd ..
.. note::
We download ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
In the above code, we download the pre-trained model into the directory
``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``.
2. Install ncnn and pnnx
^^^^^^^^^^^^^^^^^^^^^^^^
.. code-block:: bash
# We put ncnn into $HOME/open-source/ncnn
# You can change it to anywhere you like
cd $HOME
mkdir -p open-source
cd open-source
git clone https://github.com/csukuangfj/ncnn
cd ncnn
git submodule update --recursive --init
# Note: We don't use "python setup.py install" or "pip install ." here
mkdir -p build-wheel
cd build-wheel
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DNCNN_PYTHON=ON \
-DNCNN_BUILD_BENCHMARK=OFF \
-DNCNN_BUILD_EXAMPLES=OFF \
-DNCNN_BUILD_TOOLS=ON \
..
make -j4
cd ..
# Note: $PWD here is $HOME/open-source/ncnn
export PYTHONPATH=$PWD/python:$PYTHONPATH
export PATH=$PWD/tools/pnnx/build/src:$PATH
export PATH=$PWD/build-wheel/tools/quantize:$PATH
# Now build pnnx
cd tools/pnnx
mkdir build
cd build
cmake ..
make -j4
./src/pnnx
Congratulations! You have successfully installed the following components:
- ``pnxx``, which is an executable located in
``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use
it to convert models exported by ``torch.jit.trace()``.
- ``ncnn2int8``, which is an executable located in
``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use
it to quantize our models to ``int8``.
- ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located
in ``$HOME/open-source/ncnn/python/ncnn``.
.. note::
I am using ``Python 3.8``, so it
is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different
version, say, ``Python 3.9``, the name would be
``ncnn.cpython-39-x86_64-linux-gnu.so``.
Also, if you are not using Linux, the file name would also be different.
But that does not matter. As long as you can compile it, it should work.
We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your
Python code. We have also set up ``PATH`` so that you can use
``pnnx`` and ``ncnn2int8`` later in your terminal.
.. caution::
Please don't use `<https://github.com/tencent/ncnn>`_.
We have made some modifications to the offical `ncnn`_.
We will synchronize `<https://github.com/csukuangfj/ncnn>`_ periodically
with the official one.
3. Export the model via torch.jit.trace()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
First, let us rename our pre-trained model:
.. code-block::
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt
cd ../..
Next, we use the following code to export our model:
.. code-block:: bash
dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
--exp-dir $dir/exp \
--bpe-model $dir/data/lang_bpe_500/bpe.model \
--epoch 30 \
--avg 1 \
--use-averaged-model 0 \
\
--num-encoder-layers 12 \
--chunk-length 32 \
--cnn-module-kernel 31 \
--left-context-length 32 \
--right-context-length 8 \
--memory-size 32 \
--encoder-dim 512
.. hint::
We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``.
There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
If you have trained a model by yourself and if you have all checkpoints
available, please first use ``decode.py`` to tune ``--epoch --avg``
and select the best combination with with ``--use-averaged-model 1``.
.. note::
You will see the following log output:
.. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt
The log shows the model has ``75490012`` parameters, i.e., ``~75 M``.
.. code-block::
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
-rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
You can see that the file size of the pre-trained model is ``289 MB``, which
is roughly ``75490012*4/1024/1024 = 287.97 MB``.
After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``,
we will get the following files:
.. code-block:: bash
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx*
-rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt
-rw-r--r-- 1 kuangfangjun root 283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt
-rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt
.. _conv-emformer-step-3-export-torchscript-model-via-pnnx:
3. Export torchscript model via pnnx
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. hint::
Make sure you have set up the ``PATH`` environment variable. Otherwise,
it will throw an error saying that ``pnnx`` could not be found.
Now, it's time to export our models to `ncnn`_ via ``pnnx``.
.. code-block::
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
pnnx ./encoder_jit_trace-pnnx.pt
pnnx ./decoder_jit_trace-pnnx.pt
pnnx ./joiner_jit_trace-pnnx.pt
It will generate the following files:
.. code-block:: bash
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param}
-rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
There are two types of files:
- ``param``: It is a text file containing the model architectures. You can
use a text editor to view its content.
- ``bin``: It is a binary file containing the model parameters.
We compare the file sizes of the models below before and after converting via ``pnnx``:
.. see https://tableconvert.com/restructuredtext-generator
+----------------------------------+------------+
| File name | File size |
+==================================+============+
| encoder_jit_trace-pnnx.pt | 283 MB |
+----------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin | 142 MB |
+----------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin | 503 KB |
+----------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB |
+----------------------------------+------------+
You can see that the file sizes of the models after conversion are about one half
of the models before conversion:
- encoder: 283 MB vs 142 MB
- decoder: 1010 KB vs 503 KB
- joiner: 3.0 MB vs 1.5 MB
The reason is that by default ``pnnx`` converts ``float32`` parameters
to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
for ``float16``. Thus, it is ``twice smaller`` after conversion.
.. hint::
If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
won't convert ``float32`` to ``float16``.
4. Test the exported models in icefall
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. note::
We assume you have set up the environment variable ``PYTHONPATH`` when
building `ncnn`_.
Now we have successfully converted our pre-trained model to `ncnn`_ format.
The generated 6 files are what we need. You can use the following code to
test the converted models:
.. code-block:: bash
./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
--tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \
--encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \
./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
.. hint::
`ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
only 1 wave file as input.
The output is given below:
.. literalinclude:: ./code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
.. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn:
5. Modify the exported encoder for sherpa-ncnn
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In order to use the exported models in `sherpa-ncnn`_, we have to modify
``encoder_jit_trace-pnnx.ncnn.param``.
Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
.. code-block::
7767517
1060 1342
Input in0 0 1 in0
**Explanation** of the above three lines:
1. ``7767517``, it is a magic number and should not be changed.
2. ``1060 1342``, the first number ``1060`` specifies the number of layers
in this file, while ``1342`` specifies the number of intermediate outputs
of this file
3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
is the layer name of this layer; ``0`` means this layer has no input;
``1`` means this layer has one output; ``in0`` is the output name of
this layer.
We need to add 1 extra line and also increment the number of layers.
The result looks like below:
.. code-block:: bash
7767517
1061 1342
SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
Input in0 0 1 in0
**Explanation**
1. ``7767517``, it is still the same
2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``.
We don't need to change ``1342`` since the newly added layer has no inputs or outputs.
3. ``SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512``
This line is newly added. Its explanation is given below:
- ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
- ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
- ``0 0`` means this layer has no inputs or output. Must be ``0 0``
- ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1``
- ``1=12``, 1 is the key and 12 is the value of the
parameter ``--num-encoder-layers`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``2=32``, 2 is the key and 32 is the value of the
parameter ``--memory-size`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``3=31``, 3 is the key and 31 is the value of the
parameter ``--cnn-module-kernel`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``4=8``, 4 is the key and 8 is the value of the
parameter ``--left-context-length`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``5=32``, 5 is the key and 32 is the value of the
parameter ``--chunk-length`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``6=8``, 6 is the key and 8 is the value of the
parameter ``--right-context-length`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``7=512``, 7 is the key and 512 is the value of the
parameter ``--encoder-dim`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
For ease of reference, we list the key-value pairs that you need to add
in the following table. If your model has a different setting, please
change the values for ``SherpaMetaData`` accordingly. Otherwise, you
will be ``SAD``.
+------+-----------------------------+
| key | value |
+======+=============================+
| 0 | 1 (fixed) |
+------+-----------------------------+
| 1 | ``--num-encoder-layers`` |
+------+-----------------------------+
| 2 | ``--memory-size`` |
+------+-----------------------------+
| 3 | ``--cnn-module-kernel`` |
+------+-----------------------------+
| 4 | ``--left-context-length`` |
+------+-----------------------------+
| 5 | ``--chunk-length`` |
+------+-----------------------------+
| 6 | ``--right-context-length`` |
+------+-----------------------------+
| 7 | ``--encoder-dim`` |
+------+-----------------------------+
4. ``Input in0 0 1 in0``. No need to change it.
.. caution::
When you add a new layer ``SherpaMetaData``, please remember to update the
number of layers. In our case, update ``1060`` to ``1061``. Otherwise,
you will be SAD later.
.. hint::
After adding the new layer ``SherpaMetaData``, you cannot use this model
with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
supported only in `sherpa-ncnn`_.
.. hint::
`ncnn`_ is very flexible. You can add new layers to it just by text-editing
the ``param`` file! You don't need to change the ``bin`` file.
Now you can use this model in `sherpa-ncnn`_.
Please refer to the following documentation:
- Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
- Android: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
- Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
- `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
You can find more usages there.
6. (Optional) int8 quantization with sherpa-ncnn
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This step is optional.
In this step, we describe how to quantize our model with ``int8``.
Change :ref:`conv-emformer-step-3-export-torchscript-model-via-pnnx` to
disable ``fp16`` when using ``pnnx``:
.. code-block::
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
pnnx ./encoder_jit_trace-pnnx.pt fp16=0
pnnx ./decoder_jit_trace-pnnx.pt
pnnx ./joiner_jit_trace-pnnx.pt fp16=0
.. note::
We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
support quantizing the decoder model yet. We will update this documentation
once `ncnn`_ supports it. (Maybe in this year, 2023).
It will generate the following files
.. code-block:: bash
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin}
-rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
Let us compare again the file sizes:
+----------------------------------------+------------+
| File name | File size |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.pt | 283 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
+----------------------------------------+------------+
You can see that the file sizes are doubled when we disable ``fp16``.
.. note::
You can again use ``streaming-ncnn-decode.py`` to test the exported models.
Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
to modify ``encoder_jit_trace-pnnx.ncnn.param``.
Change
.. code-block:: bash
7767517
1060 1342
Input in0 0 1 in0
to
.. code-block:: bash
7767517
1061 1342
SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
Input in0 0 1 in0
.. caution::
Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
to change the values for ``SherpaMetaData`` if your model uses a different setting.
Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
`sherpa-ncnn`_.
.. code-block:: bash
# We will download sherpa-ncnn to $HOME/open-source/
# You can change it to anywhere you like.
cd $HOME
mkdir -p open-source
cd open-source
git clone https://github.com/k2-fsa/sherpa-ncnn
cd sherpa-ncnn
mkdir build
cd build
cmake ..
make -j 4
./bin/generate-int8-scale-table
export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
The output of the above commands are:
.. code-block:: bash
(py38) kuangfangjun:build$ generate-int8-scale-table
Please provide 10 arg. Currently given: 1
Usage:
generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
We need to create a file ``wave_filenames.txt``, in which we need to put
some calibration wave files. For testing purpose, we put the ``test_wavs``
from the pre-trained model repository `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
cat <<EOF > wave_filenames.txt
../test_wavs/1089-134686-0001.wav
../test_wavs/1221-135766-0001.wav
../test_wavs/1221-135766-0002.wav
EOF
Now we can calculate the scales needed for quantization with the calibration data:
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
generate-int8-scale-table \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
./encoder-scale-table.txt \
./joiner-scale-table.txt \
./wave_filenames.txt
The output logs are in the following:
.. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt
It generates the following two files:
.. code-block:: bash
$ ls -lh encoder-scale-table.txt joiner-scale-table.txt
-rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt
-rw-r--r-- 1 kuangfangjun root 18K Jan 11 17:28 joiner-scale-table.txt
.. caution::
Definitely, you need more calibration data to compute the scale table.
Finally, let us use the scale table to quantize our models into ``int8``.
.. code-block:: bash
ncnn2int8
usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
First, we quantize the encoder model:
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
ncnn2int8 \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./encoder_jit_trace-pnnx.ncnn.int8.param \
./encoder_jit_trace-pnnx.ncnn.int8.bin \
./encoder-scale-table.txt
Next, we quantize the joiner model:
.. code-block:: bash
ncnn2int8 \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.int8.param \
./joiner_jit_trace-pnnx.ncnn.int8.bin \
./joiner-scale-table.txt
The above two commands generate the following 4 files:
.. code-block:: bash
-rw-r--r-- 1 kuangfangjun root 99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin
-rw-r--r-- 1 kuangfangjun root 78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param
-rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin
-rw-r--r-- 1 kuangfangjun root 496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param
Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
.. caution::
``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
replace the following invocation:
.. code-block::
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
sherpa-ncnn \
../data/lang_bpe_500/tokens.txt \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
../test_wavs/1089-134686-0001.wav
with
.. code-block::
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
sherpa-ncnn \
../data/lang_bpe_500/tokens.txt \
./encoder_jit_trace-pnnx.ncnn.int8.param \
./encoder_jit_trace-pnnx.ncnn.int8.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
../test_wavs/1089-134686-0001.wav
The following table compares again the file sizes:
+----------------------------------------+------------+
| File name | File size |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.pt | 283 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.int8.bin | 99 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB |
+----------------------------------------+------------+
You can see that the file sizes of the model after ``int8`` quantization
are much smaller.
.. hint::
Currently, only linear layers and convolutional layers are quantized
with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
.. note::
You need to test the recognition accuracy after ``int8`` quantization.
You can find the speed comparison at `<https://github.com/k2-fsa/sherpa-ncnn/issues/44>`_.
That's it! Have fun with `sherpa-ncnn`_!

View File

@ -1,7 +1,7 @@
.. _export-model-with-torch-jit-script:
Export model with torch.jit.script()
===================================
====================================
In this section, we describe how to export a model via
``torch.jit.script()``.

View File

@ -703,7 +703,7 @@ It will show you the following message:
HLG decoding
^^^^^^^^^^^^
~~~~~~~~~~~~
.. code-block:: bash

View File

@ -0,0 +1,10 @@
Non Streaming ASR
=================
.. toctree::
:maxdepth: 2
aishell/index
librispeech/index
timit/index
yesno/index

View File

@ -888,7 +888,7 @@ It will show you the following message:
CTC decoding
^^^^^^^^^^^^
~~~~~~~~~~~~
.. code-block:: bash
@ -926,7 +926,7 @@ Its output is:
YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
HLG decoding
^^^^^^^^^^^^
~~~~~~~~~~~~
.. code-block:: bash
@ -966,7 +966,7 @@ The output is:
HLG decoding + n-gram LM rescoring
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
@ -1012,7 +1012,7 @@ The output is:
HLG decoding + n-gram LM rescoring + attention decoder rescoring
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash

View File

@ -0,0 +1,223 @@
Distillation with HuBERT
========================
This tutorial shows you how to perform knowledge distillation in `icefall`_
with the `LibriSpeech`_ dataset. The distillation method
used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation <https://arxiv.org/abs/2211.00508>`_
for more details about MVQ-KD.
.. note::
This tutorial is based on recipe
`pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_.
Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
.. note::
We assume you have read the page :ref:`install icefall` and have setup
the environment for `icefall`_.
.. HINT::
We recommend you to use a GPU or several GPUs to run this recipe.
Data preparation
----------------
We first prepare necessary training data for `LibriSpeech`_.
This is the same as in :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
.. hint::
The data preparation is the same as other recipes on LibriSpeech dataset,
if you have finished this step, you can skip to :ref:`codebook_index_preparation` directly.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
All you need to do is to run it.
The data preparation contains several stages, you can use the following two
options:
- ``--stage``
- ``--stop-stage``
to control which stage(s) should be run. By default, all stages are executed.
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh --stage 0 --stop-stage 0 # run only stage 0
$ ./prepare.sh --stage 2 --stop-stage 5 # run from stage 2 to stage 5
.. HINT::
If you have pre-downloaded the `LibriSpeech`_
dataset and the `musan`_ dataset, say,
they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
``./prepare.sh`` won't re-download them.
.. NOTE::
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
are saved in ``./data`` directory.
We provide the following YouTube video showing how to run ``./prepare.sh``.
.. note::
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
.. youtube:: ofEIoJL-mGM
.. _codebook_index_preparation:
Codebook index preparation
--------------------------
Here, we prepare necessary data for MVQ-KD. This requires the generation
of codebook indexes (please read our `paper <https://arxiv.org/abs/2211.00508>`_.
if you are interested in details). In this tutorial, we use the pre-computed
codebook indexes for convenience. The only thing you need to do is to
run `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_.
.. note::
There are 5 stages in total, the first and second stage will be automatically skipped
when choosing to downloaded codebook indexes prepared by `icefall`_.
Of course, you can extract and compute the codebook indexes by yourself. This
will require you downloading a HuBERT-XL model and it can take a while for
the extraction of codebook indexes.
As usual, you can control the stages you want to run by specifying the following
two options:
- ``--stage``
- ``--stop-stage``
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./distillation_with_hubert.sh --stage 0 --stop-stage 0 # run only stage 0
$ ./distillation_with_hubert.sh --stage 2 --stop-stage 4 # run from stage 2 to stage 5
Here are a few options in `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_
you need to know before you proceed.
- ``--full_libri`` If True, use full 960h data. Otherwise only ``train-clean-100`` will be used
- ``--use_extracted_codebook`` If True, the first two stages will be skipped and the codebook
indexes uploaded by us will be downloaded.
Since we are using the pre-computed codebook indexes, we set
``use_extracted_codebook=True``. If you want to do full `LibriSpeech`_
experiments, please set ``full_libri=True``.
The following command downloads the pre-computed codebook indexes
and prepares MVQ-augmented training manifests.
.. code-block:: bash
$ ./distillation_with_hubert.sh --stage 2 --stop-stage 2 # run only stage 2
Please see the
following screenshot for the output of an example execution.
.. figure:: ./images/distillation_codebook.png
:width: 800
:alt: Downloading codebook indexes and preparing training manifest.
:align: center
Downloading codebook indexes and preparing training manifest.
.. hint::
The codebook indexes we prepared for you in this tutorial
are extracted from the 36-th layer of a fine-tuned HuBERT-XL model
with 8 codebooks. If you want to try other configurations, please
set ``use_extracted_codebook=False`` and set ``embedding_layer`` and
``num_codebooks`` by yourself.
Now, you should see the following files under the directory ``./data/vq_fbank_layer36_cb8``.
.. figure:: ./images/distillation_directory.png
:width: 800
:alt: MVQ-augmented training manifests
:align: center
MVQ-augmented training manifests.
Whola! You are ready to perform knowledge distillation training now!
Training
--------
To perform training, please run stage 3 by executing the following command.
.. code-block:: bash
$ ./prepare.sh --stage 3 --stop-stage 3 # run MVQ training
Here is the code snippet for training:
.. code-block:: bash
WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}')
./pruned_transducer_stateless6/train.py \
--manifest-dir ./data/vq_fbank_layer36_cb8 \
--master-port 12359 \
--full-libri $full_libri \
--spec-aug-time-warp-factor -1 \
--max-duration 300 \
--world-size ${WORLD_SIZE} \
--num-epochs 30 \
--exp-dir $exp_dir \
--enable-distillation True \
--codebook-loss-scale 0.01
There are a few training arguments in the following
training commands that should be paid attention to.
- ``--enable-distillation`` If True, knowledge distillation training is enabled.
- ``--codebook-loss-scale`` The scale of the knowledge distillation loss.
- ``--manifest-dir`` The path to the MVQ-augmented manifest.
Decoding
--------
After training finished, you can test the performance on using
the following command.
.. code-block:: bash
export CUDA_VISIBLE_DEVICES=0
./pruned_transducer_stateless6/train.py \
--decoding-method "modified_beam_search" \
--epoch 30 \
--avg 10 \
--max-duration 200 \
--exp-dir $exp_dir \
--enable-distillation True
You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`_.
That's all! Feel free to experiment with your own setups and report your results.
If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`_.

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 554 KiB

View File

@ -0,0 +1,12 @@
LibriSpeech
===========
.. toctree::
:maxdepth: 1
tdnn_lstm_ctc
conformer_ctc
pruned_transducer_stateless
zipformer_mmi
zipformer_ctc_blankskip
distillation

View File

@ -0,0 +1,548 @@
.. _non_streaming_librispeech_pruned_transducer_stateless:
Pruned transducer statelessX
============================
This tutorial shows you how to run a conformer transducer model
with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
.. Note::
The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
`pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
`pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
`pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
We will take pruned_transducer_stateless4 as an example in this tutorial.
.. HINT::
We assume you have read the page :ref:`install icefall` and have setup
the environment for ``icefall``.
.. HINT::
We recommend you to use a GPU or several GPUs to run this recipe.
.. hint::
Please scroll down to the bottom of this page to find download links
for pretrained models if you don't want to train a model from scratch.
We use pruned RNN-T to compute the loss.
.. note::
You can find the paper about pruned RNN-T at the following address:
`<https://arxiv.org/abs/2206.13236>`_
The transducer model consists of 3 parts:
- Encoder, a.k.a, the transcription network. We use a Conformer model (the reworked version by Daniel Povey)
- Decoder, a.k.a, the prediction network. We use a stateless model consisting of
``nn.Embedding`` and ``nn.Conv1d``
- Joiner, a.k.a, the joint network.
.. caution::
Contrary to the conventional RNN-T models, we use a stateless decoder.
That is, it has no recurrent connections.
Data preparation
----------------
.. hint::
The data preparation is the same as other recipes on LibriSpeech dataset,
if you have finished this step, you can skip to ``Training`` directly.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
All you need to do is to run it.
The data preparation contains several stages, you can use the following two
options:
- ``--stage``
- ``--stop-stage``
to control which stage(s) should be run. By default, all stages are executed.
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh --stage 0 --stop-stage 0
means to run only stage 0.
To run stage 2 to stage 5, use:
.. code-block:: bash
$ ./prepare.sh --stage 2 --stop-stage 5
.. HINT::
If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
``./prepare.sh`` won't re-download them.
.. NOTE::
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
are saved in ``./data`` directory.
We provide the following YouTube video showing how to run ``./prepare.sh``.
.. note::
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
.. youtube:: ofEIoJL-mGM
Training
--------
Configurable options
~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless4/train.py --help
shows you the training options that can be passed from the commandline.
The following options are used quite often:
- ``--exp-dir``
The directory to save checkpoints, training logs and tensorboard.
- ``--full-libri``
If it's True, the training part uses all the training data, i.e.,
960 hours. Otherwise, the training part uses only the subset
``train-clean-100``, which has 100 hours of training data.
.. CAUTION::
The training set is perturbed by speed with two factors: 0.9 and 1.1.
If ``--full-libri`` is True, each epoch actually processes
``3x960 == 2880`` hours of data.
- ``--num-epochs``
It is the number of epochs to train. For instance,
``./pruned_transducer_stateless4/train.py --num-epochs 30`` trains for 30 epochs
and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
in the folder ``./pruned_transducer_stateless4/exp``.
- ``--start-epoch``
It's used to resume training.
``./pruned_transducer_stateless4/train.py --start-epoch 10`` loads the
checkpoint ``./pruned_transducer_stateless4/exp/epoch-9.pt`` and starts
training from epoch 10, based on the state from epoch 9.
- ``--world-size``
It is used for multi-GPU single-machine DDP training.
- (a) If it is 1, then no DDP training is used.
- (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
The following shows some use cases with it.
**Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
GPU 2 for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="0,2"
$ ./pruned_transducer_stateless4/train.py --world-size 2
**Use case 2**: You have 4 GPUs and you want to use all of them
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless4/train.py --world-size 4
**Use case 3**: You have 4 GPUs but you only want to use GPU 3
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="3"
$ ./pruned_transducer_stateless4/train.py --world-size 1
.. caution::
Only multi-GPU single-machine DDP training is implemented at present.
Multi-GPU multi-machine DDP training will be added later.
- ``--max-duration``
It specifies the number of seconds over all utterances in a
batch, before **padding**.
If you encounter CUDA OOM, please reduce it.
.. HINT::
Due to padding, the number of seconds of all utterances in a
batch will usually be larger than ``--max-duration``.
A larger value for ``--max-duration`` may cause OOM during training,
while a smaller value may increase the training time. You have to
tune it.
- ``--use-fp16``
If it is True, the model will train with half precision, from our experiment
results, by using half precision you can train with two times larger ``--max-duration``
so as to get almost 2X speed up.
Pre-configured options
~~~~~~~~~~~~~~~~~~~~~~
There are some training options, e.g., number of encoder layers,
encoder dimension, decoder dimension, number of warmup steps etc,
that are not passed from the commandline.
They are pre-configured by the function ``get_params()`` in
`pruned_transducer_stateless4/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/train.py>`_
You don't need to change these pre-configured parameters. If you really need to change
them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
.. NOTE::
The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
Training logs
~~~~~~~~~~~~~
Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless4/exp``.
You will find the following files in that directory:
- ``epoch-1.pt``, ``epoch-2.pt``, ...
These are checkpoint files saved at the end of each epoch, containing model
``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
.. code-block:: bash
$ ./pruned_transducer_stateless4/train.py --start-epoch 11
- ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
These are checkpoint files saved every ``--save-every-n`` batches,
containing model ``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
.. code-block:: bash
$ ./pruned_transducer_stateless4/train.py --start-batch 436000
- ``tensorboard/``
This folder contains tensorBoard logs. Training loss, validation loss, learning
rate, etc, are recorded in these logs. You can visualize them by:
.. code-block:: bash
$ cd pruned_transducer_stateless4/exp/tensorboard
$ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
It will print something like below:
.. code-block::
TensorFlow installation not found - running with reduced feature set.
Upload started and will continue reading any new data as it's added to the logdir.
To stop uploading, press Ctrl-C.
New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/QOGSPBgsR8KzcRMmie9JGw/
[2022-11-20T15:50:50] Started scanning logdir.
Uploading 4468 scalars...
[2022-11-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
Listening for new data in logdir...
Note there is a URL in the above output. Click it and you will see
the following screenshot:
.. figure:: images/librispeech-pruned-transducer-tensorboard-log.jpg
:width: 600
:alt: TensorBoard screenshot
:align: center
:target: https://tensorboard.dev/experiment/QOGSPBgsR8KzcRMmie9JGw/
TensorBoard screenshot.
.. hint::
If you don't have access to google, you can use the following command
to view the tensorboard log locally:
.. code-block:: bash
cd pruned_transducer_stateless4/exp/tensorboard
tensorboard --logdir . --port 6008
It will print the following message:
.. code-block::
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
logs.
- ``log/log-train-xxxx``
It is the detailed training log in text format, same as the one
you saw printed to the console during training.
Usage example
~~~~~~~~~~~~~
You can use the following command to start the training using 6 GPUs:
.. code-block:: bash
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5"
./pruned_transducer_stateless4/train.py \
--world-size 6 \
--num-epochs 30 \
--start-epoch 1 \
--exp-dir pruned_transducer_stateless4/exp \
--full-libri 1 \
--max-duration 300
Decoding
--------
The decoding part uses checkpoints saved by the training part, so you have
to run the training part first.
.. hint::
There are two kinds of checkpoints:
- (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
of each epoch. You can pass ``--epoch`` to
``pruned_transducer_stateless4/decode.py`` to use them.
- (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
every ``--save-every-n`` batches. You can pass ``--iter`` to
``pruned_transducer_stateless4/decode.py`` to use them.
We suggest that you try both types of checkpoints and choose the one
that produces the lowest WERs.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless4/decode.py --help
shows the options for decoding.
The following shows two examples (for two types of checkpoints):
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for epoch in 25 20; do
for avg in 7 5 3 1; do
./pruned_transducer_stateless4/decode.py \
--epoch $epoch \
--avg $avg \
--exp-dir pruned_transducer_stateless4/exp \
--max-duration 600 \
--decoding-method $m
done
done
done
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for iter in 474000; do
for avg in 8 10 12 14 16 18; do
./pruned_transducer_stateless4/decode.py \
--iter $iter \
--avg $avg \
--exp-dir pruned_transducer_stateless4/exp \
--max-duration 600 \
--decoding-method $m
done
done
done
.. Note::
Supporting decoding methods are as follows:
- ``greedy_search`` : It takes the symbol with largest posterior probability
of each frame as the decoding result.
- ``beam_search`` : It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
`espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
next frame.
- ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
- ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
- ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
(with N-gram LM).
- ``fast_beam_search_nbest`` : It produces the decoding results as follows:
- (1) Use ``fast_beam_search`` to get a lattice
- (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
- (3) Unique the selected paths
- (4) Intersect the selected paths with the lattice and compute the
shortest path from the intersection result
- (5) The path with the largest score is used as the decoding output.
- ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
Export Model
------------
`pruned_transducer_stateless4/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless4/exp`` in the following ways.
Export ``model.state_dict()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Checkpoints saved by ``pruned_transducer_stateless4/train.py`` also include
``optimizer.state_dict()``. It is useful for resuming training. But after training,
we are interested only in ``model.state_dict()``. You can use the following
command to extract ``model.state_dict()``.
.. code-block:: bash
# Assume that --epoch 25 --avg 3 produces the smallest WER
# (You can get such information after running ./pruned_transducer_stateless4/decode.py)
epoch=25
avg=3
./pruned_transducer_stateless4/export.py \
--exp-dir ./pruned_transducer_stateless4/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch $epoch \
--avg $avg
It will generate a file ``./pruned_transducer_stateless4/exp/pretrained.pt``.
.. hint::
To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless4/decode.py``,
you can run:
.. code-block:: bash
cd pruned_transducer_stateless4/exp
ln -s pretrained.pt epoch-999.pt
And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
``./pruned_transducer_stateless4/decode.py``.
To use the exported model with ``./pruned_transducer_stateless4/pretrained.py``, you
can run:
.. code-block:: bash
./pruned_transducer_stateless4/pretrained.py \
--checkpoint ./pruned_transducer_stateless4/exp/pretrained.pt \
--bpe-model ./data/lang_bpe_500/bpe.model \
--method greedy_search \
/path/to/foo.wav \
/path/to/bar.wav
Export model using ``torch.jit.script()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
./pruned_transducer_stateless4/export.py \
--exp-dir ./pruned_transducer_stateless4/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 25 \
--avg 3 \
--jit 1
It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
load it by ``torch.jit.load("cpu_jit.pt")``.
Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
.. NOTE::
You will need this ``cpu_jit.pt`` when deploying with Sherpa framework.
Download pretrained models
--------------------------
If you don't want to train from scratch, you can download the pretrained models
by visiting the following links:
- `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`_
- `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`_
- `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`_
- `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`_
See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
for the details of the above pretrained models
Deploy with Sherpa
------------------
Please see `<https://k2-fsa.github.io/sherpa/python/offline_asr/conformer/librispeech.html#>`_
for how to deploy the models in ``sherpa``.

View File

@ -0,0 +1,453 @@
Zipformer CTC Blank Skip
========================
.. hint::
Please scroll down to the bottom of this page to find download links
for pretrained models if you don't want to train a model from scratch.
This tutorial shows you how to train a Zipformer model based on the guidance from
a co-trained CTC model using `blank skip method <https://arxiv.org/pdf/2210.16481.pdf>`_
with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
.. note::
We use both CTC and RNN-T loss to train. During the forward pass, the encoder output
is first used to calculate the CTC posterior probability; then for each output frame,
if its blank posterior is bigger than some threshold, it will be simply discarded
from the encoder output. To prevent information loss, we also put a convolution module
similar to the one used in conformer (referred to as “LConv”) before the frame reduction.
Data preparation
----------------
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
All you need to do is to run it.
.. note::
We encourage you to read ``./prepare.sh``.
The data preparation contains several stages. You can use the following two
options:
- ``--stage``
- ``--stop-stage``
to control which stage(s) should be run. By default, all stages are executed.
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh --stage 0 --stop-stage 0
means to run only stage 0.
To run stage 2 to stage 5, use:
.. code-block:: bash
$ ./prepare.sh --stage 2 --stop-stage 5
.. hint::
If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
``./prepare.sh`` won't re-download them.
.. note::
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
are saved in ``./data`` directory.
We provide the following YouTube video showing how to run ``./prepare.sh``.
.. note::
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
.. youtube:: ofEIoJL-mGM
Training
--------
For stability, it doesn`t use blank skip method until model warm-up.
Configurable options
~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless7_ctc_bs/train.py --help
shows you the training options that can be passed from the commandline.
The following options are used quite often:
- ``--full-libri``
If it's True, the training part uses all the training data, i.e.,
960 hours. Otherwise, the training part uses only the subset
``train-clean-100``, which has 100 hours of training data.
.. CAUTION::
The training set is perturbed by speed with two factors: 0.9 and 1.1.
If ``--full-libri`` is True, each epoch actually processes
``3x960 == 2880`` hours of data.
- ``--num-epochs``
It is the number of epochs to train. For instance,
``./pruned_transducer_stateless7_ctc_bs/train.py --num-epochs 30`` trains for 30 epochs
and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
in the folder ``./pruned_transducer_stateless7_ctc_bs/exp``.
- ``--start-epoch``
It's used to resume training.
``./pruned_transducer_stateless7_ctc_bs/train.py --start-epoch 10`` loads the
checkpoint ``./pruned_transducer_stateless7_ctc_bs/exp/epoch-9.pt`` and starts
training from epoch 10, based on the state from epoch 9.
- ``--world-size``
It is used for multi-GPU single-machine DDP training.
- (a) If it is 1, then no DDP training is used.
- (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
The following shows some use cases with it.
**Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
GPU 2 for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="0,2"
$ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 2
**Use case 2**: You have 4 GPUs and you want to use all of them
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 4
**Use case 3**: You have 4 GPUs but you only want to use GPU 3
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="3"
$ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 1
.. caution::
Only multi-GPU single-machine DDP training is implemented at present.
Multi-GPU multi-machine DDP training will be added later.
- ``--max-duration``
It specifies the number of seconds over all utterances in a
batch, before **padding**.
If you encounter CUDA OOM, please reduce it.
.. HINT::
Due to padding, the number of seconds of all utterances in a
batch will usually be larger than ``--max-duration``.
A larger value for ``--max-duration`` may cause OOM during training,
while a smaller value may increase the training time. You have to
tune it.
Pre-configured options
~~~~~~~~~~~~~~~~~~~~~~
There are some training options, e.g., weight decay,
number of warmup steps, results dir, etc,
that are not passed from the commandline.
They are pre-configured by the function ``get_params()`` in
`pruned_transducer_stateless7_ctc_bs/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py>`_
You don't need to change these pre-configured parameters. If you really need to change
them, please modify ``./pruned_transducer_stateless7_ctc_bs/train.py`` directly.
Training logs
~~~~~~~~~~~~~
Training logs and checkpoints are saved in ``pruned_transducer_stateless7_ctc_bs/exp``.
You will find the following files in that directory:
- ``epoch-1.pt``, ``epoch-2.pt``, ...
These are checkpoint files saved at the end of each epoch, containing model
``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
.. code-block:: bash
$ ./pruned_transducer_stateless7_ctc_bs/train.py --start-epoch 11
- ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
These are checkpoint files saved every ``--save-every-n`` batches,
containing model ``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
.. code-block:: bash
$ ./pruned_transducer_stateless7_ctc_bs/train.py --start-batch 436000
- ``tensorboard/``
This folder contains tensorBoard logs. Training loss, validation loss, learning
rate, etc, are recorded in these logs. You can visualize them by:
.. code-block:: bash
$ cd pruned_transducer_stateless7_ctc_bs/exp/tensorboard
$ tensorboard dev upload --logdir . --description "Zipformer-CTC co-training using blank skip for LibriSpeech with icefall"
It will print something like below:
.. code-block::
TensorFlow installation not found - running with reduced feature set.
Upload started and will continue reading any new data as it's added to the logdir.
To stop uploading, press Ctrl-C.
New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/xyOZUKpEQm62HBIlUD4uPA/
Note there is a URL in the above output. Click it and you will see
tensorboard.
.. hint::
If you don't have access to google, you can use the following command
to view the tensorboard log locally:
.. code-block:: bash
cd pruned_transducer_stateless7_ctc_bs/exp/tensorboard
tensorboard --logdir . --port 6008
It will print the following message:
.. code-block::
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
logs.
- ``log/log-train-xxxx``
It is the detailed training log in text format, same as the one
you saw printed to the console during training.
Usage example
~~~~~~~~~~~~~
You can use the following command to start the training using 4 GPUs:
.. code-block:: bash
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./pruned_transducer_stateless7_ctc_bs/train.py \
--world-size 4 \
--num-epochs 30 \
--start-epoch 1 \
--full-libri 1 \
--exp-dir pruned_transducer_stateless7_ctc_bs/exp \
--max-duration 600 \
--use-fp16 1
Decoding
--------
The decoding part uses checkpoints saved by the training part, so you have
to run the training part first.
.. hint::
There are two kinds of checkpoints:
- (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
of each epoch. You can pass ``--epoch`` to
``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py`` to use them.
- (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
every ``--save-every-n`` batches. You can pass ``--iter`` to
``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py`` to use them.
We suggest that you try both types of checkpoints and choose the one
that produces the lowest WERs.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py --help
shows the options for decoding.
The following shows the example using ``epoch-*.pt``:
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py \
--epoch 30 \
--avg 13 \
--exp-dir pruned_transducer_stateless7_ctc_bs/exp \
--max-duration 600 \
--decoding-method $m
done
To test CTC branch, you can use the following command:
.. code-block:: bash
for m in ctc-decoding 1best; do
./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py \
--epoch 30 \
--avg 13 \
--exp-dir pruned_transducer_stateless7_ctc_bs/exp \
--max-duration 600 \
--decoding-method $m
done
Export models
-------------
`pruned_transducer_stateless7_ctc_bs/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless7_ctc_bs/exp`` in the following ways.
Export ``model.state_dict()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Checkpoints saved by ``pruned_transducer_stateless7_ctc_bs/train.py`` also include
``optimizer.state_dict()``. It is useful for resuming training. But after training,
we are interested only in ``model.state_dict()``. You can use the following
command to extract ``model.state_dict()``.
.. code-block:: bash
./pruned_transducer_stateless7_ctc_bs/export.py \
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 30 \
--avg 13 \
--jit 0
It will generate a file ``./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt``.
.. hint::
To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py``,
you can run:
.. code-block:: bash
cd pruned_transducer_stateless7_ctc_bs/exp
ln -s pretrained epoch-9999.pt
And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
``./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py``.
To use the exported model with ``./pruned_transducer_stateless7_ctc_bs/pretrained.py``, you
can run:
.. code-block:: bash
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
--bpe-model ./data/lang_bpe_500/bpe.model \
--method greedy_search \
/path/to/foo.wav \
/path/to/bar.wav
To test CTC branch using the exported model with ``./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py``:
.. code-block:: bash
./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
--bpe-model data/lang_bpe_500/bpe.model \
--method ctc-decoding \
--sample-rate 16000 \
/path/to/foo.wav \
/path/to/bar.wav
Export model using ``torch.jit.script()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
./pruned_transducer_stateless7_ctc_bs/export.py \
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 30 \
--avg 13 \
--jit 1
It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
load it by ``torch.jit.load("cpu_jit.pt")``.
Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
To use the generated files with ``./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py``:
.. code-block:: bash
./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py \
--nn-model-filename ./pruned_transducer_stateless7_ctc_bs/exp/cpu_jit.pt \
/path/to/foo.wav \
/path/to/bar.wav
To test CTC branch using the generated files with ``./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py``:
.. code-block:: bash
./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
--model-filename ./pruned_transducer_stateless7_ctc_bs/exp/cpu_jit.pt \
--bpe-model data/lang_bpe_500/bpe.model \
--method ctc-decoding \
--sample-rate 16000 \
/path/to/foo.wav \
/path/to/bar.wav
Download pretrained models
--------------------------
If you don't want to train from scratch, you can download the pretrained models
by visiting the following links:
- `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14>`_
See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
for the details of the above pretrained models

View File

@ -0,0 +1,422 @@
Zipformer MMI
===============
.. hint::
Please scroll down to the bottom of this page to find download links
for pretrained models if you don't want to train a model from scratch.
This tutorial shows you how to train an Zipformer MMI model
with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
We use LF-MMI to compute the loss.
.. note::
You can find the document about LF-MMI training at the following address:
`<https://github.com/k2-fsa/next-gen-kaldi-wechat/blob/master/pdf/LF-MMI-training-and-decoding-in-k2-Part-I.pdf>`_
Data preparation
----------------
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
All you need to do is to run it.
.. note::
We encourage you to read ``./prepare.sh``.
The data preparation contains several stages. You can use the following two
options:
- ``--stage``
- ``--stop-stage``
to control which stage(s) should be run. By default, all stages are executed.
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh --stage 0 --stop-stage 0
means to run only stage 0.
To run stage 2 to stage 5, use:
.. code-block:: bash
$ ./prepare.sh --stage 2 --stop-stage 5
.. hint::
If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
``./prepare.sh`` won't re-download them.
.. note::
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
are saved in ``./data`` directory.
We provide the following YouTube video showing how to run ``./prepare.sh``.
.. note::
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
.. youtube:: ofEIoJL-mGM
Training
--------
For stability, it uses CTC loss for model warm-up and then switches to MMI loss.
Configurable options
~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./zipformer_mmi/train.py --help
shows you the training options that can be passed from the commandline.
The following options are used quite often:
- ``--full-libri``
If it's True, the training part uses all the training data, i.e.,
960 hours. Otherwise, the training part uses only the subset
``train-clean-100``, which has 100 hours of training data.
.. CAUTION::
The training set is perturbed by speed with two factors: 0.9 and 1.1.
If ``--full-libri`` is True, each epoch actually processes
``3x960 == 2880`` hours of data.
- ``--num-epochs``
It is the number of epochs to train. For instance,
``./zipformer_mmi/train.py --num-epochs 30`` trains for 30 epochs
and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
in the folder ``./zipformer_mmi/exp``.
- ``--start-epoch``
It's used to resume training.
``./zipformer_mmi/train.py --start-epoch 10`` loads the
checkpoint ``./zipformer_mmi/exp/epoch-9.pt`` and starts
training from epoch 10, based on the state from epoch 9.
- ``--world-size``
It is used for multi-GPU single-machine DDP training.
- (a) If it is 1, then no DDP training is used.
- (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
The following shows some use cases with it.
**Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
GPU 2 for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="0,2"
$ ./zipformer_mmi/train.py --world-size 2
**Use case 2**: You have 4 GPUs and you want to use all of them
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./zipformer_mmi/train.py --world-size 4
**Use case 3**: You have 4 GPUs but you only want to use GPU 3
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="3"
$ ./zipformer_mmi/train.py --world-size 1
.. caution::
Only multi-GPU single-machine DDP training is implemented at present.
Multi-GPU multi-machine DDP training will be added later.
- ``--max-duration``
It specifies the number of seconds over all utterances in a
batch, before **padding**.
If you encounter CUDA OOM, please reduce it.
.. HINT::
Due to padding, the number of seconds of all utterances in a
batch will usually be larger than ``--max-duration``.
A larger value for ``--max-duration`` may cause OOM during training,
while a smaller value may increase the training time. You have to
tune it.
Pre-configured options
~~~~~~~~~~~~~~~~~~~~~~
There are some training options, e.g., weight decay,
number of warmup steps, results dir, etc,
that are not passed from the commandline.
They are pre-configured by the function ``get_params()`` in
`zipformer_mmi/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer_mmi/train.py>`_
You don't need to change these pre-configured parameters. If you really need to change
them, please modify ``./zipformer_mmi/train.py`` directly.
Training logs
~~~~~~~~~~~~~
Training logs and checkpoints are saved in ``zipformer_mmi/exp``.
You will find the following files in that directory:
- ``epoch-1.pt``, ``epoch-2.pt``, ...
These are checkpoint files saved at the end of each epoch, containing model
``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
.. code-block:: bash
$ ./zipformer_mmi/train.py --start-epoch 11
- ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
These are checkpoint files saved every ``--save-every-n`` batches,
containing model ``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
.. code-block:: bash
$ ./zipformer_mmi/train.py --start-batch 436000
- ``tensorboard/``
This folder contains tensorBoard logs. Training loss, validation loss, learning
rate, etc, are recorded in these logs. You can visualize them by:
.. code-block:: bash
$ cd zipformer_mmi/exp/tensorboard
$ tensorboard dev upload --logdir . --description "Zipformer MMI training for LibriSpeech with icefall"
It will print something like below:
.. code-block::
TensorFlow installation not found - running with reduced feature set.
Upload started and will continue reading any new data as it's added to the logdir.
To stop uploading, press Ctrl-C.
New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/xyOZUKpEQm62HBIlUD4uPA/
Note there is a URL in the above output. Click it and you will see
tensorboard.
.. hint::
If you don't have access to google, you can use the following command
to view the tensorboard log locally:
.. code-block:: bash
cd zipformer_mmi/exp/tensorboard
tensorboard --logdir . --port 6008
It will print the following message:
.. code-block::
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
logs.
- ``log/log-train-xxxx``
It is the detailed training log in text format, same as the one
you saw printed to the console during training.
Usage example
~~~~~~~~~~~~~
You can use the following command to start the training using 4 GPUs:
.. code-block:: bash
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./zipformer_mmi/train.py \
--world-size 4 \
--num-epochs 30 \
--start-epoch 1 \
--full-libri 1 \
--exp-dir zipformer_mmi/exp \
--max-duration 500 \
--use-fp16 1 \
--num-workers 2
Decoding
--------
The decoding part uses checkpoints saved by the training part, so you have
to run the training part first.
.. hint::
There are two kinds of checkpoints:
- (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
of each epoch. You can pass ``--epoch`` to
``zipformer_mmi/decode.py`` to use them.
- (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
every ``--save-every-n`` batches. You can pass ``--iter`` to
``zipformer_mmi/decode.py`` to use them.
We suggest that you try both types of checkpoints and choose the one
that produces the lowest WERs.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./zipformer_mmi/decode.py --help
shows the options for decoding.
The following shows the example using ``epoch-*.pt``:
.. code-block:: bash
for m in nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
./zipformer_mmi/decode.py \
--epoch 30 \
--avg 10 \
--exp-dir ./zipformer_mmi/exp/ \
--max-duration 100 \
--lang-dir data/lang_bpe_500 \
--nbest-scale 1.2 \
--hp-scale 1.0 \
--decoding-method $m
done
Export models
-------------
`zipformer_mmi/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer_mmi/export.py>`_ supports exporting checkpoints from ``zipformer_mmi/exp`` in the following ways.
Export ``model.state_dict()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Checkpoints saved by ``zipformer_mmi/train.py`` also include
``optimizer.state_dict()``. It is useful for resuming training. But after training,
we are interested only in ``model.state_dict()``. You can use the following
command to extract ``model.state_dict()``.
.. code-block:: bash
./zipformer_mmi/export.py \
--exp-dir ./zipformer_mmi/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 30 \
--avg 9 \
--jit 0
It will generate a file ``./zipformer_mmi/exp/pretrained.pt``.
.. hint::
To use the generated ``pretrained.pt`` for ``zipformer_mmi/decode.py``,
you can run:
.. code-block:: bash
cd zipformer_mmi/exp
ln -s pretrained epoch-9999.pt
And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
``./zipformer_mmi/decode.py``.
To use the exported model with ``./zipformer_mmi/pretrained.py``, you
can run:
.. code-block:: bash
./zipformer_mmi/pretrained.py \
--checkpoint ./zipformer_mmi/exp/pretrained.pt \
--bpe-model ./data/lang_bpe_500/bpe.model \
--method 1best \
/path/to/foo.wav \
/path/to/bar.wav
Export model using ``torch.jit.script()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
./zipformer_mmi/export.py \
--exp-dir ./zipformer_mmi/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 30 \
--avg 9 \
--jit 1
It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
load it by ``torch.jit.load("cpu_jit.pt")``.
Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
To use the generated files with ``./zipformer_mmi/jit_pretrained.py``:
.. code-block:: bash
./zipformer_mmi/jit_pretrained.py \
--nn-model-filename ./zipformer_mmi/exp/cpu_jit.pt \
--bpe-model ./data/lang_bpe_500/bpe.model \
--method 1best \
/path/to/foo.wav \
/path/to/bar.wav
Download pretrained models
--------------------------
If you don't want to train from scratch, you can download the pretrained models
by visiting the following links:
- `<https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08>`_
See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
for the details of the above pretrained models

View File

Before

Width:  |  Height:  |  Size: 121 KiB

After

Width:  |  Height:  |  Size: 121 KiB

View File

@ -0,0 +1,12 @@
Streaming ASR
=============
.. toctree::
:maxdepth: 1
introduction
.. toctree::
:maxdepth: 2
librispeech/index

View File

@ -0,0 +1,52 @@
Introduction
============
This page shows you how we implement streaming **X-former transducer** models for ASR.
.. HINT::
X-former transducer here means the encoder of the transducer model uses Multi-Head Attention,
like `Conformer <https://arxiv.org/pdf/2005.08100.pdf>`_, `EmFormer <https://arxiv.org/pdf/2010.10759.pdf>`_ etc.
Currently we have implemented two types of streaming models, one uses Conformer as encoder, the other uses Emformer as encoder.
Streaming Conformer
-------------------
The main idea of training a streaming model is to make the model see limited contexts
in training time, we can achieve this by applying a mask to the output of self-attention.
In icefall, we implement the streaming conformer the way just like what `WeNet <https://arxiv.org/pdf/2012.05481.pdf>`_ did.
.. NOTE::
The conformer-transducer recipes in LibriSpeech datasets, like, `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
`pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
`pruned_transducer_stateless3 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_,
`pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
`pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_
all support streaming.
.. NOTE::
Training a streaming conformer model in ``icefall`` is almost the same as training a
non-streaming model, all you need to do is passing several extra arguments.
See :doc:`Pruned transducer statelessX <librispeech/pruned_transducer_stateless>` for more details.
.. HINT::
If you want to adapt a non-streaming conformer model to be streaming, please refer
to `this pull request <https://github.com/k2-fsa/icefall/pull/454>`_.
Streaming Emformer
------------------
The Emformer model proposed `here <https://arxiv.org/pdf/2010.10759.pdf>`_ uses more
complicated techniques. It has a memory bank component to memorize history information,
what' more, it also introduces right context in training time by hard-copying part of
the input features.
We have three variants of Emformer models in ``icefall``.
- ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`_.
- ``conv_emformer_transducer_stateless`` using ConvEmformer implemented by ourself. Different from the Emformer in torchaudio,
ConvEmformer has a convolution in each layer and uses the mechanisms in our reworked conformer model.
See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`_.
- ``conv_emformer_transducer_stateless2`` using ConvEmformer implemented by ourself. The only difference from the above one is that
it uses a simplified memory bank. See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_.

Binary file not shown.

After

Width:  |  Height:  |  Size: 547 KiB

View File

@ -4,6 +4,8 @@ LibriSpeech
.. toctree::
:maxdepth: 1
tdnn_lstm_ctc
conformer_ctc
pruned_transducer_stateless
lstm_pruned_stateless_transducer
zipformer_transducer

View File

@ -515,10 +515,10 @@ To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``:
Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/english/server.html>`_
for how to use the exported models in ``sherpa``.
.. _export-model-for-ncnn:
.. _export-lstm-transducer-model-for-ncnn:
Export model for ncnn
~~~~~~~~~~~~~~~~~~~~~
Export LSTM transducer models for ncnn
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
We support exporting pretrained LSTM transducer models to
`ncnn <https://github.com/tencent/ncnn>`_ using
@ -531,16 +531,36 @@ First, let us install a modified version of ``ncnn``:
git clone https://github.com/csukuangfj/ncnn
cd ncnn
git submodule update --recursive --init
python3 setup.py bdist_wheel
ls -lh dist/
pip install ./dist/*.whl
# Note: We don't use "python setup.py install" or "pip install ." here
mkdir -p build-wheel
cd build-wheel
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DNCNN_PYTHON=ON \
-DNCNN_BUILD_BENCHMARK=OFF \
-DNCNN_BUILD_EXAMPLES=OFF \
-DNCNN_BUILD_TOOLS=ON \
..
make -j4
cd ..
# Note: $PWD here is /path/to/ncnn
export PYTHONPATH=$PWD/python:$PYTHONPATH
export PATH=$PWD/tools/pnnx/build/src:$PATH
export PATH=$PWD/build-wheel/tools/quantize:$PATH
# now build pnnx
cd tools/pnnx
mkdir build
cd build
cmake ..
make -j4
export PATH=$PWD/src:$PATH
./src/pnnx
@ -549,6 +569,9 @@ First, let us install a modified version of ``ncnn``:
We assume that you have added the path to the binary ``pnnx`` to the
environment variable ``PATH``.
We also assume that you have added ``build/tools/quantize`` to the environment
variable ``PATH`` so that you are able to use ``ncnn2int8`` later.
Second, let us export the model using ``torch.jit.trace()`` that is suitable
for ``pnnx``:
@ -634,3 +657,6 @@ by visiting the following links:
You can find more usages of the pretrained models in
`<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
Export ConvEmformer transducer models for ncnn
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -0,0 +1,735 @@
Pruned transducer statelessX
============================
This tutorial shows you how to run a **streaming** conformer transducer model
with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
.. Note::
The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
`pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
`pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
`pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
We will take pruned_transducer_stateless4 as an example in this tutorial.
.. HINT::
We assume you have read the page :ref:`install icefall` and have setup
the environment for ``icefall``.
.. HINT::
We recommend you to use a GPU or several GPUs to run this recipe.
.. hint::
Please scroll down to the bottom of this page to find download links
for pretrained models if you don't want to train a model from scratch.
We use pruned RNN-T to compute the loss.
.. note::
You can find the paper about pruned RNN-T at the following address:
`<https://arxiv.org/abs/2206.13236>`_
The transducer model consists of 3 parts:
- Encoder, a.k.a, the transcription network. We use a Conformer model (the reworked version by Daniel Povey)
- Decoder, a.k.a, the prediction network. We use a stateless model consisting of
``nn.Embedding`` and ``nn.Conv1d``
- Joiner, a.k.a, the joint network.
.. caution::
Contrary to the conventional RNN-T models, we use a stateless decoder.
That is, it has no recurrent connections.
Data preparation
----------------
.. hint::
The data preparation is the same as other recipes on LibriSpeech dataset,
if you have finished this step, you can skip to ``Training`` directly.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
All you need to do is to run it.
The data preparation contains several stages, you can use the following two
options:
- ``--stage``
- ``--stop-stage``
to control which stage(s) should be run. By default, all stages are executed.
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh --stage 0 --stop-stage 0
means to run only stage 0.
To run stage 2 to stage 5, use:
.. code-block:: bash
$ ./prepare.sh --stage 2 --stop-stage 5
.. HINT::
If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
``./prepare.sh`` won't re-download them.
.. NOTE::
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
are saved in ``./data`` directory.
We provide the following YouTube video showing how to run ``./prepare.sh``.
.. note::
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
.. youtube:: ofEIoJL-mGM
Training
--------
.. NOTE::
We put the streaming and non-streaming model in one recipe, to train a streaming model you only
need to add **4** extra options comparing with training a non-streaming model. These options are
``--dynamic-chunk-training``, ``--num-left-chunks``, ``--causal-convolution``, ``--short-chunk-size``.
You can see the configurable options below for their meanings or read https://arxiv.org/pdf/2012.05481.pdf for more details.
Configurable options
~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless4/train.py --help
shows you the training options that can be passed from the commandline.
The following options are used quite often:
- ``--exp-dir``
The directory to save checkpoints, training logs and tensorboard.
- ``--full-libri``
If it's True, the training part uses all the training data, i.e.,
960 hours. Otherwise, the training part uses only the subset
``train-clean-100``, which has 100 hours of training data.
.. CAUTION::
The training set is perturbed by speed with two factors: 0.9 and 1.1.
If ``--full-libri`` is True, each epoch actually processes
``3x960 == 2880`` hours of data.
- ``--num-epochs``
It is the number of epochs to train. For instance,
``./pruned_transducer_stateless4/train.py --num-epochs 30`` trains for 30 epochs
and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
in the folder ``./pruned_transducer_stateless4/exp``.
- ``--start-epoch``
It's used to resume training.
``./pruned_transducer_stateless4/train.py --start-epoch 10`` loads the
checkpoint ``./pruned_transducer_stateless4/exp/epoch-9.pt`` and starts
training from epoch 10, based on the state from epoch 9.
- ``--world-size``
It is used for multi-GPU single-machine DDP training.
- (a) If it is 1, then no DDP training is used.
- (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
The following shows some use cases with it.
**Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
GPU 2 for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="0,2"
$ ./pruned_transducer_stateless4/train.py --world-size 2
**Use case 2**: You have 4 GPUs and you want to use all of them
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless4/train.py --world-size 4
**Use case 3**: You have 4 GPUs but you only want to use GPU 3
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="3"
$ ./pruned_transducer_stateless4/train.py --world-size 1
.. caution::
Only multi-GPU single-machine DDP training is implemented at present.
Multi-GPU multi-machine DDP training will be added later.
- ``--max-duration``
It specifies the number of seconds over all utterances in a
batch, before **padding**.
If you encounter CUDA OOM, please reduce it.
.. HINT::
Due to padding, the number of seconds of all utterances in a
batch will usually be larger than ``--max-duration``.
A larger value for ``--max-duration`` may cause OOM during training,
while a smaller value may increase the training time. You have to
tune it.
- ``--use-fp16``
If it is True, the model will train with half precision, from our experiment
results, by using half precision you can train with two times larger ``--max-duration``
so as to get almost 2X speed up.
- ``--dynamic-chunk-training``
The flag that indicates whether to train a streaming model or not, it
**MUST** be True if you want to train a streaming model.
- ``--short-chunk-size``
When training a streaming attention model with chunk masking, the chunk size
would be either max sequence length of current batch or uniformly sampled from
(1, short_chunk_size). The default value is 25, you don't have to change it most of the time.
- ``--num-left-chunks``
It indicates how many left context (in chunks) that can be seen when calculating attention.
The default value is 4, you don't have to change it most of the time.
- ``--causal-convolution``
Whether to use causal convolution in conformer encoder layer, this requires
to be True when training a streaming model.
Pre-configured options
~~~~~~~~~~~~~~~~~~~~~~
There are some training options, e.g., number of encoder layers,
encoder dimension, decoder dimension, number of warmup steps etc,
that are not passed from the commandline.
They are pre-configured by the function ``get_params()`` in
`pruned_transducer_stateless4/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/train.py>`_
You don't need to change these pre-configured parameters. If you really need to change
them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
.. NOTE::
The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
Training logs
~~~~~~~~~~~~~
Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless4/exp``.
You will find the following files in that directory:
- ``epoch-1.pt``, ``epoch-2.pt``, ...
These are checkpoint files saved at the end of each epoch, containing model
``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
.. code-block:: bash
$ ./pruned_transducer_stateless4/train.py --start-epoch 11
- ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
These are checkpoint files saved every ``--save-every-n`` batches,
containing model ``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
.. code-block:: bash
$ ./pruned_transducer_stateless4/train.py --start-batch 436000
- ``tensorboard/``
This folder contains tensorBoard logs. Training loss, validation loss, learning
rate, etc, are recorded in these logs. You can visualize them by:
.. code-block:: bash
$ cd pruned_transducer_stateless4/exp/tensorboard
$ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
It will print something like below:
.. code-block::
TensorFlow installation not found - running with reduced feature set.
Upload started and will continue reading any new data as it's added to the logdir.
To stop uploading, press Ctrl-C.
New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/97VKXf80Ru61CnP2ALWZZg/
[2022-11-20T15:50:50] Started scanning logdir.
Uploading 4468 scalars...
[2022-11-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
Listening for new data in logdir...
Note there is a URL in the above output. Click it and you will see
the following screenshot:
.. figure:: images/streaming-librispeech-pruned-transducer-tensorboard-log.jpg
:width: 600
:alt: TensorBoard screenshot
:align: center
:target: https://tensorboard.dev/experiment/97VKXf80Ru61CnP2ALWZZg/
TensorBoard screenshot.
.. hint::
If you don't have access to google, you can use the following command
to view the tensorboard log locally:
.. code-block:: bash
cd pruned_transducer_stateless4/exp/tensorboard
tensorboard --logdir . --port 6008
It will print the following message:
.. code-block::
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
logs.
- ``log/log-train-xxxx``
It is the detailed training log in text format, same as the one
you saw printed to the console during training.
Usage example
~~~~~~~~~~~~~
You can use the following command to start the training using 4 GPUs:
.. code-block:: bash
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./pruned_transducer_stateless4/train.py \
--world-size 4 \
--dynamic-chunk-training 1 \
--causal-convolution 1 \
--num-epochs 30 \
--start-epoch 1 \
--exp-dir pruned_transducer_stateless4/exp \
--full-libri 1 \
--max-duration 300
.. NOTE::
Comparing with training a non-streaming model, you only need to add two extra options,
``--dynamic-chunk-training 1`` and ``--causal-convolution 1`` .
Decoding
--------
The decoding part uses checkpoints saved by the training part, so you have
to run the training part first.
.. hint::
There are two kinds of checkpoints:
- (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
of each epoch. You can pass ``--epoch`` to
``pruned_transducer_stateless4/decode.py`` to use them.
- (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
every ``--save-every-n`` batches. You can pass ``--iter`` to
``pruned_transducer_stateless4/decode.py`` to use them.
We suggest that you try both types of checkpoints and choose the one
that produces the lowest WERs.
.. tip::
To decode a streaming model, you can use either ``simulate streaming decoding`` in ``decode.py`` or
``real streaming decoding`` in ``streaming_decode.py``, the difference between ``decode.py`` and
``streaming_decode.py`` is that, ``decode.py`` processes the whole acoustic frames at one time with masking (i.e. same as training),
but ``streaming_decode.py`` processes the acoustic frames chunk by chunk (so it can only see limited context).
.. NOTE::
``simulate streaming decoding`` in ``decode.py`` and ``real streaming decoding`` in ``streaming_decode.py`` should
produce almost the same results given the same ``--decode-chunk-size`` and ``--left-context``.
Simulate streaming decoding
~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless4/decode.py --help
shows the options for decoding.
The following options are important for streaming models:
``--simulate-streaming``
If you want to decode a streaming model with ``decode.py``, you **MUST** set
``--simulate-streaming`` to ``True``. ``simulate`` here means the acoustic frames
are not processed frame by frame (or chunk by chunk), instead, the whole sequence
is processed at one time with masking (the same as training).
``--causal-convolution``
If True, the convolution module in encoder layers will be causal convolution.
This is **MUST** be True when decoding with a streaming model.
``--decode-chunk-size``
For streaming models, we will calculate the chunk-wise attention, ``--decode-chunk-size``
indicates the chunk length (in frames after subsampling) for chunk-wise attention.
For ``simulate streaming decoding`` the ``decode-chunk-size`` is used to generate
the attention mask.
``--left-context``
``--left-context`` indicates how many left context frames (after subsampling) can be seen
for current chunk when calculating chunk-wise attention. Normally, ``left-context`` should equal
to ``decode-chunk-size * num-left-chunks``, where ``num-left-chunks`` is the option used
to train this model. For ``simulate streaming decoding`` the ``left-context`` is used to generate
the attention mask.
The following shows two examples (for the two types of checkpoints):
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for epoch in 25 20; do
for avg in 7 5 3 1; do
./pruned_transducer_stateless4/decode.py \
--epoch $epoch \
--avg $avg \
--simulate-streaming 1 \
--causal-convolution 1 \
--decode-chunk-size 16 \
--left-context 64 \
--exp-dir pruned_transducer_stateless4/exp \
--max-duration 600 \
--decoding-method $m
done
done
done
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for iter in 474000; do
for avg in 8 10 12 14 16 18; do
./pruned_transducer_stateless4/decode.py \
--iter $iter \
--avg $avg \
--simulate-streaming 1 \
--causal-convolution 1 \
--decode-chunk-size 16 \
--left-context 64 \
--exp-dir pruned_transducer_stateless4/exp \
--max-duration 600 \
--decoding-method $m
done
done
done
Real streaming decoding
~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless4/streaming_decode.py --help
shows the options for decoding.
The following options are important for streaming models:
``--decode-chunk-size``
For streaming models, we will calculate the chunk-wise attention, ``--decode-chunk-size``
indicates the chunk length (in frames after subsampling) for chunk-wise attention.
For ``real streaming decoding``, we will process ``decode-chunk-size`` acoustic frames at each time.
``--left-context``
``--left-context`` indicates how many left context frames (after subsampling) can be seen
for current chunk when calculating chunk-wise attention. Normally, ``left-context`` should equal
to ``decode-chunk-size * num-left-chunks``, where ``num-left-chunks`` is the option used
to train this model.
``--num-decode-streams``
The number of decoding streams that can be run in parallel (very similar to the ``bath size``).
For ``real streaming decoding``, the batches will be packed dynamically, for example, if the
``num-decode-streams`` equals to 10, then, sequence 1 to 10 will be decoded at first, after a while,
suppose sequence 1 and 2 are done, so, sequence 3 to 12 will be processed parallelly in a batch.
.. NOTE::
We also try adding ``--right-context`` in the real streaming decoding, but it seems not to benefit
the performance for all the models, the reasons might be the training and decoding mismatch. You
can try decoding with ``--right-context`` to see if it helps. The default value is 0.
The following shows two examples (for the two types of checkpoints):
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for epoch in 25 20; do
for avg in 7 5 3 1; do
./pruned_transducer_stateless4/decode.py \
--epoch $epoch \
--avg $avg \
--decode-chunk-size 16 \
--left-context 64 \
--num-decode-streams 100 \
--exp-dir pruned_transducer_stateless4/exp \
--max-duration 600 \
--decoding-method $m
done
done
done
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for iter in 474000; do
for avg in 8 10 12 14 16 18; do
./pruned_transducer_stateless4/decode.py \
--iter $iter \
--avg $avg \
--decode-chunk-size 16 \
--left-context 64 \
--num-decode-streams 100 \
--exp-dir pruned_transducer_stateless4/exp \
--max-duration 600 \
--decoding-method $m
done
done
done
.. tip::
Supporting decoding methods are as follows:
- ``greedy_search`` : It takes the symbol with largest posterior probability
of each frame as the decoding result.
- ``beam_search`` : It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
`espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
next frame.
- ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
- ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
- ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
(with N-gram LM).
- ``fast_beam_search_nbest`` : It produces the decoding results as follows:
- (1) Use ``fast_beam_search`` to get a lattice
- (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
- (3) Unique the selected paths
- (4) Intersect the selected paths with the lattice and compute the
shortest path from the intersection result
- (5) The path with the largest score is used as the decoding output.
- ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
.. NOTE::
The supporting decoding methods in ``streaming_decode.py`` might be less than that in ``decode.py``, if needed,
you can implement them by yourself or file a issue in `icefall <https://github.com/k2-fsa/icefall/issues>`_ .
Export Model
------------
`pruned_transducer_stateless4/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless4/exp`` in the following ways.
Export ``model.state_dict()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Checkpoints saved by ``pruned_transducer_stateless4/train.py`` also include
``optimizer.state_dict()``. It is useful for resuming training. But after training,
we are interested only in ``model.state_dict()``. You can use the following
command to extract ``model.state_dict()``.
.. code-block:: bash
# Assume that --epoch 25 --avg 3 produces the smallest WER
# (You can get such information after running ./pruned_transducer_stateless4/decode.py)
epoch=25
avg=3
./pruned_transducer_stateless4/export.py \
--exp-dir ./pruned_transducer_stateless4/exp \
--streaming-model 1 \
--causal-convolution 1 \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch $epoch \
--avg $avg
.. caution::
``--streaming-model`` and ``--causal-convolution`` require to be True to export
a streaming mdoel.
It will generate a file ``./pruned_transducer_stateless4/exp/pretrained.pt``.
.. hint::
To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless4/decode.py``,
you can run:
.. code-block:: bash
cd pruned_transducer_stateless4/exp
ln -s pretrained.pt epoch-999.pt
And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
``./pruned_transducer_stateless4/decode.py``.
To use the exported model with ``./pruned_transducer_stateless4/pretrained.py``, you
can run:
.. code-block:: bash
./pruned_transducer_stateless4/pretrained.py \
--checkpoint ./pruned_transducer_stateless4/exp/pretrained.pt \
--simulate-streaming 1 \
--causal-convolution 1 \
--bpe-model ./data/lang_bpe_500/bpe.model \
--method greedy_search \
/path/to/foo.wav \
/path/to/bar.wav
Export model using ``torch.jit.script()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
./pruned_transducer_stateless4/export.py \
--exp-dir ./pruned_transducer_stateless4/exp \
--streaming-model 1 \
--causal-convolution 1 \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 25 \
--avg 3 \
--jit 1
.. caution::
``--streaming-model`` and ``--causal-convolution`` require to be True to export
a streaming mdoel.
It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
load it by ``torch.jit.load("cpu_jit.pt")``.
Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
.. NOTE::
You will need this ``cpu_jit.pt`` when deploying with Sherpa framework.
Download pretrained models
--------------------------
If you don't want to train from scratch, you can download the pretrained models
by visiting the following links:
- `pruned_transducer_stateless <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless_20220625>`_
- `pruned_transducer_stateless2 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless2_20220625>`_
- `pruned_transducer_stateless4 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625>`_
- `pruned_transducer_stateless5 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless5_20220729>`_
See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
for the details of the above pretrained models
Deploy with Sherpa
------------------
Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#>`_
for how to deploy the models in ``sherpa``.

View File

@ -0,0 +1,654 @@
Zipformer Transducer
====================
This tutorial shows you how to run a **streaming** zipformer transducer model
with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
.. Note::
The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
.. HINT::
We assume you have read the page :ref:`install icefall` and have setup
the environment for ``icefall``.
.. HINT::
We recommend you to use a GPU or several GPUs to run this recipe.
.. hint::
Please scroll down to the bottom of this page to find download links
for pretrained models if you don't want to train a model from scratch.
We use pruned RNN-T to compute the loss.
.. note::
You can find the paper about pruned RNN-T at the following address:
`<https://arxiv.org/abs/2206.13236>`_
The transducer model consists of 3 parts:
- Encoder, a.k.a, the transcription network. We use a Zipformer model (proposed by Daniel Povey)
- Decoder, a.k.a, the prediction network. We use a stateless model consisting of
``nn.Embedding`` and ``nn.Conv1d``
- Joiner, a.k.a, the joint network.
.. caution::
Contrary to the conventional RNN-T models, we use a stateless decoder.
That is, it has no recurrent connections.
Data preparation
----------------
.. hint::
The data preparation is the same as other recipes on LibriSpeech dataset,
if you have finished this step, you can skip to ``Training`` directly.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
All you need to do is to run it.
The data preparation contains several stages, you can use the following two
options:
- ``--stage``
- ``--stop-stage``
to control which stage(s) should be run. By default, all stages are executed.
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh --stage 0 --stop-stage 0
means to run only stage 0.
To run stage 2 to stage 5, use:
.. code-block:: bash
$ ./prepare.sh --stage 2 --stop-stage 5
.. HINT::
If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
``./prepare.sh`` won't re-download them.
.. NOTE::
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
are saved in ``./data`` directory.
We provide the following YouTube video showing how to run ``./prepare.sh``.
.. note::
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
.. youtube:: ofEIoJL-mGM
Training
--------
Configurable options
~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless7_streaming/train.py --help
shows you the training options that can be passed from the commandline.
The following options are used quite often:
- ``--exp-dir``
The directory to save checkpoints, training logs and tensorboard.
- ``--full-libri``
If it's True, the training part uses all the training data, i.e.,
960 hours. Otherwise, the training part uses only the subset
``train-clean-100``, which has 100 hours of training data.
.. CAUTION::
The training set is perturbed by speed with two factors: 0.9 and 1.1.
If ``--full-libri`` is True, each epoch actually processes
``3x960 == 2880`` hours of data.
- ``--num-epochs``
It is the number of epochs to train. For instance,
``./pruned_transducer_stateless7_streaming/train.py --num-epochs 30`` trains for 30 epochs
and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
in the folder ``./pruned_transducer_stateless7_streaming/exp``.
- ``--start-epoch``
It's used to resume training.
``./pruned_transducer_stateless7_streaming/train.py --start-epoch 10`` loads the
checkpoint ``./pruned_transducer_stateless7_streaming/exp/epoch-9.pt`` and starts
training from epoch 10, based on the state from epoch 9.
- ``--world-size``
It is used for multi-GPU single-machine DDP training.
- (a) If it is 1, then no DDP training is used.
- (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
The following shows some use cases with it.
**Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
GPU 2 for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="0,2"
$ ./pruned_transducer_stateless7_streaming/train.py --world-size 2
**Use case 2**: You have 4 GPUs and you want to use all of them
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless7_streaming/train.py --world-size 4
**Use case 3**: You have 4 GPUs but you only want to use GPU 3
for training. You can do the following:
.. code-block:: bash
$ cd egs/librispeech/ASR
$ export CUDA_VISIBLE_DEVICES="3"
$ ./pruned_transducer_stateless7_streaming/train.py --world-size 1
.. caution::
Only multi-GPU single-machine DDP training is implemented at present.
Multi-GPU multi-machine DDP training will be added later.
- ``--max-duration``
It specifies the number of seconds over all utterances in a
batch, before **padding**.
If you encounter CUDA OOM, please reduce it.
.. HINT::
Due to padding, the number of seconds of all utterances in a
batch will usually be larger than ``--max-duration``.
A larger value for ``--max-duration`` may cause OOM during training,
while a smaller value may increase the training time. You have to
tune it.
- ``--use-fp16``
If it is True, the model will train with half precision, from our experiment
results, by using half precision you can train with two times larger ``--max-duration``
so as to get almost 2X speed up.
We recommend using ``--use-fp16 True``.
- ``--short-chunk-size``
When training a streaming attention model with chunk masking, the chunk size
would be either max sequence length of current batch or uniformly sampled from
(1, short_chunk_size). The default value is 50, you don't have to change it most of the time.
- ``--num-left-chunks``
It indicates how many left context (in chunks) that can be seen when calculating attention.
The default value is 4, you don't have to change it most of the time.
- ``--decode-chunk-len``
The chunk size for decoding (in frames before subsampling). It is used for validation.
The default value is 32 (i.e., 320ms).
Pre-configured options
~~~~~~~~~~~~~~~~~~~~~~
There are some training options, e.g., number of encoder layers,
encoder dimension, decoder dimension, number of warmup steps etc,
that are not passed from the commandline.
They are pre-configured by the function ``get_params()`` in
`pruned_transducer_stateless7_streaming/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py>`_
You don't need to change these pre-configured parameters. If you really need to change
them, please modify ``./pruned_transducer_stateless7_streaming/train.py`` directly.
Training logs
~~~~~~~~~~~~~
Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless7_streaming/exp``.
You will find the following files in that directory:
- ``epoch-1.pt``, ``epoch-2.pt``, ...
These are checkpoint files saved at the end of each epoch, containing model
``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
.. code-block:: bash
$ ./pruned_transducer_stateless7_streaming/train.py --start-epoch 11
- ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
These are checkpoint files saved every ``--save-every-n`` batches,
containing model ``state_dict`` and optimizer ``state_dict``.
To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
.. code-block:: bash
$ ./pruned_transducer_stateless7_streaming/train.py --start-batch 436000
- ``tensorboard/``
This folder contains tensorBoard logs. Training loss, validation loss, learning
rate, etc, are recorded in these logs. You can visualize them by:
.. code-block:: bash
$ cd pruned_transducer_stateless7_streaming/exp/tensorboard
$ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
.. hint::
If you don't have access to google, you can use the following command
to view the tensorboard log locally:
.. code-block:: bash
cd pruned_transducer_stateless7_streaming/exp/tensorboard
tensorboard --logdir . --port 6008
It will print the following message:
.. code-block::
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
logs.
- ``log/log-train-xxxx``
It is the detailed training log in text format, same as the one
you saw printed to the console during training.
Usage example
~~~~~~~~~~~~~
You can use the following command to start the training using 4 GPUs:
.. code-block:: bash
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./pruned_transducer_stateless7_streaming/train.py \
--world-size 4 \
--num-epochs 30 \
--start-epoch 1 \
--use-fp16 1 \
--exp-dir pruned_transducer_stateless7_streaming/exp \
--full-libri 1 \
--max-duration 550
Decoding
--------
The decoding part uses checkpoints saved by the training part, so you have
to run the training part first.
.. hint::
There are two kinds of checkpoints:
- (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
of each epoch. You can pass ``--epoch`` to
``pruned_transducer_stateless7_streaming/decode.py`` to use them.
- (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
every ``--save-every-n`` batches. You can pass ``--iter`` to
``pruned_transducer_stateless7_streaming/decode.py`` to use them.
We suggest that you try both types of checkpoints and choose the one
that produces the lowest WERs.
.. tip::
To decode a streaming model, you can use either ``simulate streaming decoding`` in ``decode.py`` or
``real chunk-wise streaming decoding`` in ``streaming_decode.py``. The difference between ``decode.py`` and
``streaming_decode.py`` is that, ``decode.py`` processes the whole acoustic frames at one time with masking (i.e. same as training),
but ``streaming_decode.py`` processes the acoustic frames chunk by chunk.
.. NOTE::
``simulate streaming decoding`` in ``decode.py`` and ``real chunk-size streaming decoding`` in ``streaming_decode.py`` should
produce almost the same results given the same ``--decode-chunk-len``.
Simulate streaming decoding
~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless7_streaming/decode.py --help
shows the options for decoding.
The following options are important for streaming models:
``--decode-chunk-len``
It is same as in ``train.py``, which specifies the chunk size for decoding (in frames before subsampling).
The default value is 32 (i.e., 320ms).
The following shows two examples (for the two types of checkpoints):
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for epoch in 30; do
for avg in 12 11 10 9 8; do
./pruned_transducer_stateless7_streaming/decode.py \
--epoch $epoch \
--avg $avg \
--decode-chunk-len 32 \
--exp-dir pruned_transducer_stateless7_streaming/exp \
--max-duration 600 \
--decoding-method $m
done
done
done
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for iter in 474000; do
for avg in 8 10 12 14 16 18; do
./pruned_transducer_stateless7_streaming/decode.py \
--iter $iter \
--avg $avg \
--decode-chunk-len 32 \
--exp-dir pruned_transducer_stateless7_streaming/exp \
--max-duration 600 \
--decoding-method $m
done
done
done
Real streaming decoding
~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless7_streaming/streaming_decode.py --help
shows the options for decoding.
The following options are important for streaming models:
``--decode-chunk-len``
It is same as in ``train.py``, which specifies the chunk size for decoding (in frames before subsampling).
The default value is 32 (i.e., 320ms).
For ``real streaming decoding``, we will process ``decode-chunk-len`` acoustic frames at each time.
``--num-decode-streams``
The number of decoding streams that can be run in parallel (very similar to the ``bath size``).
For ``real streaming decoding``, the batches will be packed dynamically, for example, if the
``num-decode-streams`` equals to 10, then, sequence 1 to 10 will be decoded at first, after a while,
suppose sequence 1 and 2 are done, so, sequence 3 to 12 will be processed parallelly in a batch.
The following shows two examples (for the two types of checkpoints):
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for epoch in 30; do
for avg in 12 11 10 9 8; do
./pruned_transducer_stateless7_streaming/decode.py \
--epoch $epoch \
--avg $avg \
--decode-chunk-len 32 \
--num-decode-streams 100 \
--exp-dir pruned_transducer_stateless7_streaming/exp \
--decoding-method $m
done
done
done
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
for iter in 474000; do
for avg in 8 10 12 14 16 18; do
./pruned_transducer_stateless7_streaming/decode.py \
--iter $iter \
--avg $avg \
--decode-chunk-len 16 \
--num-decode-streams 100 \
--exp-dir pruned_transducer_stateless7_streaming/exp \
--decoding-method $m
done
done
done
.. tip::
Supporting decoding methods are as follows:
- ``greedy_search`` : It takes the symbol with largest posterior probability
of each frame as the decoding result.
- ``beam_search`` : It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
`espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
next frame.
- ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
- ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
- ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
(with N-gram LM).
- ``fast_beam_search_nbest`` : It produces the decoding results as follows:
- (1) Use ``fast_beam_search`` to get a lattice
- (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
- (3) Unique the selected paths
- (4) Intersect the selected paths with the lattice and compute the
shortest path from the intersection result
- (5) The path with the largest score is used as the decoding output.
- ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
.. NOTE::
The supporting decoding methods in ``streaming_decode.py`` might be less than that in ``decode.py``, if needed,
you can implement them by yourself or file a issue in `icefall <https://github.com/k2-fsa/icefall/issues>`_ .
Export Model
------------
Currently it supports exporting checkpoints from ``pruned_transducer_stateless7_streaming/exp`` in the following ways.
Export ``model.state_dict()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Checkpoints saved by ``pruned_transducer_stateless7_streaming/train.py`` also include
``optimizer.state_dict()``. It is useful for resuming training. But after training,
we are interested only in ``model.state_dict()``. You can use the following
command to extract ``model.state_dict()``.
.. code-block:: bash
# Assume that --epoch 30 --avg 9 produces the smallest WER
# (You can get such information after running ./pruned_transducer_stateless7_streaming/decode.py)
epoch=30
avg=9
./pruned_transducer_stateless7_streaming/export.py \
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch $epoch \
--avg $avg \
--use-averaged-model=True \
--decode-chunk-len 32
It will generate a file ``./pruned_transducer_stateless7_streaming/exp/pretrained.pt``.
.. hint::
To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_streaming/decode.py``,
you can run:
.. code-block:: bash
cd pruned_transducer_stateless7_streaming/exp
ln -s pretrained.pt epoch-999.pt
And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
``./pruned_transducer_stateless7_streaming/decode.py``.
To use the exported model with ``./pruned_transducer_stateless7_streaming/pretrained.py``, you
can run:
.. code-block:: bash
./pruned_transducer_stateless7_streaming/pretrained.py \
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
--bpe-model ./data/lang_bpe_500/bpe.model \
--method greedy_search \
--decode-chunk-len 32 \
/path/to/foo.wav \
/path/to/bar.wav
Export model using ``torch.jit.script()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
./pruned_transducer_stateless7_streaming/export.py \
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 30 \
--avg 9 \
--decode-chunk-len 32 \
--jit 1
.. caution::
``--decode-chunk-len`` is required to export a ScriptModule.
It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
load it by ``torch.jit.load("cpu_jit.pt")``.
Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
Export model using ``torch.jit.trace()``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
epoch=30
avg=9
./pruned_transducer_stateless7_streaming/jit_trace_export.py \
--bpe-model data/lang_bpe_500/bpe.model \
--use-averaged-model=True \
--decode-chunk-len 32 \
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
--epoch $epoch \
--avg $avg
.. caution::
``--decode-chunk-len`` is required to export a ScriptModule.
It will generate 3 files:
- ``./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt``
- ``./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt``
- ``./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt``
To use the generated files with ``./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py``:
.. code-block:: bash
./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py \
--encoder-model-filename ./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt \
--decoder-model-filename ./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt \
--joiner-model-filename ./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt \
--bpe-model ./data/lang_bpe_500/bpe.model \
--decode-chunk-len 32 \
/path/to/foo.wav
Download pretrained models
--------------------------
If you don't want to train from scratch, you can download the pretrained models
by visiting the following links:
- `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
for the details of the above pretrained models
Deploy with Sherpa
------------------
Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#>`_
for how to deploy the models in ``sherpa``.

View File

@ -13,7 +13,5 @@ We may add recipes for other tasks as well in the future.
:maxdepth: 2
:caption: Table of Contents
aishell/index
librispeech/index
timit/index
yesno/index
Non-streaming-ASR/index
Streaming-ASR/index

View File

@ -0,0 +1,38 @@
# Introduction
This recipe trains multi-domain ASR models for AliMeeting. By multi-domain, we mean that
we train a single model on close-talk and far-field conditions. This recipe optionally
uses [GSS]-based enhancement for far-field array microphone.
We pool data in the following 4 ways and train a single model on the pooled data:
(i) individual headset microphone (IHM)
(ii) IHM with simulated reverb
(iii) Single distant microphone (SDM)
(iv) GSS-enhanced array microphones
This is different from `alimeeting/ASR` since that recipe trains a model only on the
far-field audio. Additionally, we use text normalization here similar to the original
M2MeT challenge, so the results should be more comparable to those from Table 4 of
the [paper](https://arxiv.org/abs/2110.07393).
The following additional packages need to be installed to run this recipe:
* `pip install jieba`
* `pip install paddlepaddle`
* `pip install git+https://github.com/desh2608/gss.git`
[./RESULTS.md](./RESULTS.md) contains the latest results.
## Performance Record
### pruned_transducer_stateless7
The following are decoded using `modified_beam_search`:
| Evaluation set | eval WER | test WER |
|--------------------------|------------|---------|
| IHM | 9.58 | 11.53 |
| SDM | 23.37 | 25.85 |
| MDM (GSS-enhanced) | 11.82 | 14.22 |
See [RESULTS](/egs/alimeeting/ASR_v2/RESULTS.md) for details.

View File

@ -0,0 +1,90 @@
## Results (CER)
#### 2022-12-09
#### Zipformer (pruned_transducer_stateless7)
Zipformer encoder + non-current decoder. The decoder
contains only an embedding layer, a Conv1d (with kernel size 2) and a linear
layer (to transform tensor dim).
All the results below are using a single model that is trained by combining the following
data: IHM, IHM+reverb, SDM, and GSS-enhanced MDM. Speed perturbation and MUSAN noise
augmentation are applied on top of the pooled data.
**WERs for IHM:**
| | eval | test | comment |
|---------------------------|------------|------------|------------------------------------------|
| greedy search | 10.13 | 12.21 | --epoch 15 --avg 8 --max-duration 500 |
| modified beam search | 9.58 | 11.53 | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 |
| fast beam search | 9.92 | 12.07 | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 --max-contexts 4 --max-states 8 |
**WERs for SDM:**
| | eval | test | comment |
|---------------------------|------------|------------|------------------------------------------|
| greedy search | 23.70 | 26.41 | --epoch 15 --avg 8 --max-duration 500 |
| modified beam search | 23.37 | 25.85 | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 |
| fast beam search | 23.60 | 26.38 | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 --max-contexts 4 --max-states 8 |
**WERs for GSS-enhanced MDM:**
| | eval | test | comment |
|---------------------------|------------|------------|------------------------------------------|
| greedy search | 12.24 | 14.99 | --epoch 15 --avg 8 --max-duration 500 |
| modified beam search | 11.82 | 14.22 | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 |
| fast beam search | 12.30 | 14.98 | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 --max-contexts 4 --max-states 8 |
The training command for reproducing is given below:
```
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./pruned_transducer_stateless7/train.py \
--world-size 4 \
--num-epochs 15 \
--exp-dir pruned_transducer_stateless7/exp \
--max-duration 300 \
--max-cuts 100 \
--prune-range 5 \
--lr-factor 5 \
--lm-scale 0.25 \
--use-fp16 True
```
The decoding command is:
```
# greedy search
./pruned_transducer_stateless7/decode.py \
--epoch 15 \
--avg 8 \
--exp-dir ./pruned_transducer_stateless7/exp \
--max-duration 500 \
--decoding-method greedy_search
# modified beam search
./pruned_transducer_stateless7/decode.py \
--epoch 15 \
--avg 8 \
--exp-dir ./pruned_transducer_stateless7/exp \
--max-duration 500 \
--decoding-method modified_beam_search \
--beam-size 4
# fast beam search
./pruned_transducer_stateless7/decode.py \
--epoch 15 \
--avg 8 \
--exp-dir ./pruned_transducer_stateless5/exp \
--max-duration 500 \
--decoding-method fast_beam_search \
--beam 4 \
--max-contexts 4 \
--max-states 8
```
Pretrained model is available at <https://huggingface.co/desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7>
The tensorboard training log can be found at
<https://tensorboard.dev/experiment/EzmVahMMTb2YfKWXwQ2dyQ/#scalars>

View File

View File

@ -0,0 +1,193 @@
#!/usr/bin/env python3
# Copyright 2022 Johns Hopkins University (authors: Desh Raj)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file computes fbank features of the AliMeeting dataset.
For the training data, we prepare IHM, reverberated IHM, SDM, and GSS-enhanced
audios. For the test data, we separately prepare IHM, SDM, and GSS-enhanced
parts (which are the 3 evaluation settings).
It looks for manifests in the directory data/manifests.
The generated fbank features are saved in data/fbank.
"""
import logging
from pathlib import Path
import torch
import torch.multiprocessing
from lhotse import CutSet, LilcomChunkyWriter
from lhotse.features.kaldifeat import (
KaldifeatFbank,
KaldifeatFbankConfig,
KaldifeatFrameOptions,
KaldifeatMelOptions,
)
from lhotse.recipes.utils import read_manifests_if_cached
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
torch.multiprocessing.set_sharing_strategy("file_system")
def compute_fbank_ami():
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
sampling_rate = 16000
num_mel_bins = 80
extractor = KaldifeatFbank(
KaldifeatFbankConfig(
frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
device="cuda",
)
)
logging.info("Reading manifests")
manifests_ihm = read_manifests_if_cached(
dataset_parts=["train", "eval", "test"],
output_dir=src_dir,
prefix="alimeeting-ihm",
suffix="jsonl.gz",
)
manifests_sdm = read_manifests_if_cached(
dataset_parts=["train", "eval", "test"],
output_dir=src_dir,
prefix="alimeeting-sdm",
suffix="jsonl.gz",
)
# For GSS we already have cuts so we read them directly.
manifests_gss = read_manifests_if_cached(
dataset_parts=["train", "eval", "test"],
output_dir=src_dir,
prefix="alimeeting-gss",
suffix="jsonl.gz",
)
def _extract_feats(cuts: CutSet, storage_path: Path, manifest_path: Path) -> None:
cuts = cuts + cuts.perturb_speed(0.9) + cuts.perturb_speed(1.1)
_ = cuts.compute_and_store_features_batch(
extractor=extractor,
storage_path=storage_path,
manifest_path=manifest_path,
batch_duration=5000,
num_workers=8,
storage_type=LilcomChunkyWriter,
)
logging.info(
"Preparing training cuts: IHM + reverberated IHM + SDM + GSS (optional)"
)
logging.info("Processing train split IHM")
cuts_ihm = (
CutSet.from_manifests(**manifests_ihm["train"])
.trim_to_supervisions(keep_overlapping=False, keep_all_channels=False)
.modify_ids(lambda x: x + "-ihm")
)
_extract_feats(
cuts_ihm,
output_dir / "feats_train_ihm",
src_dir / "cuts_train_ihm.jsonl.gz",
)
logging.info("Processing train split IHM + reverberated IHM")
cuts_ihm_rvb = cuts_ihm.reverb_rir()
_extract_feats(
cuts_ihm_rvb,
output_dir / "feats_train_ihm_rvb",
src_dir / "cuts_train_ihm_rvb.jsonl.gz",
)
logging.info("Processing train split SDM")
cuts_sdm = (
CutSet.from_manifests(**manifests_sdm["train"])
.trim_to_supervisions(keep_overlapping=False)
.modify_ids(lambda x: x + "-sdm")
)
_extract_feats(
cuts_sdm,
output_dir / "feats_train_sdm",
src_dir / "cuts_train_sdm.jsonl.gz",
)
logging.info("Processing train split GSS")
cuts_gss = (
CutSet.from_manifests(**manifests_gss["train"])
.trim_to_supervisions(keep_overlapping=False)
.modify_ids(lambda x: x + "-gss")
)
_extract_feats(
cuts_gss,
output_dir / "feats_train_gss",
src_dir / "cuts_train_gss.jsonl.gz",
)
logging.info("Preparing test cuts: IHM, SDM, GSS (optional)")
for split in ["eval", "test"]:
logging.info(f"Processing {split} IHM")
cuts_ihm = (
CutSet.from_manifests(**manifests_ihm[split])
.trim_to_supervisions(keep_overlapping=False, keep_all_channels=False)
.compute_and_store_features_batch(
extractor=extractor,
storage_path=output_dir / f"feats_{split}_ihm",
manifest_path=src_dir / f"cuts_{split}_ihm.jsonl.gz",
batch_duration=500,
num_workers=4,
storage_type=LilcomChunkyWriter,
)
)
logging.info(f"Processing {split} SDM")
cuts_sdm = (
CutSet.from_manifests(**manifests_sdm[split])
.trim_to_supervisions(keep_overlapping=False)
.compute_and_store_features_batch(
extractor=extractor,
storage_path=output_dir / f"feats_{split}_sdm",
manifest_path=src_dir / f"cuts_{split}_sdm.jsonl.gz",
batch_duration=500,
num_workers=4,
storage_type=LilcomChunkyWriter,
)
)
logging.info(f"Processing {split} GSS")
cuts_gss = (
CutSet.from_manifests(**manifests_gss[split])
.trim_to_supervisions(keep_overlapping=False)
.compute_and_store_features_batch(
extractor=extractor,
storage_path=output_dir / f"feats_{split}_gss",
manifest_path=src_dir / f"cuts_{split}_gss.jsonl.gz",
batch_duration=500,
num_workers=4,
storage_type=LilcomChunkyWriter,
)
)
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
compute_fbank_ami()

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/compute_fbank_musan.py

View File

@ -0,0 +1,158 @@
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
# Data preparation for AliMeeting GSS-enhanced dataset.
import logging
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from lhotse import Recording, RecordingSet, SupervisionSet
from lhotse.qa import fix_manifests
from lhotse.recipes.utils import read_manifests_if_cached
from lhotse.utils import fastcopy
from tqdm import tqdm
logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)
def get_args():
import argparse
parser = argparse.ArgumentParser(description="AMI enhanced dataset preparation.")
parser.add_argument(
"manifests_dir",
type=Path,
help="Path to directory containing AliMeeting manifests.",
)
parser.add_argument(
"enhanced_dir",
type=Path,
help="Path to enhanced data directory.",
)
parser.add_argument(
"--num-jobs",
"-j",
type=int,
default=1,
help="Number of parallel jobs to run.",
)
parser.add_argument(
"--min-segment-duration",
"-d",
type=float,
default=0.0,
help="Minimum duration of a segment in seconds.",
)
return parser.parse_args()
def find_recording_and_create_new_supervision(enhanced_dir, supervision):
"""
Given a supervision (corresponding to original AMI recording), this function finds the
enhanced recording correspoding to the supervision, and returns this recording and
a new supervision whose start and end times are adjusted to match the enhanced recording.
"""
file_name = Path(
f"{supervision.recording_id}-{supervision.speaker}-{int(100*supervision.start):06d}_{int(100*supervision.end):06d}.flac"
)
save_path = enhanced_dir / f"{supervision.recording_id}" / file_name
if save_path.exists():
recording = Recording.from_file(save_path)
if recording.duration == 0:
logging.warning(f"Skipping {save_path} which has duration 0 seconds.")
return None
# Old supervision is wrt to the original recording, we create new supervision
# wrt to the enhanced segment
new_supervision = fastcopy(
supervision,
recording_id=recording.id,
start=0,
duration=recording.duration,
)
return recording, new_supervision
else:
logging.warning(f"{save_path} does not exist.")
return None
def main(args):
# Get arguments
manifests_dir = args.manifests_dir
enhanced_dir = args.enhanced_dir
# Load manifests from cache if they exist (saves time)
manifests = read_manifests_if_cached(
dataset_parts=["train", "eval", "test"],
output_dir=manifests_dir,
prefix="alimeeting-sdm",
suffix="jsonl.gz",
)
if not manifests:
raise ValueError(
"AliMeeting SDM manifests not found in {}".format(manifests_dir)
)
with ThreadPoolExecutor(args.num_jobs) as ex:
for part in ["train", "eval", "test"]:
logging.info(f"Processing {part}...")
supervisions_orig = manifests[part]["supervisions"].filter(
lambda s: s.duration >= args.min_segment_duration
)
futures = []
for supervision in tqdm(
supervisions_orig,
desc="Distributing tasks",
):
futures.append(
ex.submit(
find_recording_and_create_new_supervision,
enhanced_dir,
supervision,
)
)
recordings = []
supervisions = []
for future in tqdm(
futures,
total=len(futures),
desc="Processing tasks",
):
result = future.result()
if result is not None:
recording, new_supervision = result
recordings.append(recording)
supervisions.append(new_supervision)
# Remove duplicates from the recordings
recordings_nodup = {}
for recording in recordings:
if recording.id not in recordings_nodup:
recordings_nodup[recording.id] = recording
else:
logging.warning("Recording {} is duplicated.".format(recording.id))
recordings = RecordingSet.from_recordings(recordings_nodup.values())
supervisions = SupervisionSet.from_segments(supervisions)
recordings, supervisions = fix_manifests(
recordings=recordings, supervisions=supervisions
)
logging.info(f"Writing {part} enhanced manifests")
recordings.to_file(
manifests_dir / f"alimeeting-gss_recordings_{part}.jsonl.gz"
)
supervisions.to_file(
manifests_dir / f"alimeeting-gss_supervisions_{part}.jsonl.gz"
)
if __name__ == "__main__":
args = get_args()
main(args)

View File

@ -0,0 +1,98 @@
#!/bin/bash
# This script is used to run GSS-based enhancement on AMI data.
set -euo pipefail
nj=4
stage=0
. shared/parse_options.sh || exit 1
if [ $# != 2 ]; then
echo "Wrong #arguments ($#, expected 2)"
echo "Usage: local/prepare_alimeeting_gss.sh [options] <data-dir> <exp-dir>"
echo "e.g. local/prepare_alimeeting_gss.sh data/manifests exp/ami_gss"
echo "main options (for others, see top of script file)"
echo " --nj <nj> # number of parallel jobs"
echo " --stage <stage> # stage to start running from"
exit 1;
fi
DATA_DIR=$1
EXP_DIR=$2
mkdir -p $EXP_DIR
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
if [ $stage -le 1 ]; then
log "Stage 1: Prepare cut sets"
for part in train eval test; do
lhotse cut simple \
-r $DATA_DIR/alimeeting-mdm_recordings_${part}.jsonl.gz \
-s $DATA_DIR/alimeeting-mdm_supervisions_${part}.jsonl.gz \
$EXP_DIR/cuts_${part}.jsonl.gz
done
fi
if [ $stage -le 2 ]; then
log "Stage 2: Trim cuts to supervisions (1 cut per supervision segment)"
for part in train eval test; do
lhotse cut trim-to-supervisions --discard-overlapping \
$EXP_DIR/cuts_${part}.jsonl.gz $EXP_DIR/cuts_per_segment_${part}.jsonl.gz
done
fi
if [ $stage -le 3 ]; then
log "Stage 3: Split manifests for multi-GPU processing (optional)"
for part in train eval test; do
gss utils split $nj $EXP_DIR/cuts_per_segment_${part}.jsonl.gz \
$EXP_DIR/cuts_per_segment_${part}_split$nj
done
fi
if [ $stage -le 4 ]; then
log "Stage 4: Enhance train segments using GSS (requires GPU)"
# for train, we use smaller context and larger batches to speed-up processing
for JOB in $(seq $nj); do
gss enhance cuts $EXP_DIR/cuts_train.jsonl.gz \
$EXP_DIR/cuts_per_segment_train_split$nj/cuts_per_segment_train.JOB.jsonl.gz $EXP_DIR/enhanced \
--bss-iterations 10 \
--context-duration 5.0 \
--use-garbage-class \
--channels 0,1,2,3,4,5,6,7 \
--min-segment-length 0.05 \
--max-segment-length 25.0 \
--max-batch-duration 60.0 \
--num-buckets 4 \
--num-workers 4
done
fi
if [ $stage -le 5 ]; then
log "Stage 5: Enhance eval/test segments using GSS (using GPU)"
# for eval/test, we use larger context and smaller batches to get better quality
for part in eval test; do
for JOB in $(seq $nj); do
gss enhance cuts $EXP_DIR/cuts_${part}.jsonl.gz \
$EXP_DIR/cuts_per_segment_${part}_split$nj/cuts_per_segment_${part}.JOB.jsonl.gz \
$EXP_DIR/enhanced \
--bss-iterations 10 \
--context-duration 15.0 \
--use-garbage-class \
--channels 0,1,2,3,4,5,6,7 \
--min-segment-length 0.05 \
--max-segment-length 16.0 \
--max-batch-duration 45.0 \
--num-buckets 4 \
--num-workers 4
done
done
fi
if [ $stage -le 6 ]; then
log "Stage 6: Prepare manifests for GSS-enhanced data"
python local/prepare_alimeeting_enhanced.py $DATA_DIR $EXP_DIR/enhanced -j $nj --min-segment-duration 0.05
fi

View File

@ -0,0 +1 @@
../../ASR/local/prepare_char.py

View File

@ -0,0 +1 @@
../../ASR/local/prepare_words.py

View File

@ -0,0 +1 @@
../../ASR/local/text2segments.py

View File

@ -0,0 +1 @@
../../ASR/local/text2token.py

125
egs/alimeeting/ASR_v2/prepare.sh Executable file
View File

@ -0,0 +1,125 @@
#!/usr/bin/env bash
set -eou pipefail
stage=-1
stop_stage=100
use_gss=true # Use GSS-based enhancement with MDM setting
# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
# by this script automatically.
#
# - $dl_dir/alimeeting
# This directory contains the following files downloaded from
# https://openslr.org/62/
#
# - Train_Ali_far.tar.gz
# - Train_Ali_near.tar.gz
# - Test_Ali.tar.gz
# - Eval_Ali.tar.gz
#
# - $dl_dir/musan
# This directory contains the following directories downloaded from
# http://www.openslr.org/17/
#
# - music
# - noise
# - speech
dl_dir=$PWD/download
. shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
if [ ! -f $dl_dir/alimeeting/Train_Ali_far.tar.gz ]; then
lhotse download ali-meeting $dl_dir/alimeeting
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare alimeeting manifest"
# We assume that you have downloaded the alimeeting corpus
# to $dl_dir/alimeeting
for part in ihm sdm mdm; do
mkdir -p data/manifests/alimeeting
lhotse prepare ali-meeting --mic $part --save-mono --normalize-text m2met \
$dl_dir/alimeeting data/manifests
done
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to data/musan
mkdir -p data/manifests
lhotse prepare musan $dl_dir/musan data/manifests
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ] && [ $use_gss = true ]; then
log "Stage 3: Apply GSS enhancement on MDM data (this stage requires a GPU)"
# We assume that you have installed the GSS package: https://github.com/desh2608/gss
local/prepare_alimeeting_gss.sh data/manifests exp/alimeeting_gss
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank for musan"
mkdir -p data/fbank
python local/compute_fbank_musan.py
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute fbank for alimeeting"
mkdir -p data/fbank
python local/compute_fbank_alimeeting.py
log "Combine features from train splits"
lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\
gzip -c > data/manifests/cuts_train_all.jsonl.gz
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Prepare char based lang"
lang_char_dir=data/lang_char
mkdir -p $lang_char_dir
# Prepare text.
# Note: in Linux, you can install jq with the following command:
# wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
gunzip -c data/manifests/alimeeting-sdm_supervisions_train.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text
# Prepare words segments
python ./local/text2segments.py \
--input $lang_char_dir/text \
--output $lang_char_dir/text_words_segmentation
cat $lang_char_dir/text_words_segmentation | sed "s/ /\n/g" \
| sort -u | sed "/^$/d" \
| uniq > $lang_char_dir/words_no_ids.txt
# Prepare words.txt
if [ ! -f $lang_char_dir/words.txt ]; then
./local/prepare_words.py \
--input-file $lang_char_dir/words_no_ids.txt \
--output-file $lang_char_dir/words.txt
fi
if [ ! -f $lang_char_dir/L_disambig.pt ]; then
./local/prepare_char.py
fi
fi

View File

@ -0,0 +1,419 @@
# Copyright 2021 Piotr Żelasko
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import re
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, Optional
import torch
from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
from lhotse.cut import Cut
from lhotse.dataset import (
CutConcatenate,
CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
SpecAugment,
)
from lhotse.dataset.input_strategies import OnTheFlyFeatures
from lhotse.utils import fix_random_seed
from torch.utils.data import DataLoader
from tqdm import tqdm
from icefall.utils import str2bool
class _SeedWorkers:
def __init__(self, seed: int):
self.seed = seed
def __call__(self, worker_id: int):
fix_random_seed(self.seed + worker_id)
class AlimeetingAsrDataModule:
"""
DataModule for k2 ASR experiments.
It assumes there is always one train and valid dataloader,
but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
and test-other).
It contains all the common data pipeline modules used in ASR
experiments, e.g.:
- dynamic batch size,
- bucketing samplers,
- cut concatenation,
- augmentation,
- on-the-fly feature extraction
This class should be derived for specific corpora used in ASR tasks.
"""
def __init__(self, args: argparse.Namespace):
self.args = args
@classmethod
def add_arguments(cls, parser: argparse.ArgumentParser):
group = parser.add_argument_group(
title="ASR data related options",
description=(
"These options are used for the preparation of "
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
"effective batch sizes, sampling strategies, applied data "
"augmentations, etc."
),
)
group.add_argument(
"--manifest-dir",
type=Path,
default=Path("data/manifests"),
help="Path to directory with train/valid/test cuts.",
)
group.add_argument(
"--enable-musan",
type=str2bool,
default=True,
help=(
"When enabled, select noise from MUSAN and mix it "
"with training dataset. "
),
)
group.add_argument(
"--concatenate-cuts",
type=str2bool,
default=False,
help=(
"When enabled, utterances (cuts) will be concatenated "
"to minimize the amount of padding."
),
)
group.add_argument(
"--duration-factor",
type=float,
default=1.0,
help=(
"Determines the maximum duration of a concatenated cut "
"relative to the duration of the longest cut in a batch."
),
)
group.add_argument(
"--gap",
type=float,
default=1.0,
help=(
"The amount of padding (in seconds) inserted between "
"concatenated cuts. This padding is filled with noise when "
"noise augmentation is used."
),
)
group.add_argument(
"--max-duration",
type=int,
default=100.0,
help=(
"Maximum pooled recordings duration (seconds) in a "
"single batch. You can reduce it if it causes CUDA OOM."
),
)
group.add_argument(
"--max-cuts", type=int, default=None, help="Maximum cuts in a single batch."
)
group.add_argument(
"--num-buckets",
type=int,
default=50,
help=(
"The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets)."
),
)
group.add_argument(
"--on-the-fly-feats",
type=str2bool,
default=False,
help=(
"When enabled, use on-the-fly cut mixing and feature "
"extraction. Will drop existing precomputed feature manifests "
"if available."
),
)
group.add_argument(
"--shuffle",
type=str2bool,
default=True,
help=(
"When enabled (=default), the examples will be "
"shuffled for each epoch."
),
)
group.add_argument(
"--num-workers",
type=int,
default=8,
help=(
"The number of training dataloader workers that " "collect the batches."
),
)
group.add_argument(
"--enable-spec-aug",
type=str2bool,
default=True,
help="When enabled, use SpecAugment for training dataset.",
)
group.add_argument(
"--spec-aug-time-warp-factor",
type=int,
default=80,
help=(
"Used only when --enable-spec-aug is True. "
"It specifies the factor for time warping in SpecAugment. "
"Larger values mean more warping. "
"A value less than 1 means to disable time warp."
),
)
def train_dataloaders(
self,
cuts_train: CutSet,
sampler_state_dict: Optional[Dict[str, Any]] = None,
) -> DataLoader:
"""
Args:
cuts_train:
CutSet for training.
sampler_state_dict:
The state dict for the training sampler.
"""
logging.info("About to get Musan cuts")
transforms = []
if self.args.enable_musan:
logging.info("Enable MUSAN")
cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
transforms.append(
CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
)
else:
logging.info("Disable MUSAN")
if self.args.concatenate_cuts:
logging.info(
"Using cut concatenation with duration factor "
f"{self.args.duration_factor} and gap {self.args.gap}."
)
# Cut concatenation should be the first transform in the list,
# so that if we e.g. mix noise in, it will fill the gaps between
# different utterances.
transforms = [
CutConcatenate(
duration_factor=self.args.duration_factor, gap=self.args.gap
)
] + transforms
input_transforms = []
if self.args.enable_spec_aug:
logging.info("Enable SpecAugment")
logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
input_transforms.append(
SpecAugment(
time_warp_factor=self.args.spec_aug_time_warp_factor,
num_frame_masks=2,
features_mask_size=27,
num_feature_masks=2,
frames_mask_size=100,
)
)
else:
logging.info("Disable SpecAugment")
logging.info("About to create train dataset")
if self.args.on_the_fly_feats:
train = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
input_transforms=input_transforms,
)
else:
train = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_transforms=input_transforms,
)
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
max_cuts=self.args.max_cuts,
shuffle=False,
num_buckets=self.args.num_buckets,
drop_last=True,
)
logging.info("About to create train dataloader")
if sampler_state_dict is not None:
logging.info("Loading sampler state dict")
train_sampler.load_state_dict(sampler_state_dict)
# 'seed' is derived from the current random state, which will have
# previously been set in the main process.
seed = torch.randint(0, 100000, ()).item()
worker_init_fn = _SeedWorkers(seed)
train_dl = DataLoader(
train,
sampler=train_sampler,
batch_size=None,
num_workers=self.args.num_workers,
persistent_workers=False,
worker_init_fn=worker_init_fn,
)
return train_dl
def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
transforms = []
if self.args.concatenate_cuts:
transforms = [
CutConcatenate(
duration_factor=self.args.duration_factor, gap=self.args.gap
)
] + transforms
logging.info("About to create dev dataset")
if self.args.on_the_fly_feats:
validate = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
)
else:
validate = K2SpeechRecognitionDataset(
cut_transforms=transforms,
)
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
)
logging.info("About to create dev dataloader")
valid_dl = DataLoader(
validate,
sampler=valid_sampler,
batch_size=None,
num_workers=2,
persistent_workers=False,
)
return valid_dl
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
logging.debug("About to create test dataset")
test = K2SpeechRecognitionDataset(
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
if self.args.on_the_fly_feats
else PrecomputedFeatures(),
return_cuts=True,
)
sampler = DynamicBucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(
test,
batch_size=None,
sampler=sampler,
num_workers=self.args.num_workers,
)
return test_dl
def remove_short_cuts(self, cut: Cut) -> bool:
"""
See: https://github.com/k2-fsa/icefall/issues/500
Basically, the zipformer model subsamples the input using the following formula:
num_out_frames = ((num_in_frames - 7)//2 + 1)//2
For num_out_frames to be at least 1, num_in_frames must be at least 9.
"""
return cut.duration >= 0.09
@lru_cache()
def train_cuts(self, sp: Optional[Any] = None) -> CutSet:
logging.info("About to get AMI train cuts")
def _remove_short_and_long_utt(c: Cut):
if c.duration < 0.1 or c.duration > 25.0:
return False
# In pruned RNN-T, we require that T >= S
# where T is the number of feature frames after subsampling
# and S is the number of tokens in the utterance
# In ./zipformer.py, the conv module uses the following expression
# for subsampling
T = ((c.num_frames - 7) // 2 + 1) // 2
tokens = c.supervisions[0].text
return T >= len(tokens)
cuts_train = load_manifest_lazy(
self.args.manifest_dir / "cuts_train_all.jsonl.gz"
)
return cuts_train.filter(_remove_short_and_long_utt)
@lru_cache()
def eval_ihm_cuts(self) -> CutSet:
logging.info("About to get AliMeeting IHM eval cuts")
cs = load_manifest_lazy(self.args.manifest_dir / "cuts_eval_ihm.jsonl.gz")
return cs.filter(self.remove_short_cuts)
@lru_cache()
def eval_sdm_cuts(self) -> CutSet:
logging.info("About to get AliMeeting SDM eval cuts")
cs = load_manifest_lazy(self.args.manifest_dir / "cuts_eval_sdm.jsonl.gz")
return cs.filter(self.remove_short_cuts)
@lru_cache()
def eval_gss_cuts(self) -> CutSet:
if not (self.args.manifest_dir / "cuts_eval_gss.jsonl.gz").exists():
logging.info("No GSS dev cuts found")
return None
logging.info("About to get AliMeeting GSS-enhanced eval cuts")
cs = load_manifest_lazy(self.args.manifest_dir / "cuts_eval_gss.jsonl.gz")
return cs.filter(self.remove_short_cuts)
@lru_cache()
def test_ihm_cuts(self) -> CutSet:
logging.info("About to get AliMeeting IHM test cuts")
cs = load_manifest_lazy(self.args.manifest_dir / "cuts_test_ihm.jsonl.gz")
return cs.filter(self.remove_short_cuts)
@lru_cache()
def test_sdm_cuts(self) -> CutSet:
logging.info("About to get AliMeeting SDM test cuts")
cs = load_manifest_lazy(self.args.manifest_dir / "cuts_test_sdm.jsonl.gz")
return cs.filter(self.remove_short_cuts)
@lru_cache()
def test_gss_cuts(self) -> CutSet:
if not (self.args.manifest_dir / "cuts_test_gss.jsonl.gz").exists():
logging.info("No GSS test cuts found")
return None
logging.info("About to get AliMeeting GSS-enhanced test cuts")
cs = load_manifest_lazy(self.args.manifest_dir / "cuts_test_gss.jsonl.gz")
return cs.filter(self.remove_short_cuts)

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/beam_search.py

View File

@ -0,0 +1,698 @@
#!/usr/bin/env python3
#
# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
(1) greedy search
./pruned_transducer_stateless7/decode.py \
--epoch 15 \
--avg 8 \
--exp-dir ./pruned_transducer_stateless7/exp \
--max-duration 500 \
--decoding-method greedy_search
(2) modified beam search
./pruned_transducer_stateless7/decode.py \
--epoch 15 \
--avg 8 \
--exp-dir ./pruned_transducer_stateless7/exp \
--max-duration 500 \
--decoding-method modified_beam_search \
--beam-size 4
(3) fast beam search
./pruned_transducer_stateless7/decode.py \
--epoch 15 \
--avg 8 \
--exp-dir ./pruned_transducer_stateless7/exp \
--max-duration 500 \
--decoding-method fast_beam_search \
--beam 4 \
--max-contexts 4 \
--max-states 8
"""
import argparse
import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import k2
import sentencepiece as spm
import torch
import torch.nn as nn
from asr_datamodule import AlimeetingAsrDataModule
from beam_search import (
beam_search,
fast_beam_search_nbest_LG,
fast_beam_search_one_best,
greedy_search,
greedy_search_batch,
modified_beam_search,
)
from train import add_model_arguments, get_params, get_transducer_model
from icefall import NgramLm
from icefall.checkpoint import (
average_checkpoints,
average_checkpoints_with_averaged_model,
find_checkpoints,
load_checkpoint,
)
from icefall.lexicon import Lexicon
from icefall.utils import (
AttributeDict,
setup_logger,
store_transcripts,
str2bool,
write_error_stats,
)
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--epoch",
type=int,
default=30,
help="""It specifies the checkpoint to use for decoding.
Note: Epoch counts from 0.
You can specify --avg to use more checkpoints for model averaging.""",
)
parser.add_argument(
"--iter",
type=int,
default=0,
help="""If positive, --epoch is ignored and it
will use the checkpoint exp_dir/checkpoint-iter.pt.
You can specify --avg to use more checkpoints for model averaging.
""",
)
parser.add_argument(
"--avg",
type=int,
default=10,
help="Number of checkpoints to average. Automatically select "
"consecutive checkpoints before the checkpoint specified by "
"'--epoch' and '--iter'",
)
parser.add_argument(
"--use-averaged-model",
type=str2bool,
default=True,
help="Whether to load averaged model. Currently it only supports "
"using --epoch. If True, it would decode with the averaged model "
"over the epoch range from `epoch-avg` (excluded) to `epoch`."
"Actually only the models with epoch number of `epoch-avg` and "
"`epoch` are loaded for averaging. ",
)
parser.add_argument(
"--exp-dir",
type=str,
default="pruned_transducer_stateless2/exp",
help="The experiment dir",
)
parser.add_argument(
"--lang-dir",
type=str,
default="data/lang_char",
help="""The lang dir
It contains language related input files such as
"lexicon.txt"
""",
)
parser.add_argument(
"--decoding-method",
type=str,
default="greedy_search",
help="""Possible values are:
- greedy_search
- beam_search
- modified_beam_search
- fast_beam_search
- fast_beam_search_nbest
- fast_beam_search_nbest_oracle
- fast_beam_search_nbest_LG
If you use fast_beam_search_nbest_LG, you have to specify
`--lang-dir`, which should contain `LG.pt`.
""",
)
parser.add_argument(
"--beam-size",
type=int,
default=4,
help="""An interger indicating how many candidates we will keep for each
frame. Used only when --decoding-method is beam_search or
modified_beam_search.""",
)
parser.add_argument(
"--beam",
type=float,
default=4,
help="""A floating point value to calculate the cutoff score during beam
search (i.e., `cutoff = max-score - beam`), which is the same as the
`beam` in Kaldi.
Used only when --decoding-method is fast_beam_search""",
)
parser.add_argument(
"--ngram-lm-scale",
type=float,
default=0.01,
help="""
Used only when --decoding_method is fast_beam_search_nbest_LG.
It specifies the scale for n-gram LM scores.
""",
)
parser.add_argument(
"--max-contexts",
type=int,
default=8,
help="""Used only when --decoding-method is
fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
and fast_beam_search_nbest_oracle""",
)
parser.add_argument(
"--max-states",
type=int,
default=64,
help="""Used only when --decoding-method is
fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
and fast_beam_search_nbest_oracle""",
)
parser.add_argument(
"--context-size",
type=int,
default=2,
help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
)
parser.add_argument(
"--max-sym-per-frame",
type=int,
default=1,
help="""Maximum number of symbols per frame.
Used only when --decoding_method is greedy_search""",
)
parser.add_argument(
"--num-paths",
type=int,
default=200,
help="""Number of paths for nbest decoding.
Used only when the decoding method is fast_beam_search_nbest,
fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
)
parser.add_argument(
"--nbest-scale",
type=float,
default=0.5,
help="""Scale applied to lattice scores when computing nbest paths.
Used only when the decoding method is fast_beam_search_nbest,
fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
)
add_model_arguments(parser)
return parser
def decode_one_batch(
params: AttributeDict,
model: nn.Module,
lexicon: Lexicon,
batch: dict,
decoding_graph: Optional[k2.Fsa] = None,
) -> Dict[str, List[List[str]]]:
"""Decode one batch and return the result in a dict. The dict has the
following format:
- key: It indicates the setting used for decoding. For example,
if greedy_search is used, it would be "greedy_search"
If beam search with a beam size of 7 is used, it would be
"beam_7"
- value: It contains the decoding result. `len(value)` equals to
batch size. `value[i]` is the decoding result for the i-th
utterance in the given batch.
Args:
params:
It's the return value of :func:`get_params`.
model:
The neural model.
batch:
It is the return value from iterating
`lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
for the format of the `batch`.
decoding_graph:
The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
only when --decoding_method is fast_beam_search.
Returns:
Return the decoding result. See above description for the format of
the returned dict.
"""
device = model.device
feature = batch["inputs"]
assert feature.ndim == 3
feature = feature.to(device)
# at entry, feature is (N, T, C)
supervisions = batch["supervisions"]
feature_lens = supervisions["num_frames"].to(device)
encoder_out, encoder_out_lens = model.encoder(x=feature, x_lens=feature_lens)
hyps = []
if params.decoding_method == "fast_beam_search":
hyp_tokens = fast_beam_search_one_best(
model=model,
decoding_graph=decoding_graph,
encoder_out=encoder_out,
encoder_out_lens=encoder_out_lens,
beam=params.beam,
max_contexts=params.max_contexts,
max_states=params.max_states,
)
for i in range(encoder_out.size(0)):
hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
elif params.decoding_method == "fast_beam_search_nbest_LG":
hyp_tokens = fast_beam_search_nbest_LG(
model=model,
decoding_graph=decoding_graph,
encoder_out=encoder_out,
encoder_out_lens=encoder_out_lens,
beam=params.beam,
max_contexts=params.max_contexts,
max_states=params.max_states,
num_paths=params.num_paths,
nbest_scale=params.nbest_scale,
)
for i in range(encoder_out.size(0)):
hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
hyp_tokens = greedy_search_batch(
model=model,
encoder_out=encoder_out,
encoder_out_lens=encoder_out_lens,
)
for i in range(encoder_out.size(0)):
hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
elif params.decoding_method == "modified_beam_search":
hyp_tokens = modified_beam_search(
model=model,
encoder_out=encoder_out,
encoder_out_lens=encoder_out_lens,
beam=params.beam_size,
)
for i in range(encoder_out.size(0)):
hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
else:
batch_size = encoder_out.size(0)
for i in range(batch_size):
# fmt: off
encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
# fmt: on
if params.decoding_method == "greedy_search":
hyp = greedy_search(
model=model,
encoder_out=encoder_out_i,
max_sym_per_frame=params.max_sym_per_frame,
)
elif params.decoding_method == "beam_search":
hyp = beam_search(
model=model,
encoder_out=encoder_out_i,
beam=params.beam_size,
)
else:
raise ValueError(
f"Unsupported decoding method: {params.decoding_method}"
)
hyps.append([lexicon.token_table[idx] for idx in hyp])
if params.decoding_method == "greedy_search":
return {"greedy_search": hyps}
elif params.decoding_method == "fast_beam_search":
return {
(
f"beam_{params.beam}_"
f"max_contexts_{params.max_contexts}_"
f"max_states_{params.max_states}"
): hyps
}
elif "fast_beam_search" in params.decoding_method:
key = f"beam_{params.beam}_"
key += f"max_contexts_{params.max_contexts}_"
key += f"max_states_{params.max_states}"
if "nbest" in params.decoding_method:
key += f"_num_paths_{params.num_paths}_"
key += f"nbest_scale_{params.nbest_scale}"
if "LG" in params.decoding_method:
key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
return {key: hyps}
else:
return {f"beam_size_{params.beam_size}": hyps}
def decode_dataset(
dl: torch.utils.data.DataLoader,
params: AttributeDict,
model: nn.Module,
lexicon: Lexicon,
decoding_graph: Optional[k2.Fsa] = None,
) -> Dict[str, List[Tuple[List[str], List[str]]]]:
"""Decode dataset.
Args:
dl:
PyTorch's dataloader containing the dataset to decode.
params:
It is returned by :func:`get_params`.
model:
The neural model.
decoding_graph:
The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
only when --decoding_method is fast_beam_search.
Returns:
Return a dict, whose key may be "greedy_search" if greedy search
is used, or it may be "beam_7" if beam size of 7 is used.
Its value is a list of tuples. Each tuple contains two elements:
The first is the reference transcript, and the second is the
predicted result.
"""
num_cuts = 0
try:
num_batches = len(dl)
except TypeError:
num_batches = "?"
if params.decoding_method == "greedy_search":
log_interval = 100
else:
log_interval = 2
results = defaultdict(list)
for batch_idx, batch in enumerate(dl):
texts = batch["supervisions"]["text"]
texts = [list(str(text).replace(" ", "")) for text in texts]
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
hyps_dict = decode_one_batch(
params=params,
model=model,
lexicon=lexicon,
decoding_graph=decoding_graph,
batch=batch,
)
for name, hyps in hyps_dict.items():
this_batch = []
assert len(hyps) == len(texts)
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
this_batch.append((cut_id, ref_text, hyp_words))
results[name].extend(this_batch)
num_cuts += len(texts)
if batch_idx % log_interval == 0:
batch_str = f"{batch_idx}/{num_batches}"
logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
return results
def save_results(
params: AttributeDict,
test_set_name: str,
results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
):
test_set_wers = dict()
for key, results in results_dict.items():
recog_path = (
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
)
results = sorted(results)
store_transcripts(filename=recog_path, texts=results)
logging.info(f"The transcripts are stored in {recog_path}")
# The following prints out WERs, per-word error statistics and aligned
# ref/hyp pairs.
errs_filename = (
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
)
with open(errs_filename, "w") as f:
wer = write_error_stats(
f, f"{test_set_name}-{key}", results, enable_log=True
)
test_set_wers[key] = wer
logging.info("Wrote detailed error stats to {}".format(errs_filename))
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
errs_info = (
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
)
with open(errs_info, "w") as f:
print("settings\tWER", file=f)
for key, val in test_set_wers:
print("{}\t{}".format(key, val), file=f)
s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
note = "\tbest for {}".format(test_set_name)
for key, val in test_set_wers:
s += "{}\t{}{}\n".format(key, val, note)
note = ""
logging.info(s)
@torch.no_grad()
def main():
parser = get_parser()
AlimeetingAsrDataModule.add_arguments(parser)
args = parser.parse_args()
args.exp_dir = Path(args.exp_dir)
params = get_params()
params.update(vars(args))
assert params.decoding_method in (
"greedy_search",
"beam_search",
"fast_beam_search",
"fast_beam_search_nbest_LG",
"modified_beam_search",
)
params.res_dir = params.exp_dir / params.decoding_method
if params.iter > 0:
params.suffix = f"iter-{params.iter}-avg-{params.avg}"
else:
params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
if "fast_beam_search" in params.decoding_method:
params.suffix += f"-beam-{params.beam}"
params.suffix += f"-max-contexts-{params.max_contexts}"
params.suffix += f"-max-states-{params.max_states}"
if "nbest" in params.decoding_method:
params.suffix += f"-nbest-scale-{params.nbest_scale}"
params.suffix += f"-num-paths-{params.num_paths}"
if "LG" in params.decoding_method:
params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
elif "beam_search" in params.decoding_method:
params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
else:
params.suffix += f"-context-{params.context_size}"
params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
logging.info("Decoding started")
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
logging.info(f"Device: {device}")
lexicon = Lexicon(params.lang_dir)
params.blank_id = lexicon.token_table["<blk>"]
params.vocab_size = max(lexicon.tokens) + 1
logging.info(params)
logging.info("About to create model")
model = get_transducer_model(params)
if not params.use_averaged_model:
if params.iter > 0:
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
: params.avg
]
if len(filenames) == 0:
raise ValueError(
f"No checkpoints found for"
f" --iter {params.iter}, --avg {params.avg}"
)
elif len(filenames) < params.avg:
raise ValueError(
f"Not enough checkpoints ({len(filenames)}) found for"
f" --iter {params.iter}, --avg {params.avg}"
)
logging.info(f"averaging {filenames}")
model.to(device)
model.load_state_dict(average_checkpoints(filenames, device=device))
elif params.avg == 1:
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
else:
start = params.epoch - params.avg + 1
filenames = []
for i in range(start, params.epoch + 1):
if i >= 1:
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
logging.info(f"averaging {filenames}")
model.to(device)
model.load_state_dict(average_checkpoints(filenames, device=device))
else:
if params.iter > 0:
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
: params.avg + 1
]
if len(filenames) == 0:
raise ValueError(
f"No checkpoints found for"
f" --iter {params.iter}, --avg {params.avg}"
)
elif len(filenames) < params.avg + 1:
raise ValueError(
f"Not enough checkpoints ({len(filenames)}) found for"
f" --iter {params.iter}, --avg {params.avg}"
)
filename_start = filenames[-1]
filename_end = filenames[0]
logging.info(
"Calculating the averaged model over iteration checkpoints"
f" from {filename_start} (excluded) to {filename_end}"
)
model.to(device)
model.load_state_dict(
average_checkpoints_with_averaged_model(
filename_start=filename_start,
filename_end=filename_end,
device=device,
)
)
else:
assert params.avg > 0, params.avg
start = params.epoch - params.avg
assert start >= 1, start
filename_start = f"{params.exp_dir}/epoch-{start}.pt"
filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
logging.info(
f"Calculating the averaged model over epoch range from "
f"{start} (excluded) to {params.epoch}"
)
model.to(device)
model.load_state_dict(
average_checkpoints_with_averaged_model(
filename_start=filename_start,
filename_end=filename_end,
device=device,
)
)
model.to(device)
model.eval()
model.device = device
if "fast_beam_search" in params.decoding_method:
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
else:
decoding_graph = None
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")
alimeeting = AlimeetingAsrDataModule(args)
eval_ihm_cuts = alimeeting.eval_ihm_cuts()
test_ihm_cuts = alimeeting.test_ihm_cuts()
eval_sdm_cuts = alimeeting.eval_sdm_cuts()
test_sdm_cuts = alimeeting.test_sdm_cuts()
eval_gss_cuts = alimeeting.eval_gss_cuts()
test_gss_cuts = alimeeting.test_gss_cuts()
eval_ihm_dl = alimeeting.test_dataloaders(eval_ihm_cuts)
test_ihm_dl = alimeeting.test_dataloaders(test_ihm_cuts)
eval_sdm_dl = alimeeting.test_dataloaders(eval_sdm_cuts)
test_sdm_dl = alimeeting.test_dataloaders(test_sdm_cuts)
if eval_gss_cuts is not None:
eval_gss_dl = alimeeting.test_dataloaders(eval_gss_cuts)
if test_gss_cuts is not None:
test_gss_dl = alimeeting.test_dataloaders(test_gss_cuts)
test_sets = {
"eval_ihm": (eval_ihm_dl, eval_ihm_cuts),
"test_ihm": (test_ihm_dl, test_ihm_cuts),
"eval_sdm": (eval_sdm_dl, eval_sdm_cuts),
"test_sdm": (test_sdm_dl, test_sdm_cuts),
}
if eval_gss_cuts is not None:
test_sets["eval_gss"] = (eval_gss_dl, eval_gss_cuts)
if test_gss_cuts is not None:
test_sets["test_gss"] = (test_gss_dl, test_gss_cuts)
for test_set in test_sets:
logging.info(f"Decoding {test_set}")
dl, cuts = test_sets[test_set]
results_dict = decode_dataset(
dl=dl,
params=params,
model=model,
lexicon=lexicon,
decoding_graph=decoding_graph,
)
save_results(
params=params,
test_set_name=test_set,
results_dict=results_dict,
)
logging.info("Done!")
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/decoder.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/encoder_interface.py

View File

@ -0,0 +1,320 @@
#!/usr/bin/env python3
#
# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script converts several saved checkpoints
# to a single one using model averaging.
"""
Usage:
(1) Export to torchscript model using torch.jit.script()
./pruned_transducer_stateless7/export.py \
--exp-dir ./pruned_transducer_stateless7/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 30 \
--avg 9 \
--jit 1
It will generate a file `cpu_jit.pt` in the given `exp_dir`. You can later
load it by `torch.jit.load("cpu_jit.pt")`.
Note `cpu` in the name `cpu_jit.pt` means the parameters when loaded into Python
are on CPU. You can use `to("cuda")` to move them to a CUDA device.
Check
https://github.com/k2-fsa/sherpa
for how to use the exported models outside of icefall.
(2) Export `model.state_dict()`
./pruned_transducer_stateless7/export.py \
--exp-dir ./pruned_transducer_stateless7/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch 20 \
--avg 10
It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
load it by `icefall.checkpoint.load_checkpoint()`.
To use the generated file with `pruned_transducer_stateless7/decode.py`,
you can do:
cd /path/to/exp_dir
ln -s pretrained.pt epoch-9999.pt
cd /path/to/egs/librispeech/ASR
./pruned_transducer_stateless7/decode.py \
--exp-dir ./pruned_transducer_stateless7/exp \
--epoch 9999 \
--avg 1 \
--max-duration 600 \
--decoding-method greedy_search \
--bpe-model data/lang_bpe_500/bpe.model
Check ./pretrained.py for its usage.
Note: If you don't want to train a model from scratch, we have
provided one for you. You can get it at
https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
with the following commands:
sudo apt-get install git-lfs
git lfs install
git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
# You will find the pre-trained model in icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11/exp
"""
import argparse
import logging
from pathlib import Path
import sentencepiece as spm
import torch
import torch.nn as nn
from scaling_converter import convert_scaled_to_non_scaled
from train import add_model_arguments, get_params, get_transducer_model
from icefall.checkpoint import (
average_checkpoints,
average_checkpoints_with_averaged_model,
find_checkpoints,
load_checkpoint,
)
from icefall.lexicon import Lexicon
from icefall.utils import str2bool
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--epoch",
type=int,
default=15,
help="""It specifies the checkpoint to use for decoding.
Note: Epoch counts from 1.
You can specify --avg to use more checkpoints for model averaging.""",
)
parser.add_argument(
"--iter",
type=int,
default=0,
help="""If positive, --epoch is ignored and it
will use the checkpoint exp_dir/checkpoint-iter.pt.
You can specify --avg to use more checkpoints for model averaging.
""",
)
parser.add_argument(
"--avg",
type=int,
default=8,
help="Number of checkpoints to average. Automatically select "
"consecutive checkpoints before the checkpoint specified by "
"'--epoch' and '--iter'",
)
parser.add_argument(
"--use-averaged-model",
type=str2bool,
default=True,
help="Whether to load averaged model. Currently it only supports "
"using --epoch. If True, it would decode with the averaged model "
"over the epoch range from `epoch-avg` (excluded) to `epoch`."
"Actually only the models with epoch number of `epoch-avg` and "
"`epoch` are loaded for averaging. ",
)
parser.add_argument(
"--exp-dir",
type=str,
default="pruned_transducer_stateless7/exp",
help="""It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved
""",
)
parser.add_argument(
"--lang-dir",
type=str,
default="data/lang_char",
help="The lang dir",
)
parser.add_argument(
"--jit",
type=str2bool,
default=False,
help="""True to save a model after applying torch.jit.script.
It will generate a file named cpu_jit.pt
Check ./jit_pretrained.py for how to use it.
""",
)
parser.add_argument(
"--context-size",
type=int,
default=2,
help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
)
add_model_arguments(parser)
return parser
@torch.no_grad()
def main():
args = get_parser().parse_args()
args.exp_dir = Path(args.exp_dir)
params = get_params()
params.update(vars(args))
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
logging.info(f"device: {device}")
lexicon = Lexicon(params.lang_dir)
params.blank_id = 0
params.vocab_size = max(lexicon.tokens) + 1
logging.info(params)
logging.info("About to create model")
model = get_transducer_model(params)
model.to(device)
if not params.use_averaged_model:
if params.iter > 0:
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
: params.avg
]
if len(filenames) == 0:
raise ValueError(
f"No checkpoints found for"
f" --iter {params.iter}, --avg {params.avg}"
)
elif len(filenames) < params.avg:
raise ValueError(
f"Not enough checkpoints ({len(filenames)}) found for"
f" --iter {params.iter}, --avg {params.avg}"
)
logging.info(f"averaging {filenames}")
model.to(device)
model.load_state_dict(average_checkpoints(filenames, device=device))
elif params.avg == 1:
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
else:
start = params.epoch - params.avg + 1
filenames = []
for i in range(start, params.epoch + 1):
if i >= 1:
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
logging.info(f"averaging {filenames}")
model.to(device)
model.load_state_dict(average_checkpoints(filenames, device=device))
else:
if params.iter > 0:
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
: params.avg + 1
]
if len(filenames) == 0:
raise ValueError(
f"No checkpoints found for"
f" --iter {params.iter}, --avg {params.avg}"
)
elif len(filenames) < params.avg + 1:
raise ValueError(
f"Not enough checkpoints ({len(filenames)}) found for"
f" --iter {params.iter}, --avg {params.avg}"
)
filename_start = filenames[-1]
filename_end = filenames[0]
logging.info(
"Calculating the averaged model over iteration checkpoints"
f" from {filename_start} (excluded) to {filename_end}"
)
model.to(device)
model.load_state_dict(
average_checkpoints_with_averaged_model(
filename_start=filename_start,
filename_end=filename_end,
device=device,
)
)
else:
assert params.avg > 0, params.avg
start = params.epoch - params.avg
assert start >= 1, start
filename_start = f"{params.exp_dir}/epoch-{start}.pt"
filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
logging.info(
f"Calculating the averaged model over epoch range from "
f"{start} (excluded) to {params.epoch}"
)
model.to(device)
model.load_state_dict(
average_checkpoints_with_averaged_model(
filename_start=filename_start,
filename_end=filename_end,
device=device,
)
)
model.to("cpu")
model.eval()
if params.jit is True:
convert_scaled_to_non_scaled(model, inplace=True)
logging.info("Using torch.jit.script()")
# We won't use the forward() method of the model in C++, so just ignore
# it here.
# Otherwise, one of its arguments is a ragged tensor and is not
# torch scriptabe.
model.__class__.forward = torch.jit.ignore(model.__class__.forward)
logging.info("Using torch.jit.script")
model = torch.jit.script(model)
filename = params.exp_dir / "cpu_jit.pt"
model.save(str(filename))
logging.info(f"Saved to {filename}")
else:
logging.info("Not using torchscript. Export model.state_dict()")
# Save it using a format so that it can be loaded
# by :func:`load_checkpoint`
filename = params.exp_dir / "pretrained.pt"
torch.save({"model": model.state_dict()}, str(filename))
logging.info(f"Saved to {filename}")
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
main()

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/jit_pretrained.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/joiner.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/model.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/optim.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/pretrained.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/scaling.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/scaling_converter.py

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/test_model.py

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
../../../librispeech/ASR/pruned_transducer_stateless7/zipformer.py

View File

@ -0,0 +1 @@
../../../egs/aishell/ASR/shared

View File

@ -1 +1,2 @@
log-*
.DS_Store

View File

@ -1 +1,2 @@
log-*
.DS_Store

View File

@ -19,18 +19,36 @@ The following table lists the differences among them.
| `pruned_transducer_stateless` | Conformer | Embedding + Conv1d | Using k2 pruned RNN-T loss |
| `pruned_transducer_stateless2` | Conformer(modified) | Embedding + Conv1d | Using k2 pruned RNN-T loss |
| `pruned_transducer_stateless3` | Conformer(modified) | Embedding + Conv1d | Using k2 pruned RNN-T loss + using GigaSpeech as extra training data |
| `pruned_transducer_stateless4` | Conformer(modified) | Embedding + Conv1d | same as pruned_transducer_stateless2 + save averaged models periodically during training |
| `pruned_transducer_stateless4` | Conformer(modified) | Embedding + Conv1d | same as pruned_transducer_stateless2 + save averaged models periodically during training + delay penalty |
| `pruned_transducer_stateless5` | Conformer(modified) | Embedding + Conv1d | same as pruned_transducer_stateless4 + more layers + random combiner|
| `pruned_transducer_stateless6` | Conformer(modified) | Embedding + Conv1d | same as pruned_transducer_stateless4 + distillation with hubert|
| `pruned_transducer_stateless7` | Zipformer | Embedding + Conv1d | First experiment with Zipformer from Dan|
| `pruned_transducer_stateless7_ctc` | Zipformer | Embedding + Conv1d | Same as pruned_transducer_stateless7, but with extra CTC head|
| `pruned_transducer_stateless7_ctc_bs` | Zipformer | Embedding + Conv1d | pruned_transducer_stateless7_ctc + blank skip |
| `pruned_transducer_stateless7_streaming` | Streaming Zipformer | Embedding + Conv1d | streaming version of pruned_transducer_stateless7 |
| `pruned_transducer_stateless8` | Zipformer | Embedding + Conv1d | Same as pruned_transducer_stateless7, but using extra data from GigaSpeech|
| `pruned_stateless_emformer_rnnt2` | Emformer(from torchaudio) | Embedding + Conv1d | Using Emformer from torchaudio for streaming ASR|
| `conv_emformer_transducer_stateless` | ConvEmformer | Embedding + Conv1d | Using ConvEmformer for streaming ASR + mechanisms in reworked model |
| `conv_emformer_transducer_stateless2` | ConvEmformer | Embedding + Conv1d | Using ConvEmformer with simplified memory for streaming ASR + mechanisms in reworked model |
| `lstm_transducer_stateless` | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model |
| `lstm_transducer_stateless2` | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gigaspeech (multi-dataset setup) |
| `lstm_transducer_stateless2` | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gigaspeech (multi-dataset setup) |
| `lstm_transducer_stateless3` | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gradient filter + delay penalty |
The decoder in `transducer_stateless` is modified from the paper
[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
We place an additional Conv1d layer right after the input embedding layer.
# CTC
| | Encoder | Comment |
|------------------------------|--------------------|------------------------------|
| `conformer-ctc` | Conformer | Use auxiliary attention head |
| `conformer-ctc2` | Reworked Conformer | Use auxiliary attention head |
| `conformer-ctc3` | Reworked Conformer | Streaming version + delay penalty |
# MMI
| | Encoder | Comment |
|------------------------------|-----------|---------------------------------------------------|
| `conformer-mmi` | Conformer | |
| `zipformer-mmi` | Zipformer | CTC warmup + use HP as decoding graph for decoding |

View File

@ -1,5 +1,140 @@
## Results
### Streaming Zipformer-Transducer (Pruned Stateless Transducer + Streaming Zipformer)
#### [pruned_transducer_stateless7_streaming](./pruned_transducer_stateless7_streaming)
See <https://github.com/k2-fsa/icefall/pull/787> for more details.
You can find a pretrained model, training logs, decoding logs, and decoding
results at:
<https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>
Number of model parameters: 70369391, i.e., 70.37 M
##### training on full librispeech
The WERs are:
| decoding method | chunk size | test-clean | test-other | comment | decoding mode |
|----------------------|------------|------------|------------|---------------------|----------------------|
| greedy search | 320ms | 3.15 | 8.09 | --epoch 30 --avg 9 | simulated streaming |
| greedy search | 320ms | 3.17 | 8.24 | --epoch 30 --avg 9 | chunk-wise |
| fast beam search | 320ms | 3.2 | 8.04 | --epoch 30 --avg 9 | simulated streaming |
| fast beam search | 320ms | 3.36 | 8.19 | --epoch 30 --avg 9 | chunk-wise |
| modified beam search | 320ms | 3.11 | 7.93 | --epoch 30 --avg 9 | simulated streaming |
| modified beam search | 320ms | 3.12 | 8.11 | --epoch 30 --avg 9 | chunk-size |
| greedy search | 640ms | 2.97 | 7.5 | --epoch 30 --avg 9 | simulated streaming |
| greedy search | 640ms | 2.98 | 7.67 | --epoch 30 --avg 9 | chunk-wise |
| fast beam search | 640ms | 3.02 | 7.47 | --epoch 30 --avg 9 | simulated streaming |
| fast beam search | 640ms | 2.96 | 7.61 | --epoch 30 --avg 9 | chunk-wise |
| modified beam search | 640ms | 2.94 | 7.36 | --epoch 30 --avg 9 | simulated streaming |
| modified beam search | 640ms | 2.95 | 7.53 | --epoch 30 --avg 9 | chunk-size |
Note: `simulated streaming` indicates feeding full utterance during decoding using `decode.py`,
while `chunk-size` indicates feeding certain number of frames at each time using `streaming_decode.py`.
The training command is:
```bash
./pruned_transducer_stateless7_streaming/train.py \
--world-size 4 \
--num-epochs 30 \
--start-epoch 1 \
--use-fp16 1 \
--exp-dir pruned_transducer_stateless7_streaming/exp \
--full-libri 1 \
--max-duration 750 \
--master-port 12345
```
The tensorboard log can be found at
<https://tensorboard.dev/experiment/A46UpqEWQWS7oDi5VcQ8rg/>
The simulated streaming decoding command (e.g., chunk-size=320ms) is:
```bash
for $m in greedy_search fast_beam_search modified_beam_search; do
./pruned_transducer_stateless7_streaming/decode.py \
--epoch 30 \
--avg 9 \
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
--max-duration 600 \
--decode-chunk-len 32 \
--decoding-method $m
done
```
The streaming chunk-size decoding command (e.g., chunk-size=320ms) is:
```bash
for m in greedy_search modified_beam_search fast_beam_search; do
./pruned_transducer_stateless7_streaming/streaming_decode.py \
--epoch 30 \
--avg 9 \
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
--decoding-method $m \
--decode-chunk-len 32 \
--num-decode-streams 2000
done
```
### zipformer_mmi (zipformer with mmi loss)
See <https://github.com/k2-fsa/icefall/pull/746> for more details.
[zipformer_mmi](./zipformer_mmi)
The tensorboard log can be found at
<https://tensorboard.dev/experiment/xyOZUKpEQm62HBIlUD4uPA/>
You can find a pretrained model, training logs, decoding logs, and decoding
results at:
<https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08>
Number of model parameters: 69136519, i.e., 69.14 M
| | test-clean | test-other | comment |
|--------------------------|------------|-------------|---------------------|
| 1best | 2.54 | 5.65 | --epoch 30 --avg 10 |
| nbest | 2.54 | 5.66 | --epoch 30 --avg 10 |
| nbest-rescoring-LG | 2.49 | 5.42 | --epoch 30 --avg 10 |
| nbest-rescoring-3-gram | 2.52 | 5.62 | --epoch 30 --avg 10 |
| nbest-rescoring-4-gram | 2.5 | 5.51 | --epoch 30 --avg 10 |
The training commands are:
```bash
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./zipformer_mmi/train.py \
--world-size 4 \
--master-port 12345 \
--num-epochs 30 \
--start-epoch 1 \
--lang-dir data/lang_bpe_500 \
--max-duration 500 \
--full-libri 1 \
--use-fp16 1 \
--exp-dir zipformer_mmi/exp
```
The decoding commands for the transducer branch are:
```bash
export CUDA_VISIBLE_DEVICES="5"
for m in nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
./zipformer_mmi/decode.py \
--epoch 30 \
--avg 10 \
--exp-dir ./zipformer_mmi/exp/ \
--max-duration 100 \
--lang-dir data/lang_bpe_500 \
--nbest-scale 1.2 \
--hp-scale 1.0 \
--decoding-method $m
done
```
### pruned_transducer_stateless7_ctc (zipformer with transducer loss and ctc loss)
See <https://github.com/k2-fsa/icefall/pull/683> for more details.
@ -261,9 +396,13 @@ Number of model parameters: 70369391, i.e., 70.37 M
| | test-clean | test-other | comment |
|----------------------|------------|-------------|----------------------------------------|
| greedy search | 2.17 | 5.23 | --epoch 39 --avg 6 --max-duration 600 |
| modified beam search | 2.15 | 5.20 | --epoch 39 --avg 6 --max-duration 600 |
| fast beam search | 2.15 | 5.22 | --epoch 39 --avg 6 --max-duration 600 |
| greedy search | 2.17 | 5.23 | --epoch 30 --avg 9 --max-duration 600 |
| modified beam search | 2.15 | 5.20 | --epoch 30 --avg 9 --max-duration 600 |
| modified beam search + RNNLM shallow fusion | 1.99 | 4.73 | --epoch 30 --avg 9 --max-duration 600 |
| modified beam search + TransformerLM shallow fusion | 1.94 | 4.73 | --epoch 30 --avg 9 --max-duration 600 |
| modified beam search + RNNLM + LODR | 1.91 | 4.57 | --epoch 30 --avg 9 --max-duration 600 |
| modified beam search + TransformerLM + LODR | 1.91 | 4.51 | --epoch 30 --avg 9 --max-duration 600 |
| fast beam search | 2.15 | 5.22 | --epoch 30 --avg 9 --max-duration 600 |
The training commands are:
```bash
@ -401,7 +540,9 @@ The WERs are:
| greedy search (max sym per frame 1) | 2.78 | 7.36 | --iter 468000 --avg 16 |
| modified_beam_search | 2.73 | 7.15 | --iter 468000 --avg 16 |
| modified_beam_search + RNNLM shallow fusion | 2.42 | 6.46 | --iter 468000 --avg 16 |
| modified_beam_search + RNNLM shallow fusion | 2.28 | 5.94 | --iter 468000 --avg 16 |
| modified_beam_search + TransformerLM shallow fusion | 2.37 | 6.48 | --iter 468000 --avg 16 |
| modified_beam_search + RNNLM + LODR | 2.24 | 5.89 | --iter 468000 --avg 16 |
| modified_beam_search + TransformerLM + LODR | 2.19 | 5.90 | --iter 468000 --avg 16 |
| fast_beam_search | 2.76 | 7.31 | --iter 468000 --avg 16 |
| greedy search (max sym per frame 1) | 2.77 | 7.35 | --iter 472000 --avg 18 |
| modified_beam_search | 2.75 | 7.08 | --iter 472000 --avg 18 |
@ -456,9 +597,12 @@ for m in greedy_search fast_beam_search modified_beam_search; do
done
```
To decode with RNNLM shallow fusion, use the following decoding command. A well-trained RNNLM
can be found here: <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm/tree/main>
You may also decode using shallow fusion with external neural network LM. To do so you need to
download a well-trained NN LM:
RNN LM: <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm/tree/main>
Transformer LM: <https://huggingface.co/marcoyang/icefall-librispeech-transformer-lm/tree/main>
```bash
for iter in 472000; do
for avg in 8 10 12 14 16 18; do
./lstm_transducer_stateless2/decode.py \
@ -466,23 +610,24 @@ for iter in 472000; do
--avg $avg \
--exp-dir ./lstm_transducer_stateless2/exp \
--max-duration 600 \
--decoding-method modified_beam_search_rnnlm_shallow_fusion \
--beam 4 \
--rnn-lm-scale 0.3 \
--rnn-lm-exp-dir /path/to/RNNLM \
--rnn-lm-epoch 99 \
--rnn-lm-avg 1 \
--rnn-lm-num-layers 3 \
--rnn-lm-tie-weights 1
--decoding-method modified_beam_search_lm_shallow_fusion \
--use-shallow-fusion 1 \
--lm-type rnn \
--lm-exp-dir /ceph-data4/yangxiaoyu/pretrained_models/LM/icefall-librispeech-rnn-lm/exp \
--lm-epoch 99 \
--lm-scale $lm_scale \
--lm-avg 1 \
done
done
```
You may also decode using LODR + RNNLM shallow fusion. This decoding method is proposed in <https://arxiv.org/pdf/2203.16776.pdf>.
You may also decode using LODR + LM shallow fusion. This decoding method is proposed in <https://arxiv.org/pdf/2203.16776.pdf>.
It subtracts the internal language model score during shallow fusion, which is approximated by a bi-gram model. The bi-gram can be
generated by `generate-lm.sh`, or you may download it from <https://huggingface.co/marcoyang/librispeech_bigram>.
The decoding command is as follows:
```bash
for iter in 472000; do
for avg in 8 10 12 14 16 18; do
./lstm_transducer_stateless2/decode.py \
@ -490,18 +635,22 @@ for iter in 472000; do
--avg $avg \
--exp-dir ./lstm_transducer_stateless2/exp \
--max-duration 600 \
--decoding-method modified_beam_search_rnnlm_LODR \
--decoding-method modified_beam_search_LODR \
--beam 4 \
--rnn-lm-scale 0.4 \
--rnn-lm-exp-dir /path/to/RNNLM \
--rnn-lm-epoch 99 \
--rnn-lm-avg 1 \
--rnn-lm-num-layers 3 \
--rnn-lm-tie-weights 1 \
--token-ngram 2 \
--max-contexts 4 \
--use-shallow-fusion 1 \
--lm-type rnn \
--lm-exp-dir /ceph-data4/yangxiaoyu/pretrained_models/LM/icefall-librispeech-rnn-lm/exp \
--lm-epoch 99 \
--lm-scale 0.4 \
--lm-avg 1 \
--tokens-ngram 2 \
--ngram-lm-scale -0.16
done
done
```
Note that you can also set `--lm-type transformer` to use transformer LM during LODR. But it will be slower
because it has not been optimized. The pre-trained transformer LM is available at <https://huggingface.co/marcoyang/icefall-librispeech-transformer-lm/tree/main>
Pretrained models, training logs, decoding logs, and decoding results
are available at
@ -1660,6 +1809,9 @@ layers (24 v.s 12) but a narrower model (1536 feedforward dim and 384 encoder di
| greedy search (max sym per frame 1) | 2.54 | 5.72 | --epoch 30 --avg 10 --max-duration 600 |
| modified beam search | 2.47 | 5.71 | --epoch 30 --avg 10 --max-duration 600 |
| modified beam search + RNNLM shallow fusion | 2.27 | 5.24 | --epoch 30 --avg 10 --max-duration 600 |
| modified beam search + RNNLM + LODR | 2.23 | 5.17 | --epoch 30 --avg 10 --max-duration 600 |
| modified beam search + TransformerLM shallow fusion | 2.27 | 5.26 | --epoch 30 --avg 10 --max-duration 600 |
| modified beam search + TransformerLM + LODR | 2.22 | 5.11 | --epoch 30 --avg 10 --max-duration 600 |
| fast beam search | 2.5 | 5.72 | --epoch 30 --avg 10 --max-duration 600 |
```bash
@ -2023,7 +2175,8 @@ subset so that the gigaspeech dataloader never exhausts.
| greedy search (max sym per frame 1) | 2.03 | 4.70 | --iter 1224000 --avg 14 --max-duration 600 |
| modified beam search | 2.00 | 4.63 | --iter 1224000 --avg 14 --max-duration 600 |
| modified beam search + rnnlm shallow fusion | 1.94 | 4.2 | --iter 1224000 --avg 14 --max-duration 600 |
| modified beam search + LODR | 1.83 | 4.03 | --iter 1224000 --avg 14 --max-duration 600 |
| modified beam search + rnnlm + LODR | 1.77 | 3.99 | --iter 1224000 --avg 14 --max-duration 600 |
| modified beam search + TransformerLM + LODR | 1.75 | 3.94 | --iter 1224000 --avg 14 --max-duration 600 |
| fast beam search | 2.10 | 4.68 | --iter 1224000 --avg 14 --max-duration 600 |
The training commands are:
@ -2069,8 +2222,10 @@ for iter in 1224000; do
done
done
```
You may also decode using shallow fusion with external RNNLM. To do so you need to
download a well-trained RNNLM from this link <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm/tree/main>
You may also decode using shallow fusion with external neural network LM. To do so you need to
download a well-trained NN LM:
RNN LM: <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm/tree/main>
Transformer LM: <https://huggingface.co/marcoyang/icefall-librispeech-transformer-lm/tree/main>
```bash
rnn_lm_scale=0.3

View File

@ -44,7 +44,8 @@ class LabelSmoothingLoss(torch.nn.Module):
mean of the output is taken. (3) "sum": the output will be summed.
"""
super().__init__()
assert 0.0 <= label_smoothing < 1.0
assert 0.0 <= label_smoothing < 1.0, f"{label_smoothing}"
assert reduction in ("none", "sum", "mean"), reduction
self.ignore_index = ignore_index
self.label_smoothing = label_smoothing
self.reduction = reduction

View File

@ -24,10 +24,9 @@ from scaling import (
ScaledConv2d,
ScaledLinear,
)
from torch import nn
class Conv2dSubsampling(nn.Module):
class Conv2dSubsampling(torch.nn.Module):
"""Convolutional 2D subsampling (to 1/4 length).
Convert an input of shape (N, T, idim) to an output
@ -61,7 +60,7 @@ class Conv2dSubsampling(nn.Module):
assert in_channels >= 7
super().__init__()
self.conv = nn.Sequential(
self.conv = torch.nn.Sequential(
ScaledConv2d(
in_channels=1,
out_channels=layer1_channels,

View File

@ -291,7 +291,10 @@ def main():
batch_size = nnet_output.shape[0]
supervision_segments = torch.tensor(
[[i, 0, nnet_output.shape[1]] for i in range(batch_size)],
[
[i, 0, feature_lengths[i] // params.subsampling_factor]
for i in range(batch_size)
],
dtype=torch.int32,
)

View File

@ -339,7 +339,10 @@ def main():
batch_size = nnet_output.shape[0]
supervision_segments = torch.tensor(
[[i, 0, nnet_output.shape[1]] for i in range(batch_size)],
[
[i, 0, feature_lengths[i] // params.subsampling_factor]
for i in range(batch_size)
],
dtype=torch.int32,
)

View File

@ -660,14 +660,22 @@ def main():
# we need cut ids to display recognition results.
args.return_cuts = True
librispeech = LibriSpeechAsrDataModule(args)
test_clean_cuts = librispeech.test_clean_cuts()
test_other_cuts = librispeech.test_other_cuts()
test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
test_other_dl = librispeech.test_dataloaders(test_other_cuts)
# CAUTION: `test_sets` is for displaying only.
# If you want to skip test-clean, you have to skip
# it inside the for loop. That is, use
#
# if test_set == 'test-clean': continue
#
test_sets = ["test-clean", "test-other"]
for test_set, test_dl in zip(test_sets, librispeech.test_dataloaders()):
test_dls = [test_clean_dl, test_other_dl]
for test_set, test_dl in zip(test_sets, test_dls):
results_dict = decode_dataset(
dl=test_dl,
params=params,

View File

@ -30,6 +30,8 @@ import torch.multiprocessing as mp
import torch.nn as nn
from asr_datamodule import LibriSpeechAsrDataModule
from conformer import Conformer
from lhotse.cut import Cut
from lhotse.dataset.sampling.base import CutSampler
from lhotse.utils import fix_random_seed
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn.utils import clip_grad_norm_
@ -100,6 +102,41 @@ def get_parser():
""",
)
parser.add_argument(
"--exp-dir",
type=str,
default="conformer_mmi/exp-attn",
help="""The experiment dir.
It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved
""",
)
parser.add_argument(
"--lang-dir",
type=str,
default="data/lang_bpe_500",
help="""The lang dir
It contains language related input files such as
"lexicon.txt"
""",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="The seed for random generators intended for reproducibility",
)
parser.add_argument(
"--use-pruned-intersect",
type=str2bool,
default=False,
help="""Whether to use `intersect_dense_pruned` to get denominator
lattice.""",
)
return parser
@ -114,12 +151,6 @@ def get_params() -> AttributeDict:
Explanation of options saved in `params`:
- exp_dir: It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved
- lang_dir: It contains language related input files such as
"lexicon.txt"
- best_train_loss: Best training loss so far. It is used to select
the model that has the lowest training loss. It is
updated during the training.
@ -164,8 +195,6 @@ def get_params() -> AttributeDict:
"""
params = AttributeDict(
{
"exp_dir": Path("conformer_mmi/exp_500_with_attention"),
"lang_dir": Path("data/lang_bpe_500"),
"best_train_loss": float("inf"),
"best_valid_loss": float("inf"),
"best_train_epoch": -1,
@ -184,15 +213,12 @@ def get_params() -> AttributeDict:
"beam_size": 6, # will change it to 8 after some batches (see code)
"reduction": "sum",
"use_double_scores": True,
# "att_rate": 0.0,
# "num_decoder_layers": 0,
"att_rate": 0.7,
"num_decoder_layers": 6,
# parameters for Noam
"weight_decay": 1e-6,
"lr_factor": 5.0,
"warm_step": 80000,
"use_pruned_intersect": False,
"den_scale": 1.0,
# use alignments before this number of batches
"use_ali_until": 13000,
@ -661,7 +687,7 @@ def run(rank, world_size, args):
params = get_params()
params.update(vars(args))
fix_random_seed(42)
fix_random_seed(params.seed)
if world_size > 1:
setup_dist(rank, world_size, params.master_port)
@ -745,8 +771,29 @@ def run(rank, world_size, args):
valid_ali = None
librispeech = LibriSpeechAsrDataModule(args)
train_dl = librispeech.train_dataloaders()
valid_dl = librispeech.valid_dataloaders()
train_cuts = librispeech.train_clean_100_cuts()
if params.full_libri:
train_cuts += librispeech.train_clean_360_cuts()
train_cuts += librispeech.train_other_500_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
train_cuts = train_cuts.filter(remove_short_and_long_utt)
train_dl = librispeech.train_dataloaders(train_cuts)
valid_cuts = librispeech.dev_clean_cuts()
valid_cuts += librispeech.dev_other_cuts()
valid_dl = librispeech.valid_dataloaders(valid_cuts)
for epoch in range(params.start_epoch, params.num_epochs):
train_dl.sampler.set_epoch(epoch)
@ -796,6 +843,7 @@ def main():
parser = get_parser()
LibriSpeechAsrDataModule.add_arguments(parser)
args = parser.parse_args()
args.exp_dir = Path(args.exp_dir)
world_size = args.world_size
assert world_size >= 1

View File

@ -30,6 +30,8 @@ import torch.multiprocessing as mp
import torch.nn as nn
from asr_datamodule import LibriSpeechAsrDataModule
from conformer import Conformer
from lhotse.cut import Cut
from lhotse.dataset.sampling.base import CutSampler
from lhotse.utils import fix_random_seed
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn.utils import clip_grad_norm_
@ -100,6 +102,26 @@ def get_parser():
""",
)
parser.add_argument(
"--exp-dir",
type=str,
default="conformer_mmi/exp",
help="""The experiment dir.
It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved
""",
)
parser.add_argument(
"--lang-dir",
type=str,
default="data/lang_bpe_500",
help="""The lang dir
It contains language related input files such as
"lexicon.txt"
""",
)
parser.add_argument(
"--seed",
type=int,
@ -107,6 +129,14 @@ def get_parser():
help="The seed for random generators intended for reproducibility",
)
parser.add_argument(
"--use-pruned-intersect",
type=str2bool,
default=False,
help="""Whether to use `intersect_dense_pruned` to get denominator
lattice.""",
)
return parser
@ -121,12 +151,6 @@ def get_params() -> AttributeDict:
Explanation of options saved in `params`:
- exp_dir: It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved
- lang_dir: It contains language related input files such as
"lexicon.txt"
- best_train_loss: Best training loss so far. It is used to select
the model that has the lowest training loss. It is
updated during the training.
@ -171,8 +195,6 @@ def get_params() -> AttributeDict:
"""
params = AttributeDict(
{
"exp_dir": Path("conformer_mmi/exp_500"),
"lang_dir": Path("data/lang_bpe_500"),
"best_train_loss": float("inf"),
"best_valid_loss": float("inf"),
"best_train_epoch": -1,
@ -193,13 +215,10 @@ def get_params() -> AttributeDict:
"use_double_scores": True,
"att_rate": 0.0,
"num_decoder_layers": 0,
# "att_rate": 0.7,
# "num_decoder_layers": 6,
# parameters for Noam
"weight_decay": 1e-6,
"lr_factor": 5.0,
"warm_step": 80000,
"use_pruned_intersect": False,
"den_scale": 1.0,
# use alignments before this number of batches
"use_ali_until": 13000,
@ -752,8 +771,29 @@ def run(rank, world_size, args):
valid_ali = None
librispeech = LibriSpeechAsrDataModule(args)
train_dl = librispeech.train_dataloaders()
valid_dl = librispeech.valid_dataloaders()
train_cuts = librispeech.train_clean_100_cuts()
if params.full_libri:
train_cuts += librispeech.train_clean_360_cuts()
train_cuts += librispeech.train_other_500_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
train_cuts = train_cuts.filter(remove_short_and_long_utt)
train_dl = librispeech.train_dataloaders(train_cuts)
valid_cuts = librispeech.dev_clean_cuts()
valid_cuts += librispeech.dev_other_cuts()
valid_dl = librispeech.valid_dataloaders(valid_cuts)
for epoch in range(params.start_epoch, params.num_epochs):
fix_random_seed(params.seed + epoch)
@ -804,6 +844,7 @@ def main():
parser = get_parser()
LibriSpeechAsrDataModule.add_arguments(parser)
args = parser.parse_args()
args.exp_dir = Path(args.exp_dir)
world_size = args.world_size
assert world_size >= 1

View File

@ -1435,7 +1435,7 @@ class EmformerEncoder(nn.Module):
self,
x: torch.Tensor,
states: List[torch.Tensor],
) -> Tuple[torch.Tensor, List[torch.Tensor],]:
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
"""Forward pass for streaming inference.
B: batch size;
@ -1512,24 +1512,6 @@ class EmformerEncoder(nn.Module):
)
return states
attn_caches = [
[
torch.zeros(self.memory_size, self.d_model, device=device),
torch.zeros(self.left_context_length, self.d_model, device=device),
torch.zeros(self.left_context_length, self.d_model, device=device),
]
for _ in range(self.num_encoder_layers)
]
conv_caches = [
torch.zeros(self.d_model, self.cnn_module_kernel - 1, device=device)
for _ in range(self.num_encoder_layers)
]
states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]] = (
attn_caches,
conv_caches,
)
return states
class Emformer(EncoderInterface):
def __init__(
@ -1640,7 +1622,7 @@ class Emformer(EncoderInterface):
self,
x: torch.Tensor,
states: List[torch.Tensor],
) -> Tuple[torch.Tensor, List[torch.Tensor],]:
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
"""Forward pass for streaming inference.
B: batch size;

Some files were not shown because too many files have changed in this diff Show More