resolve conflict

This commit is contained in:
marcoyang 2023-06-19 12:31:35 +08:00
commit ad24b4ad9e
587 changed files with 87456 additions and 6558 deletions

View File

@ -13,6 +13,7 @@ per-file-ignores =
egs/librispeech/ASR/conv_emformer_transducer_stateless*/*.py: E501, E203
egs/librispeech/ASR/conformer_ctc*/*py: E501,
egs/librispeech/ASR/zipformer_mmi/*.py: E501, E203
egs/librispeech/ASR/zipformer/*.py: E501, E203
egs/librispeech/ASR/RESULTS.md: E999,
# invalid escape sequence (cause by tex formular), W605

View File

@ -15,5 +15,5 @@ mkdir -p data
cd data
[ ! -e fbank ] && ln -s ~/tmp/fbank-libri fbank
cd ..
./local/compute_fbank_librispeech.py
./local/compute_fbank_librispeech.py --dataset 'test-clean test-other'
ls -lh data/fbank/

View File

@ -25,7 +25,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -18,7 +18,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -1,79 +0,0 @@
#!/usr/bin/env bash
#
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
cd exp
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
popd
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
log "Install ncnn and pnnx"
# We are using a modified ncnn here. Will try to merge it to the official repo
# of ncnn
git clone https://github.com/csukuangfj/ncnn
pushd ncnn
git submodule init
git submodule update python/pybind11
python3 setup.py bdist_wheel
ls -lh dist/
pip install dist/*.whl
cd tools/pnnx
mkdir build
cd build
cmake -D Python3_EXECUTABLE=/opt/hostedtoolcache/Python/3.8.14/x64/bin/python3 ..
make -j4 pnnx
./src/pnnx || echo "pass"
popd
log "Test exporting to pnnx format"
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
--exp-dir $repo/exp \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--use-averaged-model 0 \
\
--num-encoder-layers 12 \
--chunk-length 32 \
--cnn-module-kernel 31 \
--left-context-length 32 \
--right-context-length 8 \
--memory-size 32
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/encoder_jit_trace-pnnx.pt
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/decoder_jit_trace-pnnx.pt
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/joiner_jit_trace-pnnx.pt
./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
--tokens $repo/data/lang_bpe_500/tokens.txt \
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
$repo/test_wavs/1089-134686-0001.wav

View File

@ -20,7 +20,6 @@ abs_repo=$(realpath $repo)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
@ -28,63 +27,6 @@ ln -s pretrained-iter-468000-avg-16.pt pretrained.pt
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
popd
log "Install ncnn and pnnx"
# We are using a modified ncnn here. Will try to merge it to the official repo
# of ncnn
git clone https://github.com/csukuangfj/ncnn
pushd ncnn
git submodule init
git submodule update python/pybind11
python3 setup.py bdist_wheel
ls -lh dist/
pip install dist/*.whl
cd tools/pnnx
mkdir build
cd build
cmake ..
make -j4 pnnx
./src/pnnx || echo "pass"
popd
log "Test exporting to pnnx format"
./lstm_transducer_stateless2/export.py \
--exp-dir $repo/exp \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--use-averaged-model 0 \
--pnnx 1
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/encoder_jit_trace-pnnx.pt
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/decoder_jit_trace-pnnx.pt
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/joiner_jit_trace-pnnx.pt
./lstm_transducer_stateless2/ncnn-decode.py \
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
$repo/test_wavs/1089-134686-0001.wav
./lstm_transducer_stateless2/streaming-ncnn-decode.py \
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
$repo/test_wavs/1089-134686-0001.wav
log "Test exporting with torch.jit.trace()"
./lstm_transducer_stateless2/export.py \
@ -106,47 +48,6 @@ log "Decode with models exported by torch.jit.trace()"
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
log "Test exporting to ONNX"
./lstm_transducer_stateless2/export.py \
--exp-dir $repo/exp \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--use-averaged-model 0 \
--onnx 1
log "Decode with ONNX models "
./lstm_transducer_stateless2/streaming-onnx-decode.py \
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
--encoder-model-filename $repo//exp/encoder.onnx \
--decoder-model-filename $repo/exp/decoder.onnx \
--joiner-model-filename $repo/exp/joiner.onnx \
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
$repo/test_wavs/1089-134686-0001.wav
./lstm_transducer_stateless2/streaming-onnx-decode.py \
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
--encoder-model-filename $repo//exp/encoder.onnx \
--decoder-model-filename $repo/exp/decoder.onnx \
--joiner-model-filename $repo/exp/joiner.onnx \
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
$repo/test_wavs/1221-135766-0001.wav
./lstm_transducer_stateless2/streaming-onnx-decode.py \
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
--encoder-model-filename $repo//exp/encoder.onnx \
--decoder-model-filename $repo/exp/decoder.onnx \
--joiner-model-filename $repo/exp/joiner.onnx \
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
$repo/test_wavs/1221-135766-0002.wav
for sym in 1 2 3; do
log "Greedy search with --max-sym-per-frame $sym"

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
for sym in 1 2 3; do

View File

@ -23,7 +23,6 @@ popd
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -22,7 +22,6 @@ popd
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
@ -27,14 +26,6 @@ ln -s pretrained-iter-1224000-avg-14.pt pretrained.pt
ln -s pretrained-iter-1224000-avg-14.pt epoch-99.pt
popd
log "Test exporting to ONNX format"
./pruned_transducer_stateless3/export.py \
--exp-dir $repo/exp \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--onnx 1
log "Export to torchscript model"
./pruned_transducer_stateless3/export.py \
@ -51,30 +42,8 @@ log "Export to torchscript model"
--avg 1 \
--jit-trace 1
ls -lh $repo/exp/*.onnx
ls -lh $repo/exp/*.pt
log "Decode with ONNX models"
./pruned_transducer_stateless3/onnx_check.py \
--jit-filename $repo/exp/cpu_jit.pt \
--onnx-encoder-filename $repo/exp/encoder.onnx \
--onnx-decoder-filename $repo/exp/decoder.onnx \
--onnx-joiner-filename $repo/exp/joiner.onnx \
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
./pruned_transducer_stateless3/onnx_pretrained.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--encoder-model-filename $repo/exp/encoder.onnx \
--decoder-model-filename $repo/exp/decoder.onnx \
--joiner-model-filename $repo/exp/joiner.onnx \
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
log "Decode with models exported by torch.jit.trace()"
./pruned_transducer_stateless3/jit_pretrained.py \

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
@ -30,15 +29,6 @@ ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "Test exporting to ONNX format"
./pruned_transducer_stateless7/export.py \
--exp-dir $repo/exp \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--onnx 1
log "Export to torchscript model"
./pruned_transducer_stateless7/export.py \
--exp-dir $repo/exp \
@ -50,27 +40,6 @@ log "Export to torchscript model"
ls -lh $repo/exp/*.pt
log "Decode with ONNX models"
./pruned_transducer_stateless7/onnx_check.py \
--jit-filename $repo/exp/cpu_jit.pt \
--onnx-encoder-filename $repo/exp/encoder.onnx \
--onnx-decoder-filename $repo/exp/decoder.onnx \
--onnx-joiner-filename $repo/exp/joiner.onnx \
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
./pruned_transducer_stateless7/onnx_pretrained.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--encoder-model-filename $repo/exp/encoder.onnx \
--decoder-model-filename $repo/exp/decoder.onnx \
--joiner-model-filename $repo/exp/joiner.onnx \
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
log "Decode with models exported by torch.jit.script()"
./pruned_transducer_stateless7/jit_pretrained.py \

View File

@ -18,7 +18,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
@ -148,4 +147,4 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" ==
done
rm pruned_transducer_stateless7_ctc/exp/*.pt
fi
fi

View File

@ -10,7 +10,7 @@ log() {
cd egs/librispeech/ASR
repo_url=https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14
repo_url=https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2023-01-29
log "Downloading pre-trained model from $repo_url"
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
@ -18,7 +18,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -19,16 +19,16 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/cpu_jit.pt"
git lfs pull --include "exp/pretrained.pt"
git lfs pull --include "exp/encoder_jit_trace.pt"
git lfs pull --include "exp/decoder_jit_trace.pt"
git lfs pull --include "exp/joiner_jit_trace.pt"
cd exp
ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -0,0 +1,115 @@
#!/usr/bin/env bash
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
log "Downloading pre-trained model from $repo_url"
git lfs install
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
log "Display test files"
tree $repo/
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/jit_script_chunk_16_left_128.pt"
git lfs pull --include "exp/pretrained.pt"
ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "Export to torchscript model"
./zipformer/export.py \
--exp-dir $repo/exp \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--causal 1 \
--chunk-size 16 \
--left-context-frames 128 \
--epoch 99 \
--avg 1 \
--jit 1
ls -lh $repo/exp/*.pt
log "Decode with models exported by torch.jit.script()"
./zipformer/jit_pretrained_streaming.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--nn-model-filename $repo/exp/jit_script_chunk_16_left_128.pt \
$repo/test_wavs/1089-134686-0001.wav
for method in greedy_search modified_beam_search fast_beam_search; do
log "$method"
./zipformer/pretrained.py \
--causal 1 \
--chunk-size 16 \
--left-context-frames 128 \
--method $method \
--beam-size 4 \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
mkdir -p zipformer/exp
ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-999.pt
ln -s $PWD/$repo/data/lang_bpe_500 data/
ls -lh data
ls -lh zipformer/exp
log "Decoding test-clean and test-other"
# use a small value for decoding with CPU
max_duration=100
for method in greedy_search fast_beam_search modified_beam_search; do
log "Simulated streaming decoding with $method"
./zipformer/decode.py \
--causal 1 \
--chunk-size 16 \
--left-context-frames 128 \
--decoding-method $method \
--epoch 999 \
--avg 1 \
--use-averaged-model 0 \
--max-duration $max_duration \
--exp-dir zipformer/exp
done
for method in greedy_search fast_beam_search modified_beam_search; do
log "Chunk-wise streaming decoding with $method"
./zipformer/streaming_decode.py \
--causal 1 \
--chunk-size 16 \
--left-context-frames 128 \
--decoding-method $method \
--epoch 999 \
--avg 1 \
--use-averaged-model 0 \
--max-duration $max_duration \
--exp-dir zipformer/exp
done
rm zipformer/exp/*.pt
fi

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
for sym in 1 2 3; do

View File

@ -0,0 +1,93 @@
#!/usr/bin/env bash
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
log "Downloading pre-trained model from $repo_url"
git lfs install
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
log "Display test files"
tree $repo/
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/jit_script.pt"
git lfs pull --include "exp/pretrained.pt"
ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "Export to torchscript model"
./zipformer/export.py \
--exp-dir $repo/exp \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--jit 1
ls -lh $repo/exp/*.pt
log "Decode with models exported by torch.jit.script()"
./zipformer/jit_pretrained.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--nn-model-filename $repo/exp/jit_script.pt \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
for method in greedy_search modified_beam_search fast_beam_search; do
log "$method"
./zipformer/pretrained.py \
--method $method \
--beam-size 4 \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
mkdir -p zipformer/exp
ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-999.pt
ln -s $PWD/$repo/data/lang_bpe_500 data/
ls -lh data
ls -lh zipformer/exp
log "Decoding test-clean and test-other"
# use a small value for decoding with CPU
max_duration=100
for method in greedy_search fast_beam_search modified_beam_search; do
log "Decoding with $method"
./zipformer/decode.py \
--decoding-method $method \
--epoch 999 \
--avg 1 \
--use-averaged-model 0 \
--max-duration $max_duration \
--exp-dir zipformer/exp
done
rm zipformer/exp/*.pt
fi

View File

@ -0,0 +1,117 @@
#!/usr/bin/env bash
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-transducer-ctc-2023-06-13
log "Downloading pre-trained model from $repo_url"
git lfs install
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
log "Display test files"
tree $repo/
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "data/lang_bpe_500/HLG.pt"
git lfs pull --include "data/lang_bpe_500/L.pt"
git lfs pull --include "data/lang_bpe_500/LG.pt"
git lfs pull --include "data/lang_bpe_500/Linv.pt"
git lfs pull --include "data/lm/G_4_gram.pt"
git lfs pull --include "exp/jit_script.pt"
git lfs pull --include "exp/pretrained.pt"
ln -s pretrained.pt epoch-99.pt
ls -lh *.pt
popd
log "Export to torchscript model"
./zipformer/export.py \
--exp-dir $repo/exp \
--use-transducer 1 \
--use-ctc 1 \
--use-averaged-model false \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--jit 1
ls -lh $repo/exp/*.pt
log "Decode with models exported by torch.jit.script()"
for method in ctc-decoding 1best; do
./zipformer/jit_pretrained_ctc.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--model-filename $repo/exp/jit_script.pt \
--HLG $repo/data/lang_bpe_500/HLG.pt \
--words-file $repo/data/lang_bpe_500/words.txt \
--G $repo/data/lm/G_4_gram.pt \
--method $method \
--sample-rate 16000 \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
for method in ctc-decoding 1best; do
log "$method"
./zipformer/pretrained_ctc.py \
--use-transducer 1 \
--use-ctc 1 \
--method $method \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--words-file $repo/data/lang_bpe_500/words.txt \
--HLG $repo/data/lang_bpe_500/HLG.pt \
--G $repo/data/lm/G_4_gram.pt \
--words-file $repo/data/lang_bpe_500/words.txt \
--sample-rate 16000 \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
done
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
mkdir -p zipformer/exp
ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-999.pt
ln -s $PWD/$repo/data/lang_bpe_500 data/
ls -lh data
ls -lh zipformer/exp
log "Decoding test-clean and test-other"
# use a small value for decoding with CPU
max_duration=100
for method in ctc-decoding 1best; do
log "Decoding with $method"
./zipformer/ctc_decode.py \
--use-transducer 1 \
--use-ctc 1 \
--decoding-method $method \
--nbest-scale 1.0 \
--hlg-scale 0.6 \
--epoch 999 \
--avg 1 \
--use-averaged-model 0 \
--max-duration $max_duration \
--exp-dir zipformer/exp
done
rm zipformer/exp/*.pt
fi

View File

@ -18,7 +18,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.flac
ls -lh $repo/test_wavs/*.flac
log "CTC decoding"

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
for sym in 1 2 3; do

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
for sym in 1 2 3; do

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
for sym in 1 2 3; do

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
for sym in 1 2 3; do

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
for sym in 1 2 3; do

View File

@ -19,7 +19,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
log "Beam search decoding"

View File

@ -20,7 +20,6 @@ repo=$(basename $repo_url)
log "Display test files"
tree $repo/
soxi $repo/test_wavs/*.wav
ls -lh $repo/test_wavs/*.wav
pushd $repo/exp

234
.github/scripts/test-ncnn-export.sh vendored Executable file
View File

@ -0,0 +1,234 @@
#!/usr/bin/env bash
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
pushd egs/librispeech/ASR
log "Install ncnn and pnnx"
# We are using a modified ncnn here. Will try to merge it to the official repo
# of ncnn
git clone https://github.com/csukuangfj/ncnn
pushd ncnn
git submodule init
git submodule update python/pybind11
python3 setup.py bdist_wheel
ls -lh dist/
pip install dist/*.whl
cd tools/pnnx
mkdir build
cd build
echo "which python3"
which python3
#/opt/hostedtoolcache/Python/3.8.16/x64/bin/python3
cmake -D Python3_EXECUTABLE=$(which python3) ..
make -j4 pnnx
./src/pnnx || echo "pass"
popd
export PATH=$PWD/ncnn/tools/pnnx/build/src:$PATH
log "=========================================================================="
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
cd exp
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
popd
log "Export via torch.jit.trace()"
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
--exp-dir $repo/exp \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--use-averaged-model 0 \
\
--num-encoder-layers 12 \
--chunk-length 32 \
--cnn-module-kernel 31 \
--left-context-length 32 \
--right-context-length 8 \
--memory-size 32
pnnx $repo/exp/encoder_jit_trace-pnnx.pt
pnnx $repo/exp/decoder_jit_trace-pnnx.pt
pnnx $repo/exp/joiner_jit_trace-pnnx.pt
python3 ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
--tokens $repo/data/lang_bpe_500/tokens.txt \
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
$repo/test_wavs/1089-134686-0001.wav
rm -rf $repo
log "--------------------------------------------------------------------------"
log "=========================================================================="
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
cd exp
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
popd
log "Export via torch.jit.trace()"
./lstm_transducer_stateless2/export-for-ncnn.py \
--exp-dir $repo/exp \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--use-averaged-model 0
pnnx $repo/exp/encoder_jit_trace-pnnx.pt
pnnx $repo/exp/decoder_jit_trace-pnnx.pt
pnnx $repo/exp/joiner_jit_trace-pnnx.pt
python3 ./lstm_transducer_stateless2/streaming-ncnn-decode.py \
--tokens $repo/data/lang_bpe_500/tokens.txt \
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
$repo/test_wavs/1089-134686-0001.wav
python3 ./lstm_transducer_stateless2/ncnn-decode.py \
--tokens $repo/data/lang_bpe_500/tokens.txt \
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
$repo/test_wavs/1089-134686-0001.wav
rm -rf $repo
log "--------------------------------------------------------------------------"
log "=========================================================================="
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained.pt"
cd exp
ln -s pretrained.pt epoch-99.pt
popd
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--exp-dir $repo/exp \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
\
--decode-chunk-len 32 \
--num-encoder-layers "2,4,3,2,4" \
--feedforward-dims "1024,1024,2048,2048,1024" \
--nhead "8,8,8,8,8" \
--encoder-dims "384,384,384,384,384" \
--attention-dims "192,192,192,192,192" \
--encoder-unmasked-dims "256,256,256,256,256" \
--zipformer-downsampling-factors "1,2,4,8,2" \
--cnn-module-kernels "31,31,31,31,31" \
--decoder-dim 512 \
--joiner-dim 512
pnnx $repo/exp/encoder_jit_trace-pnnx.pt
pnnx $repo/exp/decoder_jit_trace-pnnx.pt
pnnx $repo/exp/joiner_jit_trace-pnnx.pt
python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
--tokens $repo/data/lang_bpe_500/tokens.txt \
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
$repo/test_wavs/1089-134686-0001.wav
rm -rf $repo
log "--------------------------------------------------------------------------"
log "=========================================================================="
repo_url=https://huggingface.co/pfluo/k2fsa-zipformer-chinese-english-mixed
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_char_bpe/L.pt"
git lfs pull --include "data/lang_char_bpe/L_disambig.pt"
git lfs pull --include "data/lang_char_bpe/Linv.pt"
git lfs pull --include "exp/pretrained.pt"
cd exp
ln -s pretrained.pt epoch-99.pt
popd
./pruned_transducer_stateless7_streaming/export-for-ncnn-zh.py \
--lang-dir $repo/data/lang_char_bpe \
--exp-dir $repo/exp \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--decode-chunk-len 32 \
--num-encoder-layers "2,4,3,2,4" \
--feedforward-dims "1024,1024,1536,1536,1024" \
--nhead "8,8,8,8,8" \
--encoder-dims "384,384,384,384,384" \
--attention-dims "192,192,192,192,192" \
--encoder-unmasked-dims "256,256,256,256,256" \
--zipformer-downsampling-factors "1,2,4,8,2" \
--cnn-module-kernels "31,31,31,31,31" \
--decoder-dim 512 \
--joiner-dim 512
pnnx $repo/exp/encoder_jit_trace-pnnx.pt
pnnx $repo/exp/decoder_jit_trace-pnnx.pt
pnnx $repo/exp/joiner_jit_trace-pnnx.pt
python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
--tokens $repo/data/lang_char_bpe/tokens.txt \
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
$repo/test_wavs/0.wav
rm -rf $repo
log "--------------------------------------------------------------------------"

351
.github/scripts/test-onnx-export.sh vendored Executable file
View File

@ -0,0 +1,351 @@
#!/usr/bin/env bash
set -e
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/librispeech/ASR
log "=========================================================================="
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
log "Downloading pre-trained model from $repo_url"
git lfs install
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained.pt"
cd exp
ln -s pretrained.pt epoch-99.pt
popd
log "Export via torch.jit.trace()"
./pruned_transducer_stateless7_streaming/jit_trace_export.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--decode-chunk-len 32 \
--exp-dir $repo/exp/
log "Test exporting to ONNX format"
./pruned_transducer_stateless7_streaming/export-onnx.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--decode-chunk-len 32 \
--exp-dir $repo/exp/
ls -lh $repo/exp
log "Run onnx_check.py"
./pruned_transducer_stateless7_streaming/onnx_check.py \
--jit-encoder-filename $repo/exp/encoder_jit_trace.pt \
--jit-decoder-filename $repo/exp/decoder_jit_trace.pt \
--jit-joiner-filename $repo/exp/joiner_jit_trace.pt \
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
log "Run onnx_pretrained.py"
./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_500/tokens.txt \
$repo/test_wavs/1089-134686-0001.wav
rm -rf $repo
log "--------------------------------------------------------------------------"
log "=========================================================================="
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
log "Downloading pre-trained model from $repo_url"
git lfs install
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained-iter-1224000-avg-14.pt"
cd exp
ln -s pretrained-iter-1224000-avg-14.pt epoch-9999.pt
popd
log "Export via torch.jit.script()"
./pruned_transducer_stateless3/export.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 9999 \
--avg 1 \
--exp-dir $repo/exp/ \
--jit 1
log "Test exporting to ONNX format"
./pruned_transducer_stateless3/export-onnx.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 9999 \
--avg 1 \
--exp-dir $repo/exp/
ls -lh $repo/exp
log "Run onnx_check.py"
./pruned_transducer_stateless3/onnx_check.py \
--jit-filename $repo/exp/cpu_jit.pt \
--onnx-encoder-filename $repo/exp/encoder-epoch-9999-avg-1.onnx \
--onnx-decoder-filename $repo/exp/decoder-epoch-9999-avg-1.onnx \
--onnx-joiner-filename $repo/exp/joiner-epoch-9999-avg-1.onnx
log "Run onnx_pretrained.py"
./pruned_transducer_stateless3/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-9999-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-9999-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-9999-avg-1.onnx \
--tokens $repo/data/lang_bpe_500/tokens.txt \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
rm -rf $repo
log "--------------------------------------------------------------------------"
log "=========================================================================="
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained-epoch-39-avg-7.pt"
cd exp
ln -s pretrained-epoch-39-avg-7.pt epoch-99.pt
popd
log "Export via torch.jit.script()"
./pruned_transducer_stateless5/export.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--use-averaged-model 0 \
--exp-dir $repo/exp \
--num-encoder-layers 18 \
--dim-feedforward 2048 \
--nhead 8 \
--encoder-dim 512 \
--decoder-dim 512 \
--joiner-dim 512 \
--jit 1
log "Test exporting to ONNX format"
./pruned_transducer_stateless5/export-onnx.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--use-averaged-model 0 \
--exp-dir $repo/exp \
--num-encoder-layers 18 \
--dim-feedforward 2048 \
--nhead 8 \
--encoder-dim 512 \
--decoder-dim 512 \
--joiner-dim 512
ls -lh $repo/exp
log "Run onnx_check.py"
./pruned_transducer_stateless5/onnx_check.py \
--jit-filename $repo/exp/cpu_jit.pt \
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
log "Run onnx_pretrained.py"
./pruned_transducer_stateless5/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_500/tokens.txt \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
rm -rf $repo
log "--------------------------------------------------------------------------"
log "=========================================================================="
repo_url=
rm -rf $repo
log "--------------------------------------------------------------------------"
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained.pt"
cd exp
ln -s pretrained.pt epoch-99.pt
popd
log "Export via torch.jit.script()"
./pruned_transducer_stateless7/export.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp \
--feedforward-dims "1024,1024,2048,2048,1024" \
--jit 1
log "Test exporting to ONNX format"
./pruned_transducer_stateless7/export-onnx.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp \
--feedforward-dims "1024,1024,2048,2048,1024"
ls -lh $repo/exp
log "Run onnx_check.py"
./pruned_transducer_stateless7/onnx_check.py \
--jit-filename $repo/exp/cpu_jit.pt \
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
log "Run onnx_pretrained.py"
./pruned_transducer_stateless7/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_500/tokens.txt \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
log "=========================================================================="
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
cd exp
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
popd
log "Test exporting to ONNX format"
./conv_emformer_transducer_stateless2/export-onnx.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp \
--num-encoder-layers 12 \
--chunk-length 32 \
--cnn-module-kernel 31 \
--left-context-length 32 \
--right-context-length 8 \
--memory-size 32
log "Run onnx_pretrained.py"
./conv_emformer_transducer_stateless2/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_500/tokens.txt \
$repo/test_wavs/1221-135766-0001.wav
rm -rf $repo
log "--------------------------------------------------------------------------"
log "=========================================================================="
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
cd exp
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
popd
log "Export via torch.jit.trace()"
./lstm_transducer_stateless2/export.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp/ \
--jit-trace 1
log "Test exporting to ONNX format"
./lstm_transducer_stateless2/export-onnx.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp
ls -lh $repo/exp
log "Run onnx_check.py"
./lstm_transducer_stateless2/onnx_check.py \
--jit-encoder-filename $repo/exp/encoder_jit_trace.pt \
--jit-decoder-filename $repo/exp/decoder_jit_trace.pt \
--jit-joiner-filename $repo/exp/joiner_jit_trace.pt \
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
log "Run onnx_pretrained.py"
./lstm_transducer_stateless2/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_500/tokens.txt \
$repo/test_wavs/1221-135766-0001.wav
rm -rf $repo
log "--------------------------------------------------------------------------"

View File

@ -65,7 +65,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -73,7 +73,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -87,7 +87,7 @@ jobs:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -39,7 +39,7 @@ concurrency:
jobs:
run_librispeech_2022_11_11_zipformer:
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -60,7 +60,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -68,7 +68,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -119,7 +119,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -35,7 +35,7 @@ on:
jobs:
run_librispeech_2022_12_15_zipformer_ctc_bs:
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
if: github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@ -60,7 +60,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -68,7 +68,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -119,7 +119,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -22,7 +22,7 @@ concurrency:
jobs:
run_librispeech_lstm_transducer_stateless2_2022_09_03:
if: github.event.label.name == 'ready' || github.event.label.name == 'LODR' || github.event.label.name == 'shallow-fusion' || github.event.label.name == 'ncnn' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
if: github.event.label.name == 'ready' || github.event.label.name == 'LODR' || github.event.label.name == 'shallow-fusion' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@ -47,7 +47,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -55,7 +55,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -106,7 +106,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -39,7 +39,7 @@ concurrency:
jobs:
run_librispeech_pruned_transducer_stateless3_2022_05_13:
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -0,0 +1,174 @@
# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
# See ../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: run-librispeech-streaming-zipformer-2023-05-18
# zipformer
on:
push:
branches:
- master
pull_request:
types: [labeled]
schedule:
# minute (0-59)
# hour (0-23)
# day of the month (1-31)
# month (1-12)
# day of the week (0-6)
# nightly build at 15:50 UTC time every day
- cron: "50 15 * * *"
concurrency:
group: run_librispeech_2023_05_18_streaming_zipformer-${{ github.ref }}
cancel-in-progress: true
jobs:
run_librispeech_2023_05_18_streaming_zipformer:
if: github.event.label.name == 'zipformer' ||github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements-ci.txt'
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
uses: actions/cache@v2
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/install-kaldifeat.sh
- name: Cache LibriSpeech test-clean and test-other datasets
id: libri-test-clean-and-test-other-data
uses: actions/cache@v2
with:
path: |
~/tmp/download
key: cache-libri-test-clean-and-test-other
- name: Download LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
- name: Prepare manifests for LibriSpeech test-clean and test-other
shell: bash
run: |
.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
- name: Cache LibriSpeech test-clean and test-other fbank features
id: libri-test-clean-and-test-other-fbank
uses: actions/cache@v2
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
- name: Inference with pre-trained model
shell: bash
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
mkdir -p egs/librispeech/ASR/data
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh
- name: Display decoding results for librispeech zipformer
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
shell: bash
run: |
cd egs/librispeech/ASR/
tree ./zipformer/exp
cd zipformer
echo "results for zipformer, simulated streaming decoding"
echo "===greedy search==="
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===fast_beam_search==="
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===modified beam search==="
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "results for zipformer, chunk-wise streaming decoding"
echo "===greedy search==="
find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===fast_beam_search==="
find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===modified beam search==="
find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
- name: Upload decoding results for librispeech zipformer
uses: actions/upload-artifact@v2
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
with:
name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
path: egs/librispeech/ASR/zipformer/exp/

View File

@ -64,7 +64,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -72,7 +72,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -123,7 +123,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -0,0 +1,159 @@
# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
# See ../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: run-librispeech-zipformer-2023-05-18
# zipformer
on:
push:
branches:
- master
pull_request:
types: [labeled]
schedule:
# minute (0-59)
# hour (0-23)
# day of the month (1-31)
# month (1-12)
# day of the week (0-6)
# nightly build at 15:50 UTC time every day
- cron: "50 15 * * *"
concurrency:
group: run_librispeech_2023_05_18_zipformer-${{ github.ref }}
cancel-in-progress: true
jobs:
run_librispeech_2023_05_18_zipformer:
if: github.event.label.name == 'zipformer' ||github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements-ci.txt'
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
uses: actions/cache@v2
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/install-kaldifeat.sh
- name: Cache LibriSpeech test-clean and test-other datasets
id: libri-test-clean-and-test-other-data
uses: actions/cache@v2
with:
path: |
~/tmp/download
key: cache-libri-test-clean-and-test-other
- name: Download LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
- name: Prepare manifests for LibriSpeech test-clean and test-other
shell: bash
run: |
.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
- name: Cache LibriSpeech test-clean and test-other fbank features
id: libri-test-clean-and-test-other-fbank
uses: actions/cache@v2
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
- name: Inference with pre-trained model
shell: bash
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
mkdir -p egs/librispeech/ASR/data
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/run-librispeech-zipformer-2023-05-18.sh
- name: Display decoding results for librispeech zipformer
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
shell: bash
run: |
cd egs/librispeech/ASR/
tree ./zipformer/exp
cd zipformer
echo "results for zipformer"
echo "===greedy search==="
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===fast_beam_search==="
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===modified beam search==="
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
- name: Upload decoding results for librispeech zipformer
uses: actions/upload-artifact@v2
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
with:
name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
path: egs/librispeech/ASR/zipformer/exp/

View File

@ -0,0 +1,155 @@
# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
# See ../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: run-librispeech-zipformer-ctc-2023-06-14
# zipformer
on:
push:
branches:
- master
pull_request:
types: [labeled]
schedule:
# minute (0-59)
# hour (0-23)
# day of the month (1-31)
# month (1-12)
# day of the week (0-6)
# nightly build at 15:50 UTC time every day
- cron: "50 15 * * *"
concurrency:
group: run_librispeech_2023_06_14_zipformer-ctc-${{ github.ref }}
cancel-in-progress: true
jobs:
run_librispeech_2023_06_14_zipformer_ctc:
if: github.event.label.name == 'zipformer' ||github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements-ci.txt'
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
uses: actions/cache@v2
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/install-kaldifeat.sh
- name: Cache LibriSpeech test-clean and test-other datasets
id: libri-test-clean-and-test-other-data
uses: actions/cache@v2
with:
path: |
~/tmp/download
key: cache-libri-test-clean-and-test-other
- name: Download LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
- name: Prepare manifests for LibriSpeech test-clean and test-other
shell: bash
run: |
.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
- name: Cache LibriSpeech test-clean and test-other fbank features
id: libri-test-clean-and-test-other-fbank
uses: actions/cache@v2
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
- name: Inference with pre-trained model
shell: bash
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
mkdir -p egs/librispeech/ASR/data
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh
- name: Display decoding results for librispeech zipformer
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
shell: bash
run: |
cd egs/librispeech/ASR/
tree ./zipformer/exp
cd zipformer
echo "results for zipformer"
echo "===ctc-decoding==="
find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
echo "===1best==="
find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
- name: Upload decoding results for librispeech zipformer
uses: actions/upload-artifact@v2
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
with:
name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
path: egs/librispeech/ASR/zipformer/exp/

View File

@ -54,7 +54,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -62,7 +62,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -73,7 +73,7 @@ jobs:
- name: Inference with pre-trained model
shell: bash
run: |
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -63,7 +63,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -71,7 +71,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -122,7 +122,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -63,7 +63,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -71,7 +71,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -122,7 +122,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -54,7 +54,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -62,7 +62,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -73,7 +73,7 @@ jobs:
- name: Inference with pre-trained model
shell: bash
run: |
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -54,7 +54,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -62,7 +62,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -73,7 +73,7 @@ jobs:
- name: Inference with pre-trained model
shell: bash
run: |
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -63,7 +63,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -71,7 +71,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -122,7 +122,7 @@ jobs:
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -54,7 +54,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -62,7 +62,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -73,7 +73,7 @@ jobs:
- name: Inference with pre-trained model
shell: bash
run: |
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -47,7 +47,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Prepare data
shell: bash

View File

@ -54,7 +54,7 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -62,7 +62,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -76,7 +76,7 @@ jobs:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
sudo apt-get -qq install git-lfs tree sox
sudo apt-get -qq install git-lfs tree
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

View File

@ -35,7 +35,7 @@ jobs:
matrix:
# os: [ubuntu-18.04, macos-10.15]
# TODO: enable macOS for CPU testing
os: [ubuntu-18.04]
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
@ -67,7 +67,9 @@ jobs:
run: |
grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
pip install --no-deps --force-reinstall https://huggingface.co/csukuangfj/k2/resolve/main/cpu/k2-1.24.3.dev20230508+cpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
- name: Run yesno recipe
shell: bash

View File

@ -1,4 +1,4 @@
name: run-librispeech-conv-emformer-transducer-stateless2-2022-12-05
name: test-ncnn-export
on:
push:
@ -16,15 +16,18 @@ on:
# nightly build at 15:50 UTC time every day
- cron: "50 15 * * *"
concurrency:
group: test_ncnn_export-${{ github.ref }}
cancel-in-progress: true
jobs:
run_librispeech_conv_emformer_transducer_stateless2_2022_12_05:
test_ncnn_export:
if: github.event.label.name == 'ready' || github.event.label.name == 'ncnn' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
steps:
@ -41,9 +44,9 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
@ -51,7 +54,7 @@ jobs:
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
@ -59,19 +62,14 @@ jobs:
run: |
.github/scripts/install-kaldifeat.sh
- name: Inference with pre-trained model
- name: Test ncnn export
shell: bash
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
mkdir -p egs/librispeech/ASR/data
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
ls -lh egs/librispeech/ASR/data/*
sudo apt-get -qq install git-lfs tree sox
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/run-librispeech-conv-emformer-transducer-stateless2-2022-12-05.sh
.github/scripts/test-ncnn-export.sh

75
.github/workflows/test-onnx-export.yml vendored Normal file
View File

@ -0,0 +1,75 @@
name: test-onnx-export
on:
push:
branches:
- master
pull_request:
types: [labeled]
schedule:
# minute (0-59)
# hour (0-23)
# day of the month (1-31)
# month (1-12)
# day of the week (0-6)
# nightly build at 15:50 UTC time every day
- cron: "50 15 * * *"
concurrency:
group: test_onnx_export-${{ github.ref }}
cancel-in-progress: true
jobs:
test_onnx_export:
if: github.event.label.name == 'ready' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.8]
fail-fast: false
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: '**/requirements-ci.txt'
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf==3.20.*
- name: Cache kaldifeat
id: my-cache
uses: actions/cache@v2
with:
path: |
~/tmp/kaldifeat
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
- name: Install kaldifeat
if: steps.my-cache.outputs.cache-hit != 'true'
shell: bash
run: |
.github/scripts/install-kaldifeat.sh
- name: Test ONNX export
shell: bash
env:
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
run: |
export PYTHONPATH=$PWD:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/test-onnx-export.sh

View File

@ -56,7 +56,7 @@ jobs:
run: |
sudo apt update
sudo apt install -q -y libsndfile1-dev libsndfile1 ffmpeg
sudo apt install -q -y --fix-missing sox libsox-dev libsox-fmt-all
sudo apt install -q -y --fix-missing libsox-dev libsox-fmt-all
- name: Install Python dependencies
run: |
@ -70,7 +70,7 @@ jobs:
pip install git+https://github.com/lhotse-speech/lhotse
# icefall requirements
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
pip install --no-binary protobuf protobuf==3.20.*
pip install kaldifst
pip install onnxruntime
@ -113,14 +113,15 @@ jobs:
cd ../pruned_transducer_stateless4
pytest -v -s
echo $PYTHONPATH
cd ../pruned_transducer_stateless7
pytest -v -s
cd ../transducer_stateless
pytest -v -s
cd ../transducer
pytest -v -s
# cd ../transducer
# pytest -v -s
cd ../transducer_stateless2
pytest -v -s
@ -157,8 +158,8 @@ jobs:
cd ../transducer_stateless
pytest -v -s
cd ../transducer
pytest -v -s
# cd ../transducer
# pytest -v -s
cd ../transducer_stateless2
pytest -v -s

View File

@ -26,7 +26,7 @@ repos:
# E121,E123,E126,E226,E24,E704,W503,W504
- repo: https://github.com/pycqa/isort
rev: 5.10.1
rev: 5.11.5
hooks:
- id: isort
args: ["--profile=black"]

View File

@ -1,13 +1,4 @@
Legal Notices
NOTE (this is not from the Apache License): The copyright model is that
authors (or their employers, if noted in individual files) own their
individual contributions. The authors' contributions can be discerned
from the git history.
-------------------------------------------------------------------------
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

161
README.md
View File

@ -28,14 +28,15 @@ We provide the following recipes:
- [yesno][yesno]
- [LibriSpeech][librispeech]
- [GigaSpeech][gigaspeech]
- [Aishell][aishell]
- [Aishell2][aishell2]
- [Aishell4][aishell4]
- [TIMIT][timit]
- [TED-LIUM3][tedlium3]
- [GigaSpeech][gigaspeech]
- [Aidatatang_200zh][aidatatang_200zh]
- [WenetSpeech][wenetspeech]
- [Alimeeting][alimeeting]
- [Aishell4][aishell4]
- [TAL_CSASR][tal_csasr]
### yesno
@ -46,9 +47,7 @@ Training takes less than 30 seconds and gives you the following WER:
```
[test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
```
We do provide a Colab notebook for this recipe.
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing)
We provide a Colab notebook for this recipe: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing)
### LibriSpeech
@ -56,12 +55,13 @@ We do provide a Colab notebook for this recipe.
Please see <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>
for the **latest** results.
We provide 4 models for this recipe:
We provide 5 models for this recipe:
- [conformer CTC model][LibriSpeech_conformer_ctc]
- [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc]
- [Transducer: Conformer encoder + LSTM decoder][LibriSpeech_transducer]
- [Transducer: Conformer encoder + Embedding decoder][LibriSpeech_transducer_stateless]
- [Transducer: Zipformer encoder + Embedding decoder][LibriSpeech_zipformer]
#### Conformer CTC Model
@ -82,7 +82,7 @@ The WER for this model is:
|-----|------------|------------|
| WER | 6.59 | 17.69 |
We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing)
We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing)
#### Transducer: Conformer encoder + LSTM decoder
@ -116,21 +116,58 @@ We provide a Colab notebook to run a pre-trained transducer conformer + stateles
#### k2 pruned RNN-T
| | test-clean | test-other |
|-----|------------|------------|
| WER | 2.57 | 5.95 |
| Encoder | Params | test-clean | test-other |
|-----------------|--------|------------|------------|
| zipformer | 65.5M | 2.21 | 4.91 |
| zipformer-small | 23.2M | 2.46 | 5.83 |
| zipformer-large | 148.4M | 2.11 | 4.77 |
Note: No auxiliary losses are used in the training and no LMs are used
in the decoding.
#### k2 pruned RNN-T + GigaSpeech
| | test-clean | test-other |
|-----|------------|------------|
| WER | 2.00 | 4.63 |
| WER | 1.78 | 4.08 |
Note: No auxiliary losses are used in the training and no LMs are used
in the decoding.
#### k2 pruned RNN-T + GigaSpeech + CommonVoice
| | test-clean | test-other |
|-----|------------|------------|
| WER | 1.90 | 3.98 |
Note: No auxiliary losses are used in the training and no LMs are used
in the decoding.
### GigaSpeech
We provide two models for this recipe: [Conformer CTC model][GigaSpeech_conformer_ctc]
and [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][GigaSpeech_pruned_transducer_stateless2].
#### Conformer CTC
| | Dev | Test |
|-----|-------|-------|
| WER | 10.47 | 10.58 |
#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
| | Dev | Test |
|----------------------|-------|-------|
| greedy search | 10.51 | 10.73 |
| fast beam search | 10.50 | 10.69 |
| modified beam search | 10.40 | 10.51 |
### Aishell
We provide two models for this recipe: [conformer CTC model][Aishell_conformer_ctc]
and [TDNN LSTM CTC model][Aishell_tdnn_lstm_ctc].
We provide three models for this recipe: [conformer CTC model][Aishell_conformer_ctc],
[TDNN LSTM CTC model][Aishell_tdnn_lstm_ctc], and [Transducer Stateless Model][Aishell_pruned_transducer_stateless7],
#### Conformer CTC Model
@ -140,20 +177,6 @@ The best CER we currently have is:
|-----|------|
| CER | 4.26 |
We provide a Colab notebook to run a pre-trained conformer CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg](https://colab.research.google.com/drive/1WnG17io5HEZ0Gn_cnh_VzK5QYOoiiklC?usp=sharing)
#### Transducer Stateless Model
The best CER we currently have is:
| | test |
|-----|------|
| CER | 4.68 |
We provide a Colab notebook to run a pre-trained TransducerStateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14XaT2MhnBkK-3_RqqWq3K90Xlbin-GZC?usp=sharing)
#### TDNN LSTM CTC Model
The CER for this model is:
@ -164,6 +187,46 @@ The CER for this model is:
We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing)
#### Transducer Stateless Model
The best CER we currently have is:
| | test |
|-----|------|
| CER | 4.38 |
We provide a Colab notebook to run a pre-trained TransducerStateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14XaT2MhnBkK-3_RqqWq3K90Xlbin-GZC?usp=sharing)
### Aishell2
We provide one model for this recipe: [Transducer Stateless Model][Aishell2_pruned_transducer_stateless5].
#### Transducer Stateless Model
The best WER we currently have is:
| | dev-ios | test-ios |
|-----|------------|------------|
| WER | 5.32 | 5.56 |
### Aishell4
We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aishell4_pruned_transducer_stateless5].
#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with all subsets)
The best CER we currently have is:
| | test |
|-----|------------|
| CER | 29.08 |
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
### TIMIT
We provide two models for this recipe: [TDNN LSTM CTC model][TIMIT_tdnn_lstm_ctc]
@ -187,7 +250,8 @@ The PER for this model is:
|--|--|
|PER| 17.66% |
We provide a Colab notebook to run a pre-trained TDNN LiGRU CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/11IT-k4HQIgQngXz1uvWsEYktjqQt7Tmb?usp=sharing)
We provide a Colab notebook to run a pre-trained TDNN LiGRU CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
### TED-LIUM3
@ -215,24 +279,6 @@ The best WER using modified beam search with beam size 4 is:
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1je_1zGrOkGVVd4WLzgkXRHxl-I27yWtz?usp=sharing)
### GigaSpeech
We provide two models for this recipe: [Conformer CTC model][GigaSpeech_conformer_ctc]
and [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][GigaSpeech_pruned_transducer_stateless2].
#### Conformer CTC
| | Dev | Test |
|-----|-------|-------|
| WER | 10.47 | 10.58 |
#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
| | Dev | Test |
|----------------------|-------|-------|
| greedy search | 10.51 | 10.73 |
| fast beam search | 10.50 | 10.69 |
| modified beam search | 10.40 | 10.51 |
### Aidatatang_200zh
@ -248,6 +294,7 @@ We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wNSnSj3T5oOctbh5IGCa393gKOoQw2GH?usp=sharing)
### WenetSpeech
We provide some models for this recipe: [Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless2] and [Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless5].
@ -284,20 +331,6 @@ We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)
### Aishell4
We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aishell4_pruned_transducer_stateless5].
#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with all subsets)
The best CER(%) results:
| | test |
|----------------------|--------|
| greedy search | 29.89 |
| fast beam search | 28.91 |
| modified beam search | 29.08 |
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
### TAL_CSASR
@ -331,8 +364,12 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
[LibriSpeech_conformer_ctc]: egs/librispeech/ASR/conformer_ctc
[LibriSpeech_transducer]: egs/librispeech/ASR/transducer
[LibriSpeech_transducer_stateless]: egs/librispeech/ASR/transducer_stateless
[LibriSpeech_zipformer]: egs/librispeech/ASR/zipformer
[Aishell_tdnn_lstm_ctc]: egs/aishell/ASR/tdnn_lstm_ctc
[Aishell_conformer_ctc]: egs/aishell/ASR/conformer_ctc
[Aishell_pruned_transducer_stateless7]: egs/aishell/ASR/pruned_transducer_stateless7_bbpe
[Aishell2_pruned_transducer_stateless5]: egs/aishell2/ASR/pruned_transducer_stateless5
[Aishell4_pruned_transducer_stateless5]: egs/aishell4/ASR/pruned_transducer_stateless5
[TIMIT_tdnn_lstm_ctc]: egs/timit/ASR/tdnn_lstm_ctc
[TIMIT_tdnn_ligru_ctc]: egs/timit/ASR/tdnn_ligru_ctc
[TED-LIUM3_transducer_stateless]: egs/tedlium3/ASR/transducer_stateless
@ -343,17 +380,17 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
[WenetSpeech_pruned_transducer_stateless2]: egs/wenetspeech/ASR/pruned_transducer_stateless2
[WenetSpeech_pruned_transducer_stateless5]: egs/wenetspeech/ASR/pruned_transducer_stateless5
[Alimeeting_pruned_transducer_stateless2]: egs/alimeeting/ASR/pruned_transducer_stateless2
[Aishell4_pruned_transducer_stateless5]: egs/aishell4/ASR/pruned_transducer_stateless5
[TAL_CSASR_pruned_transducer_stateless5]: egs/tal_csasr/ASR/pruned_transducer_stateless5
[yesno]: egs/yesno/ASR
[librispeech]: egs/librispeech/ASR
[aishell]: egs/aishell/ASR
[aishell2]: egs/aishell2/ASR
[aishell4]: egs/aishell4/ASR
[timit]: egs/timit/ASR
[tedlium3]: egs/tedlium3/ASR
[gigaspeech]: egs/gigaspeech/ASR
[aidatatang_200zh]: egs/aidatatang_200zh/ASR
[wenetspeech]: egs/wenetspeech/ASR
[alimeeting]: egs/alimeeting/ASR
[aishell4]: egs/aishell4/ASR
[tal_csasr]: egs/tal_csasr/ASR
[k2]: https://github.com/k2-fsa/k2

View File

@ -78,3 +78,15 @@ html_context = {
}
todo_include_todos = True
rst_epilog = """
.. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn
.. _sherpa-onnx: https://github.com/k2-fsa/sherpa-onnx
.. _icefall: https://github.com/k2-fsa/icefall
.. _git-lfs: https://git-lfs.com/
.. _ncnn: https://github.com/tencent/ncnn
.. _LibriSpeech: https://www.openslr.org/12
.. _musan: http://www.openslr.org/17/
.. _ONNX: https://github.com/onnx/onnx
.. _onnxruntime: https://github.com/microsoft/onnxruntime
"""

107
docs/source/faqs.rst Normal file
View File

@ -0,0 +1,107 @@
Frequently Asked Questions (FAQs)
=================================
In this section, we collect issues reported by users and post the corresponding
solutions.
OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
-----------------------------------------------------------------------------------
One user is using the following code to install ``torch`` and ``torchaudio``:
.. code-block:: bash
pip install \
torch==1.10.0+cu111 \
torchvision==0.11.0+cu111 \
torchaudio==0.10.0 \
-f https://download.pytorch.org/whl/torch_stable.html
and it throws the following error when running ``tdnn/train.py``:
.. code-block::
OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
The fix is to specify the CUDA version while installing ``torchaudio``. That
is, change ``torchaudio==0.10.0`` to ``torchaudio==0.10.0+cu11```. Therefore,
the correct command is:
.. code-block:: bash
pip install \
torch==1.10.0+cu111 \
torchvision==0.11.0+cu111 \
torchaudio==0.10.0+cu111 \
-f https://download.pytorch.org/whl/torch_stable.html
AttributeError: module 'distutils' has no attribute 'version'
-------------------------------------------------------------
The error log is:
.. code-block::
Traceback (most recent call last):
File "./tdnn/train.py", line 14, in <module>
from asr_datamodule import YesNoAsrDataModule
File "/home/xxx/code/next-gen-kaldi/icefall/egs/yesno/ASR/tdnn/asr_datamodule.py", line 34, in <module>
from icefall.dataset.datamodule import DataModule
File "/home/xxx/code/next-gen-kaldi/icefall/icefall/__init__.py", line 3, in <module>
from . import (
File "/home/xxx/code/next-gen-kaldi/icefall/icefall/decode.py", line 23, in <module>
from icefall.utils import add_eos, add_sos, get_texts
File "/home/xxx/code/next-gen-kaldi/icefall/icefall/utils.py", line 39, in <module>
from torch.utils.tensorboard import SummaryWriter
File "/home/xxx/tool/miniconda3/envs/yyy/lib/python3.8/site-packages/torch/utils/tensorboard/__init__.py", line 4, in <module>
LooseVersion = distutils.version.LooseVersion
AttributeError: module 'distutils' has no attribute 'version'
The fix is:
.. code-block:: bash
pip uninstall setuptools
pip install setuptools==58.0.4
ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
--------------------------------------------------------------------------------------------
If you are using ``conda`` and encounter the following issue:
.. code-block::
Traceback (most recent call last):
File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 24, in <module>
from _k2 import DeterminizeWeightPushingType
ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/k2-dev/yangyifan/icefall/egs/librispeech/ASR/./pruned_transducer_stateless7_ctc_bs/decode.py", line 104, in <module>
import k2
File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 30, in <module>
raise ImportError(
ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
Note: If you're using anaconda and importing k2 on MacOS,
you can probably fix this by setting the environment variable:
export DYLD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages:$DYLD_LIBRARY_PATH
Please first try to find where ``libpython3.10.so.1.0`` locates.
For instance,
.. code-block:: bash
cd $CONDA_PREFIX/lib
find . -name "libpython*"
If you are able to find it inside ``$CODNA_PREFIX/lib``, please set the
following environment variable:
.. code-block:: bash
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH

View File

@ -21,6 +21,7 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
:caption: Contents:
installation/index
faqs
model-export/index
.. toctree::

View File

@ -3,64 +3,91 @@
Installation
============
- |os|
- |device|
- |python_versions|
- |torch_versions|
- |k2_versions|
.. |os| image:: ./images/os-Linux_macOS-ff69b4.svg
:alt: Supported operating systems
.. |device| image:: ./images/device-CPU_CUDA-orange.svg
:alt: Supported devices
.. |python_versions| image:: ./images/python-gt-v3.6-blue.svg
:alt: Supported python versions
.. |torch_versions| image:: ./images/torch-gt-v1.6.0-green.svg
:alt: Supported PyTorch versions
.. |k2_versions| image:: ./images/k2-gt-v1.9-blueviolet.svg
:alt: Supported k2 versions
``icefall`` depends on `k2 <https://github.com/k2-fsa/k2>`_ and
`lhotse <https://github.com/lhotse-speech/lhotse>`_.
We recommend you to use the following steps to install the dependencies.
We recommend that you use the following steps to install the dependencies.
- (0) Install PyTorch and torchaudio
- (1) Install k2
- (2) Install lhotse
- (0) Install CUDA toolkit and cuDNN
- (1) Install PyTorch and torchaudio
- (2) Install k2
- (3) Install lhotse
.. caution::
99% users who have issues about the installation are using conda.
.. caution::
99% users who have issues about the installation are using conda.
.. caution::
99% users who have issues about the installation are using conda.
.. hint::
We suggest that you use ``pip install`` to install PyTorch.
You can use the following command to create a virutal environment in Python:
.. code-block:: bash
python3 -m venv ./my_env
source ./my_env/bin/activate
.. caution::
Installation order matters.
(0) Install PyTorch and torchaudio
(0) Install CUDA toolkit and cuDNN
----------------------------------
Please refer to
`<https://k2-fsa.github.io/k2/installation/cuda-cudnn.html>`_
to install CUDA and cuDNN.
(1) Install PyTorch and torchaudio
----------------------------------
Please refer `<https://pytorch.org/>`_ to install PyTorch
and torchaudio.
.. hint::
(1) Install k2
You can also go to `<https://download.pytorch.org/whl/torch_stable.html>`_
to download pre-compiled wheels and install them.
.. caution::
Please install torch and torchaudio at the same time.
(2) Install k2
--------------
Please refer to `<https://k2-fsa.github.io/k2/installation/index.html>`_
to install ``k2``.
.. CAUTION::
.. caution::
You need to install ``k2`` with a version at least **v1.9**.
Please don't change your installed PyTorch after you have installed k2.
.. HINT::
.. note::
If you have already installed PyTorch and don't want to replace it,
please install a version of ``k2`` that is compiled against the version
of PyTorch you are using.
We suggest that you install k2 from source by following
`<https://k2-fsa.github.io/k2/installation/from_source.html>`_
or
`<https://k2-fsa.github.io/k2/installation/for_developers.html>`_.
(2) Install lhotse
.. hint::
Please always install the latest version of k2.
(3) Install lhotse
------------------
Please refer to `<https://lhotse.readthedocs.io/en/latest/getting-started.html#installation>`_
@ -75,8 +102,7 @@ to install ``lhotse``.
to install the latest version of lhotse.
(3) Download icefall
(4) Download icefall
--------------------
``icefall`` is a collection of Python scripts; what you need is to download it
@ -338,44 +364,42 @@ The log of running ``./prepare.sh`` is:
.. code-block::
2021-08-23 19:27:26 (prepare.sh:24:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
2021-08-23 19:27:26 (prepare.sh:27:main) stage 0: Download data
Downloading waves_yesno.tar.gz: 4.49MB [00:03, 1.39MB/s]
2021-08-23 19:27:30 (prepare.sh:36:main) Stage 1: Prepare yesno manifest
2021-08-23 19:27:31 (prepare.sh:42:main) Stage 2: Compute fbank for yesno
2021-08-23 19:27:32,803 INFO [compute_fbank_yesno.py:52] Processing train
Extracting and storing features: 100%|_______________________________________________________________| 90/90 [00:01<00:00, 80.57it/s]
2021-08-23 19:27:34,085 INFO [compute_fbank_yesno.py:52] Processing test
Extracting and storing features: 100%|______________________________________________________________| 30/30 [00:00<00:00, 248.21it/s]
2021-08-23 19:27:34 (prepare.sh:48:main) Stage 3: Prepare lang
2021-08-23 19:27:35 (prepare.sh:63:main) Stage 4: Prepare G
/tmp/pip-install-fcordre9/kaldilm_6899d26f2d684ad48f21025950cd2866/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Rea
d(std::istream&):79
[I] Reading \data\ section.
/tmp/pip-install-fcordre9/kaldilm_6899d26f2d684ad48f21025950cd2866/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Rea
d(std::istream&):140
[I] Reading \1-grams: section.
2021-08-23 19:27:35 (prepare.sh:89:main) Stage 5: Compile HLG
2021-08-23 19:27:35,928 INFO [compile_hlg.py:120] Processing data/lang_phone
2021-08-23 19:27:35,929 INFO [lexicon.py:116] Converting L.pt to Linv.pt
2021-08-23 19:27:35,931 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
2021-08-23 19:27:35,932 INFO [compile_hlg.py:52] Loading G.fst.txt
2021-08-23 19:27:35,932 INFO [compile_hlg.py:62] Intersecting L and G
2021-08-23 19:27:35,933 INFO [compile_hlg.py:64] LG shape: (4, None)
2021-08-23 19:27:35,933 INFO [compile_hlg.py:66] Connecting LG
2021-08-23 19:27:35,933 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
2021-08-23 19:27:35,933 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
2021-08-23 19:27:35,933 INFO [compile_hlg.py:71] Determinizing LG
2021-08-23 19:27:35,934 INFO [compile_hlg.py:74] <class '_k2.RaggedInt'>
2021-08-23 19:27:35,934 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
2021-08-23 19:27:35,934 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
2021-08-23 19:27:35,934 INFO [compile_hlg.py:87] LG shape after k2.remove_epsilon: (6, None)
2021-08-23 19:27:35,935 INFO [compile_hlg.py:92] Arc sorting LG
2021-08-23 19:27:35,935 INFO [compile_hlg.py:95] Composing H and LG
2021-08-23 19:27:35,935 INFO [compile_hlg.py:102] Connecting LG
2021-08-23 19:27:35,935 INFO [compile_hlg.py:105] Arc sorting LG
2021-08-23 19:27:35,936 INFO [compile_hlg.py:107] HLG.shape: (8, None)
2021-08-23 19:27:35,936 INFO [compile_hlg.py:123] Saving HLG.pt to data/lang_phone
2023-05-12 17:55:21 (prepare.sh:27:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
2023-05-12 17:55:21 (prepare.sh:30:main) Stage 0: Download data
/tmp/icefall/egs/yesno/ASR/download/waves_yesno.tar.gz: 100%|_______________________________________________________________| 4.70M/4.70M [06:54<00:00, 11.4kB/s]
2023-05-12 18:02:19 (prepare.sh:39:main) Stage 1: Prepare yesno manifest
2023-05-12 18:02:21 (prepare.sh:45:main) Stage 2: Compute fbank for yesno
2023-05-12 18:02:23,199 INFO [compute_fbank_yesno.py:65] Processing train
Extracting and storing features: 100%|_______________________________________________________________| 90/90 [00:00<00:00, 212.60it/s]
2023-05-12 18:02:23,640 INFO [compute_fbank_yesno.py:65] Processing test
Extracting and storing features: 100%|_______________________________________________________________| 30/30 [00:00<00:00, 304.53it/s]
2023-05-12 18:02:24 (prepare.sh:51:main) Stage 3: Prepare lang
2023-05-12 18:02:26 (prepare.sh:66:main) Stage 4: Prepare G
/project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):79
[I] Reading \data\ section.
/project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):140
[I] Reading \1-grams: section.
2023-05-12 18:02:26 (prepare.sh:92:main) Stage 5: Compile HLG
2023-05-12 18:02:28,581 INFO [compile_hlg.py:124] Processing data/lang_phone
2023-05-12 18:02:28,582 INFO [lexicon.py:171] Converting L.pt to Linv.pt
2023-05-12 18:02:28,609 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
2023-05-12 18:02:28,610 INFO [compile_hlg.py:52] Loading G.fst.txt
2023-05-12 18:02:28,611 INFO [compile_hlg.py:62] Intersecting L and G
2023-05-12 18:02:28,613 INFO [compile_hlg.py:64] LG shape: (4, None)
2023-05-12 18:02:28,613 INFO [compile_hlg.py:66] Connecting LG
2023-05-12 18:02:28,614 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
2023-05-12 18:02:28,614 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
2023-05-12 18:02:28,614 INFO [compile_hlg.py:71] Determinizing LG
2023-05-12 18:02:28,615 INFO [compile_hlg.py:74] <class '_k2.ragged.RaggedTensor'>
2023-05-12 18:02:28,615 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
2023-05-12 18:02:28,615 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
2023-05-12 18:02:28,616 INFO [compile_hlg.py:91] LG shape after k2.remove_epsilon: (6, None)
2023-05-12 18:02:28,617 INFO [compile_hlg.py:96] Arc sorting LG
2023-05-12 18:02:28,617 INFO [compile_hlg.py:99] Composing H and LG
2023-05-12 18:02:28,619 INFO [compile_hlg.py:106] Connecting LG
2023-05-12 18:02:28,619 INFO [compile_hlg.py:109] Arc sorting LG
2023-05-12 18:02:28,619 INFO [compile_hlg.py:111] HLG.shape: (8, None)
2023-05-12 18:02:28,619 INFO [compile_hlg.py:127] Saving HLG.pt to data/lang_phone
Training
@ -408,49 +432,53 @@ The training log is given below:
.. code-block::
2021-08-23 19:30:31,072 INFO [train.py:465] Training started
2021-08-23 19:30:31,072 INFO [train.py:466] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01,
'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, '
best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_doub
le_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'feature_dir': PosixPath('data/fbank'
), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0
, 'on_the_fly_feats': False, 'shuffle': True, 'return_cuts': True, 'num_workers': 2}
2021-08-23 19:30:31,074 INFO [lexicon.py:113] Loading pre-compiled data/lang_phone/Linv.pt
2021-08-23 19:30:31,098 INFO [asr_datamodule.py:146] About to get train cuts
2021-08-23 19:30:31,098 INFO [asr_datamodule.py:240] About to get train cuts
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:149] About to create train dataset
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:200] Using SingleCutSampler.
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:206] About to create train dataloader
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:219] About to get test cuts
2021-08-23 19:30:31,102 INFO [asr_datamodule.py:246] About to get test cuts
2021-08-23 19:30:31,357 INFO [train.py:416] Epoch 0, batch 0, batch avg loss 1.0789, total avg loss: 1.0789, batch size: 4
2021-08-23 19:30:31,848 INFO [train.py:416] Epoch 0, batch 10, batch avg loss 0.5356, total avg loss: 0.7556, batch size: 4
2021-08-23 19:30:32,301 INFO [train.py:432] Epoch 0, valid loss 0.9972, best valid loss: 0.9972 best valid epoch: 0
2021-08-23 19:30:32,805 INFO [train.py:416] Epoch 0, batch 20, batch avg loss 0.2436, total avg loss: 0.5717, batch size: 3
2021-08-23 19:30:33,109 INFO [train.py:432] Epoch 0, valid loss 0.4167, best valid loss: 0.4167 best valid epoch: 0
2021-08-23 19:30:33,121 INFO [checkpoint.py:62] Saving checkpoint to tdnn/exp/epoch-0.pt
2021-08-23 19:30:33,325 INFO [train.py:416] Epoch 1, batch 0, batch avg loss 0.2214, total avg loss: 0.2214, batch size: 5
2021-08-23 19:30:33,798 INFO [train.py:416] Epoch 1, batch 10, batch avg loss 0.0781, total avg loss: 0.1343, batch size: 5
2021-08-23 19:30:34,065 INFO [train.py:432] Epoch 1, valid loss 0.0859, best valid loss: 0.0859 best valid epoch: 1
2021-08-23 19:30:34,556 INFO [train.py:416] Epoch 1, batch 20, batch avg loss 0.0421, total avg loss: 0.0975, batch size: 3
2021-08-23 19:30:34,810 INFO [train.py:432] Epoch 1, valid loss 0.0431, best valid loss: 0.0431 best valid epoch: 1
2021-08-23 19:30:34,824 INFO [checkpoint.py:62] Saving checkpoint to tdnn/exp/epoch-1.pt
2023-05-12 18:04:59,759 INFO [train.py:481] Training started
2023-05-12 18:04:59,759 INFO [train.py:482] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0,
'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10,
'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0,
'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2,
'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '3b7f09fa35e72589914f67089c0da9f196a92ca4', 'k2-git-date': 'Mon May 8 22:58:45 2023',
'lhotse-version': '1.15.0.dev+git.6fcfced.clean', 'torch-version': '2.0.0+cu118', 'torch-cuda-available': False, 'torch-cuda-version': '11.8', 'python-version': '3.1', 'icefall-git-branch': 'master',
'icefall-git-sha1': '30bde4b-clean', 'icefall-git-date': 'Thu May 11 17:37:47 2023', 'icefall-path': '/tmp/icefall',
'k2-path': 'tmp/lib/python3.10/site-packages/k2-1.24.3.dev20230512+cuda11.8.torch2.0.0-py3.10-linux-x86_64.egg/k2/__init__.py',
'lhotse-path': 'tmp/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'host', 'IP address': '0.0.0.0'}}
2023-05-12 18:04:59,761 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
2023-05-12 18:04:59,764 INFO [train.py:495] device: cpu
2023-05-12 18:04:59,791 INFO [asr_datamodule.py:146] About to get train cuts
2023-05-12 18:04:59,791 INFO [asr_datamodule.py:244] About to get train cuts
2023-05-12 18:04:59,852 INFO [asr_datamodule.py:149] About to create train dataset
2023-05-12 18:04:59,852 INFO [asr_datamodule.py:199] Using SingleCutSampler.
2023-05-12 18:04:59,852 INFO [asr_datamodule.py:205] About to create train dataloader
2023-05-12 18:04:59,853 INFO [asr_datamodule.py:218] About to get test cuts
2023-05-12 18:04:59,853 INFO [asr_datamodule.py:252] About to get test cuts
2023-05-12 18:04:59,986 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.065, over 2436.00 frames. ], tot_loss[loss=1.065, over 2436.00 frames. ], batch size: 4
2023-05-12 18:05:00,352 INFO [train.py:422] Epoch 0, batch 10, loss[loss=0.4561, over 2828.00 frames. ], tot_loss[loss=0.7076, over 22192.90 frames. ], batch size: 4
2023-05-12 18:05:00,691 INFO [train.py:444] Epoch 0, validation loss=0.9002, over 18067.00 frames.
2023-05-12 18:05:00,996 INFO [train.py:422] Epoch 0, batch 20, loss[loss=0.2555, over 2695.00 frames. ], tot_loss[loss=0.484, over 34971.47 frames. ], batch size: 5
2023-05-12 18:05:01,217 INFO [train.py:444] Epoch 0, validation loss=0.4688, over 18067.00 frames.
2023-05-12 18:05:01,251 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-0.pt
2023-05-12 18:05:01,389 INFO [train.py:422] Epoch 1, batch 0, loss[loss=0.2532, over 2436.00 frames. ], tot_loss[loss=0.2532, over 2436.00 frames. ], batch size: 4
2023-05-12 18:05:01,637 INFO [train.py:422] Epoch 1, batch 10, loss[loss=0.1139, over 2828.00 frames. ], tot_loss[loss=0.1592, over 22192.90 frames. ], batch size: 4
2023-05-12 18:05:01,859 INFO [train.py:444] Epoch 1, validation loss=0.1629, over 18067.00 frames.
2023-05-12 18:05:02,094 INFO [train.py:422] Epoch 1, batch 20, loss[loss=0.0767, over 2695.00 frames. ], tot_loss[loss=0.118, over 34971.47 frames. ], batch size: 5
2023-05-12 18:05:02,350 INFO [train.py:444] Epoch 1, validation loss=0.06778, over 18067.00 frames.
2023-05-12 18:05:02,395 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-1.pt
... ...
2021-08-23 19:30:49,657 INFO [train.py:416] Epoch 13, batch 0, batch avg loss 0.0109, total avg loss: 0.0109, batch size: 5
2021-08-23 19:30:49,984 INFO [train.py:416] Epoch 13, batch 10, batch avg loss 0.0093, total avg loss: 0.0096, batch size: 4
2021-08-23 19:30:50,239 INFO [train.py:432] Epoch 13, valid loss 0.0104, best valid loss: 0.0101 best valid epoch: 12
2021-08-23 19:30:50,569 INFO [train.py:416] Epoch 13, batch 20, batch avg loss 0.0092, total avg loss: 0.0096, batch size: 2
2021-08-23 19:30:50,819 INFO [train.py:432] Epoch 13, valid loss 0.0101, best valid loss: 0.0101 best valid epoch: 13
2021-08-23 19:30:50,835 INFO [checkpoint.py:62] Saving checkpoint to tdnn/exp/epoch-13.pt
2021-08-23 19:30:51,024 INFO [train.py:416] Epoch 14, batch 0, batch avg loss 0.0105, total avg loss: 0.0105, batch size: 5
2021-08-23 19:30:51,317 INFO [train.py:416] Epoch 14, batch 10, batch avg loss 0.0099, total avg loss: 0.0097, batch size: 4
2021-08-23 19:30:51,552 INFO [train.py:432] Epoch 14, valid loss 0.0108, best valid loss: 0.0101 best valid epoch: 13
2021-08-23 19:30:51,869 INFO [train.py:416] Epoch 14, batch 20, batch avg loss 0.0096, total avg loss: 0.0097, batch size: 5
2021-08-23 19:30:52,107 INFO [train.py:432] Epoch 14, valid loss 0.0102, best valid loss: 0.0101 best valid epoch: 13
2021-08-23 19:30:52,126 INFO [checkpoint.py:62] Saving checkpoint to tdnn/exp/epoch-14.pt
2021-08-23 19:30:52,128 INFO [train.py:537] Done!
2023-05-12 18:05:14,789 INFO [train.py:422] Epoch 13, batch 0, loss[loss=0.01056, over 2436.00 frames. ], tot_loss[loss=0.01056, over 2436.00 frames. ], batch size: 4
2023-05-12 18:05:15,016 INFO [train.py:422] Epoch 13, batch 10, loss[loss=0.009022, over 2828.00 frames. ], tot_loss[loss=0.009985, over 22192.90 frames. ], batch size: 4
2023-05-12 18:05:15,271 INFO [train.py:444] Epoch 13, validation loss=0.01088, over 18067.00 frames.
2023-05-12 18:05:15,497 INFO [train.py:422] Epoch 13, batch 20, loss[loss=0.01174, over 2695.00 frames. ], tot_loss[loss=0.01077, over 34971.47 frames. ], batch size: 5
2023-05-12 18:05:15,747 INFO [train.py:444] Epoch 13, validation loss=0.01087, over 18067.00 frames.
2023-05-12 18:05:15,783 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-13.pt
2023-05-12 18:05:15,921 INFO [train.py:422] Epoch 14, batch 0, loss[loss=0.01045, over 2436.00 frames. ], tot_loss[loss=0.01045, over 2436.00 frames. ], batch size: 4
2023-05-12 18:05:16,146 INFO [train.py:422] Epoch 14, batch 10, loss[loss=0.008957, over 2828.00 frames. ], tot_loss[loss=0.009903, over 22192.90 frames. ], batch size: 4
2023-05-12 18:05:16,374 INFO [train.py:444] Epoch 14, validation loss=0.01092, over 18067.00 frames.
2023-05-12 18:05:16,598 INFO [train.py:422] Epoch 14, batch 20, loss[loss=0.01169, over 2695.00 frames. ], tot_loss[loss=0.01065, over 34971.47 frames. ], batch size: 5
2023-05-12 18:05:16,824 INFO [train.py:444] Epoch 14, validation loss=0.01077, over 18067.00 frames.
2023-05-12 18:05:16,862 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-14.pt
2023-05-12 18:05:16,865 INFO [train.py:555] Done!
Decoding
~~~~~~~~
@ -465,22 +493,25 @@ The decoding log is:
.. code-block::
2021-08-23 19:35:30,192 INFO [decode.py:249] Decoding started
2021-08-23 19:35:30,192 INFO [decode.py:250] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': True, 'return_cuts': True, 'num_workers': 2}
2021-08-23 19:35:30,193 INFO [lexicon.py:113] Loading pre-compiled data/lang_phone/Linv.pt
2021-08-23 19:35:30,213 INFO [decode.py:259] device: cpu
2021-08-23 19:35:30,217 INFO [decode.py:279] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
/tmp/icefall/icefall/checkpoint.py:146: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch.
It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at /pytorch/aten/src/ATen/native/BinaryOps.cpp:450.)
avg[k] //= n
2021-08-23 19:35:30,220 INFO [asr_datamodule.py:219] About to get test cuts
2021-08-23 19:35:30,220 INFO [asr_datamodule.py:246] About to get test cuts
2021-08-23 19:35:30,409 INFO [decode.py:190] batch 0/8, cuts processed until now is 4
2021-08-23 19:35:30,571 INFO [decode.py:228] The transcripts are stored in tdnn/exp/recogs-test_set.txt
2021-08-23 19:35:30,572 INFO [utils.py:317] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
2021-08-23 19:35:30,573 INFO [decode.py:236] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
2021-08-23 19:35:30,573 INFO [decode.py:299] Done!
2023-05-12 18:08:30,482 INFO [decode.py:263] Decoding started
2023-05-12 18:08:30,483 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23,
'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'),
'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True,
'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '3b7f09fa35e72589914f67089c0da9f196a92ca4', 'k2-git-date': 'Mon May 8 22:58:45 2023',
'lhotse-version': '1.15.0.dev+git.6fcfced.clean', 'torch-version': '2.0.0+cu118', 'torch-cuda-available': False, 'torch-cuda-version': '11.8', 'python-version': '3.1', 'icefall-git-branch': 'master',
'icefall-git-sha1': '30bde4b-clean', 'icefall-git-date': 'Thu May 11 17:37:47 2023', 'icefall-path': '/tmp/icefall',
'k2-path': '/tmp/lib/python3.10/site-packages/k2-1.24.3.dev20230512+cuda11.8.torch2.0.0-py3.10-linux-x86_64.egg/k2/__init__.py',
'lhotse-path': '/tmp/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'host', 'IP address': '0.0.0.0'}}
2023-05-12 18:08:30,483 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
2023-05-12 18:08:30,487 INFO [decode.py:273] device: cpu
2023-05-12 18:08:30,513 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
2023-05-12 18:08:30,521 INFO [asr_datamodule.py:218] About to get test cuts
2023-05-12 18:08:30,521 INFO [asr_datamodule.py:252] About to get test cuts
2023-05-12 18:08:30,675 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
2023-05-12 18:08:30,923 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
2023-05-12 18:08:30,924 INFO [utils.py:558] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
2023-05-12 18:08:30,925 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
2023-05-12 18:08:30,925 INFO [decode.py:316] Done!
**Congratulations!** You have successfully setup the environment and have run the first recipe in ``icefall``.

View File

@ -0,0 +1,21 @@
2023-01-11 12:15:38,677 INFO [export-for-ncnn.py:220] device: cpu
2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:229] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_v
alid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampl
ing_factor': 4, 'decoder_dim': 512, 'joiner_dim': 512, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.23.2', 'k2-build-type':
'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'a34171ed85605b0926eebbd0463d059431f4f74a', 'k2-git-date': 'Wed Dec 14 00:06:38 2022',
'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-vers
ion': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'fix-stateless3-train-2022-12-27', 'icefall-git-sha1': '530e8a1-dirty', '
icefall-git-date': 'Tue Dec 27 13:59:18 2022', 'icefall-path': '/star-fj/fangjun/open-source/icefall', 'k2-path': '/star-fj/fangjun/op
en-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279
-k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '127.0.0.1'}, 'epoch': 30, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefa
ll-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp'), 'bpe_model': './icefall-asr-librispeech-conv-emformer-transdu
cer-stateless2-2022-07-05//data/lang_bpe_500/bpe.model', 'jit': False, 'context_size': 2, 'use_averaged_model': False, 'encoder_dim':
512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'cnn_module_kernel': 31, 'left_context_length': 32, 'chunk_length'
: 32, 'right_context_length': 8, 'memory_size': 32, 'blank_id': 0, 'vocab_size': 500}
2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:231] About to create model
2023-01-11 12:15:40,053 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-conv-emformer-transducer-stateless2-2
022-07-05/exp/epoch-30.pt
2023-01-11 12:15:40,708 INFO [export-for-ncnn.py:315] Number of model parameters: 75490012
2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:318] Using torch.jit.trace()
2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:320] Exporting encoder
2023-01-11 12:15:41,682 INFO [export-for-ncnn.py:149] chunk_length: 32, right_context_length: 8

View File

@ -0,0 +1,18 @@
2023-02-17 11:22:42,862 INFO [export-for-ncnn.py:222] device: cpu
2023-02-17 11:22:42,865 INFO [export-for-ncnn.py:231] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'dim_feedforward': 2048, 'decoder_dim': 512, 'joiner_dim': 512, 'is_pnnx': False, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.23.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '62e404dd3f3a811d73e424199b3408e309c06e1a', 'k2-git-date': 'Mon Jan 30 10:26:16 2023', 'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '6d7a559-dirty', 'icefall-git-date': 'Thu Feb 16 19:47:54 2023', 'icefall-path': '/star-fj/fangjun/open-source/icefall-2', 'k2-path': '/star-fj/fangjun/open-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '10.177.6.147'}, 'epoch': 99, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp'), 'bpe_model': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/bpe.model', 'context_size': 2, 'use_averaged_model': False, 'num_encoder_layers': 12, 'encoder_dim': 512, 'rnn_hidden_size': 1024, 'aux_layer_period': 0, 'blank_id': 0, 'vocab_size': 500}
2023-02-17 11:22:42,865 INFO [export-for-ncnn.py:235] About to create model
2023-02-17 11:22:43,239 INFO [train.py:472] Disable giga
2023-02-17 11:22:43,249 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/epoch-99.pt
2023-02-17 11:22:44,595 INFO [export-for-ncnn.py:324] encoder parameters: 83137520
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:325] decoder parameters: 257024
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:326] joiner parameters: 781812
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:327] total parameters: 84176356
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:329] Using torch.jit.trace()
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:331] Exporting encoder
2023-02-17 11:22:48,182 INFO [export-for-ncnn.py:158] Saved to icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.pt
2023-02-17 11:22:48,183 INFO [export-for-ncnn.py:335] Exporting decoder
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/lstm_transducer_stateless2/decoder.py:101: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
need_pad = bool(need_pad)
2023-02-17 11:22:48,259 INFO [export-for-ncnn.py:180] Saved to icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.pt
2023-02-17 11:22:48,259 INFO [export-for-ncnn.py:339] Exporting joiner
2023-02-17 11:22:48,304 INFO [export-for-ncnn.py:207] Saved to icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.pt

View File

@ -0,0 +1,74 @@
2023-02-27 20:23:07,473 INFO [export-for-ncnn.py:246] device: cpu
2023-02-27 20:23:07,477 INFO [export-for-ncnn.py:255] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'warm_step': 2000, 'env_info': {'k2-version': '1.23.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '62e404dd3f3a811d73e424199b3408e309c06e1a', 'k2-git-date': 'Mon Jan 30 10:26:16 2023', 'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '6d7a559-clean', 'icefall-git-date': 'Thu Feb 16 19:47:54 2023', 'icefall-path': '/star-fj/fangjun/open-source/icefall-2', 'k2-path': '/star-fj/fangjun/open-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '10.177.6.147'}, 'epoch': 99, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp'), 'bpe_model': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model', 'context_size': 2, 'use_averaged_model': False, 'num_encoder_layers': '2,4,3,2,4', 'feedforward_dims': '1024,1024,2048,2048,1024', 'nhead': '8,8,8,8,8', 'encoder_dims': '384,384,384,384,384', 'attention_dims': '192,192,192,192,192', 'encoder_unmasked_dims': '256,256,256,256,256', 'zipformer_downsampling_factors': '1,2,4,8,2', 'cnn_module_kernels': '31,31,31,31,31', 'decoder_dim': 512, 'joiner_dim': 512, 'short_chunk_size': 50, 'num_left_chunks': 4, 'decode_chunk_len': 32, 'blank_id': 0, 'vocab_size': 500}
2023-02-27 20:23:07,477 INFO [export-for-ncnn.py:257] About to create model
2023-02-27 20:23:08,023 INFO [zipformer2.py:419] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
2023-02-27 20:23:08,037 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/epoch-99.pt
2023-02-27 20:23:08,655 INFO [export-for-ncnn.py:346] encoder parameters: 68944004
2023-02-27 20:23:08,655 INFO [export-for-ncnn.py:347] decoder parameters: 260096
2023-02-27 20:23:08,655 INFO [export-for-ncnn.py:348] joiner parameters: 716276
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:349] total parameters: 69920376
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:351] Using torch.jit.trace()
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:353] Exporting encoder
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:174] decode_chunk_len: 32
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:175] T: 39
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1344: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_len.size(0) == self.num_layers, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1348: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_avg.size(0) == self.num_layers, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1352: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_key.size(0) == self.num_layers, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1356: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_val.size(0) == self.num_layers, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1360: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_val2.size(0) == self.num_layers, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1364: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_conv1.size(0) == self.num_layers, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1368: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_conv2.size(0) == self.num_layers, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1373: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert self.left_context_len == cached_key.shape[1], (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1884: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert self.x_size == x.size(0), (self.x_size, x.size(0))
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2442: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_key.shape[0] == self.left_context_len, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2449: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_key.shape[0] == cached_val.shape[0], (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2469: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_key.shape[0] == left_context_len, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2473: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_val.shape[0] == left_context_len, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2483: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert kv_len == k.shape[0], (kv_len, k.shape)
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2570: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert list(attn_output.size()) == [bsz * num_heads, seq_len, head_dim // 2]
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2926: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cache.shape == (x.size(0), x.size(1), self.lorder), (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2652: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert x.shape[0] == self.x_size, (x.shape[0], self.x_size)
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2653: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert x.shape[2] == self.embed_dim, (x.shape[2], self.embed_dim)
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2666: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert cached_val.shape[0] == self.left_context_len, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1543: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert src.shape[0] == self.in_x_size, (src.shape[0], self.in_x_size)
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1637: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert src.shape[0] == self.in_x_size, (
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1643: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert src.shape[2] == self.in_channels, (src.shape[2], self.in_channels)
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1571: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
if src.shape[0] != self.in_x_size:
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1763: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert src1.shape[:-1] == src2.shape[:-1], (src1.shape, src2.shape)
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1779: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert src1.shape[-1] == self.dim1, (src1.shape[-1], self.dim1)
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1780: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert src2.shape[-1] == self.dim2, (src2.shape[-1], self.dim2)
/star-fj/fangjun/py38/lib/python3.8/site-packages/torch/jit/_trace.py:958: TracerWarning: Encountering a list at the output of the tracer might cause the trace to be incorrect, this is only valid if the container structure does not change based on the module's inputs. Consider using a constant container instead (e.g. for `list`, use a `tuple` instead. for `dict`, use a `NamedTuple` instead). If you absolutely need this and know the side effects, pass strict=False to trace() to allow this behavior.
module._c._create_method_from_trace(
2023-02-27 20:23:19,640 INFO [export-for-ncnn.py:182] Saved to icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.pt
2023-02-27 20:23:19,646 INFO [export-for-ncnn.py:357] Exporting decoder
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/decoder.py:102: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert embedding_out.size(-1) == self.context_size
2023-02-27 20:23:19,686 INFO [export-for-ncnn.py:204] Saved to icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.pt
2023-02-27 20:23:19,686 INFO [export-for-ncnn.py:361] Exporting joiner
2023-02-27 20:23:19,735 INFO [export-for-ncnn.py:231] Saved to icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.pt

View File

@ -0,0 +1,104 @@
Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1
num encoder conv layers: 88
num joiner conv layers: 3
num files: 3
Processing ../test_wavs/1089-134686-0001.wav
Processing ../test_wavs/1221-135766-0001.wav
Processing ../test_wavs/1221-135766-0002.wav
Processing ../test_wavs/1089-134686-0001.wav
Processing ../test_wavs/1221-135766-0001.wav
Processing ../test_wavs/1221-135766-0002.wav
----------encoder----------
conv_87 : max = 15.942385 threshold = 15.938493 scale = 7.968131
conv_88 : max = 35.442448 threshold = 15.549335 scale = 8.167552
conv_89 : max = 23.228289 threshold = 8.001738 scale = 15.871552
linear_90 : max = 3.976146 threshold = 1.101789 scale = 115.267128
linear_91 : max = 6.962030 threshold = 5.162033 scale = 24.602713
linear_92 : max = 12.323041 threshold = 3.853959 scale = 32.953129
linear_94 : max = 6.905416 threshold = 4.648006 scale = 27.323545
linear_93 : max = 6.905416 threshold = 5.474093 scale = 23.200188
linear_95 : max = 1.888012 threshold = 1.403563 scale = 90.483986
linear_96 : max = 6.856741 threshold = 5.398679 scale = 23.524273
linear_97 : max = 9.635942 threshold = 2.613655 scale = 48.590950
linear_98 : max = 6.460340 threshold = 5.670146 scale = 22.398010
linear_99 : max = 9.532276 threshold = 2.585537 scale = 49.119396
linear_101 : max = 6.585871 threshold = 5.719224 scale = 22.205809
linear_100 : max = 6.585871 threshold = 5.751382 scale = 22.081648
linear_102 : max = 1.593344 threshold = 1.450581 scale = 87.551147
linear_103 : max = 6.592681 threshold = 5.705824 scale = 22.257959
linear_104 : max = 8.752957 threshold = 1.980955 scale = 64.110489
linear_105 : max = 6.696240 threshold = 5.877193 scale = 21.608953
linear_106 : max = 9.059659 threshold = 2.643138 scale = 48.048950
linear_108 : max = 6.975461 threshold = 4.589567 scale = 27.671457
linear_107 : max = 6.975461 threshold = 6.190381 scale = 20.515701
linear_109 : max = 3.710759 threshold = 2.305635 scale = 55.082436
linear_110 : max = 7.531228 threshold = 5.731162 scale = 22.159557
linear_111 : max = 10.528083 threshold = 2.259322 scale = 56.211544
linear_112 : max = 8.148807 threshold = 5.500842 scale = 23.087374
linear_113 : max = 8.592566 threshold = 1.948851 scale = 65.166611
linear_115 : max = 8.437109 threshold = 5.608947 scale = 22.642395
linear_114 : max = 8.437109 threshold = 6.193942 scale = 20.503904
linear_116 : max = 3.966980 threshold = 3.200896 scale = 39.676392
linear_117 : max = 9.451303 threshold = 6.061664 scale = 20.951344
linear_118 : max = 12.077262 threshold = 3.965800 scale = 32.023804
linear_119 : max = 9.671615 threshold = 4.847613 scale = 26.198460
linear_120 : max = 8.625638 threshold = 3.131427 scale = 40.556595
linear_122 : max = 10.274080 threshold = 4.888716 scale = 25.978189
linear_121 : max = 10.274080 threshold = 5.420480 scale = 23.429659
linear_123 : max = 4.826197 threshold = 3.599617 scale = 35.281532
linear_124 : max = 11.396383 threshold = 7.325849 scale = 17.335875
linear_125 : max = 9.337198 threshold = 3.941410 scale = 32.221970
linear_126 : max = 9.699965 threshold = 4.842878 scale = 26.224073
linear_127 : max = 8.775370 threshold = 3.884215 scale = 32.696438
linear_129 : max = 9.872276 threshold = 4.837319 scale = 26.254213
linear_128 : max = 9.872276 threshold = 7.180057 scale = 17.687883
linear_130 : max = 4.150427 threshold = 3.454298 scale = 36.765789
linear_131 : max = 11.112692 threshold = 7.924847 scale = 16.025545
linear_132 : max = 11.852893 threshold = 3.116593 scale = 40.749626
linear_133 : max = 11.517084 threshold = 5.024665 scale = 25.275314
linear_134 : max = 10.683807 threshold = 3.878618 scale = 32.743618
linear_136 : max = 12.421055 threshold = 6.322729 scale = 20.086264
linear_135 : max = 12.421055 threshold = 5.309880 scale = 23.917679
linear_137 : max = 4.827781 threshold = 3.744595 scale = 33.915554
linear_138 : max = 14.422395 threshold = 7.742882 scale = 16.402161
linear_139 : max = 8.527538 threshold = 3.866123 scale = 32.849449
linear_140 : max = 12.128619 threshold = 4.657793 scale = 27.266134
linear_141 : max = 9.839593 threshold = 3.845993 scale = 33.021378
linear_143 : max = 12.442304 threshold = 7.099039 scale = 17.889746
linear_142 : max = 12.442304 threshold = 5.325038 scale = 23.849592
linear_144 : max = 5.929444 threshold = 5.618206 scale = 22.605080
linear_145 : max = 13.382126 threshold = 9.321095 scale = 13.625010
linear_146 : max = 9.894987 threshold = 3.867645 scale = 32.836517
linear_147 : max = 10.915313 threshold = 4.906028 scale = 25.886522
linear_148 : max = 9.614287 threshold = 3.908151 scale = 32.496181
linear_150 : max = 11.724932 threshold = 4.485588 scale = 28.312899
linear_149 : max = 11.724932 threshold = 5.161146 scale = 24.606939
linear_151 : max = 7.164453 threshold = 5.847355 scale = 21.719223
linear_152 : max = 13.086471 threshold = 5.984121 scale = 21.222834
linear_153 : max = 11.099524 threshold = 3.991601 scale = 31.816805
linear_154 : max = 10.054585 threshold = 4.489706 scale = 28.286930
linear_155 : max = 12.389185 threshold = 3.100321 scale = 40.963501
linear_157 : max = 9.982999 threshold = 5.154796 scale = 24.637253
linear_156 : max = 9.982999 threshold = 8.537706 scale = 14.875190
linear_158 : max = 8.420287 threshold = 6.502287 scale = 19.531588
linear_159 : max = 25.014746 threshold = 9.423280 scale = 13.477261
linear_160 : max = 45.633553 threshold = 5.715335 scale = 22.220921
linear_161 : max = 20.371849 threshold = 5.117830 scale = 24.815203
linear_162 : max = 12.492933 threshold = 3.126283 scale = 40.623318
linear_164 : max = 20.697504 threshold = 4.825712 scale = 26.317358
linear_163 : max = 20.697504 threshold = 5.078367 scale = 25.008038
linear_165 : max = 9.023975 threshold = 6.836278 scale = 18.577358
linear_166 : max = 34.860619 threshold = 7.259792 scale = 17.493614
linear_167 : max = 30.380934 threshold = 5.496160 scale = 23.107042
linear_168 : max = 20.691216 threshold = 4.733317 scale = 26.831076
linear_169 : max = 9.723948 threshold = 3.952728 scale = 32.129707
linear_171 : max = 21.034811 threshold = 5.366547 scale = 23.665123
linear_170 : max = 21.034811 threshold = 5.356277 scale = 23.710501
linear_172 : max = 10.556884 threshold = 5.729481 scale = 22.166058
linear_173 : max = 20.033039 threshold = 10.207264 scale = 12.442120
linear_174 : max = 11.597379 threshold = 2.658676 scale = 47.768131
----------joiner----------
linear_2 : max = 19.293503 threshold = 14.305265 scale = 8.877850
linear_1 : max = 10.812222 threshold = 8.766452 scale = 14.487047
linear_3 : max = 0.999999 threshold = 0.999755 scale = 127.031174
ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\(^0^)/...233...

View File

@ -0,0 +1,44 @@
Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1
num encoder conv layers: 28
num joiner conv layers: 3
num files: 3
Processing ../test_wavs/1089-134686-0001.wav
Processing ../test_wavs/1221-135766-0001.wav
Processing ../test_wavs/1221-135766-0002.wav
Processing ../test_wavs/1089-134686-0001.wav
Processing ../test_wavs/1221-135766-0001.wav
Processing ../test_wavs/1221-135766-0002.wav
----------encoder----------
conv_15 : max = 15.942385 threshold = 15.930708 scale = 7.972025
conv_16 : max = 44.978855 threshold = 17.031788 scale = 7.456645
conv_17 : max = 17.868437 threshold = 7.830528 scale = 16.218575
linear_18 : max = 3.107259 threshold = 1.194808 scale = 106.293236
linear_19 : max = 6.193777 threshold = 4.634748 scale = 27.401705
linear_20 : max = 9.259933 threshold = 2.606617 scale = 48.722160
linear_21 : max = 5.186600 threshold = 4.790260 scale = 26.512129
linear_22 : max = 9.759041 threshold = 2.265832 scale = 56.050053
linear_23 : max = 3.931209 threshold = 3.099090 scale = 40.979767
linear_24 : max = 10.324160 threshold = 2.215561 scale = 57.321835
linear_25 : max = 3.800708 threshold = 3.599352 scale = 35.284134
linear_26 : max = 10.492444 threshold = 3.153369 scale = 40.274391
linear_27 : max = 3.660161 threshold = 2.720994 scale = 46.674126
linear_28 : max = 9.415265 threshold = 3.174434 scale = 40.007133
linear_29 : max = 4.038418 threshold = 3.118534 scale = 40.724262
linear_30 : max = 10.072084 threshold = 3.936867 scale = 32.259155
linear_31 : max = 4.342712 threshold = 3.599489 scale = 35.282787
linear_32 : max = 11.340535 threshold = 3.120308 scale = 40.701103
linear_33 : max = 3.846987 threshold = 3.630030 scale = 34.985939
linear_34 : max = 10.686298 threshold = 2.204571 scale = 57.607586
linear_35 : max = 4.904821 threshold = 4.575518 scale = 27.756420
linear_36 : max = 11.806659 threshold = 2.585589 scale = 49.118401
linear_37 : max = 6.402340 threshold = 5.047157 scale = 25.162680
linear_38 : max = 11.174589 threshold = 1.923361 scale = 66.030258
linear_39 : max = 16.178576 threshold = 7.556058 scale = 16.807705
linear_40 : max = 12.901954 threshold = 5.301267 scale = 23.956539
linear_41 : max = 14.839805 threshold = 7.597429 scale = 16.716181
linear_42 : max = 10.178945 threshold = 2.651595 scale = 47.895699
----------joiner----------
linear_2 : max = 24.829245 threshold = 16.627592 scale = 7.637907
linear_1 : max = 10.746186 threshold = 5.255032 scale = 24.167313
linear_3 : max = 1.000000 threshold = 0.999756 scale = 127.031013
ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\(^0^)/...233...

View File

@ -0,0 +1,7 @@
2023-01-11 14:02:12,216 INFO [streaming-ncnn-decode.py:320] {'tokens': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav'}
T 51 32
2023-01-11 14:02:13,141 INFO [streaming-ncnn-decode.py:328] Constructing Fbank computer
2023-01-11 14:02:13,151 INFO [streaming-ncnn-decode.py:331] Reading sound files: ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
2023-01-11 14:02:13,176 INFO [streaming-ncnn-decode.py:336] torch.Size([106000])
2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:380] ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:381] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS

View File

@ -0,0 +1,6 @@
2023-02-17 11:37:30,861 INFO [streaming-ncnn-decode.py:255] {'tokens': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav'}
2023-02-17 11:37:31,425 INFO [streaming-ncnn-decode.py:263] Constructing Fbank computer
2023-02-17 11:37:31,427 INFO [streaming-ncnn-decode.py:266] Reading sound files: ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav
2023-02-17 11:37:31,431 INFO [streaming-ncnn-decode.py:271] torch.Size([106000])
2023-02-17 11:37:34,115 INFO [streaming-ncnn-decode.py:342] ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav
2023-02-17 11:37:34,115 INFO [streaming-ncnn-decode.py:343] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS

View File

@ -0,0 +1,7 @@
2023-02-27 20:43:40,283 INFO [streaming-ncnn-decode.py:349] {'tokens': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav'}
2023-02-27 20:43:41,260 INFO [streaming-ncnn-decode.py:357] Constructing Fbank computer
2023-02-27 20:43:41,264 INFO [streaming-ncnn-decode.py:360] Reading sound files: ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav
2023-02-27 20:43:41,269 INFO [streaming-ncnn-decode.py:365] torch.Size([106000])
2023-02-27 20:43:41,280 INFO [streaming-ncnn-decode.py:372] number of states: 35
2023-02-27 20:43:45,026 INFO [streaming-ncnn-decode.py:410] ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav
2023-02-27 20:43:45,026 INFO [streaming-ncnn-decode.py:411] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS

View File

@ -0,0 +1,753 @@
.. _export_conv_emformer_transducer_models_to_ncnn:
Export ConvEmformer transducer models to ncnn
=============================================
We use the pre-trained model from the following repository as an example:
- `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
.. hint::
We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
.. caution::
Please use a more recent version of PyTorch. For instance, ``torch 1.8``
may ``not`` work.
1. Download the pre-trained model
---------------------------------
.. hint::
You can also refer to `<https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html#icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_ to download the pre-trained model.
You have to install `git-lfs`_ before you continue.
.. code-block:: bash
cd egs/librispeech/ASR
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
cd ..
.. note::
We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
In the above code, we downloaded the pre-trained model into the directory
``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``.
.. _export_for_ncnn_install_ncnn_and_pnnx:
2. Install ncnn and pnnx
------------------------
.. code-block:: bash
# We put ncnn into $HOME/open-source/ncnn
# You can change it to anywhere you like
cd $HOME
mkdir -p open-source
cd open-source
git clone https://github.com/csukuangfj/ncnn
cd ncnn
git submodule update --recursive --init
# Note: We don't use "python setup.py install" or "pip install ." here
mkdir -p build-wheel
cd build-wheel
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DNCNN_PYTHON=ON \
-DNCNN_BUILD_BENCHMARK=OFF \
-DNCNN_BUILD_EXAMPLES=OFF \
-DNCNN_BUILD_TOOLS=ON \
..
make -j4
cd ..
# Note: $PWD here is $HOME/open-source/ncnn
export PYTHONPATH=$PWD/python:$PYTHONPATH
export PATH=$PWD/tools/pnnx/build/src:$PATH
export PATH=$PWD/build-wheel/tools/quantize:$PATH
# Now build pnnx
cd tools/pnnx
mkdir build
cd build
cmake ..
make -j4
./src/pnnx
Congratulations! You have successfully installed the following components:
- ``pnnx``, which is an executable located in
``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use
it to convert models exported by ``torch.jit.trace()``.
- ``ncnn2int8``, which is an executable located in
``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use
it to quantize our models to ``int8``.
- ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located
in ``$HOME/open-source/ncnn/python/ncnn``.
.. note::
I am using ``Python 3.8``, so it
is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different
version, say, ``Python 3.9``, the name would be
``ncnn.cpython-39-x86_64-linux-gnu.so``.
Also, if you are not using Linux, the file name would also be different.
But that does not matter. As long as you can compile it, it should work.
We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your
Python code. We have also set up ``PATH`` so that you can use
``pnnx`` and ``ncnn2int8`` later in your terminal.
.. caution::
Please don't use `<https://github.com/tencent/ncnn>`_.
We have made some modifications to the offical `ncnn`_.
We will synchronize `<https://github.com/csukuangfj/ncnn>`_ periodically
with the official one.
3. Export the model via torch.jit.trace()
-----------------------------------------
First, let us rename our pre-trained model:
.. code-block::
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt
cd ../..
Next, we use the following code to export our model:
.. code-block:: bash
dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
--exp-dir $dir/exp \
--bpe-model $dir/data/lang_bpe_500/bpe.model \
--epoch 30 \
--avg 1 \
--use-averaged-model 0 \
\
--num-encoder-layers 12 \
--chunk-length 32 \
--cnn-module-kernel 31 \
--left-context-length 32 \
--right-context-length 8 \
--memory-size 32 \
--encoder-dim 512
.. caution::
If your model has different configuration parameters, please change them accordingly.
.. hint::
We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``.
There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
If you have trained a model by yourself and if you have all checkpoints
available, please first use ``decode.py`` to tune ``--epoch --avg``
and select the best combination with with ``--use-averaged-model 1``.
.. note::
You will see the following log output:
.. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt
The log shows the model has ``75490012`` parameters, i.e., ``~75 M``.
.. code-block::
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
-rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
You can see that the file size of the pre-trained model is ``289 MB``, which
is roughly equal to ``75490012*4/1024/1024 = 287.97 MB``.
After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``,
we will get the following files:
.. code-block:: bash
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx*
-rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt
-rw-r--r-- 1 kuangfangjun root 283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt
-rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt
.. _conv-emformer-step-4-export-torchscript-model-via-pnnx:
4. Export torchscript model via pnnx
------------------------------------
.. hint::
Make sure you have set up the ``PATH`` environment variable. Otherwise,
it will throw an error saying that ``pnnx`` could not be found.
Now, it's time to export our models to `ncnn`_ via ``pnnx``.
.. code-block::
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
pnnx ./encoder_jit_trace-pnnx.pt
pnnx ./decoder_jit_trace-pnnx.pt
pnnx ./joiner_jit_trace-pnnx.pt
It will generate the following files:
.. code-block:: bash
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param}
-rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
There are two types of files:
- ``param``: It is a text file containing the model architectures. You can
use a text editor to view its content.
- ``bin``: It is a binary file containing the model parameters.
We compare the file sizes of the models below before and after converting via ``pnnx``:
.. see https://tableconvert.com/restructuredtext-generator
+----------------------------------+------------+
| File name | File size |
+==================================+============+
| encoder_jit_trace-pnnx.pt | 283 MB |
+----------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin | 142 MB |
+----------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin | 503 KB |
+----------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB |
+----------------------------------+------------+
You can see that the file sizes of the models after conversion are about one half
of the models before conversion:
- encoder: 283 MB vs 142 MB
- decoder: 1010 KB vs 503 KB
- joiner: 3.0 MB vs 1.5 MB
The reason is that by default ``pnnx`` converts ``float32`` parameters
to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
for ``float16``. Thus, it is ``twice smaller`` after conversion.
.. hint::
If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
won't convert ``float32`` to ``float16``.
5. Test the exported models in icefall
--------------------------------------
.. note::
We assume you have set up the environment variable ``PYTHONPATH`` when
building `ncnn`_.
Now we have successfully converted our pre-trained model to `ncnn`_ format.
The generated 6 files are what we need. You can use the following code to
test the converted models:
.. code-block:: bash
./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
--tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \
--encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \
./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
.. hint::
`ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
only 1 wave file as input.
The output is given below:
.. literalinclude:: ./code/test-streaming-ncnn-decode-conv-emformer-transducer-libri.txt
Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
.. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn:
6. Modify the exported encoder for sherpa-ncnn
----------------------------------------------
In order to use the exported models in `sherpa-ncnn`_, we have to modify
``encoder_jit_trace-pnnx.ncnn.param``.
Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
.. code-block::
7767517
1060 1342
Input in0 0 1 in0
**Explanation** of the above three lines:
1. ``7767517``, it is a magic number and should not be changed.
2. ``1060 1342``, the first number ``1060`` specifies the number of layers
in this file, while ``1342`` specifies the number of intermediate outputs
of this file
3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
is the layer name of this layer; ``0`` means this layer has no input;
``1`` means this layer has one output; ``in0`` is the output name of
this layer.
We need to add 1 extra line and also increment the number of layers.
The result looks like below:
.. code-block:: bash
7767517
1061 1342
SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
Input in0 0 1 in0
**Explanation**
1. ``7767517``, it is still the same
2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``.
We don't need to change ``1342`` since the newly added layer has no inputs or outputs.
3. ``SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512``
This line is newly added. Its explanation is given below:
- ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
- ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
- ``0 0`` means this layer has no inputs or output. Must be ``0 0``
- ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1``
- ``1=12``, 1 is the key and 12 is the value of the
parameter ``--num-encoder-layers`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``2=32``, 2 is the key and 32 is the value of the
parameter ``--memory-size`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``3=31``, 3 is the key and 31 is the value of the
parameter ``--cnn-module-kernel`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``4=8``, 4 is the key and 8 is the value of the
parameter ``--left-context-length`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``5=32``, 5 is the key and 32 is the value of the
parameter ``--chunk-length`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``6=8``, 6 is the key and 8 is the value of the
parameter ``--right-context-length`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
- ``7=512``, 7 is the key and 512 is the value of the
parameter ``--encoder-dim`` that you provided when running
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
For ease of reference, we list the key-value pairs that you need to add
in the following table. If your model has a different setting, please
change the values for ``SherpaMetaData`` accordingly. Otherwise, you
will be ``SAD``.
+------+-----------------------------+
| key | value |
+======+=============================+
| 0 | 1 (fixed) |
+------+-----------------------------+
| 1 | ``--num-encoder-layers`` |
+------+-----------------------------+
| 2 | ``--memory-size`` |
+------+-----------------------------+
| 3 | ``--cnn-module-kernel`` |
+------+-----------------------------+
| 4 | ``--left-context-length`` |
+------+-----------------------------+
| 5 | ``--chunk-length`` |
+------+-----------------------------+
| 6 | ``--right-context-length`` |
+------+-----------------------------+
| 7 | ``--encoder-dim`` |
+------+-----------------------------+
4. ``Input in0 0 1 in0``. No need to change it.
.. caution::
When you add a new layer ``SherpaMetaData``, please remember to update the
number of layers. In our case, update ``1060`` to ``1061``. Otherwise,
you will be SAD later.
.. hint::
After adding the new layer ``SherpaMetaData``, you cannot use this model
with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
supported only in `sherpa-ncnn`_.
.. hint::
`ncnn`_ is very flexible. You can add new layers to it just by text-editing
the ``param`` file! You don't need to change the ``bin`` file.
Now you can use this model in `sherpa-ncnn`_.
Please refer to the following documentation:
- Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
- ``Android``: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
- ``iOS``: `<https://k2-fsa.github.io/sherpa/ncnn/ios/index.html>`_
- Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
- `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
You can find more usages there.
7. (Optional) int8 quantization with sherpa-ncnn
------------------------------------------------
This step is optional.
In this step, we describe how to quantize our model with ``int8``.
Change :ref:`conv-emformer-step-4-export-torchscript-model-via-pnnx` to
disable ``fp16`` when using ``pnnx``:
.. code-block::
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
pnnx ./encoder_jit_trace-pnnx.pt fp16=0
pnnx ./decoder_jit_trace-pnnx.pt
pnnx ./joiner_jit_trace-pnnx.pt fp16=0
.. note::
We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
support quantizing the decoder model yet. We will update this documentation
once `ncnn`_ supports it. (Maybe in this year, 2023).
It will generate the following files
.. code-block:: bash
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin}
-rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
Let us compare again the file sizes:
+----------------------------------------+------------+
| File name | File size |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.pt | 283 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
+----------------------------------------+------------+
You can see that the file sizes are doubled when we disable ``fp16``.
.. note::
You can again use ``streaming-ncnn-decode.py`` to test the exported models.
Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
to modify ``encoder_jit_trace-pnnx.ncnn.param``.
Change
.. code-block:: bash
7767517
1060 1342
Input in0 0 1 in0
to
.. code-block:: bash
7767517
1061 1342
SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
Input in0 0 1 in0
.. caution::
Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
to change the values for ``SherpaMetaData`` if your model uses a different setting.
Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
`sherpa-ncnn`_.
.. code-block:: bash
# We will download sherpa-ncnn to $HOME/open-source/
# You can change it to anywhere you like.
cd $HOME
mkdir -p open-source
cd open-source
git clone https://github.com/k2-fsa/sherpa-ncnn
cd sherpa-ncnn
mkdir build
cd build
cmake ..
make -j 4
./bin/generate-int8-scale-table
export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
The output of the above commands are:
.. code-block:: bash
(py38) kuangfangjun:build$ generate-int8-scale-table
Please provide 10 arg. Currently given: 1
Usage:
generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
We need to create a file ``wave_filenames.txt``, in which we need to put
some calibration wave files. For testing purpose, we put the ``test_wavs``
from the pre-trained model repository `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
cat <<EOF > wave_filenames.txt
../test_wavs/1089-134686-0001.wav
../test_wavs/1221-135766-0001.wav
../test_wavs/1221-135766-0002.wav
EOF
Now we can calculate the scales needed for quantization with the calibration data:
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
generate-int8-scale-table \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
./encoder-scale-table.txt \
./joiner-scale-table.txt \
./wave_filenames.txt
The output logs are in the following:
.. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt
It generates the following two files:
.. code-block:: bash
$ ls -lh encoder-scale-table.txt joiner-scale-table.txt
-rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt
-rw-r--r-- 1 kuangfangjun root 18K Jan 11 17:28 joiner-scale-table.txt
.. caution::
Definitely, you need more calibration data to compute the scale table.
Finally, let us use the scale table to quantize our models into ``int8``.
.. code-block:: bash
ncnn2int8
usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
First, we quantize the encoder model:
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
ncnn2int8 \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./encoder_jit_trace-pnnx.ncnn.int8.param \
./encoder_jit_trace-pnnx.ncnn.int8.bin \
./encoder-scale-table.txt
Next, we quantize the joiner model:
.. code-block:: bash
ncnn2int8 \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.int8.param \
./joiner_jit_trace-pnnx.ncnn.int8.bin \
./joiner-scale-table.txt
The above two commands generate the following 4 files:
.. code-block:: bash
-rw-r--r-- 1 kuangfangjun root 99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin
-rw-r--r-- 1 kuangfangjun root 78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param
-rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin
-rw-r--r-- 1 kuangfangjun root 496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param
Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
.. caution::
``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
replace the following invocation:
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
sherpa-ncnn \
../data/lang_bpe_500/tokens.txt \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
../test_wavs/1089-134686-0001.wav
with
.. code-block::
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
sherpa-ncnn \
../data/lang_bpe_500/tokens.txt \
./encoder_jit_trace-pnnx.ncnn.int8.param \
./encoder_jit_trace-pnnx.ncnn.int8.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
../test_wavs/1089-134686-0001.wav
The following table compares again the file sizes:
+----------------------------------------+------------+
| File name | File size |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.pt | 283 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.int8.bin | 99 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB |
+----------------------------------------+------------+
You can see that the file sizes of the model after ``int8`` quantization
are much smaller.
.. hint::
Currently, only linear layers and convolutional layers are quantized
with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
.. note::
You need to test the recognition accuracy after ``int8`` quantization.
You can find the speed comparison at `<https://github.com/k2-fsa/sherpa-ncnn/issues/44>`_.
That's it! Have fun with `sherpa-ncnn`_!

View File

@ -0,0 +1,644 @@
.. _export_lstm_transducer_models_to_ncnn:
Export LSTM transducer models to ncnn
-------------------------------------
We use the pre-trained model from the following repository as an example:
`<https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03>`_
We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
.. hint::
We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
.. caution::
Please use a more recent version of PyTorch. For instance, ``torch 1.8``
may ``not`` work.
1. Download the pre-trained model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. hint::
You have to install `git-lfs`_ before you continue.
.. code-block:: bash
cd egs/librispeech/ASR
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
cd ..
.. note::
We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
In the above code, we downloaded the pre-trained model into the directory
``egs/librispeech/ASR/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03``.
2. Install ncnn and pnnx
^^^^^^^^^^^^^^^^^^^^^^^^
Please refer to :ref:`export_for_ncnn_install_ncnn_and_pnnx` .
3. Export the model via torch.jit.trace()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
First, let us rename our pre-trained model:
.. code-block::
cd egs/librispeech/ASR
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
cd ../..
Next, we use the following code to export our model:
.. code-block:: bash
dir=./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
./lstm_transducer_stateless2/export-for-ncnn.py \
--exp-dir $dir/exp \
--bpe-model $dir/data/lang_bpe_500/bpe.model \
--epoch 99 \
--avg 1 \
--use-averaged-model 0 \
--num-encoder-layers 12 \
--encoder-dim 512 \
--rnn-hidden-size 1024
.. hint::
We have renamed our model to ``epoch-99.pt`` so that we can use ``--epoch 99``.
There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
If you have trained a model by yourself and if you have all checkpoints
available, please first use ``decode.py`` to tune ``--epoch --avg``
and select the best combination with with ``--use-averaged-model 1``.
.. note::
You will see the following log output:
.. literalinclude:: ./code/export-lstm-transducer-for-ncnn-output.txt
The log shows the model has ``84176356`` parameters, i.e., ``~84 M``.
.. code-block::
ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/pretrained-iter-468000-avg-16.pt
-rw-r--r-- 1 kuangfangjun root 324M Feb 17 10:34 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/pretrained-iter-468000-avg-16.pt
You can see that the file size of the pre-trained model is ``324 MB``, which
is roughly equal to ``84176356*4/1024/1024 = 321.107 MB``.
After running ``lstm_transducer_stateless2/export-for-ncnn.py``,
we will get the following files:
.. code-block:: bash
ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*pnnx.pt
-rw-r--r-- 1 kuangfangjun root 1010K Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.pt
-rw-r--r-- 1 kuangfangjun root 318M Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.pt
-rw-r--r-- 1 kuangfangjun root 3.0M Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.pt
.. _lstm-transducer-step-4-export-torchscript-model-via-pnnx:
4. Export torchscript model via pnnx
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. hint::
Make sure you have set up the ``PATH`` environment variable
in :ref:`export_for_ncnn_install_ncnn_and_pnnx`. Otherwise,
it will throw an error saying that ``pnnx`` could not be found.
Now, it's time to export our models to `ncnn`_ via ``pnnx``.
.. code-block::
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
pnnx ./encoder_jit_trace-pnnx.pt
pnnx ./decoder_jit_trace-pnnx.pt
pnnx ./joiner_jit_trace-pnnx.pt
It will generate the following files:
.. code-block:: bash
ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*ncnn*{bin,param}
-rw-r--r-- 1 kuangfangjun root 503K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 437 Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 159M Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 21K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 1.5M Feb 17 11:33 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 488 Feb 17 11:33 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param
There are two types of files:
- ``param``: It is a text file containing the model architectures. You can
use a text editor to view its content.
- ``bin``: It is a binary file containing the model parameters.
We compare the file sizes of the models below before and after converting via ``pnnx``:
.. see https://tableconvert.com/restructuredtext-generator
+----------------------------------+------------+
| File name | File size |
+==================================+============+
| encoder_jit_trace-pnnx.pt | 318 MB |
+----------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin | 159 MB |
+----------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin | 503 KB |
+----------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB |
+----------------------------------+------------+
You can see that the file sizes of the models after conversion are about one half
of the models before conversion:
- encoder: 318 MB vs 159 MB
- decoder: 1010 KB vs 503 KB
- joiner: 3.0 MB vs 1.5 MB
The reason is that by default ``pnnx`` converts ``float32`` parameters
to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
for ``float16``. Thus, it is ``twice smaller`` after conversion.
.. hint::
If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
won't convert ``float32`` to ``float16``.
5. Test the exported models in icefall
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. note::
We assume you have set up the environment variable ``PYTHONPATH`` when
building `ncnn`_.
Now we have successfully converted our pre-trained model to `ncnn`_ format.
The generated 6 files are what we need. You can use the following code to
test the converted models:
.. code-block:: bash
python3 ./lstm_transducer_stateless2/streaming-ncnn-decode.py \
--tokens ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/tokens.txt \
--encoder-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin \
./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav
.. hint::
`ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
only 1 wave file as input.
The output is given below:
.. literalinclude:: ./code/test-streaming-ncnn-decode-lstm-transducer-libri.txt
Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
.. _lstm-modify-the-exported-encoder-for-sherpa-ncnn:
6. Modify the exported encoder for sherpa-ncnn
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In order to use the exported models in `sherpa-ncnn`_, we have to modify
``encoder_jit_trace-pnnx.ncnn.param``.
Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
.. code-block::
7767517
267 379
Input in0 0 1 in0
**Explanation** of the above three lines:
1. ``7767517``, it is a magic number and should not be changed.
2. ``267 379``, the first number ``267`` specifies the number of layers
in this file, while ``379`` specifies the number of intermediate outputs
of this file
3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
is the layer name of this layer; ``0`` means this layer has no input;
``1`` means this layer has one output; ``in0`` is the output name of
this layer.
We need to add 1 extra line and also increment the number of layers.
The result looks like below:
.. code-block:: bash
7767517
268 379
SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024
Input in0 0 1 in0
**Explanation**
1. ``7767517``, it is still the same
2. ``268 379``, we have added an extra layer, so we need to update ``267`` to ``268``.
We don't need to change ``379`` since the newly added layer has no inputs or outputs.
3. ``SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024``
This line is newly added. Its explanation is given below:
- ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
- ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
- ``0 0`` means this layer has no inputs or output. Must be ``0 0``
- ``0=3``, 0 is the key and 3 is the value. MUST be ``0=3``
- ``1=12``, 1 is the key and 12 is the value of the
parameter ``--num-encoder-layers`` that you provided when running
``./lstm_transducer_stateless2/export-for-ncnn.py``.
- ``2=512``, 2 is the key and 512 is the value of the
parameter ``--encoder-dim`` that you provided when running
``./lstm_transducer_stateless2/export-for-ncnn.py``.
- ``3=1024``, 3 is the key and 1024 is the value of the
parameter ``--rnn-hidden-size`` that you provided when running
``./lstm_transducer_stateless2/export-for-ncnn.py``.
For ease of reference, we list the key-value pairs that you need to add
in the following table. If your model has a different setting, please
change the values for ``SherpaMetaData`` accordingly. Otherwise, you
will be ``SAD``.
+------+-----------------------------+
| key | value |
+======+=============================+
| 0 | 3 (fixed) |
+------+-----------------------------+
| 1 | ``--num-encoder-layers`` |
+------+-----------------------------+
| 2 | ``--encoder-dim`` |
+------+-----------------------------+
| 3 | ``--rnn-hidden-size`` |
+------+-----------------------------+
4. ``Input in0 0 1 in0``. No need to change it.
.. caution::
When you add a new layer ``SherpaMetaData``, please remember to update the
number of layers. In our case, update ``267`` to ``268``. Otherwise,
you will be SAD later.
.. hint::
After adding the new layer ``SherpaMetaData``, you cannot use this model
with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
supported only in `sherpa-ncnn`_.
.. hint::
`ncnn`_ is very flexible. You can add new layers to it just by text-editing
the ``param`` file! You don't need to change the ``bin`` file.
Now you can use this model in `sherpa-ncnn`_.
Please refer to the following documentation:
- Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
- ``Android``: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
- ``iOS``: `<https://k2-fsa.github.io/sherpa/ncnn/ios/index.html>`_
- Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
- `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
You can find more usages there.
7. (Optional) int8 quantization with sherpa-ncnn
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This step is optional.
In this step, we describe how to quantize our model with ``int8``.
Change :ref:`lstm-transducer-step-4-export-torchscript-model-via-pnnx` to
disable ``fp16`` when using ``pnnx``:
.. code-block::
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
pnnx ./encoder_jit_trace-pnnx.pt fp16=0
pnnx ./decoder_jit_trace-pnnx.pt
pnnx ./joiner_jit_trace-pnnx.pt fp16=0
.. note::
We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
support quantizing the decoder model yet. We will update this documentation
once `ncnn`_ supports it. (Maybe in this year, 2023).
.. code-block:: bash
ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*_jit_trace-pnnx.ncnn.{param,bin}
-rw-r--r-- 1 kuangfangjun root 503K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 437 Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 317M Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 21K Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 3.0M Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 488 Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param
Let us compare again the file sizes:
+----------------------------------------+------------+
| File name | File size |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.pt | 318 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 159 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 317 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
+----------------------------------------+------------+
You can see that the file sizes are doubled when we disable ``fp16``.
.. note::
You can again use ``streaming-ncnn-decode.py`` to test the exported models.
Next, follow :ref:`lstm-modify-the-exported-encoder-for-sherpa-ncnn`
to modify ``encoder_jit_trace-pnnx.ncnn.param``.
Change
.. code-block:: bash
7767517
267 379
Input in0 0 1 in0
to
.. code-block:: bash
7767517
268 379
SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024
Input in0 0 1 in0
.. caution::
Please follow :ref:`lstm-modify-the-exported-encoder-for-sherpa-ncnn`
to change the values for ``SherpaMetaData`` if your model uses a different setting.
Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
`sherpa-ncnn`_.
.. code-block:: bash
# We will download sherpa-ncnn to $HOME/open-source/
# You can change it to anywhere you like.
cd $HOME
mkdir -p open-source
cd open-source
git clone https://github.com/k2-fsa/sherpa-ncnn
cd sherpa-ncnn
mkdir build
cd build
cmake ..
make -j 4
./bin/generate-int8-scale-table
export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
The output of the above commands are:
.. code-block:: bash
(py38) kuangfangjun:build$ generate-int8-scale-table
Please provide 10 arg. Currently given: 1
Usage:
generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
We need to create a file ``wave_filenames.txt``, in which we need to put
some calibration wave files. For testing purpose, we put the ``test_wavs``
from the pre-trained model repository
`<https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03>`_
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
cat <<EOF > wave_filenames.txt
../test_wavs/1089-134686-0001.wav
../test_wavs/1221-135766-0001.wav
../test_wavs/1221-135766-0002.wav
EOF
Now we can calculate the scales needed for quantization with the calibration data:
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
generate-int8-scale-table \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
./encoder-scale-table.txt \
./joiner-scale-table.txt \
./wave_filenames.txt
The output logs are in the following:
.. literalinclude:: ./code/generate-int-8-scale-table-for-lstm.txt
It generates the following two files:
.. code-block:: bash
ls -lh encoder-scale-table.txt joiner-scale-table.txt
-rw-r--r-- 1 kuangfangjun root 345K Feb 17 12:13 encoder-scale-table.txt
-rw-r--r-- 1 kuangfangjun root 17K Feb 17 12:13 joiner-scale-table.txt
.. caution::
Definitely, you need more calibration data to compute the scale table.
Finally, let us use the scale table to quantize our models into ``int8``.
.. code-block:: bash
ncnn2int8
usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
First, we quantize the encoder model:
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
ncnn2int8 \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./encoder_jit_trace-pnnx.ncnn.int8.param \
./encoder_jit_trace-pnnx.ncnn.int8.bin \
./encoder-scale-table.txt
Next, we quantize the joiner model:
.. code-block:: bash
ncnn2int8 \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.int8.param \
./joiner_jit_trace-pnnx.ncnn.int8.bin \
./joiner-scale-table.txt
The above two commands generate the following 4 files:
.. code-block::
-rw-r--r-- 1 kuangfangjun root 218M Feb 17 12:19 encoder_jit_trace-pnnx.ncnn.int8.bin
-rw-r--r-- 1 kuangfangjun root 21K Feb 17 12:19 encoder_jit_trace-pnnx.ncnn.int8.param
-rw-r--r-- 1 kuangfangjun root 774K Feb 17 12:19 joiner_jit_trace-pnnx.ncnn.int8.bin
-rw-r--r-- 1 kuangfangjun root 496 Feb 17 12:19 joiner_jit_trace-pnnx.ncnn.int8.param
Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
.. caution::
``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
replace the following invocation:
.. code-block::
cd egs/librispeech/ASR
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
sherpa-ncnn \
../data/lang_bpe_500/tokens.txt \
./encoder_jit_trace-pnnx.ncnn.param \
./encoder_jit_trace-pnnx.ncnn.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
../test_wavs/1089-134686-0001.wav
with
.. code-block:: bash
cd egs/librispeech/ASR
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
sherpa-ncnn \
../data/lang_bpe_500/tokens.txt \
./encoder_jit_trace-pnnx.ncnn.int8.param \
./encoder_jit_trace-pnnx.ncnn.int8.bin \
./decoder_jit_trace-pnnx.ncnn.param \
./decoder_jit_trace-pnnx.ncnn.bin \
./joiner_jit_trace-pnnx.ncnn.param \
./joiner_jit_trace-pnnx.ncnn.bin \
../test_wavs/1089-134686-0001.wav
The following table compares again the file sizes:
+----------------------------------------+------------+
| File name | File size |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.pt | 318 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1010 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 159 MB |
+----------------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 317 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
+----------------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.int8.bin | 218 MB |
+----------------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB |
+----------------------------------------+------------+
You can see that the file size of the joiner model after ``int8`` quantization
is much smaller. However, the size of the encoder model is even larger than
the ``fp16`` counterpart. The reason is that `ncnn`_ currently does not support
quantizing ``LSTM`` layers into ``8-bit``. Please see
`<https://github.com/Tencent/ncnn/issues/4532>`_
.. hint::
Currently, only linear layers and convolutional layers are quantized
with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
.. note::
You need to test the recognition accuracy after ``int8`` quantization.
That's it! Have fun with `sherpa-ncnn`_!

View File

@ -0,0 +1,388 @@
.. _export_streaming_zipformer_transducer_models_to_ncnn:
Export streaming Zipformer transducer models to ncnn
----------------------------------------------------
We use the pre-trained model from the following repository as an example:
`<https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
.. hint::
We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
.. caution::
Please use a more recent version of PyTorch. For instance, ``torch 1.8``
may ``not`` work.
1. Download the pre-trained model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. hint::
You have to install `git-lfs`_ before you continue.
.. code-block:: bash
cd egs/librispeech/ASR
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
git lfs pull --include "exp/pretrained.pt"
git lfs pull --include "data/lang_bpe_500/bpe.model"
cd ..
.. note::
We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
In the above code, we downloaded the pre-trained model into the directory
``egs/librispeech/ASR/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29``.
2. Install ncnn and pnnx
^^^^^^^^^^^^^^^^^^^^^^^^
Please refer to :ref:`export_for_ncnn_install_ncnn_and_pnnx` .
3. Export the model via torch.jit.trace()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
First, let us rename our pre-trained model:
.. code-block::
cd egs/librispeech/ASR
cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
ln -s pretrained.pt epoch-99.pt
cd ../..
Next, we use the following code to export our model:
.. code-block:: bash
dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
--bpe-model $dir/data/lang_bpe_500/bpe.model \
--exp-dir $dir/exp \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
\
--decode-chunk-len 32 \
--num-left-chunks 4 \
--num-encoder-layers "2,4,3,2,4" \
--feedforward-dims "1024,1024,2048,2048,1024" \
--nhead "8,8,8,8,8" \
--encoder-dims "384,384,384,384,384" \
--attention-dims "192,192,192,192,192" \
--encoder-unmasked-dims "256,256,256,256,256" \
--zipformer-downsampling-factors "1,2,4,8,2" \
--cnn-module-kernels "31,31,31,31,31" \
--decoder-dim 512 \
--joiner-dim 512
.. caution::
If your model has different configuration parameters, please change them accordingly.
.. hint::
We have renamed our model to ``epoch-99.pt`` so that we can use ``--epoch 99``.
There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
If you have trained a model by yourself and if you have all checkpoints
available, please first use ``decode.py`` to tune ``--epoch --avg``
and select the best combination with with ``--use-averaged-model 1``.
.. note::
You will see the following log output:
.. literalinclude:: ./code/export-zipformer-transducer-for-ncnn-output.txt
The log shows the model has ``69920376`` parameters, i.e., ``~69.9 M``.
.. code-block:: bash
ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/pretrained.pt
-rw-r--r-- 1 kuangfangjun root 269M Jan 12 12:53 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/pretrained.pt
You can see that the file size of the pre-trained model is ``269 MB``, which
is roughly equal to ``69920376*4/1024/1024 = 266.725 MB``.
After running ``pruned_transducer_stateless7_streaming/export-for-ncnn.py``,
we will get the following files:
.. code-block:: bash
ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/*pnnx.pt
-rw-r--r-- 1 kuangfangjun root 1022K Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.pt
-rw-r--r-- 1 kuangfangjun root 266M Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.pt
-rw-r--r-- 1 kuangfangjun root 2.8M Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.pt
.. _zipformer-transducer-step-4-export-torchscript-model-via-pnnx:
4. Export torchscript model via pnnx
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. hint::
Make sure you have set up the ``PATH`` environment variable
in :ref:`export_for_ncnn_install_ncnn_and_pnnx`. Otherwise,
it will throw an error saying that ``pnnx`` could not be found.
Now, it's time to export our models to `ncnn`_ via ``pnnx``.
.. code-block::
cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
pnnx ./encoder_jit_trace-pnnx.pt
pnnx ./decoder_jit_trace-pnnx.pt
pnnx ./joiner_jit_trace-pnnx.pt
It will generate the following files:
.. code-block:: bash
ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/*ncnn*{bin,param}
-rw-r--r-- 1 kuangfangjun root 509K Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 437 Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 133M Feb 27 20:30 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 152K Feb 27 20:30 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.param
-rw-r--r-- 1 kuangfangjun root 1.4M Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.bin
-rw-r--r-- 1 kuangfangjun root 488 Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.param
There are two types of files:
- ``param``: It is a text file containing the model architectures. You can
use a text editor to view its content.
- ``bin``: It is a binary file containing the model parameters.
We compare the file sizes of the models below before and after converting via ``pnnx``:
.. see https://tableconvert.com/restructuredtext-generator
+----------------------------------+------------+
| File name | File size |
+==================================+============+
| encoder_jit_trace-pnnx.pt | 266 MB |
+----------------------------------+------------+
| decoder_jit_trace-pnnx.pt | 1022 KB |
+----------------------------------+------------+
| joiner_jit_trace-pnnx.pt | 2.8 MB |
+----------------------------------+------------+
| encoder_jit_trace-pnnx.ncnn.bin | 133 MB |
+----------------------------------+------------+
| decoder_jit_trace-pnnx.ncnn.bin | 509 KB |
+----------------------------------+------------+
| joiner_jit_trace-pnnx.ncnn.bin | 1.4 MB |
+----------------------------------+------------+
You can see that the file sizes of the models after conversion are about one half
of the models before conversion:
- encoder: 266 MB vs 133 MB
- decoder: 1022 KB vs 509 KB
- joiner: 2.8 MB vs 1.4 MB
The reason is that by default ``pnnx`` converts ``float32`` parameters
to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
for ``float16``. Thus, it is ``twice smaller`` after conversion.
.. hint::
If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
won't convert ``float32`` to ``float16``.
5. Test the exported models in icefall
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. note::
We assume you have set up the environment variable ``PYTHONPATH`` when
building `ncnn`_.
Now we have successfully converted our pre-trained model to `ncnn`_ format.
The generated 6 files are what we need. You can use the following code to
test the converted models:
.. code-block:: bash
python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
--tokens ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \
--encoder-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.bin \
./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav
.. hint::
`ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
only 1 wave file as input.
The output is given below:
.. literalinclude:: ./code/test-streaming-ncnn-decode-zipformer-transducer-libri.txt
Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
.. _zipformer-modify-the-exported-encoder-for-sherpa-ncnn:
6. Modify the exported encoder for sherpa-ncnn
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In order to use the exported models in `sherpa-ncnn`_, we have to modify
``encoder_jit_trace-pnnx.ncnn.param``.
Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
.. code-block::
7767517
2028 2547
Input in0 0 1 in0
**Explanation** of the above three lines:
1. ``7767517``, it is a magic number and should not be changed.
2. ``2028 2547``, the first number ``2028`` specifies the number of layers
in this file, while ``2547`` specifies the number of intermediate outputs
of this file
3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
is the layer name of this layer; ``0`` means this layer has no input;
``1`` means this layer has one output; ``in0`` is the output name of
this layer.
We need to add 1 extra line and also increment the number of layers.
The result looks like below:
.. code-block:: bash
7767517
2029 2547
SherpaMetaData sherpa_meta_data1 0 0 0=2 1=32 2=4 3=7 15=1 -23316=5,2,4,3,2,4 -23317=5,384,384,384,384,384 -23318=5,192,192,192,192,192 -23319=5,1,2,4,8,2 -23320=5,31,31,31,31,31
Input in0 0 1 in0
**Explanation**
1. ``7767517``, it is still the same
2. ``2029 2547``, we have added an extra layer, so we need to update ``2028`` to ``2029``.
We don't need to change ``2547`` since the newly added layer has no inputs or outputs.
3. ``SherpaMetaData sherpa_meta_data1 0 0 0=2 1=32 2=4 3=7 -23316=5,2,4,3,2,4 -23317=5,384,384,384,384,384 -23318=5,192,192,192,192,192 -23319=5,1,2,4,8,2 -23320=5,31,31,31,31,31``
This line is newly added. Its explanation is given below:
- ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
- ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
- ``0 0`` means this layer has no inputs or output. Must be ``0 0``
- ``0=2``, 0 is the key and 2 is the value. MUST be ``0=2``
- ``1=32``, 1 is the key and 32 is the value of the
parameter ``--decode-chunk-len`` that you provided when running
``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
- ``2=4``, 2 is the key and 4 is the value of the
parameter ``--num-left-chunks`` that you provided when running
``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
- ``3=7``, 3 is the key and 7 is the value of for the amount of padding
used in the Conv2DSubsampling layer. It should be 7 for zipformer
if you don't change zipformer.py.
- ``15=1``, attribute 15, this is the model version. Starting from
`sherpa-ncnn`_ v2.0, we require that the model version has to
be >= 1.
- ``-23316=5,2,4,3,2,4``, attribute 16, this is an array attribute.
It is attribute 16 since -23300 - (-23316) = 16.
The first element of the array is the length of the array, which is 5 in our case.
``2,4,3,2,4`` is the value of ``--num-encoder-layers``that you provided
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
- ``-23317=5,384,384,384,384,384``, attribute 17.
The first element of the array is the length of the array, which is 5 in our case.
``384,384,384,384,384`` is the value of ``--encoder-dims``that you provided
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
- ``-23318=5,192,192,192,192,192``, attribute 18.
The first element of the array is the length of the array, which is 5 in our case.
``192,192,192,192,192`` is the value of ``--attention-dims`` that you provided
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
- ``-23319=5,1,2,4,8,2``, attribute 19.
The first element of the array is the length of the array, which is 5 in our case.
``1,2,4,8,2`` is the value of ``--zipformer-downsampling-factors`` that you provided
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
- ``-23320=5,31,31,31,31,31``, attribute 20.
The first element of the array is the length of the array, which is 5 in our case.
``31,31,31,31,31`` is the value of ``--cnn-module-kernels`` that you provided
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
For ease of reference, we list the key-value pairs that you need to add
in the following table. If your model has a different setting, please
change the values for ``SherpaMetaData`` accordingly. Otherwise, you
will be ``SAD``.
+----------+--------------------------------------------+
| key | value |
+==========+============================================+
| 0 | 2 (fixed) |
+----------+--------------------------------------------+
| 1 | ``-decode-chunk-len`` |
+----------+--------------------------------------------+
| 2 | ``--num-left-chunks`` |
+----------+--------------------------------------------+
| 3 | 7 (if you don't change code) |
+----------+--------------------------------------------+
| 15 | 1 (The model version) |
+----------+--------------------------------------------+
|-23316 | ``--num-encoder-layer`` |
+----------+--------------------------------------------+
|-23317 | ``--encoder-dims`` |
+----------+--------------------------------------------+
|-23318 | ``--attention-dims`` |
+----------+--------------------------------------------+
|-23319 | ``--zipformer-downsampling-factors`` |
+----------+--------------------------------------------+
|-23320 | ``--cnn-module-kernels`` |
+----------+--------------------------------------------+
4. ``Input in0 0 1 in0``. No need to change it.
.. caution::
When you add a new layer ``SherpaMetaData``, please remember to update the
number of layers. In our case, update ``2028`` to ``2029``. Otherwise,
you will be SAD later.
.. hint::
After adding the new layer ``SherpaMetaData``, you cannot use this model
with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
supported only in `sherpa-ncnn`_.
.. hint::
`ncnn`_ is very flexible. You can add new layers to it just by text-editing
the ``param`` file! You don't need to change the ``bin`` file.
Now you can use this model in `sherpa-ncnn`_.
Please refer to the following documentation:
- Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
- ``Android``: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
- ``iOS``: `<https://k2-fsa.github.io/sherpa/ncnn/ios/index.html>`_
- Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
- `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
You can find more usages there.

View File

@ -1,12 +1,37 @@
Export to ncnn
==============
We support exporting LSTM transducer models to `ncnn <https://github.com/tencent/ncnn>`_.
We support exporting the following models
to `ncnn <https://github.com/tencent/ncnn>`_:
Please refer to :ref:`export-model-for-ncnn` for details.
- `Zipformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_
We also provide `<https://github.com/k2-fsa/sherpa-ncnn>`_
performing speech recognition using ``ncnn`` with exported models.
It has been tested on Linux, macOS, Windows, and Raspberry Pi. The project is
self-contained and can be statically linked to produce a binary containing
everything needed.
- `LSTM transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
- `ConvEmformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_
We also provide `sherpa-ncnn`_
for performing speech recognition using `ncnn`_ with exported models.
It has been tested on the following platforms:
- Linux
- macOS
- Windows
- ``Android``
- ``iOS``
- ``Raspberry Pi``
- `爱芯派 <https://wiki.sipeed.com/hardware/zh/>`_ (`MAIX-III AXera-Pi <https://wiki.sipeed.com/hardware/en/maixIII/ax-pi/axpi.html>`_).
- `RV1126 <https://www.rock-chips.com/a/en/products/RV11_Series/2020/0427/1076.html>`_
`sherpa-ncnn`_ is self-contained and can be statically linked to produce
a binary containing everything needed. Please refer
to its documentation for details:
- `<https://k2-fsa.github.io/sherpa/ncnn/index.html>`_
.. toctree::
export-ncnn-zipformer
export-ncnn-conv-emformer
export-ncnn-lstm

View File

@ -1,69 +1,104 @@
Export to ONNX
==============
In this section, we describe how to export models to ONNX.
In this section, we describe how to export models to `ONNX`_.
.. hint::
Only non-streaming conformer transducer models are tested.
Before you continue, please run:
.. code-block:: bash
pip install onnx
When to use it
--------------
In each recipe, there is a file called ``export-onnx.py``, which is used
to export trained models to `ONNX`_.
It you want to use an inference framework that supports ONNX
to run the pretrained model.
There is also a file named ``onnx_pretrained.py``, which you can use
the exported `ONNX`_ model in Python with `onnxruntime`_ to decode sound files.
sherpa-onnx
-----------
We have a separate repository `sherpa-onnx`_ for deploying your exported models
on various platforms such as:
- iOS
- Android
- Raspberry Pi
- Linux/macOS/Windows
How to export
-------------
Please see the documentation of `sherpa-onnx`_ for details:
We use
`<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_
as an example in the following.
`<https://k2-fsa.github.io/sherpa/onnx/index.html>`_
Example
-------
In the following, we demonstrate how to export a streaming Zipformer pre-trained
model from
`<https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11>`_
to `ONNX`_.
Download the pre-trained model
------------------------------
.. hint::
We assume you have installed `git-lfs`_.
.. code-block:: bash
cd egs/librispeech/ASR
epoch=14
avg=2
./pruned_transducer_stateless3/export.py \
--exp-dir ./pruned_transducer_stateless3/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--epoch $epoch \
--avg $avg \
--onnx 1
cd egs/librispeech/ASR
It will generate the following files inside ``pruned_transducer_stateless3/exp``:
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
repo=$(basename $repo_url)
- ``encoder.onnx``
- ``decoder.onnx``
- ``joiner.onnx``
- ``joiner_encoder_proj.onnx``
- ``joiner_decoder_proj.onnx``
pushd $repo
git lfs pull --include "data/lang_bpe_500/bpe.model"
git lfs pull --include "exp/pretrained.pt"
cd exp
ln -s pretrained.pt epoch-99.pt
popd
You can use ``./pruned_transducer_stateless3/exp/onnx_pretrained.py`` to decode
waves with the generated files:
Export the model to ONNX
------------------------
.. code-block:: bash
./pruned_transducer_stateless3/onnx_pretrained.py \
--bpe-model ./data/lang_bpe_500/bpe.model \
--encoder-model-filename ./pruned_transducer_stateless3/exp/encoder.onnx \
--decoder-model-filename ./pruned_transducer_stateless3/exp/decoder.onnx \
--joiner-model-filename ./pruned_transducer_stateless3/exp/joiner.onnx \
--joiner-encoder-proj-model-filename ./pruned_transducer_stateless3/exp/joiner_encoder_proj.onnx \
--joiner-decoder-proj-model-filename ./pruned_transducer_stateless3/exp/joiner_decoder_proj.onnx \
/path/to/foo.wav \
/path/to/bar.wav \
/path/to/baz.wav
./pruned_transducer_stateless7_streaming/export-onnx.py \
--bpe-model $repo/data/lang_bpe_500/bpe.model \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--decode-chunk-len 32 \
--exp-dir $repo/exp/
.. warning::
How to use the exported model
-----------------------------
``export-onnx.py`` from different recipes has different options.
We also provide `<https://github.com/k2-fsa/sherpa-onnx>`_
performing speech recognition using `onnxruntime <https://github.com/microsoft/onnxruntime>`_
with exported models.
It has been tested on Linux, macOS, and Windows.
In the above example, ``--decode-chunk-len`` is specific for the
streaming Zipformer. Other models won't have such an option.
It will generate the following 3 files in ``$repo/exp``
- ``encoder-epoch-99-avg-1.onnx``
- ``decoder-epoch-99-avg-1.onnx``
- ``joiner-epoch-99-avg-1.onnx``
Decode sound files with exported ONNX models
--------------------------------------------
.. code-block:: bash
./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_500/tokens.txt \
$repo/test_wavs/1089-134686-0001.wav

View File

@ -0,0 +1,223 @@
Distillation with HuBERT
========================
This tutorial shows you how to perform knowledge distillation in `icefall`_
with the `LibriSpeech`_ dataset. The distillation method
used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation <https://arxiv.org/abs/2211.00508>`_
for more details about MVQ-KD.
.. note::
This tutorial is based on recipe
`pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_.
Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
.. note::
We assume you have read the page :ref:`install icefall` and have setup
the environment for `icefall`_.
.. HINT::
We recommend you to use a GPU or several GPUs to run this recipe.
Data preparation
----------------
We first prepare necessary training data for `LibriSpeech`_.
This is the same as in :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
.. hint::
The data preparation is the same as other recipes on LibriSpeech dataset,
if you have finished this step, you can skip to :ref:`codebook_index_preparation` directly.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
All you need to do is to run it.
The data preparation contains several stages, you can use the following two
options:
- ``--stage``
- ``--stop-stage``
to control which stage(s) should be run. By default, all stages are executed.
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./prepare.sh --stage 0 --stop-stage 0 # run only stage 0
$ ./prepare.sh --stage 2 --stop-stage 5 # run from stage 2 to stage 5
.. HINT::
If you have pre-downloaded the `LibriSpeech`_
dataset and the `musan`_ dataset, say,
they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
``./prepare.sh`` won't re-download them.
.. NOTE::
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
are saved in ``./data`` directory.
We provide the following YouTube video showing how to run ``./prepare.sh``.
.. note::
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
.. youtube:: ofEIoJL-mGM
.. _codebook_index_preparation:
Codebook index preparation
--------------------------
Here, we prepare necessary data for MVQ-KD. This requires the generation
of codebook indexes (please read our `paper <https://arxiv.org/abs/2211.00508>`_.
if you are interested in details). In this tutorial, we use the pre-computed
codebook indexes for convenience. The only thing you need to do is to
run `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_.
.. note::
There are 5 stages in total, the first and second stage will be automatically skipped
when choosing to downloaded codebook indexes prepared by `icefall`_.
Of course, you can extract and compute the codebook indexes by yourself. This
will require you downloading a HuBERT-XL model and it can take a while for
the extraction of codebook indexes.
As usual, you can control the stages you want to run by specifying the following
two options:
- ``--stage``
- ``--stop-stage``
For example,
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./distillation_with_hubert.sh --stage 0 --stop-stage 0 # run only stage 0
$ ./distillation_with_hubert.sh --stage 2 --stop-stage 4 # run from stage 2 to stage 5
Here are a few options in `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_
you need to know before you proceed.
- ``--full_libri`` If True, use full 960h data. Otherwise only ``train-clean-100`` will be used
- ``--use_extracted_codebook`` If True, the first two stages will be skipped and the codebook
indexes uploaded by us will be downloaded.
Since we are using the pre-computed codebook indexes, we set
``use_extracted_codebook=True``. If you want to do full `LibriSpeech`_
experiments, please set ``full_libri=True``.
The following command downloads the pre-computed codebook indexes
and prepares MVQ-augmented training manifests.
.. code-block:: bash
$ ./distillation_with_hubert.sh --stage 2 --stop-stage 2 # run only stage 2
Please see the
following screenshot for the output of an example execution.
.. figure:: ./images/distillation_codebook.png
:width: 800
:alt: Downloading codebook indexes and preparing training manifest.
:align: center
Downloading codebook indexes and preparing training manifest.
.. hint::
The codebook indexes we prepared for you in this tutorial
are extracted from the 36-th layer of a fine-tuned HuBERT-XL model
with 8 codebooks. If you want to try other configurations, please
set ``use_extracted_codebook=False`` and set ``embedding_layer`` and
``num_codebooks`` by yourself.
Now, you should see the following files under the directory ``./data/vq_fbank_layer36_cb8``.
.. figure:: ./images/distillation_directory.png
:width: 800
:alt: MVQ-augmented training manifests
:align: center
MVQ-augmented training manifests.
Whola! You are ready to perform knowledge distillation training now!
Training
--------
To perform training, please run stage 3 by executing the following command.
.. code-block:: bash
$ ./prepare.sh --stage 3 --stop-stage 3 # run MVQ training
Here is the code snippet for training:
.. code-block:: bash
WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}')
./pruned_transducer_stateless6/train.py \
--manifest-dir ./data/vq_fbank_layer36_cb8 \
--master-port 12359 \
--full-libri $full_libri \
--spec-aug-time-warp-factor -1 \
--max-duration 300 \
--world-size ${WORLD_SIZE} \
--num-epochs 30 \
--exp-dir $exp_dir \
--enable-distillation True \
--codebook-loss-scale 0.01
There are a few training arguments in the following
training commands that should be paid attention to.
- ``--enable-distillation`` If True, knowledge distillation training is enabled.
- ``--codebook-loss-scale`` The scale of the knowledge distillation loss.
- ``--manifest-dir`` The path to the MVQ-augmented manifest.
Decoding
--------
After training finished, you can test the performance on using
the following command.
.. code-block:: bash
export CUDA_VISIBLE_DEVICES=0
./pruned_transducer_stateless6/train.py \
--decoding-method "modified_beam_search" \
--epoch 30 \
--avg 10 \
--max-duration 200 \
--exp-dir $exp_dir \
--enable-distillation True
You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`_.
That's all! Feel free to experiment with your own setups and report your results.
If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`_.

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

View File

@ -9,3 +9,4 @@ LibriSpeech
pruned_transducer_stateless
zipformer_mmi
zipformer_ctc_blankskip
distillation

View File

@ -1,3 +1,5 @@
.. _non_streaming_librispeech_pruned_transducer_stateless:
Pruned transducer statelessX
============================

View File

@ -299,11 +299,11 @@ to run the training part first.
- (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
of each epoch. You can pass ``--epoch`` to
``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py`` to use them.
``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py`` to use them.
- (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
every ``--save-every-n`` batches. You can pass ``--iter`` to
``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py`` to use them.
``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py`` to use them.
We suggest that you try both types of checkpoints and choose the one
that produces the lowest WERs.
@ -311,7 +311,7 @@ to run the training part first.
.. code-block:: bash
$ cd egs/librispeech/ASR
$ ./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py --help
$ ./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py --help
shows the options for decoding.
@ -320,7 +320,7 @@ The following shows the example using ``epoch-*.pt``:
.. code-block:: bash
for m in greedy_search fast_beam_search modified_beam_search; do
./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py \
./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py \
--epoch 30 \
--avg 13 \
--exp-dir pruned_transducer_stateless7_ctc_bs/exp \
@ -333,7 +333,7 @@ To test CTC branch, you can use the following command:
.. code-block:: bash
for m in ctc-decoding 1best; do
./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py \
./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py \
--epoch 30 \
--avg 13 \
--exp-dir pruned_transducer_stateless7_ctc_bs/exp \
@ -367,7 +367,7 @@ It will generate a file ``./pruned_transducer_stateless7_ctc_bs/exp/pretrained.p
.. hint::
To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py``,
To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py``,
you can run:
.. code-block:: bash
@ -376,7 +376,7 @@ It will generate a file ``./pruned_transducer_stateless7_ctc_bs/exp/pretrained.p
ln -s pretrained epoch-9999.pt
And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
``./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py``.
``./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py``.
To use the exported model with ``./pruned_transducer_stateless7_ctc_bs/pretrained.py``, you
can run:
@ -447,7 +447,8 @@ Download pretrained models
If you don't want to train from scratch, you can download the pretrained models
by visiting the following links:
- `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14>`_
- trained on LibriSpeech 100h: `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14>`_
- trained on LibriSpeech 960h: `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2023-01-29>`_
See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
for the details of the above pretrained models

View File

@ -30,8 +30,9 @@ In icefall, we implement the streaming conformer the way just like what `WeNet <
See :doc:`Pruned transducer statelessX <librispeech/pruned_transducer_stateless>` for more details.
.. HINT::
If you want to adapt a non-streaming conformer model to be streaming, please refer
to `this pull request <https://github.com/k2-fsa/icefall/pull/454>`_.
If you want to modify a non-streaming conformer recipe to support both streaming and non-streaming, please refer
to `this pull request <https://github.com/k2-fsa/icefall/pull/454>`_. After adding the code needed by streaming training,
you have to re-train it with the extra arguments metioned in the docs above to get a streaming model.
Streaming Emformer

View File

@ -515,133 +515,6 @@ To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``:
Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/english/server.html>`_
for how to use the exported models in ``sherpa``.
.. _export-model-for-ncnn:
Export model for ncnn
~~~~~~~~~~~~~~~~~~~~~
We support exporting pretrained LSTM transducer models to
`ncnn <https://github.com/tencent/ncnn>`_ using
`pnnx <https://github.com/Tencent/ncnn/tree/master/tools/pnnx>`_.
First, let us install a modified version of ``ncnn``:
.. code-block:: bash
git clone https://github.com/csukuangfj/ncnn
cd ncnn
git submodule update --recursive --init
# Note: We don't use "python setup.py install" or "pip install ." here
mkdir -p build-wheel
cd build-wheel
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DNCNN_PYTHON=ON \
-DNCNN_BUILD_BENCHMARK=OFF \
-DNCNN_BUILD_EXAMPLES=OFF \
-DNCNN_BUILD_TOOLS=ON \
..
make -j4
cd ..
# Note: $PWD here is /path/to/ncnn
export PYTHONPATH=$PWD/python:$PYTHONPATH
export PATH=$PWD/tools/pnnx/build/src:$PATH
export PATH=$PWD/build-wheel/tools/quantize:$PATH
# now build pnnx
cd tools/pnnx
mkdir build
cd build
cmake ..
make -j4
./src/pnnx
.. note::
We assume that you have added the path to the binary ``pnnx`` to the
environment variable ``PATH``.
We also assume that you have added ``build/tools/quantize`` to the environment
variable ``PATH`` so that you are able to use ``ncnn2int8`` later.
Second, let us export the model using ``torch.jit.trace()`` that is suitable
for ``pnnx``:
.. code-block:: bash
iter=468000
avg=16
./lstm_transducer_stateless2/export.py \
--exp-dir ./lstm_transducer_stateless2/exp \
--bpe-model data/lang_bpe_500/bpe.model \
--iter $iter \
--avg $avg \
--pnnx 1
It will generate 3 files:
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.pt``
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.pt``
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.pt``
Third, convert torchscript model to ``ncnn`` format:
.. code-block::
pnnx ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.pt
pnnx ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.pt
pnnx ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.pt
It will generate the following files:
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param``
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin``
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param``
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin``
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param``
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin``
To use the above generated files, run:
.. code-block:: bash
./lstm_transducer_stateless2/ncnn-decode.py \
--bpe-model-filename ./data/lang_bpe_500/bpe.model \
--encoder-param-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin \
/path/to/foo.wav
.. code-block:: bash
./lstm_transducer_stateless2/streaming-ncnn-decode.py \
--bpe-model-filename ./data/lang_bpe_500/bpe.model \
--encoder-param-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param \
--encoder-bin-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin \
--decoder-param-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param \
--decoder-bin-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin \
--joiner-param-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param \
--joiner-bin-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin \
/path/to/foo.wav
To use the above generated files in C++, please see
`<https://github.com/k2-fsa/sherpa-ncnn>`_
It is able to generate a static linked executable that can be run on Linux, Windows,
macOS, Raspberry Pi, etc, without external dependencies.
Download pretrained models
--------------------------

View File

@ -391,18 +391,14 @@ def save_results(
):
test_set_wers = dict()
for key, results in results_dict.items():
recog_path = (
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
)
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
results = sorted(results)
store_transcripts(filename=recog_path, texts=results)
logging.info(f"The transcripts are stored in {recog_path}")
# The following prints out WERs, per-word error statistics and aligned
# ref/hyp pairs.
errs_filename = (
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
)
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
with open(errs_filename, "w") as f:
wer = write_error_stats(
f, f"{test_set_name}-{key}", results, enable_log=True
@ -412,9 +408,7 @@ def save_results(
logging.info("Wrote detailed error stats to {}".format(errs_filename))
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
errs_info = (
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
)
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
with open(errs_info, "w") as f:
print("settings\tWER", file=f)
for key, val in test_set_wers:

View File

@ -1,7 +1,7 @@
# Introduction
Please refer to <https://icefall.readthedocs.io/en/latest/recipes/aishell/index.html>
Please refer to <https://icefall.readthedocs.io/en/latest/recipes/Non-streaming-ASR/aishell/index.html>
for how to run models in this recipe.
@ -17,6 +17,7 @@ The following table lists the differences among them.
| `transducer_stateless_modified` | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` |
| `transducer_stateless_modified-2` | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` + extra data |
| `pruned_transducer_stateless3` | Conformer (reworked) | Embedding + Conv1d | pruned RNN-T + reworked model with random combiner + using aidatatang_20zh as extra data|
| `pruned_transducer_stateless7` | Zipformer | Embedding | pruned RNN-T + zipformer encoder + stateless decoder with context-size 1 |
The decoder in `transducer_stateless` is modified from the paper
[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).

View File

@ -2,6 +2,109 @@
### Aishell training result(Stateless Transducer)
#### Pruned transducer stateless 7
[./pruned_transducer_stateless7](./pruned_transducer_stateless7)
It's Zipformer with Pruned RNNT loss.
| | test | dev | comment |
|------------------------|------|------|---------------------------------------|
| greedy search | 5.02 | 4.61 | --epoch 42 --avg 6 --max-duration 600 |
| modified beam search | 4.81 | 4.4 | --epoch 42 --avg 6 --max-duration 600 |
| fast beam search | 4.91 | 4.52 | --epoch 42 --avg 6 --max-duration 600 |
Training command is:
```bash
./prepare.sh
export CUDA_VISIBLE_DEVICES="0,1"
./pruned_transducer_stateless7/train.py \
--world-size 2 \
--num-epochs 50 \
--start-epoch 1 \
--use-fp16 1 \
--exp-dir pruned_transducer_stateless7/exp \
--context-size 1 \
--max-duration 300
```
**Caution**: It uses `--context-size=1`.
The tensorboard log is available at
<https://tensorboard.dev/experiment/MHYo3ApfQxaCdYLr38cQOQ>
The decoding command is:
```bash
for m in greedy_search modified_beam_search fast_beam_search ; do
./pruned_transducer_stateless7/decode.py \
--epoch 42 \
--avg 6 \
--exp-dir ./pruned_transducer_stateless7/exp \
--lang-dir data/lang_char \
--max-duration 300 \
--context-size 1 \
--decoding-method $m
done
```
Pretrained models, training logs, decoding logs, and decoding results
are available at
<https://huggingface.co/marcoyang/icefall-asr-aishell-zipformer-pruned-transducer-stateless7-2023-03-21>
#### Pruned transducer stateless 7 (zipformer)
See <https://github.com/k2-fsa/icefall/pull/986>
[./pruned_transducer_stateless7_bbpe](./pruned_transducer_stateless7_bbpe)
**Note**: The modeling units are byte level BPEs
The best results I have gotten are:
Vocab size | Greedy search(dev & test) | Modified beam search(dev & test) | Fast beam search (dev & test) | Fast beam search LG (dev & test) | comments
-- | -- | -- | -- | -- | --
500 | 4.31 & 4.59 | 4.25 & 4.54 | 4.27 & 4.55 | 4.07 & 4.38 | --epoch 48 --avg 29
The training command:
```
export CUDA_VISIBLE_DEVICES="4,5,6,7"
./pruned_transducer_stateless7_bbpe/train.py \
--world-size 4 \
--num-epochs 50 \
--start-epoch 1 \
--use-fp16 1 \
--max-duration 800 \
--bpe-model data/lang_bbpe_500/bbpe.model \
--exp-dir pruned_transducer_stateless7_bbpe/exp \
--lr-epochs 6 \
--master-port 12535
```
The decoding command:
```
for m in greedy_search modified_beam_search fast_beam_search fast_beam_search_LG; do
./pruned_transducer_stateless7_bbpe/decode.py \
--epoch 48 \
--avg 29 \
--exp-dir ./pruned_transducer_stateless7_bbpe/exp \
--max-sym-per-frame 1 \
--ngram-lm-scale 0.25 \
--ilme-scale 0.2 \
--bpe-model data/lang_bbpe_500/bbpe.model \
--max-duration 2000 \
--decoding-method $m
done
```
The pretrained model is available at: https://huggingface.co/pkufool/icefall_asr_aishell_pruned_transducer_stateless7_bbpe
#### Pruned transducer stateless 3
See <https://github.com/k2-fsa/icefall/pull/436>
@ -15,6 +118,8 @@ It uses pruned RNN-T.
|------------------------|------|------|---------------------------------------|
| greedy search | 5.39 | 5.09 | --epoch 29 --avg 5 --max-duration 600 |
| modified beam search | 5.05 | 4.79 | --epoch 29 --avg 5 --max-duration 600 |
| modified beam search + RNNLM shallow fusion | 4.73 | 4.53 | --epoch 29 --avg 5 --max-duration 600 |
| modified beam search + LODR | 4.57 | 4.37 | --epoch 29 --avg 5 --max-duration 600 |
| fast beam search | 5.13 | 4.91 | --epoch 29 --avg 5 --max-duration 600 |
Training command is:
@ -73,6 +178,78 @@ for epoch in 29; do
done
```
We provide the option of shallow fusion with a RNN language model. The pre-trained language model is
available at <https://huggingface.co/marcoyang/icefall-aishell-rnn-lm>. To decode with the language model,
please use the following command:
```bash
# download pre-trained model
git lfs install
git clone https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
aishell_exp=icefall-aishell-pruned-transducer-stateless3-2022-06-20/
pushd ${aishell_exp}/exp
ln -s pretrained-epoch-29-avg-5-torch-1.10.0.pt epoch-99.pt
popd
# download RNN LM
git lfs install
git clone https://huggingface.co/marcoyang/icefall-aishell-rnn-lm
rnnlm_dir=icefall-aishell-rnn-lm
# RNNLM shallow fusion
for lm_scale in $(seq 0.26 0.02 0.34); do
python ./pruned_transducer_stateless3/decode.py \
--epoch 99 \
--avg 1 \
--lang-dir ${aishell_exp}/data/lang_char \
--exp-dir ${aishell_exp}/exp \
--use-averaged-model False \
--decoding-method modified_beam_search_lm_shallow_fusion \
--use-shallow-fusion 1 \
--lm-type rnn \
--lm-exp-dir ${rnnlm_dir}/exp \
--lm-epoch 99 \
--lm-scale $lm_scale \
--lm-avg 1 \
--rnn-lm-embedding-dim 2048 \
--rnn-lm-hidden-dim 2048 \
--rnn-lm-num-layers 2 \
--lm-vocab-size 4336
done
# RNNLM Low-order density ratio (LODR) with a 2-gram
cp ${rnnlm_dir}/2gram.fst.txt ${aishell_exp}/data/lang_char/2gram.fst.txt
for lm_scale in 0.48; do
for LODR_scale in -0.28; do
python ./pruned_transducer_stateless3/decode.py \
--epoch 99 \
--avg 1 \
--lang-dir ${aishell_exp}/data/lang_char \
--exp-dir ${aishell_exp}/exp \
--use-averaged-model False \
--decoding-method modified_beam_search_LODR \
--use-shallow-fusion 1 \
--lm-type rnn \
--lm-exp-dir ${rnnlm_dir}/exp \
--lm-epoch 99 \
--lm-scale $lm_scale \
--lm-avg 1 \
--rnn-lm-embedding-dim 2048 \
--rnn-lm-hidden-dim 2048 \
--rnn-lm-num-layers 2 \
--lm-vocab-size 4336 \
--tokens-ngram 2 \
--backoff-id 4336 \
--ngram-lm-scale $LODR_scale
done
done
```
Pretrained models, training logs, decoding logs, and decoding results
are available at
<https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20>

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/compile_lg.py

View File

@ -33,6 +33,7 @@ and generates the following files in the directory `lang_dir`:
- tokens.txt
"""
import argparse
import re
from pathlib import Path
from typing import Dict, List
@ -189,8 +190,22 @@ def generate_tokens(text_file: str) -> Dict[str, int]:
return tokens
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-dir",
type=str,
help="""Input and output directory.
It should contain the bpe.model and words.txt
""",
)
return parser.parse_args()
def main():
lang_dir = Path("data/lang_char")
args = get_args()
lang_dir = Path(args.lang_dir)
text_file = lang_dir / "text"
word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")

View File

@ -0,0 +1,164 @@
#!/usr/bin/env python3
# Copyright (c) 2021 Xiaomi Corporation (authors: Daniel Povey
# Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script takes a `tokens.txt` and a text file such as
./download/lm/aishell-transcript.txt
and outputs the LM training data to a supplied directory such
as data/lm_training_char. The format is as follows:
It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
representation of a dict with the same format with librispeech receipe
"""
import argparse
import logging
from pathlib import Path
import k2
import torch
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-char",
type=str,
help="""Lang dir of asr model, e.g. data/lang_char""",
)
parser.add_argument(
"--lm-data",
type=str,
help="""Input LM training data as text, e.g.
download/lm/aishell-train-word.txt""",
)
parser.add_argument(
"--lm-archive",
type=str,
help="""Path to output archive, e.g. data/lm_training_char/lm_data.pt;
look at the source of this script to see the format.""",
)
return parser.parse_args()
def main():
args = get_args()
if Path(args.lm_archive).exists():
logging.warning(f"{args.lm_archive} exists - skipping")
return
# make token_dict from tokens.txt in order to map characters to tokens.
token_dict = {}
token_file = args.lang_char + "/tokens.txt"
with open(token_file, "r") as f:
for line in f.readlines():
line_list = line.split()
token_dict[line_list[0]] = int(line_list[1])
# word2index is a dictionary from words to integer ids. No need to reserve
# space for epsilon, etc.; the words are just used as a convenient way to
# compress the sequences of tokens.
word2index = dict()
word2token = [] # Will be a list-of-list-of-int, representing tokens.
sentences = [] # Will be a list-of-list-of-int, representing word-ids.
if "aishell-lm" in args.lm_data:
num_lines_in_total = 120098.0
step = 50000
elif "valid" in args.lm_data:
num_lines_in_total = 14326.0
step = 3000
elif "test" in args.lm_data:
num_lines_in_total = 7176.0
step = 3000
else:
num_lines_in_total = None
step = None
processed = 0
with open(args.lm_data) as f:
while True:
line = f.readline()
if line == "":
break
if step and processed % step == 0:
logging.info(
f"Processed number of lines: {processed} "
f"({processed / num_lines_in_total * 100: .3f}%)"
)
processed += 1
line_words = line.split()
for w in line_words:
if w not in word2index:
w_token = []
for t in w:
if t in token_dict:
w_token.append(token_dict[t])
else:
w_token.append(token_dict["<unk>"])
word2index[w] = len(word2token)
word2token.append(w_token)
sentences.append([word2index[w] for w in line_words])
logging.info("Constructing ragged tensors")
words = k2.ragged.RaggedTensor(word2token)
sentences = k2.ragged.RaggedTensor(sentences)
output = dict(words=words, sentences=sentences)
num_sentences = sentences.dim0
logging.info(f"Computing sentence lengths, num_sentences: {num_sentences}")
sentence_lengths = [0] * num_sentences
for i in range(num_sentences):
if step and i % step == 0:
logging.info(
f"Processed number of lines: {i} ({i / num_sentences * 100: .3f}%)"
)
word_ids = sentences[i]
# NOTE: If word_ids is a tensor with only 1 entry,
# token_ids is a torch.Tensor
token_ids = words[word_ids]
if isinstance(token_ids, k2.RaggedTensor):
token_ids = token_ids.values
# token_ids is a 1-D tensor containing the BPE tokens
# of the current sentence
sentence_lengths[i] = token_ids.numel()
output["sentence_lengths"] = torch.tensor(sentence_lengths, dtype=torch.int32)
torch.save(output, args.lm_archive)
logging.info(f"Saved to {args.lm_archive}")
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
main()

View File

@ -0,0 +1,267 @@
#!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang
# Wei Kang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script takes as input `lang_dir`, which should contain::
- lang_dir/bbpe.model,
- lang_dir/words.txt
and generates the following files in the directory `lang_dir`:
- lexicon.txt
- lexicon_disambig.txt
- L.pt
- L_disambig.pt
- tokens.txt
"""
import argparse
from pathlib import Path
from typing import Dict, List, Tuple
import k2
import sentencepiece as spm
import torch
from prepare_lang import (
Lexicon,
add_disambig_symbols,
add_self_loops,
write_lexicon,
write_mapping,
)
from icefall.byte_utils import byte_encode
from icefall.utils import str2bool, tokenize_by_CJK_char
def lexicon_to_fst_no_sil(
lexicon: Lexicon,
token2id: Dict[str, int],
word2id: Dict[str, int],
need_self_loops: bool = False,
) -> k2.Fsa:
"""Convert a lexicon to an FST (in k2 format).
Args:
lexicon:
The input lexicon. See also :func:`read_lexicon`
token2id:
A dict mapping tokens to IDs.
word2id:
A dict mapping words to IDs.
need_self_loops:
If True, add self-loop to states with non-epsilon output symbols
on at least one arc out of the state. The input label for this
self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
Returns:
Return an instance of `k2.Fsa` representing the given lexicon.
"""
loop_state = 0 # words enter and leave from here
next_state = 1 # the next un-allocated state, will be incremented as we go
arcs = []
# The blank symbol <blk> is defined in local/train_bpe_model.py
assert token2id["<blk>"] == 0
assert word2id["<eps>"] == 0
eps = 0
for word, pieces in lexicon:
assert len(pieces) > 0, f"{word} has no pronunciations"
cur_state = loop_state
word = word2id[word]
pieces = [token2id[i] for i in pieces]
for i in range(len(pieces) - 1):
w = word if i == 0 else eps
arcs.append([cur_state, next_state, pieces[i], w, 0])
cur_state = next_state
next_state += 1
# now for the last piece of this word
i = len(pieces) - 1
w = word if i == 0 else eps
arcs.append([cur_state, loop_state, pieces[i], w, 0])
if need_self_loops:
disambig_token = token2id["#0"]
disambig_word = word2id["#0"]
arcs = add_self_loops(
arcs,
disambig_token=disambig_token,
disambig_word=disambig_word,
)
final_state = next_state
arcs.append([loop_state, final_state, -1, -1, 0])
arcs.append([final_state])
arcs = sorted(arcs, key=lambda arc: arc[0])
arcs = [[str(i) for i in arc] for arc in arcs]
arcs = [" ".join(arc) for arc in arcs]
arcs = "\n".join(arcs)
fsa = k2.Fsa.from_str(arcs, acceptor=False)
return fsa
def generate_lexicon(
model_file: str, words: List[str], oov: str
) -> Tuple[Lexicon, Dict[str, int]]:
"""Generate a lexicon from a BPE model.
Args:
model_file:
Path to a sentencepiece model.
words:
A list of strings representing words.
oov:
The out of vocabulary word in lexicon.
Returns:
Return a tuple with two elements:
- A dict whose keys are words and values are the corresponding
word pieces.
- A dict representing the token symbol, mapping from tokens to IDs.
"""
sp = spm.SentencePieceProcessor()
sp.load(str(model_file))
# Convert word to word piece IDs instead of word piece strings
# to avoid OOV tokens.
encode_words = [byte_encode(tokenize_by_CJK_char(w)) for w in words]
words_pieces_ids: List[List[int]] = sp.encode(encode_words, out_type=int)
# Now convert word piece IDs back to word piece strings.
words_pieces: List[List[str]] = [sp.id_to_piece(ids) for ids in words_pieces_ids]
lexicon = []
for word, pieces in zip(words, words_pieces):
lexicon.append((word, pieces))
lexicon.append((oov, ["", sp.id_to_piece(sp.unk_id())]))
token2id: Dict[str, int] = {sp.id_to_piece(i): i for i in range(sp.vocab_size())}
return lexicon, token2id
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-dir",
type=str,
help="""Input and output directory.
It should contain the bpe.model and words.txt
""",
)
parser.add_argument(
"--oov",
type=str,
default="<UNK>",
help="The out of vocabulary word in lexicon.",
)
parser.add_argument(
"--debug",
type=str2bool,
default=False,
help="""True for debugging, which will generate
a visualization of the lexicon FST.
Caution: If your lexicon contains hundreds of thousands
of lines, please set it to False!
See "test/test_bpe_lexicon.py" for usage.
""",
)
return parser.parse_args()
def main():
args = get_args()
lang_dir = Path(args.lang_dir)
model_file = lang_dir / "bbpe.model"
word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
words = word_sym_table.symbols
excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", args.oov, "#0", "<s>", "</s>"]
for w in excluded:
if w in words:
words.remove(w)
lexicon, token_sym_table = generate_lexicon(model_file, words, args.oov)
lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
next_token_id = max(token_sym_table.values()) + 1
for i in range(max_disambig + 1):
disambig = f"#{i}"
assert disambig not in token_sym_table
token_sym_table[disambig] = next_token_id
next_token_id += 1
word_sym_table.add("#0")
word_sym_table.add("<s>")
word_sym_table.add("</s>")
write_mapping(lang_dir / "tokens.txt", token_sym_table)
write_lexicon(lang_dir / "lexicon.txt", lexicon)
write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
L = lexicon_to_fst_no_sil(
lexicon,
token2id=token_sym_table,
word2id=word_sym_table,
)
L_disambig = lexicon_to_fst_no_sil(
lexicon_disambig,
token2id=token_sym_table,
word2id=word_sym_table,
need_self_loops=True,
)
torch.save(L.as_dict(), lang_dir / "L.pt")
torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
if args.debug:
labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
L.labels_sym = labels_sym
L.aux_labels_sym = aux_labels_sym
L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
L_disambig.labels_sym = labels_sym
L_disambig.aux_labels_sym = aux_labels_sym
L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt")
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/sort_lm_training_data.py

Some files were not shown because too many files have changed in this diff Show More