mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Merge branch 'master' of https://github.com/k2-fsa/icefall
This commit is contained in:
commit
f6e68378dc
@ -15,5 +15,5 @@ mkdir -p data
|
||||
cd data
|
||||
[ ! -e fbank ] && ln -s ~/tmp/fbank-libri fbank
|
||||
cd ..
|
||||
./local/compute_fbank_librispeech.py
|
||||
./local/compute_fbank_librispeech.py --dataset 'test-clean test-other'
|
||||
ls -lh data/fbank/
|
||||
|
||||
@ -25,7 +25,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -18,7 +18,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -1,79 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
set -e
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
|
||||
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
pushd $repo
|
||||
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
cd exp
|
||||
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
log "Install ncnn and pnnx"
|
||||
|
||||
# We are using a modified ncnn here. Will try to merge it to the official repo
|
||||
# of ncnn
|
||||
git clone https://github.com/csukuangfj/ncnn
|
||||
pushd ncnn
|
||||
git submodule init
|
||||
git submodule update python/pybind11
|
||||
python3 setup.py bdist_wheel
|
||||
ls -lh dist/
|
||||
pip install dist/*.whl
|
||||
cd tools/pnnx
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -D Python3_EXECUTABLE=/opt/hostedtoolcache/Python/3.8.14/x64/bin/python3 ..
|
||||
make -j4 pnnx
|
||||
|
||||
./src/pnnx || echo "pass"
|
||||
|
||||
popd
|
||||
|
||||
log "Test exporting to pnnx format"
|
||||
|
||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
\
|
||||
--num-encoder-layers 12 \
|
||||
--chunk-length 32 \
|
||||
--cnn-module-kernel 31 \
|
||||
--left-context-length 32 \
|
||||
--right-context-length 8 \
|
||||
--memory-size 32
|
||||
|
||||
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/encoder_jit_trace-pnnx.pt
|
||||
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/decoder_jit_trace-pnnx.pt
|
||||
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
@ -20,7 +20,6 @@ abs_repo=$(realpath $repo)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
@ -28,63 +27,6 @@ ln -s pretrained-iter-468000-avg-16.pt pretrained.pt
|
||||
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Install ncnn and pnnx"
|
||||
|
||||
# We are using a modified ncnn here. Will try to merge it to the official repo
|
||||
# of ncnn
|
||||
git clone https://github.com/csukuangfj/ncnn
|
||||
pushd ncnn
|
||||
git submodule init
|
||||
git submodule update python/pybind11
|
||||
python3 setup.py bdist_wheel
|
||||
ls -lh dist/
|
||||
pip install dist/*.whl
|
||||
cd tools/pnnx
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j4 pnnx
|
||||
|
||||
./src/pnnx || echo "pass"
|
||||
|
||||
popd
|
||||
|
||||
log "Test exporting to pnnx format"
|
||||
|
||||
./lstm_transducer_stateless2/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
--pnnx 1
|
||||
|
||||
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/encoder_jit_trace-pnnx.pt
|
||||
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/decoder_jit_trace-pnnx.pt
|
||||
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
./lstm_transducer_stateless2/ncnn-decode.py \
|
||||
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
./lstm_transducer_stateless2/streaming-ncnn-decode.py \
|
||||
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
|
||||
|
||||
log "Test exporting with torch.jit.trace()"
|
||||
|
||||
./lstm_transducer_stateless2/export.py \
|
||||
@ -106,47 +48,6 @@ log "Decode with models exported by torch.jit.trace()"
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
log "Test exporting to ONNX"
|
||||
|
||||
./lstm_transducer_stateless2/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
--onnx 1
|
||||
|
||||
log "Decode with ONNX models "
|
||||
|
||||
./lstm_transducer_stateless2/streaming-onnx-decode.py \
|
||||
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||
--encoder-model-filename $repo//exp/encoder.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
./lstm_transducer_stateless2/streaming-onnx-decode.py \
|
||||
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||
--encoder-model-filename $repo//exp/encoder.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||
$repo/test_wavs/1221-135766-0001.wav
|
||||
|
||||
./lstm_transducer_stateless2/streaming-onnx-decode.py \
|
||||
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||
--encoder-model-filename $repo//exp/encoder.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
|
||||
|
||||
for sym in 1 2 3; do
|
||||
log "Greedy search with --max-sym-per-frame $sym"
|
||||
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
for sym in 1 2 3; do
|
||||
|
||||
@ -23,7 +23,6 @@ popd
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -22,7 +22,6 @@ popd
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
@ -27,14 +26,6 @@ ln -s pretrained-iter-1224000-avg-14.pt pretrained.pt
|
||||
ln -s pretrained-iter-1224000-avg-14.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--onnx 1
|
||||
|
||||
log "Export to torchscript model"
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
@ -51,30 +42,8 @@ log "Export to torchscript model"
|
||||
--avg 1 \
|
||||
--jit-trace 1
|
||||
|
||||
ls -lh $repo/exp/*.onnx
|
||||
ls -lh $repo/exp/*.pt
|
||||
|
||||
log "Decode with ONNX models"
|
||||
|
||||
./pruned_transducer_stateless3/onnx_check.py \
|
||||
--jit-filename $repo/exp/cpu_jit.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner.onnx \
|
||||
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
|
||||
|
||||
./pruned_transducer_stateless3/onnx_pretrained.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--encoder-model-filename $repo/exp/encoder.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
log "Decode with models exported by torch.jit.trace()"
|
||||
|
||||
./pruned_transducer_stateless3/jit_pretrained.py \
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
@ -30,15 +29,6 @@ ln -s pretrained.pt epoch-99.pt
|
||||
ls -lh *.pt
|
||||
popd
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
./pruned_transducer_stateless7/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model false \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--onnx 1
|
||||
|
||||
log "Export to torchscript model"
|
||||
./pruned_transducer_stateless7/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
@ -50,27 +40,6 @@ log "Export to torchscript model"
|
||||
|
||||
ls -lh $repo/exp/*.pt
|
||||
|
||||
log "Decode with ONNX models"
|
||||
|
||||
./pruned_transducer_stateless7/onnx_check.py \
|
||||
--jit-filename $repo/exp/cpu_jit.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner.onnx \
|
||||
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
|
||||
|
||||
./pruned_transducer_stateless7/onnx_pretrained.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--encoder-model-filename $repo/exp/encoder.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
log "Decode with models exported by torch.jit.script()"
|
||||
|
||||
./pruned_transducer_stateless7/jit_pretrained.py \
|
||||
|
||||
@ -18,7 +18,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
@ -148,4 +147,4 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" ==
|
||||
done
|
||||
|
||||
rm pruned_transducer_stateless7_ctc/exp/*.pt
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -10,7 +10,7 @@ log() {
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
repo_url=https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14
|
||||
repo_url=https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2023-01-29
|
||||
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
@ -18,7 +18,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -19,16 +19,16 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/cpu_jit.pt"
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
git lfs pull --include "exp/encoder_jit_trace.pt"
|
||||
git lfs pull --include "exp/decoder_jit_trace.pt"
|
||||
git lfs pull --include "exp/joiner_jit_trace.pt"
|
||||
cd exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
ls -lh *.pt
|
||||
popd
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
for sym in 1 2 3; do
|
||||
|
||||
@ -18,7 +18,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.flac
|
||||
ls -lh $repo/test_wavs/*.flac
|
||||
|
||||
log "CTC decoding"
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
for sym in 1 2 3; do
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
for sym in 1 2 3; do
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
for sym in 1 2 3; do
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
for sym in 1 2 3; do
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
for sym in 1 2 3; do
|
||||
|
||||
@ -19,7 +19,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
log "Beam search decoding"
|
||||
|
||||
@ -20,7 +20,6 @@ repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
soxi $repo/test_wavs/*.wav
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
|
||||
234
.github/scripts/test-ncnn-export.sh
vendored
Executable file
234
.github/scripts/test-ncnn-export.sh
vendored
Executable file
@ -0,0 +1,234 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
pushd egs/librispeech/ASR
|
||||
|
||||
log "Install ncnn and pnnx"
|
||||
|
||||
# We are using a modified ncnn here. Will try to merge it to the official repo
|
||||
# of ncnn
|
||||
git clone https://github.com/csukuangfj/ncnn
|
||||
pushd ncnn
|
||||
git submodule init
|
||||
git submodule update python/pybind11
|
||||
python3 setup.py bdist_wheel
|
||||
ls -lh dist/
|
||||
pip install dist/*.whl
|
||||
cd tools/pnnx
|
||||
mkdir build
|
||||
cd build
|
||||
|
||||
echo "which python3"
|
||||
|
||||
which python3
|
||||
#/opt/hostedtoolcache/Python/3.8.16/x64/bin/python3
|
||||
|
||||
cmake -D Python3_EXECUTABLE=$(which python3) ..
|
||||
make -j4 pnnx
|
||||
|
||||
./src/pnnx || echo "pass"
|
||||
|
||||
popd
|
||||
|
||||
export PATH=$PWD/ncnn/tools/pnnx/build/src:$PATH
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Export via torch.jit.trace()"
|
||||
|
||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
\
|
||||
--num-encoder-layers 12 \
|
||||
--chunk-length 32 \
|
||||
--cnn-module-kernel 31 \
|
||||
--left-context-length 32 \
|
||||
--right-context-length 8 \
|
||||
--memory-size 32
|
||||
|
||||
pnnx $repo/exp/encoder_jit_trace-pnnx.pt
|
||||
pnnx $repo/exp/decoder_jit_trace-pnnx.pt
|
||||
pnnx $repo/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
python3 ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Export via torch.jit.trace()"
|
||||
|
||||
./lstm_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0
|
||||
|
||||
pnnx $repo/exp/encoder_jit_trace-pnnx.pt
|
||||
pnnx $repo/exp/decoder_jit_trace-pnnx.pt
|
||||
pnnx $repo/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
python3 ./lstm_transducer_stateless2/streaming-ncnn-decode.py \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
python3 ./lstm_transducer_stateless2/ncnn-decode.py \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
\
|
||||
--decode-chunk-len 32 \
|
||||
--num-encoder-layers "2,4,3,2,4" \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--nhead "8,8,8,8,8" \
|
||||
--encoder-dims "384,384,384,384,384" \
|
||||
--attention-dims "192,192,192,192,192" \
|
||||
--encoder-unmasked-dims "256,256,256,256,256" \
|
||||
--zipformer-downsampling-factors "1,2,4,8,2" \
|
||||
--cnn-module-kernels "31,31,31,31,31" \
|
||||
--decoder-dim 512 \
|
||||
--joiner-dim 512
|
||||
|
||||
pnnx $repo/exp/encoder_jit_trace-pnnx.pt
|
||||
pnnx $repo/exp/decoder_jit_trace-pnnx.pt
|
||||
pnnx $repo/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/pfluo/k2fsa-zipformer-chinese-english-mixed
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_char_bpe/L.pt"
|
||||
git lfs pull --include "data/lang_char_bpe/L_disambig.pt"
|
||||
git lfs pull --include "data/lang_char_bpe/Linv.pt"
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-for-ncnn-zh.py \
|
||||
--lang-dir $repo/data/lang_char_bpe \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--decode-chunk-len 32 \
|
||||
--num-encoder-layers "2,4,3,2,4" \
|
||||
--feedforward-dims "1024,1024,1536,1536,1024" \
|
||||
--nhead "8,8,8,8,8" \
|
||||
--encoder-dims "384,384,384,384,384" \
|
||||
--attention-dims "192,192,192,192,192" \
|
||||
--encoder-unmasked-dims "256,256,256,256,256" \
|
||||
--zipformer-downsampling-factors "1,2,4,8,2" \
|
||||
--cnn-module-kernels "31,31,31,31,31" \
|
||||
--decoder-dim 512 \
|
||||
--joiner-dim 512
|
||||
|
||||
pnnx $repo/exp/encoder_jit_trace-pnnx.pt
|
||||
pnnx $repo/exp/decoder_jit_trace-pnnx.pt
|
||||
pnnx $repo/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
|
||||
--tokens $repo/data/lang_char_bpe/tokens.txt \
|
||||
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
$repo/test_wavs/0.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
351
.github/scripts/test-onnx-export.sh
vendored
Executable file
351
.github/scripts/test-onnx-export.sh
vendored
Executable file
@ -0,0 +1,351 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
git lfs install
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
cd exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Export via torch.jit.trace()"
|
||||
|
||||
./pruned_transducer_stateless7_streaming/jit_trace_export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--decode-chunk-len 32 \
|
||||
--exp-dir $repo/exp/
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--decode-chunk-len 32 \
|
||||
--exp-dir $repo/exp/
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
log "Run onnx_check.py"
|
||||
|
||||
./pruned_transducer_stateless7_streaming/onnx_check.py \
|
||||
--jit-encoder-filename $repo/exp/encoder_jit_trace.pt \
|
||||
--jit-decoder-filename $repo/exp/decoder_jit_trace.pt \
|
||||
--jit-joiner-filename $repo/exp/joiner_jit_trace.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
|
||||
|
||||
log "Run onnx_pretrained.py"
|
||||
|
||||
./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
git lfs install
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained-iter-1224000-avg-14.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained-iter-1224000-avg-14.pt epoch-9999.pt
|
||||
popd
|
||||
|
||||
log "Export via torch.jit.script()"
|
||||
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 9999 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp/ \
|
||||
--jit 1
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless3/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 9999 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp/
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
log "Run onnx_check.py"
|
||||
|
||||
./pruned_transducer_stateless3/onnx_check.py \
|
||||
--jit-filename $repo/exp/cpu_jit.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder-epoch-9999-avg-1.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder-epoch-9999-avg-1.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner-epoch-9999-avg-1.onnx
|
||||
|
||||
log "Run onnx_pretrained.py"
|
||||
|
||||
./pruned_transducer_stateless3/onnx_pretrained.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-9999-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-9999-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-9999-avg-1.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained-epoch-39-avg-7.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained-epoch-39-avg-7.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Export via torch.jit.script()"
|
||||
|
||||
./pruned_transducer_stateless5/export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
--exp-dir $repo/exp \
|
||||
--num-encoder-layers 18 \
|
||||
--dim-feedforward 2048 \
|
||||
--nhead 8 \
|
||||
--encoder-dim 512 \
|
||||
--decoder-dim 512 \
|
||||
--joiner-dim 512 \
|
||||
--jit 1
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless5/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
--exp-dir $repo/exp \
|
||||
--num-encoder-layers 18 \
|
||||
--dim-feedforward 2048 \
|
||||
--nhead 8 \
|
||||
--encoder-dim 512 \
|
||||
--decoder-dim 512 \
|
||||
--joiner-dim 512
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
log "Run onnx_check.py"
|
||||
|
||||
./pruned_transducer_stateless5/onnx_check.py \
|
||||
--jit-filename $repo/exp/cpu_jit.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
|
||||
|
||||
log "Run onnx_pretrained.py"
|
||||
|
||||
./pruned_transducer_stateless5/onnx_pretrained.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Export via torch.jit.script()"
|
||||
|
||||
./pruned_transducer_stateless7/export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--jit 1
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless7/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024"
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
log "Run onnx_check.py"
|
||||
|
||||
./pruned_transducer_stateless7/onnx_check.py \
|
||||
--jit-filename $repo/exp/cpu_jit.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
|
||||
|
||||
log "Run onnx_pretrained.py"
|
||||
|
||||
./pruned_transducer_stateless7/onnx_pretrained.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./conv_emformer_transducer_stateless2/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp \
|
||||
--num-encoder-layers 12 \
|
||||
--chunk-length 32 \
|
||||
--cnn-module-kernel 31 \
|
||||
--left-context-length 32 \
|
||||
--right-context-length 8 \
|
||||
--memory-size 32
|
||||
|
||||
log "Run onnx_pretrained.py"
|
||||
|
||||
./conv_emformer_transducer_stateless2/onnx_pretrained.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1221-135766-0001.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Export via torch.jit.trace()"
|
||||
|
||||
./lstm_transducer_stateless2/export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp/ \
|
||||
--jit-trace 1
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./lstm_transducer_stateless2/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
log "Run onnx_check.py"
|
||||
|
||||
./lstm_transducer_stateless2/onnx_check.py \
|
||||
--jit-encoder-filename $repo/exp/encoder_jit_trace.pt \
|
||||
--jit-decoder-filename $repo/exp/decoder_jit_trace.pt \
|
||||
--jit-joiner-filename $repo/exp/joiner_jit_trace.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
|
||||
|
||||
log "Run onnx_pretrained.py"
|
||||
|
||||
./lstm_transducer_stateless2/onnx_pretrained.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1221-135766-0001.wav
|
||||
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
4
.github/workflows/run-aishell-2022-06-20.yml
vendored
4
.github/workflows/run-aishell-2022-06-20.yml
vendored
@ -65,7 +65,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -87,7 +87,7 @@ jobs:
|
||||
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -39,7 +39,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
run_librispeech_2022_11_11_zipformer:
|
||||
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -60,7 +60,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -119,7 +119,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -35,7 +35,7 @@ on:
|
||||
|
||||
jobs:
|
||||
run_librispeech_2022_12_15_zipformer_ctc_bs:
|
||||
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
if: github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
@ -60,7 +60,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -119,7 +119,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -22,7 +22,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
run_librispeech_lstm_transducer_stateless2_2022_09_03:
|
||||
if: github.event.label.name == 'ready' || github.event.label.name == 'LODR' || github.event.label.name == 'shallow-fusion' || github.event.label.name == 'ncnn' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
if: github.event.label.name == 'ready' || github.event.label.name == 'LODR' || github.event.label.name == 'shallow-fusion' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
@ -47,7 +47,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -106,7 +106,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -39,7 +39,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
run_librispeech_pruned_transducer_stateless3_2022_05_13:
|
||||
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -54,7 +54,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -73,7 +73,7 @@ jobs:
|
||||
- name: Inference with pre-trained model
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -63,7 +63,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -122,7 +122,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -63,7 +63,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -122,7 +122,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -54,7 +54,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -73,7 +73,7 @@ jobs:
|
||||
- name: Inference with pre-trained model
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -54,7 +54,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -73,7 +73,7 @@ jobs:
|
||||
- name: Inference with pre-trained model
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -63,7 +63,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -122,7 +122,7 @@ jobs:
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
@ -54,7 +54,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -73,7 +73,7 @@ jobs:
|
||||
- name: Inference with pre-trained model
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
2
.github/workflows/run-ptb-rnn-lm.yml
vendored
2
.github/workflows/run-ptb-rnn-lm.yml
vendored
@ -47,7 +47,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Prepare data
|
||||
shell: bash
|
||||
|
||||
@ -54,7 +54,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -76,7 +76,7 @@ jobs:
|
||||
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
2
.github/workflows/run-yesno-recipe.yml
vendored
2
.github/workflows/run-yesno-recipe.yml
vendored
@ -67,7 +67,7 @@ jobs:
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Run yesno recipe
|
||||
shell: bash
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
name: run-librispeech-conv-emformer-transducer-stateless2-2022-12-05
|
||||
name: test-ncnn-export
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -16,15 +16,18 @@ on:
|
||||
# nightly build at 15:50 UTC time every day
|
||||
- cron: "50 15 * * *"
|
||||
|
||||
concurrency:
|
||||
group: test_ncnn_export-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run_librispeech_conv_emformer_transducer_stateless2_2022_12_05:
|
||||
test_ncnn_export:
|
||||
if: github.event.label.name == 'ready' || github.event.label.name == 'ncnn' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
@ -41,9 +44,9 @@ jobs:
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -59,19 +62,14 @@ jobs:
|
||||
run: |
|
||||
.github/scripts/install-kaldifeat.sh
|
||||
|
||||
- name: Inference with pre-trained model
|
||||
- name: Test ncnn export
|
||||
shell: bash
|
||||
env:
|
||||
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||
run: |
|
||||
mkdir -p egs/librispeech/ASR/data
|
||||
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||
ls -lh egs/librispeech/ASR/data/*
|
||||
|
||||
sudo apt-get -qq install git-lfs tree sox
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
.github/scripts/run-librispeech-conv-emformer-transducer-stateless2-2022-12-05.sh
|
||||
.github/scripts/test-ncnn-export.sh
|
||||
75
.github/workflows/test-onnx-export.yml
vendored
Normal file
75
.github/workflows/test-onnx-export.yml
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
name: test-onnx-export
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
|
||||
schedule:
|
||||
# minute (0-59)
|
||||
# hour (0-23)
|
||||
# day of the month (1-31)
|
||||
# month (1-12)
|
||||
# day of the week (0-6)
|
||||
# nightly build at 15:50 UTC time every day
|
||||
- cron: "50 15 * * *"
|
||||
|
||||
concurrency:
|
||||
group: test_onnx_export-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test_onnx_export:
|
||||
if: github.event.label.name == 'ready' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.8]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: 'pip'
|
||||
cache-dependency-path: '**/requirements-ci.txt'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
~/tmp/kaldifeat
|
||||
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||
|
||||
- name: Install kaldifeat
|
||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
.github/scripts/install-kaldifeat.sh
|
||||
|
||||
- name: Test ONNX export
|
||||
shell: bash
|
||||
env:
|
||||
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
.github/scripts/test-onnx-export.sh
|
||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@ -56,7 +56,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -q -y libsndfile1-dev libsndfile1 ffmpeg
|
||||
sudo apt install -q -y --fix-missing sox libsox-dev libsox-fmt-all
|
||||
sudo apt install -q -y --fix-missing libsox-dev libsox-fmt-all
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
@ -70,7 +70,7 @@ jobs:
|
||||
pip install git+https://github.com/lhotse-speech/lhotse
|
||||
# icefall requirements
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
pip install kaldifst
|
||||
pip install onnxruntime
|
||||
|
||||
9
LICENSE
9
LICENSE
@ -1,13 +1,4 @@
|
||||
|
||||
Legal Notices
|
||||
|
||||
NOTE (this is not from the Apache License): The copyright model is that
|
||||
authors (or their employers, if noted in individual files) own their
|
||||
individual contributions. The authors' contributions can be discerned
|
||||
from the git history.
|
||||
|
||||
-------------------------------------------------------------------------
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
@ -81,9 +81,12 @@ todo_include_todos = True
|
||||
|
||||
rst_epilog = """
|
||||
.. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn
|
||||
.. _sherpa-onnx: https://github.com/k2-fsa/sherpa-onnx
|
||||
.. _icefall: https://github.com/k2-fsa/icefall
|
||||
.. _git-lfs: https://git-lfs.com/
|
||||
.. _ncnn: https://github.com/tencent/ncnn
|
||||
.. _LibriSpeech: https://www.openslr.org/12
|
||||
.. _musan: http://www.openslr.org/17/
|
||||
.. _ONNX: https://github.com/onnx/onnx
|
||||
.. _onnxruntime: https://github.com/microsoft/onnxruntime
|
||||
"""
|
||||
|
||||
@ -0,0 +1,18 @@
|
||||
2023-02-17 11:22:42,862 INFO [export-for-ncnn.py:222] device: cpu
|
||||
2023-02-17 11:22:42,865 INFO [export-for-ncnn.py:231] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'dim_feedforward': 2048, 'decoder_dim': 512, 'joiner_dim': 512, 'is_pnnx': False, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.23.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '62e404dd3f3a811d73e424199b3408e309c06e1a', 'k2-git-date': 'Mon Jan 30 10:26:16 2023', 'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '6d7a559-dirty', 'icefall-git-date': 'Thu Feb 16 19:47:54 2023', 'icefall-path': '/star-fj/fangjun/open-source/icefall-2', 'k2-path': '/star-fj/fangjun/open-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '10.177.6.147'}, 'epoch': 99, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp'), 'bpe_model': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/bpe.model', 'context_size': 2, 'use_averaged_model': False, 'num_encoder_layers': 12, 'encoder_dim': 512, 'rnn_hidden_size': 1024, 'aux_layer_period': 0, 'blank_id': 0, 'vocab_size': 500}
|
||||
2023-02-17 11:22:42,865 INFO [export-for-ncnn.py:235] About to create model
|
||||
2023-02-17 11:22:43,239 INFO [train.py:472] Disable giga
|
||||
2023-02-17 11:22:43,249 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/epoch-99.pt
|
||||
2023-02-17 11:22:44,595 INFO [export-for-ncnn.py:324] encoder parameters: 83137520
|
||||
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:325] decoder parameters: 257024
|
||||
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:326] joiner parameters: 781812
|
||||
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:327] total parameters: 84176356
|
||||
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:329] Using torch.jit.trace()
|
||||
2023-02-17 11:22:44,596 INFO [export-for-ncnn.py:331] Exporting encoder
|
||||
2023-02-17 11:22:48,182 INFO [export-for-ncnn.py:158] Saved to icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.pt
|
||||
2023-02-17 11:22:48,183 INFO [export-for-ncnn.py:335] Exporting decoder
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/lstm_transducer_stateless2/decoder.py:101: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
need_pad = bool(need_pad)
|
||||
2023-02-17 11:22:48,259 INFO [export-for-ncnn.py:180] Saved to icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.pt
|
||||
2023-02-17 11:22:48,259 INFO [export-for-ncnn.py:339] Exporting joiner
|
||||
2023-02-17 11:22:48,304 INFO [export-for-ncnn.py:207] Saved to icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.pt
|
||||
@ -0,0 +1,74 @@
|
||||
2023-02-27 20:23:07,473 INFO [export-for-ncnn.py:246] device: cpu
|
||||
2023-02-27 20:23:07,477 INFO [export-for-ncnn.py:255] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'warm_step': 2000, 'env_info': {'k2-version': '1.23.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '62e404dd3f3a811d73e424199b3408e309c06e1a', 'k2-git-date': 'Mon Jan 30 10:26:16 2023', 'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '6d7a559-clean', 'icefall-git-date': 'Thu Feb 16 19:47:54 2023', 'icefall-path': '/star-fj/fangjun/open-source/icefall-2', 'k2-path': '/star-fj/fangjun/open-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '10.177.6.147'}, 'epoch': 99, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp'), 'bpe_model': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model', 'context_size': 2, 'use_averaged_model': False, 'num_encoder_layers': '2,4,3,2,4', 'feedforward_dims': '1024,1024,2048,2048,1024', 'nhead': '8,8,8,8,8', 'encoder_dims': '384,384,384,384,384', 'attention_dims': '192,192,192,192,192', 'encoder_unmasked_dims': '256,256,256,256,256', 'zipformer_downsampling_factors': '1,2,4,8,2', 'cnn_module_kernels': '31,31,31,31,31', 'decoder_dim': 512, 'joiner_dim': 512, 'short_chunk_size': 50, 'num_left_chunks': 4, 'decode_chunk_len': 32, 'blank_id': 0, 'vocab_size': 500}
|
||||
2023-02-27 20:23:07,477 INFO [export-for-ncnn.py:257] About to create model
|
||||
2023-02-27 20:23:08,023 INFO [zipformer2.py:419] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
||||
2023-02-27 20:23:08,037 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/epoch-99.pt
|
||||
2023-02-27 20:23:08,655 INFO [export-for-ncnn.py:346] encoder parameters: 68944004
|
||||
2023-02-27 20:23:08,655 INFO [export-for-ncnn.py:347] decoder parameters: 260096
|
||||
2023-02-27 20:23:08,655 INFO [export-for-ncnn.py:348] joiner parameters: 716276
|
||||
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:349] total parameters: 69920376
|
||||
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:351] Using torch.jit.trace()
|
||||
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:353] Exporting encoder
|
||||
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:174] decode_chunk_len: 32
|
||||
2023-02-27 20:23:08,656 INFO [export-for-ncnn.py:175] T: 39
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1344: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_len.size(0) == self.num_layers, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1348: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_avg.size(0) == self.num_layers, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1352: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_key.size(0) == self.num_layers, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1356: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_val.size(0) == self.num_layers, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1360: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_val2.size(0) == self.num_layers, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1364: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_conv1.size(0) == self.num_layers, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1368: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_conv2.size(0) == self.num_layers, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1373: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert self.left_context_len == cached_key.shape[1], (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1884: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert self.x_size == x.size(0), (self.x_size, x.size(0))
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2442: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_key.shape[0] == self.left_context_len, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2449: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_key.shape[0] == cached_val.shape[0], (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2469: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_key.shape[0] == left_context_len, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2473: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_val.shape[0] == left_context_len, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2483: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert kv_len == k.shape[0], (kv_len, k.shape)
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2570: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert list(attn_output.size()) == [bsz * num_heads, seq_len, head_dim // 2]
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2926: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cache.shape == (x.size(0), x.size(1), self.lorder), (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2652: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert x.shape[0] == self.x_size, (x.shape[0], self.x_size)
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2653: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert x.shape[2] == self.embed_dim, (x.shape[2], self.embed_dim)
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:2666: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert cached_val.shape[0] == self.left_context_len, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1543: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert src.shape[0] == self.in_x_size, (src.shape[0], self.in_x_size)
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1637: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert src.shape[0] == self.in_x_size, (
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1643: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert src.shape[2] == self.in_channels, (src.shape[2], self.in_channels)
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1571: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
if src.shape[0] != self.in_x_size:
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1763: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert src1.shape[:-1] == src2.shape[:-1], (src1.shape, src2.shape)
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1779: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert src1.shape[-1] == self.dim1, (src1.shape[-1], self.dim1)
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer2.py:1780: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert src2.shape[-1] == self.dim2, (src2.shape[-1], self.dim2)
|
||||
/star-fj/fangjun/py38/lib/python3.8/site-packages/torch/jit/_trace.py:958: TracerWarning: Encountering a list at the output of the tracer might cause the trace to be incorrect, this is only valid if the container structure does not change based on the module's inputs. Consider using a constant container instead (e.g. for `list`, use a `tuple` instead. for `dict`, use a `NamedTuple` instead). If you absolutely need this and know the side effects, pass strict=False to trace() to allow this behavior.
|
||||
module._c._create_method_from_trace(
|
||||
2023-02-27 20:23:19,640 INFO [export-for-ncnn.py:182] Saved to icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.pt
|
||||
2023-02-27 20:23:19,646 INFO [export-for-ncnn.py:357] Exporting decoder
|
||||
/star-fj/fangjun/open-source/icefall-2/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/decoder.py:102: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
|
||||
assert embedding_out.size(-1) == self.context_size
|
||||
2023-02-27 20:23:19,686 INFO [export-for-ncnn.py:204] Saved to icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.pt
|
||||
2023-02-27 20:23:19,686 INFO [export-for-ncnn.py:361] Exporting joiner
|
||||
2023-02-27 20:23:19,735 INFO [export-for-ncnn.py:231] Saved to icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.pt
|
||||
@ -0,0 +1,44 @@
|
||||
Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1
|
||||
num encoder conv layers: 28
|
||||
num joiner conv layers: 3
|
||||
num files: 3
|
||||
Processing ../test_wavs/1089-134686-0001.wav
|
||||
Processing ../test_wavs/1221-135766-0001.wav
|
||||
Processing ../test_wavs/1221-135766-0002.wav
|
||||
Processing ../test_wavs/1089-134686-0001.wav
|
||||
Processing ../test_wavs/1221-135766-0001.wav
|
||||
Processing ../test_wavs/1221-135766-0002.wav
|
||||
----------encoder----------
|
||||
conv_15 : max = 15.942385 threshold = 15.930708 scale = 7.972025
|
||||
conv_16 : max = 44.978855 threshold = 17.031788 scale = 7.456645
|
||||
conv_17 : max = 17.868437 threshold = 7.830528 scale = 16.218575
|
||||
linear_18 : max = 3.107259 threshold = 1.194808 scale = 106.293236
|
||||
linear_19 : max = 6.193777 threshold = 4.634748 scale = 27.401705
|
||||
linear_20 : max = 9.259933 threshold = 2.606617 scale = 48.722160
|
||||
linear_21 : max = 5.186600 threshold = 4.790260 scale = 26.512129
|
||||
linear_22 : max = 9.759041 threshold = 2.265832 scale = 56.050053
|
||||
linear_23 : max = 3.931209 threshold = 3.099090 scale = 40.979767
|
||||
linear_24 : max = 10.324160 threshold = 2.215561 scale = 57.321835
|
||||
linear_25 : max = 3.800708 threshold = 3.599352 scale = 35.284134
|
||||
linear_26 : max = 10.492444 threshold = 3.153369 scale = 40.274391
|
||||
linear_27 : max = 3.660161 threshold = 2.720994 scale = 46.674126
|
||||
linear_28 : max = 9.415265 threshold = 3.174434 scale = 40.007133
|
||||
linear_29 : max = 4.038418 threshold = 3.118534 scale = 40.724262
|
||||
linear_30 : max = 10.072084 threshold = 3.936867 scale = 32.259155
|
||||
linear_31 : max = 4.342712 threshold = 3.599489 scale = 35.282787
|
||||
linear_32 : max = 11.340535 threshold = 3.120308 scale = 40.701103
|
||||
linear_33 : max = 3.846987 threshold = 3.630030 scale = 34.985939
|
||||
linear_34 : max = 10.686298 threshold = 2.204571 scale = 57.607586
|
||||
linear_35 : max = 4.904821 threshold = 4.575518 scale = 27.756420
|
||||
linear_36 : max = 11.806659 threshold = 2.585589 scale = 49.118401
|
||||
linear_37 : max = 6.402340 threshold = 5.047157 scale = 25.162680
|
||||
linear_38 : max = 11.174589 threshold = 1.923361 scale = 66.030258
|
||||
linear_39 : max = 16.178576 threshold = 7.556058 scale = 16.807705
|
||||
linear_40 : max = 12.901954 threshold = 5.301267 scale = 23.956539
|
||||
linear_41 : max = 14.839805 threshold = 7.597429 scale = 16.716181
|
||||
linear_42 : max = 10.178945 threshold = 2.651595 scale = 47.895699
|
||||
----------joiner----------
|
||||
linear_2 : max = 24.829245 threshold = 16.627592 scale = 7.637907
|
||||
linear_1 : max = 10.746186 threshold = 5.255032 scale = 24.167313
|
||||
linear_3 : max = 1.000000 threshold = 0.999756 scale = 127.031013
|
||||
ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\(^0^)/...233...
|
||||
@ -0,0 +1,6 @@
|
||||
2023-02-17 11:37:30,861 INFO [streaming-ncnn-decode.py:255] {'tokens': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav'}
|
||||
2023-02-17 11:37:31,425 INFO [streaming-ncnn-decode.py:263] Constructing Fbank computer
|
||||
2023-02-17 11:37:31,427 INFO [streaming-ncnn-decode.py:266] Reading sound files: ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav
|
||||
2023-02-17 11:37:31,431 INFO [streaming-ncnn-decode.py:271] torch.Size([106000])
|
||||
2023-02-17 11:37:34,115 INFO [streaming-ncnn-decode.py:342] ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav
|
||||
2023-02-17 11:37:34,115 INFO [streaming-ncnn-decode.py:343] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
|
||||
@ -0,0 +1,7 @@
|
||||
2023-02-27 20:43:40,283 INFO [streaming-ncnn-decode.py:349] {'tokens': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav'}
|
||||
2023-02-27 20:43:41,260 INFO [streaming-ncnn-decode.py:357] Constructing Fbank computer
|
||||
2023-02-27 20:43:41,264 INFO [streaming-ncnn-decode.py:360] Reading sound files: ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav
|
||||
2023-02-27 20:43:41,269 INFO [streaming-ncnn-decode.py:365] torch.Size([106000])
|
||||
2023-02-27 20:43:41,280 INFO [streaming-ncnn-decode.py:372] number of states: 35
|
||||
2023-02-27 20:43:45,026 INFO [streaming-ncnn-decode.py:410] ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav
|
||||
2023-02-27 20:43:45,026 INFO [streaming-ncnn-decode.py:411] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
|
||||
753
docs/source/model-export/export-ncnn-conv-emformer.rst
Normal file
753
docs/source/model-export/export-ncnn-conv-emformer.rst
Normal file
@ -0,0 +1,753 @@
|
||||
.. _export_conv_emformer_transducer_models_to_ncnn:
|
||||
|
||||
Export ConvEmformer transducer models to ncnn
|
||||
=============================================
|
||||
|
||||
We use the pre-trained model from the following repository as an example:
|
||||
|
||||
- `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
|
||||
|
||||
We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
|
||||
|
||||
.. caution::
|
||||
|
||||
Please use a more recent version of PyTorch. For instance, ``torch 1.8``
|
||||
may ``not`` work.
|
||||
|
||||
1. Download the pre-trained model
|
||||
---------------------------------
|
||||
|
||||
.. hint::
|
||||
|
||||
You can also refer to `<https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html#icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_ to download the pre-trained model.
|
||||
|
||||
You have to install `git-lfs`_ before you continue.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
|
||||
|
||||
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
|
||||
cd ..
|
||||
|
||||
.. note::
|
||||
|
||||
We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
|
||||
|
||||
|
||||
In the above code, we downloaded the pre-trained model into the directory
|
||||
``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``.
|
||||
|
||||
.. _export_for_ncnn_install_ncnn_and_pnnx:
|
||||
|
||||
2. Install ncnn and pnnx
|
||||
------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# We put ncnn into $HOME/open-source/ncnn
|
||||
# You can change it to anywhere you like
|
||||
|
||||
cd $HOME
|
||||
mkdir -p open-source
|
||||
cd open-source
|
||||
|
||||
git clone https://github.com/csukuangfj/ncnn
|
||||
cd ncnn
|
||||
git submodule update --recursive --init
|
||||
|
||||
# Note: We don't use "python setup.py install" or "pip install ." here
|
||||
|
||||
mkdir -p build-wheel
|
||||
cd build-wheel
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DNCNN_PYTHON=ON \
|
||||
-DNCNN_BUILD_BENCHMARK=OFF \
|
||||
-DNCNN_BUILD_EXAMPLES=OFF \
|
||||
-DNCNN_BUILD_TOOLS=ON \
|
||||
..
|
||||
|
||||
make -j4
|
||||
|
||||
cd ..
|
||||
|
||||
# Note: $PWD here is $HOME/open-source/ncnn
|
||||
|
||||
export PYTHONPATH=$PWD/python:$PYTHONPATH
|
||||
export PATH=$PWD/tools/pnnx/build/src:$PATH
|
||||
export PATH=$PWD/build-wheel/tools/quantize:$PATH
|
||||
|
||||
# Now build pnnx
|
||||
cd tools/pnnx
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j4
|
||||
|
||||
./src/pnnx
|
||||
|
||||
Congratulations! You have successfully installed the following components:
|
||||
|
||||
- ``pnnx``, which is an executable located in
|
||||
``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use
|
||||
it to convert models exported by ``torch.jit.trace()``.
|
||||
- ``ncnn2int8``, which is an executable located in
|
||||
``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use
|
||||
it to quantize our models to ``int8``.
|
||||
- ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located
|
||||
in ``$HOME/open-source/ncnn/python/ncnn``.
|
||||
|
||||
.. note::
|
||||
|
||||
I am using ``Python 3.8``, so it
|
||||
is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different
|
||||
version, say, ``Python 3.9``, the name would be
|
||||
``ncnn.cpython-39-x86_64-linux-gnu.so``.
|
||||
|
||||
Also, if you are not using Linux, the file name would also be different.
|
||||
But that does not matter. As long as you can compile it, it should work.
|
||||
|
||||
We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your
|
||||
Python code. We have also set up ``PATH`` so that you can use
|
||||
``pnnx`` and ``ncnn2int8`` later in your terminal.
|
||||
|
||||
.. caution::
|
||||
|
||||
Please don't use `<https://github.com/tencent/ncnn>`_.
|
||||
We have made some modifications to the offical `ncnn`_.
|
||||
|
||||
We will synchronize `<https://github.com/csukuangfj/ncnn>`_ periodically
|
||||
with the official one.
|
||||
|
||||
3. Export the model via torch.jit.trace()
|
||||
-----------------------------------------
|
||||
|
||||
First, let us rename our pre-trained model:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp
|
||||
|
||||
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt
|
||||
|
||||
cd ../..
|
||||
|
||||
Next, we use the following code to export our model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/
|
||||
|
||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $dir/exp \
|
||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
||||
--epoch 30 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
\
|
||||
--num-encoder-layers 12 \
|
||||
--chunk-length 32 \
|
||||
--cnn-module-kernel 31 \
|
||||
--left-context-length 32 \
|
||||
--right-context-length 8 \
|
||||
--memory-size 32 \
|
||||
--encoder-dim 512
|
||||
|
||||
.. caution::
|
||||
|
||||
If your model has different configuration parameters, please change them accordingly.
|
||||
|
||||
.. hint::
|
||||
|
||||
We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``.
|
||||
There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
|
||||
|
||||
If you have trained a model by yourself and if you have all checkpoints
|
||||
available, please first use ``decode.py`` to tune ``--epoch --avg``
|
||||
and select the best combination with with ``--use-averaged-model 1``.
|
||||
|
||||
.. note::
|
||||
|
||||
You will see the following log output:
|
||||
|
||||
.. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt
|
||||
|
||||
The log shows the model has ``75490012`` parameters, i.e., ``~75 M``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
|
||||
|
||||
You can see that the file size of the pre-trained model is ``289 MB``, which
|
||||
is roughly equal to ``75490012*4/1024/1024 = 287.97 MB``.
|
||||
|
||||
After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``,
|
||||
we will get the following files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx*
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
|
||||
.. _conv-emformer-step-4-export-torchscript-model-via-pnnx:
|
||||
|
||||
4. Export torchscript model via pnnx
|
||||
------------------------------------
|
||||
|
||||
.. hint::
|
||||
|
||||
Make sure you have set up the ``PATH`` environment variable. Otherwise,
|
||||
it will throw an error saying that ``pnnx`` could not be found.
|
||||
|
||||
Now, it's time to export our models to `ncnn`_ via ``pnnx``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
pnnx ./encoder_jit_trace-pnnx.pt
|
||||
pnnx ./decoder_jit_trace-pnnx.pt
|
||||
pnnx ./joiner_jit_trace-pnnx.pt
|
||||
|
||||
It will generate the following files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param}
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
|
||||
|
||||
There are two types of files:
|
||||
|
||||
- ``param``: It is a text file containing the model architectures. You can
|
||||
use a text editor to view its content.
|
||||
- ``bin``: It is a binary file containing the model parameters.
|
||||
|
||||
We compare the file sizes of the models below before and after converting via ``pnnx``:
|
||||
|
||||
.. see https://tableconvert.com/restructuredtext-generator
|
||||
|
||||
+----------------------------------+------------+
|
||||
| File name | File size |
|
||||
+==================================+============+
|
||||
| encoder_jit_trace-pnnx.pt | 283 MB |
|
||||
+----------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin | 142 MB |
|
||||
+----------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin | 503 KB |
|
||||
+----------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB |
|
||||
+----------------------------------+------------+
|
||||
|
||||
You can see that the file sizes of the models after conversion are about one half
|
||||
of the models before conversion:
|
||||
|
||||
- encoder: 283 MB vs 142 MB
|
||||
- decoder: 1010 KB vs 503 KB
|
||||
- joiner: 3.0 MB vs 1.5 MB
|
||||
|
||||
The reason is that by default ``pnnx`` converts ``float32`` parameters
|
||||
to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
|
||||
for ``float16``. Thus, it is ``twice smaller`` after conversion.
|
||||
|
||||
.. hint::
|
||||
|
||||
If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
|
||||
won't convert ``float32`` to ``float16``.
|
||||
|
||||
5. Test the exported models in icefall
|
||||
--------------------------------------
|
||||
|
||||
.. note::
|
||||
|
||||
We assume you have set up the environment variable ``PYTHONPATH`` when
|
||||
building `ncnn`_.
|
||||
|
||||
Now we have successfully converted our pre-trained model to `ncnn`_ format.
|
||||
The generated 6 files are what we need. You can use the following code to
|
||||
test the converted models:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
|
||||
--tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
|
||||
|
||||
.. hint::
|
||||
|
||||
`ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
|
||||
only 1 wave file as input.
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. literalinclude:: ./code/test-streaming-ncnn-decode-conv-emformer-transducer-libri.txt
|
||||
|
||||
Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
|
||||
|
||||
|
||||
.. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn:
|
||||
|
||||
6. Modify the exported encoder for sherpa-ncnn
|
||||
----------------------------------------------
|
||||
|
||||
In order to use the exported models in `sherpa-ncnn`_, we have to modify
|
||||
``encoder_jit_trace-pnnx.ncnn.param``.
|
||||
|
||||
Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
|
||||
|
||||
.. code-block::
|
||||
|
||||
7767517
|
||||
1060 1342
|
||||
Input in0 0 1 in0
|
||||
|
||||
**Explanation** of the above three lines:
|
||||
|
||||
1. ``7767517``, it is a magic number and should not be changed.
|
||||
2. ``1060 1342``, the first number ``1060`` specifies the number of layers
|
||||
in this file, while ``1342`` specifies the number of intermediate outputs
|
||||
of this file
|
||||
3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
|
||||
is the layer name of this layer; ``0`` means this layer has no input;
|
||||
``1`` means this layer has one output; ``in0`` is the output name of
|
||||
this layer.
|
||||
|
||||
We need to add 1 extra line and also increment the number of layers.
|
||||
The result looks like below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
1061 1342
|
||||
SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
|
||||
Input in0 0 1 in0
|
||||
|
||||
**Explanation**
|
||||
|
||||
1. ``7767517``, it is still the same
|
||||
2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``.
|
||||
We don't need to change ``1342`` since the newly added layer has no inputs or outputs.
|
||||
3. ``SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512``
|
||||
This line is newly added. Its explanation is given below:
|
||||
|
||||
- ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
|
||||
- ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
|
||||
- ``0 0`` means this layer has no inputs or output. Must be ``0 0``
|
||||
- ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1``
|
||||
- ``1=12``, 1 is the key and 12 is the value of the
|
||||
parameter ``--num-encoder-layers`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``2=32``, 2 is the key and 32 is the value of the
|
||||
parameter ``--memory-size`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``3=31``, 3 is the key and 31 is the value of the
|
||||
parameter ``--cnn-module-kernel`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``4=8``, 4 is the key and 8 is the value of the
|
||||
parameter ``--left-context-length`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``5=32``, 5 is the key and 32 is the value of the
|
||||
parameter ``--chunk-length`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``6=8``, 6 is the key and 8 is the value of the
|
||||
parameter ``--right-context-length`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``7=512``, 7 is the key and 512 is the value of the
|
||||
parameter ``--encoder-dim`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
|
||||
For ease of reference, we list the key-value pairs that you need to add
|
||||
in the following table. If your model has a different setting, please
|
||||
change the values for ``SherpaMetaData`` accordingly. Otherwise, you
|
||||
will be ``SAD``.
|
||||
|
||||
+------+-----------------------------+
|
||||
| key | value |
|
||||
+======+=============================+
|
||||
| 0 | 1 (fixed) |
|
||||
+------+-----------------------------+
|
||||
| 1 | ``--num-encoder-layers`` |
|
||||
+------+-----------------------------+
|
||||
| 2 | ``--memory-size`` |
|
||||
+------+-----------------------------+
|
||||
| 3 | ``--cnn-module-kernel`` |
|
||||
+------+-----------------------------+
|
||||
| 4 | ``--left-context-length`` |
|
||||
+------+-----------------------------+
|
||||
| 5 | ``--chunk-length`` |
|
||||
+------+-----------------------------+
|
||||
| 6 | ``--right-context-length`` |
|
||||
+------+-----------------------------+
|
||||
| 7 | ``--encoder-dim`` |
|
||||
+------+-----------------------------+
|
||||
|
||||
4. ``Input in0 0 1 in0``. No need to change it.
|
||||
|
||||
.. caution::
|
||||
|
||||
When you add a new layer ``SherpaMetaData``, please remember to update the
|
||||
number of layers. In our case, update ``1060`` to ``1061``. Otherwise,
|
||||
you will be SAD later.
|
||||
|
||||
.. hint::
|
||||
|
||||
After adding the new layer ``SherpaMetaData``, you cannot use this model
|
||||
with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
|
||||
supported only in `sherpa-ncnn`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
`ncnn`_ is very flexible. You can add new layers to it just by text-editing
|
||||
the ``param`` file! You don't need to change the ``bin`` file.
|
||||
|
||||
Now you can use this model in `sherpa-ncnn`_.
|
||||
Please refer to the following documentation:
|
||||
|
||||
- Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
|
||||
- ``Android``: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
|
||||
- ``iOS``: `<https://k2-fsa.github.io/sherpa/ncnn/ios/index.html>`_
|
||||
- Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
|
||||
|
||||
We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
|
||||
|
||||
- `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
|
||||
|
||||
You can find more usages there.
|
||||
|
||||
7. (Optional) int8 quantization with sherpa-ncnn
|
||||
------------------------------------------------
|
||||
|
||||
This step is optional.
|
||||
|
||||
In this step, we describe how to quantize our model with ``int8``.
|
||||
|
||||
Change :ref:`conv-emformer-step-4-export-torchscript-model-via-pnnx` to
|
||||
disable ``fp16`` when using ``pnnx``:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
pnnx ./encoder_jit_trace-pnnx.pt fp16=0
|
||||
pnnx ./decoder_jit_trace-pnnx.pt
|
||||
pnnx ./joiner_jit_trace-pnnx.pt fp16=0
|
||||
|
||||
.. note::
|
||||
|
||||
We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
|
||||
support quantizing the decoder model yet. We will update this documentation
|
||||
once `ncnn`_ supports it. (Maybe in this year, 2023).
|
||||
|
||||
It will generate the following files
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin}
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
|
||||
|
||||
Let us compare again the file sizes:
|
||||
|
||||
+----------------------------------------+------------+
|
||||
| File name | File size |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.pt | 283 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
|
||||
You can see that the file sizes are doubled when we disable ``fp16``.
|
||||
|
||||
.. note::
|
||||
|
||||
You can again use ``streaming-ncnn-decode.py`` to test the exported models.
|
||||
|
||||
Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
|
||||
to modify ``encoder_jit_trace-pnnx.ncnn.param``.
|
||||
|
||||
Change
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
1060 1342
|
||||
Input in0 0 1 in0
|
||||
|
||||
to
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
1061 1342
|
||||
SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
|
||||
Input in0 0 1 in0
|
||||
|
||||
.. caution::
|
||||
|
||||
Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
|
||||
to change the values for ``SherpaMetaData`` if your model uses a different setting.
|
||||
|
||||
|
||||
Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
|
||||
`sherpa-ncnn`_.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# We will download sherpa-ncnn to $HOME/open-source/
|
||||
# You can change it to anywhere you like.
|
||||
cd $HOME
|
||||
mkdir -p open-source
|
||||
|
||||
cd open-source
|
||||
git clone https://github.com/k2-fsa/sherpa-ncnn
|
||||
cd sherpa-ncnn
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j 4
|
||||
|
||||
./bin/generate-int8-scale-table
|
||||
|
||||
export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
|
||||
|
||||
The output of the above commands are:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
(py38) kuangfangjun:build$ generate-int8-scale-table
|
||||
Please provide 10 arg. Currently given: 1
|
||||
Usage:
|
||||
generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
|
||||
|
||||
Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
|
||||
|
||||
We need to create a file ``wave_filenames.txt``, in which we need to put
|
||||
some calibration wave files. For testing purpose, we put the ``test_wavs``
|
||||
from the pre-trained model repository `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
cat <<EOF > wave_filenames.txt
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
../test_wavs/1221-135766-0001.wav
|
||||
../test_wavs/1221-135766-0002.wav
|
||||
EOF
|
||||
|
||||
Now we can calculate the scales needed for quantization with the calibration data:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
generate-int8-scale-table \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./encoder-scale-table.txt \
|
||||
./joiner-scale-table.txt \
|
||||
./wave_filenames.txt
|
||||
|
||||
The output logs are in the following:
|
||||
|
||||
.. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt
|
||||
|
||||
It generates the following two files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ls -lh encoder-scale-table.txt joiner-scale-table.txt
|
||||
-rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt
|
||||
-rw-r--r-- 1 kuangfangjun root 18K Jan 11 17:28 joiner-scale-table.txt
|
||||
|
||||
.. caution::
|
||||
|
||||
Definitely, you need more calibration data to compute the scale table.
|
||||
|
||||
Finally, let us use the scale table to quantize our models into ``int8``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ncnn2int8
|
||||
|
||||
usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
|
||||
|
||||
First, we quantize the encoder model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
ncnn2int8 \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./encoder-scale-table.txt
|
||||
|
||||
Next, we quantize the joiner model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ncnn2int8 \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.int8.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./joiner-scale-table.txt
|
||||
|
||||
The above two commands generate the following 4 files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param
|
||||
-rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param
|
||||
|
||||
Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
|
||||
|
||||
.. caution::
|
||||
|
||||
``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
|
||||
|
||||
You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
|
||||
and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
|
||||
|
||||
For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
|
||||
replace the following invocation:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
sherpa-ncnn \
|
||||
../data/lang_bpe_500/tokens.txt \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
|
||||
with
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
sherpa-ncnn \
|
||||
../data/lang_bpe_500/tokens.txt \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
|
||||
|
||||
The following table compares again the file sizes:
|
||||
|
||||
|
||||
+----------------------------------------+------------+
|
||||
| File name | File size |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.pt | 283 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.int8.bin | 99 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB |
|
||||
+----------------------------------------+------------+
|
||||
|
||||
You can see that the file sizes of the model after ``int8`` quantization
|
||||
are much smaller.
|
||||
|
||||
.. hint::
|
||||
|
||||
Currently, only linear layers and convolutional layers are quantized
|
||||
with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
|
||||
|
||||
.. note::
|
||||
|
||||
You need to test the recognition accuracy after ``int8`` quantization.
|
||||
|
||||
You can find the speed comparison at `<https://github.com/k2-fsa/sherpa-ncnn/issues/44>`_.
|
||||
|
||||
|
||||
That's it! Have fun with `sherpa-ncnn`_!
|
||||
644
docs/source/model-export/export-ncnn-lstm.rst
Normal file
644
docs/source/model-export/export-ncnn-lstm.rst
Normal file
@ -0,0 +1,644 @@
|
||||
.. _export_lstm_transducer_models_to_ncnn:
|
||||
|
||||
Export LSTM transducer models to ncnn
|
||||
-------------------------------------
|
||||
|
||||
We use the pre-trained model from the following repository as an example:
|
||||
|
||||
`<https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03>`_
|
||||
|
||||
We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
|
||||
|
||||
.. caution::
|
||||
|
||||
Please use a more recent version of PyTorch. For instance, ``torch 1.8``
|
||||
may ``not`` work.
|
||||
|
||||
1. Download the pre-trained model
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. hint::
|
||||
|
||||
You have to install `git-lfs`_ before you continue.
|
||||
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
|
||||
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
|
||||
|
||||
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
|
||||
cd ..
|
||||
|
||||
.. note::
|
||||
|
||||
We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
|
||||
|
||||
In the above code, we downloaded the pre-trained model into the directory
|
||||
``egs/librispeech/ASR/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03``.
|
||||
|
||||
2. Install ncnn and pnnx
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Please refer to :ref:`export_for_ncnn_install_ncnn_and_pnnx` .
|
||||
|
||||
|
||||
3. Export the model via torch.jit.trace()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
First, let us rename our pre-trained model:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp
|
||||
|
||||
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
|
||||
|
||||
cd ../..
|
||||
|
||||
Next, we use the following code to export our model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
dir=./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
|
||||
|
||||
./lstm_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $dir/exp \
|
||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
--num-encoder-layers 12 \
|
||||
--encoder-dim 512 \
|
||||
--rnn-hidden-size 1024
|
||||
|
||||
.. hint::
|
||||
|
||||
We have renamed our model to ``epoch-99.pt`` so that we can use ``--epoch 99``.
|
||||
There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
|
||||
|
||||
If you have trained a model by yourself and if you have all checkpoints
|
||||
available, please first use ``decode.py`` to tune ``--epoch --avg``
|
||||
and select the best combination with with ``--use-averaged-model 1``.
|
||||
|
||||
.. note::
|
||||
|
||||
You will see the following log output:
|
||||
|
||||
.. literalinclude:: ./code/export-lstm-transducer-for-ncnn-output.txt
|
||||
|
||||
The log shows the model has ``84176356`` parameters, i.e., ``~84 M``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/pretrained-iter-468000-avg-16.pt
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 324M Feb 17 10:34 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/pretrained-iter-468000-avg-16.pt
|
||||
|
||||
You can see that the file size of the pre-trained model is ``324 MB``, which
|
||||
is roughly equal to ``84176356*4/1024/1024 = 321.107 MB``.
|
||||
|
||||
After running ``lstm_transducer_stateless2/export-for-ncnn.py``,
|
||||
we will get the following files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*pnnx.pt
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 1010K Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 318M Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 3.0M Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
|
||||
.. _lstm-transducer-step-4-export-torchscript-model-via-pnnx:
|
||||
|
||||
4. Export torchscript model via pnnx
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. hint::
|
||||
|
||||
Make sure you have set up the ``PATH`` environment variable
|
||||
in :ref:`export_for_ncnn_install_ncnn_and_pnnx`. Otherwise,
|
||||
it will throw an error saying that ``pnnx`` could not be found.
|
||||
|
||||
Now, it's time to export our models to `ncnn`_ via ``pnnx``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
|
||||
|
||||
pnnx ./encoder_jit_trace-pnnx.pt
|
||||
pnnx ./decoder_jit_trace-pnnx.pt
|
||||
pnnx ./joiner_jit_trace-pnnx.pt
|
||||
|
||||
It will generate the following files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*ncnn*{bin,param}
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 503K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 437 Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 159M Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 21K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 1.5M Feb 17 11:33 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 488 Feb 17 11:33 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param
|
||||
|
||||
|
||||
There are two types of files:
|
||||
|
||||
- ``param``: It is a text file containing the model architectures. You can
|
||||
use a text editor to view its content.
|
||||
- ``bin``: It is a binary file containing the model parameters.
|
||||
|
||||
We compare the file sizes of the models below before and after converting via ``pnnx``:
|
||||
|
||||
.. see https://tableconvert.com/restructuredtext-generator
|
||||
|
||||
+----------------------------------+------------+
|
||||
| File name | File size |
|
||||
+==================================+============+
|
||||
| encoder_jit_trace-pnnx.pt | 318 MB |
|
||||
+----------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin | 159 MB |
|
||||
+----------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin | 503 KB |
|
||||
+----------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB |
|
||||
+----------------------------------+------------+
|
||||
|
||||
You can see that the file sizes of the models after conversion are about one half
|
||||
of the models before conversion:
|
||||
|
||||
- encoder: 318 MB vs 159 MB
|
||||
- decoder: 1010 KB vs 503 KB
|
||||
- joiner: 3.0 MB vs 1.5 MB
|
||||
|
||||
The reason is that by default ``pnnx`` converts ``float32`` parameters
|
||||
to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
|
||||
for ``float16``. Thus, it is ``twice smaller`` after conversion.
|
||||
|
||||
.. hint::
|
||||
|
||||
If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
|
||||
won't convert ``float32`` to ``float16``.
|
||||
|
||||
5. Test the exported models in icefall
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. note::
|
||||
|
||||
We assume you have set up the environment variable ``PYTHONPATH`` when
|
||||
building `ncnn`_.
|
||||
|
||||
Now we have successfully converted our pre-trained model to `ncnn`_ format.
|
||||
The generated 6 files are what we need. You can use the following code to
|
||||
test the converted models:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 ./lstm_transducer_stateless2/streaming-ncnn-decode.py \
|
||||
--tokens ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav
|
||||
|
||||
.. hint::
|
||||
|
||||
`ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
|
||||
only 1 wave file as input.
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. literalinclude:: ./code/test-streaming-ncnn-decode-lstm-transducer-libri.txt
|
||||
|
||||
Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
|
||||
|
||||
.. _lstm-modify-the-exported-encoder-for-sherpa-ncnn:
|
||||
|
||||
6. Modify the exported encoder for sherpa-ncnn
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In order to use the exported models in `sherpa-ncnn`_, we have to modify
|
||||
``encoder_jit_trace-pnnx.ncnn.param``.
|
||||
|
||||
Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
|
||||
|
||||
.. code-block::
|
||||
|
||||
7767517
|
||||
267 379
|
||||
Input in0 0 1 in0
|
||||
|
||||
**Explanation** of the above three lines:
|
||||
|
||||
1. ``7767517``, it is a magic number and should not be changed.
|
||||
2. ``267 379``, the first number ``267`` specifies the number of layers
|
||||
in this file, while ``379`` specifies the number of intermediate outputs
|
||||
of this file
|
||||
3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
|
||||
is the layer name of this layer; ``0`` means this layer has no input;
|
||||
``1`` means this layer has one output; ``in0`` is the output name of
|
||||
this layer.
|
||||
|
||||
We need to add 1 extra line and also increment the number of layers.
|
||||
The result looks like below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
268 379
|
||||
SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024
|
||||
Input in0 0 1 in0
|
||||
|
||||
**Explanation**
|
||||
|
||||
1. ``7767517``, it is still the same
|
||||
2. ``268 379``, we have added an extra layer, so we need to update ``267`` to ``268``.
|
||||
We don't need to change ``379`` since the newly added layer has no inputs or outputs.
|
||||
3. ``SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024``
|
||||
This line is newly added. Its explanation is given below:
|
||||
|
||||
- ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
|
||||
- ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
|
||||
- ``0 0`` means this layer has no inputs or output. Must be ``0 0``
|
||||
- ``0=3``, 0 is the key and 3 is the value. MUST be ``0=3``
|
||||
- ``1=12``, 1 is the key and 12 is the value of the
|
||||
parameter ``--num-encoder-layers`` that you provided when running
|
||||
``./lstm_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``2=512``, 2 is the key and 512 is the value of the
|
||||
parameter ``--encoder-dim`` that you provided when running
|
||||
``./lstm_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``3=1024``, 3 is the key and 1024 is the value of the
|
||||
parameter ``--rnn-hidden-size`` that you provided when running
|
||||
``./lstm_transducer_stateless2/export-for-ncnn.py``.
|
||||
|
||||
For ease of reference, we list the key-value pairs that you need to add
|
||||
in the following table. If your model has a different setting, please
|
||||
change the values for ``SherpaMetaData`` accordingly. Otherwise, you
|
||||
will be ``SAD``.
|
||||
|
||||
+------+-----------------------------+
|
||||
| key | value |
|
||||
+======+=============================+
|
||||
| 0 | 3 (fixed) |
|
||||
+------+-----------------------------+
|
||||
| 1 | ``--num-encoder-layers`` |
|
||||
+------+-----------------------------+
|
||||
| 2 | ``--encoder-dim`` |
|
||||
+------+-----------------------------+
|
||||
| 3 | ``--rnn-hidden-size`` |
|
||||
+------+-----------------------------+
|
||||
|
||||
4. ``Input in0 0 1 in0``. No need to change it.
|
||||
|
||||
.. caution::
|
||||
|
||||
When you add a new layer ``SherpaMetaData``, please remember to update the
|
||||
number of layers. In our case, update ``267`` to ``268``. Otherwise,
|
||||
you will be SAD later.
|
||||
|
||||
.. hint::
|
||||
|
||||
After adding the new layer ``SherpaMetaData``, you cannot use this model
|
||||
with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
|
||||
supported only in `sherpa-ncnn`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
`ncnn`_ is very flexible. You can add new layers to it just by text-editing
|
||||
the ``param`` file! You don't need to change the ``bin`` file.
|
||||
|
||||
Now you can use this model in `sherpa-ncnn`_.
|
||||
Please refer to the following documentation:
|
||||
|
||||
- Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
|
||||
- ``Android``: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
|
||||
- ``iOS``: `<https://k2-fsa.github.io/sherpa/ncnn/ios/index.html>`_
|
||||
- Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
|
||||
|
||||
We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
|
||||
|
||||
- `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
|
||||
|
||||
You can find more usages there.
|
||||
|
||||
7. (Optional) int8 quantization with sherpa-ncnn
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
This step is optional.
|
||||
|
||||
In this step, we describe how to quantize our model with ``int8``.
|
||||
|
||||
Change :ref:`lstm-transducer-step-4-export-torchscript-model-via-pnnx` to
|
||||
disable ``fp16`` when using ``pnnx``:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
|
||||
|
||||
pnnx ./encoder_jit_trace-pnnx.pt fp16=0
|
||||
pnnx ./decoder_jit_trace-pnnx.pt
|
||||
pnnx ./joiner_jit_trace-pnnx.pt fp16=0
|
||||
|
||||
.. note::
|
||||
|
||||
We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
|
||||
support quantizing the decoder model yet. We will update this documentation
|
||||
once `ncnn`_ supports it. (Maybe in this year, 2023).
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*_jit_trace-pnnx.ncnn.{param,bin}
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 503K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 437 Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 317M Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 21K Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 3.0M Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 488 Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param
|
||||
|
||||
|
||||
Let us compare again the file sizes:
|
||||
|
||||
+----------------------------------------+------------+
|
||||
| File name | File size |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.pt | 318 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 159 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 317 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
|
||||
You can see that the file sizes are doubled when we disable ``fp16``.
|
||||
|
||||
.. note::
|
||||
|
||||
You can again use ``streaming-ncnn-decode.py`` to test the exported models.
|
||||
|
||||
Next, follow :ref:`lstm-modify-the-exported-encoder-for-sherpa-ncnn`
|
||||
to modify ``encoder_jit_trace-pnnx.ncnn.param``.
|
||||
|
||||
Change
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
267 379
|
||||
Input in0 0 1 in0
|
||||
|
||||
to
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
268 379
|
||||
SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024
|
||||
Input in0 0 1 in0
|
||||
|
||||
.. caution::
|
||||
|
||||
Please follow :ref:`lstm-modify-the-exported-encoder-for-sherpa-ncnn`
|
||||
to change the values for ``SherpaMetaData`` if your model uses a different setting.
|
||||
|
||||
Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
|
||||
`sherpa-ncnn`_.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# We will download sherpa-ncnn to $HOME/open-source/
|
||||
# You can change it to anywhere you like.
|
||||
cd $HOME
|
||||
mkdir -p open-source
|
||||
|
||||
cd open-source
|
||||
git clone https://github.com/k2-fsa/sherpa-ncnn
|
||||
cd sherpa-ncnn
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j 4
|
||||
|
||||
./bin/generate-int8-scale-table
|
||||
|
||||
export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
|
||||
|
||||
The output of the above commands are:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
(py38) kuangfangjun:build$ generate-int8-scale-table
|
||||
Please provide 10 arg. Currently given: 1
|
||||
Usage:
|
||||
generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
|
||||
|
||||
Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
|
||||
|
||||
We need to create a file ``wave_filenames.txt``, in which we need to put
|
||||
some calibration wave files. For testing purpose, we put the ``test_wavs``
|
||||
from the pre-trained model repository
|
||||
`<https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03>`_
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
|
||||
|
||||
cat <<EOF > wave_filenames.txt
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
../test_wavs/1221-135766-0001.wav
|
||||
../test_wavs/1221-135766-0002.wav
|
||||
EOF
|
||||
|
||||
Now we can calculate the scales needed for quantization with the calibration data:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
|
||||
|
||||
generate-int8-scale-table \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./encoder-scale-table.txt \
|
||||
./joiner-scale-table.txt \
|
||||
./wave_filenames.txt
|
||||
|
||||
The output logs are in the following:
|
||||
|
||||
.. literalinclude:: ./code/generate-int-8-scale-table-for-lstm.txt
|
||||
|
||||
It generates the following two files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh encoder-scale-table.txt joiner-scale-table.txt
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 345K Feb 17 12:13 encoder-scale-table.txt
|
||||
-rw-r--r-- 1 kuangfangjun root 17K Feb 17 12:13 joiner-scale-table.txt
|
||||
|
||||
.. caution::
|
||||
|
||||
Definitely, you need more calibration data to compute the scale table.
|
||||
|
||||
Finally, let us use the scale table to quantize our models into ``int8``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ncnn2int8
|
||||
|
||||
usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
|
||||
|
||||
First, we quantize the encoder model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
|
||||
|
||||
ncnn2int8 \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./encoder-scale-table.txt
|
||||
|
||||
Next, we quantize the joiner model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ncnn2int8 \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.int8.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./joiner-scale-table.txt
|
||||
|
||||
The above two commands generate the following 4 files:
|
||||
|
||||
.. code-block::
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 218M Feb 17 12:19 encoder_jit_trace-pnnx.ncnn.int8.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 21K Feb 17 12:19 encoder_jit_trace-pnnx.ncnn.int8.param
|
||||
-rw-r--r-- 1 kuangfangjun root 774K Feb 17 12:19 joiner_jit_trace-pnnx.ncnn.int8.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 496 Feb 17 12:19 joiner_jit_trace-pnnx.ncnn.int8.param
|
||||
|
||||
Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
|
||||
|
||||
.. caution::
|
||||
|
||||
``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
|
||||
|
||||
You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
|
||||
and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
|
||||
|
||||
For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
|
||||
replace the following invocation:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
|
||||
|
||||
sherpa-ncnn \
|
||||
../data/lang_bpe_500/tokens.txt \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
|
||||
with
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
sherpa-ncnn \
|
||||
../data/lang_bpe_500/tokens.txt \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
|
||||
The following table compares again the file sizes:
|
||||
|
||||
+----------------------------------------+------------+
|
||||
| File name | File size |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.pt | 318 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 159 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 317 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.int8.bin | 218 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB |
|
||||
+----------------------------------------+------------+
|
||||
|
||||
You can see that the file size of the joiner model after ``int8`` quantization
|
||||
is much smaller. However, the size of the encoder model is even larger than
|
||||
the ``fp16`` counterpart. The reason is that `ncnn`_ currently does not support
|
||||
quantizing ``LSTM`` layers into ``8-bit``. Please see
|
||||
`<https://github.com/Tencent/ncnn/issues/4532>`_
|
||||
|
||||
.. hint::
|
||||
|
||||
Currently, only linear layers and convolutional layers are quantized
|
||||
with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
|
||||
|
||||
.. note::
|
||||
|
||||
You need to test the recognition accuracy after ``int8`` quantization.
|
||||
|
||||
|
||||
That's it! Have fun with `sherpa-ncnn`_!
|
||||
383
docs/source/model-export/export-ncnn-zipformer.rst
Normal file
383
docs/source/model-export/export-ncnn-zipformer.rst
Normal file
@ -0,0 +1,383 @@
|
||||
.. _export_streaming_zipformer_transducer_models_to_ncnn:
|
||||
|
||||
Export streaming Zipformer transducer models to ncnn
|
||||
----------------------------------------------------
|
||||
|
||||
We use the pre-trained model from the following repository as an example:
|
||||
|
||||
`<https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
|
||||
|
||||
We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
|
||||
|
||||
.. caution::
|
||||
|
||||
Please use a more recent version of PyTorch. For instance, ``torch 1.8``
|
||||
may ``not`` work.
|
||||
|
||||
1. Download the pre-trained model
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. hint::
|
||||
|
||||
You have to install `git-lfs`_ before you continue.
|
||||
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
|
||||
cd ..
|
||||
|
||||
.. note::
|
||||
|
||||
We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
|
||||
|
||||
In the above code, we downloaded the pre-trained model into the directory
|
||||
``egs/librispeech/ASR/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29``.
|
||||
|
||||
2. Install ncnn and pnnx
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Please refer to :ref:`export_for_ncnn_install_ncnn_and_pnnx` .
|
||||
|
||||
|
||||
3. Export the model via torch.jit.trace()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
First, let us rename our pre-trained model:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
|
||||
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
|
||||
cd ../..
|
||||
|
||||
Next, we use the following code to export our model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
||||
--exp-dir $dir/exp \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
\
|
||||
--decode-chunk-len 32 \
|
||||
--num-left-chunks 4 \
|
||||
--num-encoder-layers "2,4,3,2,4" \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--nhead "8,8,8,8,8" \
|
||||
--encoder-dims "384,384,384,384,384" \
|
||||
--attention-dims "192,192,192,192,192" \
|
||||
--encoder-unmasked-dims "256,256,256,256,256" \
|
||||
--zipformer-downsampling-factors "1,2,4,8,2" \
|
||||
--cnn-module-kernels "31,31,31,31,31" \
|
||||
--decoder-dim 512 \
|
||||
--joiner-dim 512
|
||||
|
||||
.. caution::
|
||||
|
||||
If your model has different configuration parameters, please change them accordingly.
|
||||
|
||||
.. hint::
|
||||
|
||||
We have renamed our model to ``epoch-99.pt`` so that we can use ``--epoch 99``.
|
||||
There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
|
||||
|
||||
If you have trained a model by yourself and if you have all checkpoints
|
||||
available, please first use ``decode.py`` to tune ``--epoch --avg``
|
||||
and select the best combination with with ``--use-averaged-model 1``.
|
||||
|
||||
.. note::
|
||||
|
||||
You will see the following log output:
|
||||
|
||||
.. literalinclude:: ./code/export-zipformer-transducer-for-ncnn-output.txt
|
||||
|
||||
The log shows the model has ``69920376`` parameters, i.e., ``~69.9 M``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/pretrained.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 269M Jan 12 12:53 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/pretrained.pt
|
||||
|
||||
You can see that the file size of the pre-trained model is ``269 MB``, which
|
||||
is roughly equal to ``69920376*4/1024/1024 = 266.725 MB``.
|
||||
|
||||
After running ``pruned_transducer_stateless7_streaming/export-for-ncnn.py``,
|
||||
we will get the following files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/*pnnx.pt
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 1022K Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 266M Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 2.8M Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
.. _zipformer-transducer-step-4-export-torchscript-model-via-pnnx:
|
||||
|
||||
4. Export torchscript model via pnnx
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. hint::
|
||||
|
||||
Make sure you have set up the ``PATH`` environment variable
|
||||
in :ref:`export_for_ncnn_install_ncnn_and_pnnx`. Otherwise,
|
||||
it will throw an error saying that ``pnnx`` could not be found.
|
||||
|
||||
Now, it's time to export our models to `ncnn`_ via ``pnnx``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
|
||||
|
||||
pnnx ./encoder_jit_trace-pnnx.pt
|
||||
pnnx ./decoder_jit_trace-pnnx.pt
|
||||
pnnx ./joiner_jit_trace-pnnx.pt
|
||||
|
||||
It will generate the following files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/*ncnn*{bin,param}
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 509K Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 437 Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 133M Feb 27 20:30 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 152K Feb 27 20:30 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 1.4M Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 488 Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.param
|
||||
|
||||
There are two types of files:
|
||||
|
||||
- ``param``: It is a text file containing the model architectures. You can
|
||||
use a text editor to view its content.
|
||||
- ``bin``: It is a binary file containing the model parameters.
|
||||
|
||||
We compare the file sizes of the models below before and after converting via ``pnnx``:
|
||||
|
||||
.. see https://tableconvert.com/restructuredtext-generator
|
||||
|
||||
+----------------------------------+------------+
|
||||
| File name | File size |
|
||||
+==================================+============+
|
||||
| encoder_jit_trace-pnnx.pt | 266 MB |
|
||||
+----------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1022 KB |
|
||||
+----------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 2.8 MB |
|
||||
+----------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin | 133 MB |
|
||||
+----------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin | 509 KB |
|
||||
+----------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin | 1.4 MB |
|
||||
+----------------------------------+------------+
|
||||
|
||||
You can see that the file sizes of the models after conversion are about one half
|
||||
of the models before conversion:
|
||||
|
||||
- encoder: 266 MB vs 133 MB
|
||||
- decoder: 1022 KB vs 509 KB
|
||||
- joiner: 2.8 MB vs 1.4 MB
|
||||
|
||||
The reason is that by default ``pnnx`` converts ``float32`` parameters
|
||||
to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
|
||||
for ``float16``. Thus, it is ``twice smaller`` after conversion.
|
||||
|
||||
.. hint::
|
||||
|
||||
If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
|
||||
won't convert ``float32`` to ``float16``.
|
||||
|
||||
5. Test the exported models in icefall
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. note::
|
||||
|
||||
We assume you have set up the environment variable ``PYTHONPATH`` when
|
||||
building `ncnn`_.
|
||||
|
||||
Now we have successfully converted our pre-trained model to `ncnn`_ format.
|
||||
The generated 6 files are what we need. You can use the following code to
|
||||
test the converted models:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
|
||||
--tokens ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav
|
||||
|
||||
.. hint::
|
||||
|
||||
`ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
|
||||
only 1 wave file as input.
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. literalinclude:: ./code/test-streaming-ncnn-decode-zipformer-transducer-libri.txt
|
||||
|
||||
Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
|
||||
|
||||
.. _zipformer-modify-the-exported-encoder-for-sherpa-ncnn:
|
||||
|
||||
6. Modify the exported encoder for sherpa-ncnn
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In order to use the exported models in `sherpa-ncnn`_, we have to modify
|
||||
``encoder_jit_trace-pnnx.ncnn.param``.
|
||||
|
||||
Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
|
||||
|
||||
.. code-block::
|
||||
|
||||
7767517
|
||||
2028 2547
|
||||
Input in0 0 1 in0
|
||||
|
||||
**Explanation** of the above three lines:
|
||||
|
||||
1. ``7767517``, it is a magic number and should not be changed.
|
||||
2. ``2028 2547``, the first number ``2028`` specifies the number of layers
|
||||
in this file, while ``2547`` specifies the number of intermediate outputs
|
||||
of this file
|
||||
3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
|
||||
is the layer name of this layer; ``0`` means this layer has no input;
|
||||
``1`` means this layer has one output; ``in0`` is the output name of
|
||||
this layer.
|
||||
|
||||
We need to add 1 extra line and also increment the number of layers.
|
||||
The result looks like below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
2029 2547
|
||||
SherpaMetaData sherpa_meta_data1 0 0 0=2 1=32 2=4 3=7 -23316=5,2,4,3,2,4 -23317=5,384,384,384,384,384 -23318=5,192,192,192,192,192 -23319=5,1,2,4,8,2 -23320=5,31,31,31,31,31
|
||||
Input in0 0 1 in0
|
||||
|
||||
**Explanation**
|
||||
|
||||
1. ``7767517``, it is still the same
|
||||
2. ``2029 2547``, we have added an extra layer, so we need to update ``2028`` to ``2029``.
|
||||
We don't need to change ``2547`` since the newly added layer has no inputs or outputs.
|
||||
3. ``SherpaMetaData sherpa_meta_data1 0 0 0=2 1=32 2=4 3=7 -23316=5,2,4,3,2,4 -23317=5,384,384,384,384,384 -23318=5,192,192,192,192,192 -23319=5,1,2,4,8,2 -23320=5,31,31,31,31,31``
|
||||
This line is newly added. Its explanation is given below:
|
||||
|
||||
- ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
|
||||
- ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
|
||||
- ``0 0`` means this layer has no inputs or output. Must be ``0 0``
|
||||
- ``0=2``, 0 is the key and 2 is the value. MUST be ``0=2``
|
||||
- ``1=32``, 1 is the key and 32 is the value of the
|
||||
parameter ``--decode-chunk-len`` that you provided when running
|
||||
``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
|
||||
- ``2=4``, 2 is the key and 4 is the value of the
|
||||
parameter ``--num-left-chunks`` that you provided when running
|
||||
``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
|
||||
- ``3=7``, 3 is the key and 7 is the value of for the amount of padding
|
||||
used in the Conv2DSubsampling layer. It should be 7 for zipformer
|
||||
if you don't change zipformer.py.
|
||||
- ``-23316=5,2,4,3,2,4``, attribute 16, this is an array attribute.
|
||||
It is attribute 16 since -23300 - (-23316) = 16.
|
||||
The first element of the array is the length of the array, which is 5 in our case.
|
||||
``2,4,3,2,4`` is the value of ``--num-encoder-layers``that you provided
|
||||
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
|
||||
- ``-23317=5,384,384,384,384,384``, attribute 17.
|
||||
The first element of the array is the length of the array, which is 5 in our case.
|
||||
``384,384,384,384,384`` is the value of ``--encoder-dims``that you provided
|
||||
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
|
||||
- ``-23318=5,192,192,192,192,192``, attribute 18.
|
||||
The first element of the array is the length of the array, which is 5 in our case.
|
||||
``192,192,192,192,192`` is the value of ``--attention-dims`` that you provided
|
||||
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
|
||||
- ``-23319=5,1,2,4,8,2``, attribute 19.
|
||||
The first element of the array is the length of the array, which is 5 in our case.
|
||||
``1,2,4,8,2`` is the value of ``--zipformer-downsampling-factors`` that you provided
|
||||
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
|
||||
- ``-23320=5,31,31,31,31,31``, attribute 20.
|
||||
The first element of the array is the length of the array, which is 5 in our case.
|
||||
``31,31,31,31,31`` is the value of ``--cnn-module-kernels`` that you provided
|
||||
when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
|
||||
|
||||
For ease of reference, we list the key-value pairs that you need to add
|
||||
in the following table. If your model has a different setting, please
|
||||
change the values for ``SherpaMetaData`` accordingly. Otherwise, you
|
||||
will be ``SAD``.
|
||||
|
||||
+----------+--------------------------------------------+
|
||||
| key | value |
|
||||
+==========+============================================+
|
||||
| 0 | 2 (fixed) |
|
||||
+----------+--------------------------------------------+
|
||||
| 1 | ``-decode-chunk-len`` |
|
||||
+----------+--------------------------------------------+
|
||||
| 2 | ``--num-left-chunks`` |
|
||||
+----------+--------------------------------------------+
|
||||
| 3 | 7 (if you don't change code) |
|
||||
+----------+--------------------------------------------+
|
||||
|-23316 | ``--num-encoder-layer`` |
|
||||
+----------+--------------------------------------------+
|
||||
|-23317 | ``--encoder-dims`` |
|
||||
+----------+--------------------------------------------+
|
||||
|-23318 | ``--attention-dims`` |
|
||||
+----------+--------------------------------------------+
|
||||
|-23319 | ``--zipformer-downsampling-factors`` |
|
||||
+----------+--------------------------------------------+
|
||||
|-23320 | ``--cnn-module-kernels`` |
|
||||
+----------+--------------------------------------------+
|
||||
|
||||
4. ``Input in0 0 1 in0``. No need to change it.
|
||||
|
||||
.. caution::
|
||||
|
||||
When you add a new layer ``SherpaMetaData``, please remember to update the
|
||||
number of layers. In our case, update ``2028`` to ``2029``. Otherwise,
|
||||
you will be SAD later.
|
||||
|
||||
.. hint::
|
||||
|
||||
After adding the new layer ``SherpaMetaData``, you cannot use this model
|
||||
with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
|
||||
supported only in `sherpa-ncnn`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
`ncnn`_ is very flexible. You can add new layers to it just by text-editing
|
||||
the ``param`` file! You don't need to change the ``bin`` file.
|
||||
|
||||
Now you can use this model in `sherpa-ncnn`_.
|
||||
Please refer to the following documentation:
|
||||
|
||||
- Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
|
||||
- ``Android``: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
|
||||
- ``iOS``: `<https://k2-fsa.github.io/sherpa/ncnn/ios/index.html>`_
|
||||
- Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
|
||||
|
||||
We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
|
||||
|
||||
- `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
|
||||
|
||||
You can find more usages there.
|
||||
@ -1,15 +1,27 @@
|
||||
Export to ncnn
|
||||
==============
|
||||
|
||||
We support exporting both
|
||||
`LSTM transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
|
||||
and
|
||||
`ConvEmformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_
|
||||
to `ncnn <https://github.com/tencent/ncnn>`_.
|
||||
We support exporting the following models
|
||||
to `ncnn <https://github.com/tencent/ncnn>`_:
|
||||
|
||||
We also provide `<https://github.com/k2-fsa/sherpa-ncnn>`_
|
||||
performing speech recognition using ``ncnn`` with exported models.
|
||||
It has been tested on Linux, macOS, Windows, ``Android``, and ``Raspberry Pi``.
|
||||
- `Zipformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_
|
||||
|
||||
- `LSTM transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
|
||||
|
||||
- `ConvEmformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_
|
||||
|
||||
We also provide `sherpa-ncnn`_
|
||||
for performing speech recognition using `ncnn`_ with exported models.
|
||||
It has been tested on the following platforms:
|
||||
|
||||
- Linux
|
||||
- macOS
|
||||
- Windows
|
||||
- ``Android``
|
||||
- ``iOS``
|
||||
- ``Raspberry Pi``
|
||||
- `爱芯派 <https://wiki.sipeed.com/hardware/zh/>`_ (`MAIX-III AXera-Pi <https://wiki.sipeed.com/hardware/en/maixIII/ax-pi/axpi.html>`_).
|
||||
- `RV1126 <https://www.rock-chips.com/a/en/products/RV11_Series/2020/0427/1076.html>`_
|
||||
|
||||
`sherpa-ncnn`_ is self-contained and can be statically linked to produce
|
||||
a binary containing everything needed. Please refer
|
||||
@ -18,754 +30,8 @@ to its documentation for details:
|
||||
- `<https://k2-fsa.github.io/sherpa/ncnn/index.html>`_
|
||||
|
||||
|
||||
Export LSTM transducer models
|
||||
-----------------------------
|
||||
.. toctree::
|
||||
|
||||
Please refer to :ref:`export-lstm-transducer-model-for-ncnn` for details.
|
||||
|
||||
|
||||
|
||||
Export ConvEmformer transducer models
|
||||
-------------------------------------
|
||||
|
||||
We use the pre-trained model from the following repository as an example:
|
||||
|
||||
- `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
|
||||
|
||||
We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
We use ``Ubuntu 18.04``, ``torch 1.10``, and ``Python 3.8`` for testing.
|
||||
|
||||
.. caution::
|
||||
|
||||
Please use a more recent version of PyTorch. For instance, ``torch 1.8``
|
||||
may ``not`` work.
|
||||
|
||||
1. Download the pre-trained model
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. hint::
|
||||
|
||||
You can also refer to `<https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html#icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_ to download the pre-trained model.
|
||||
|
||||
You have to install `git-lfs`_ before you continue.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
|
||||
|
||||
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
|
||||
cd ..
|
||||
|
||||
.. note::
|
||||
|
||||
We download ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
|
||||
|
||||
|
||||
In the above code, we download the pre-trained model into the directory
|
||||
``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``.
|
||||
|
||||
2. Install ncnn and pnnx
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# We put ncnn into $HOME/open-source/ncnn
|
||||
# You can change it to anywhere you like
|
||||
|
||||
cd $HOME
|
||||
mkdir -p open-source
|
||||
cd open-source
|
||||
|
||||
git clone https://github.com/csukuangfj/ncnn
|
||||
cd ncnn
|
||||
git submodule update --recursive --init
|
||||
|
||||
# Note: We don't use "python setup.py install" or "pip install ." here
|
||||
|
||||
mkdir -p build-wheel
|
||||
cd build-wheel
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DNCNN_PYTHON=ON \
|
||||
-DNCNN_BUILD_BENCHMARK=OFF \
|
||||
-DNCNN_BUILD_EXAMPLES=OFF \
|
||||
-DNCNN_BUILD_TOOLS=ON \
|
||||
..
|
||||
|
||||
make -j4
|
||||
|
||||
cd ..
|
||||
|
||||
# Note: $PWD here is $HOME/open-source/ncnn
|
||||
|
||||
export PYTHONPATH=$PWD/python:$PYTHONPATH
|
||||
export PATH=$PWD/tools/pnnx/build/src:$PATH
|
||||
export PATH=$PWD/build-wheel/tools/quantize:$PATH
|
||||
|
||||
# Now build pnnx
|
||||
cd tools/pnnx
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j4
|
||||
|
||||
./src/pnnx
|
||||
|
||||
Congratulations! You have successfully installed the following components:
|
||||
|
||||
- ``pnxx``, which is an executable located in
|
||||
``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use
|
||||
it to convert models exported by ``torch.jit.trace()``.
|
||||
- ``ncnn2int8``, which is an executable located in
|
||||
``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use
|
||||
it to quantize our models to ``int8``.
|
||||
- ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located
|
||||
in ``$HOME/open-source/ncnn/python/ncnn``.
|
||||
|
||||
.. note::
|
||||
|
||||
I am using ``Python 3.8``, so it
|
||||
is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different
|
||||
version, say, ``Python 3.9``, the name would be
|
||||
``ncnn.cpython-39-x86_64-linux-gnu.so``.
|
||||
|
||||
Also, if you are not using Linux, the file name would also be different.
|
||||
But that does not matter. As long as you can compile it, it should work.
|
||||
|
||||
We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your
|
||||
Python code. We have also set up ``PATH`` so that you can use
|
||||
``pnnx`` and ``ncnn2int8`` later in your terminal.
|
||||
|
||||
.. caution::
|
||||
|
||||
Please don't use `<https://github.com/tencent/ncnn>`_.
|
||||
We have made some modifications to the offical `ncnn`_.
|
||||
|
||||
We will synchronize `<https://github.com/csukuangfj/ncnn>`_ periodically
|
||||
with the official one.
|
||||
|
||||
3. Export the model via torch.jit.trace()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
First, let us rename our pre-trained model:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp
|
||||
|
||||
ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt
|
||||
|
||||
cd ../..
|
||||
|
||||
Next, we use the following code to export our model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/
|
||||
|
||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $dir/exp \
|
||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
||||
--epoch 30 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
\
|
||||
--num-encoder-layers 12 \
|
||||
--chunk-length 32 \
|
||||
--cnn-module-kernel 31 \
|
||||
--left-context-length 32 \
|
||||
--right-context-length 8 \
|
||||
--memory-size 32 \
|
||||
--encoder-dim 512
|
||||
|
||||
.. hint::
|
||||
|
||||
We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``.
|
||||
There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
|
||||
|
||||
If you have trained a model by yourself and if you have all checkpoints
|
||||
available, please first use ``decode.py`` to tune ``--epoch --avg``
|
||||
and select the best combination with with ``--use-averaged-model 1``.
|
||||
|
||||
.. note::
|
||||
|
||||
You will see the following log output:
|
||||
|
||||
.. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt
|
||||
|
||||
The log shows the model has ``75490012`` parameters, i.e., ``~75 M``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
|
||||
|
||||
You can see that the file size of the pre-trained model is ``289 MB``, which
|
||||
is roughly ``75490012*4/1024/1024 = 287.97 MB``.
|
||||
|
||||
After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``,
|
||||
we will get the following files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx*
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt
|
||||
-rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
|
||||
.. _conv-emformer-step-3-export-torchscript-model-via-pnnx:
|
||||
|
||||
3. Export torchscript model via pnnx
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. hint::
|
||||
|
||||
Make sure you have set up the ``PATH`` environment variable. Otherwise,
|
||||
it will throw an error saying that ``pnnx`` could not be found.
|
||||
|
||||
Now, it's time to export our models to `ncnn`_ via ``pnnx``.
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
pnnx ./encoder_jit_trace-pnnx.pt
|
||||
pnnx ./decoder_jit_trace-pnnx.pt
|
||||
pnnx ./joiner_jit_trace-pnnx.pt
|
||||
|
||||
It will generate the following files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param}
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
|
||||
|
||||
There are two types of files:
|
||||
|
||||
- ``param``: It is a text file containing the model architectures. You can
|
||||
use a text editor to view its content.
|
||||
- ``bin``: It is a binary file containing the model parameters.
|
||||
|
||||
We compare the file sizes of the models below before and after converting via ``pnnx``:
|
||||
|
||||
.. see https://tableconvert.com/restructuredtext-generator
|
||||
|
||||
+----------------------------------+------------+
|
||||
| File name | File size |
|
||||
+==================================+============+
|
||||
| encoder_jit_trace-pnnx.pt | 283 MB |
|
||||
+----------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin | 142 MB |
|
||||
+----------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin | 503 KB |
|
||||
+----------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB |
|
||||
+----------------------------------+------------+
|
||||
|
||||
You can see that the file sizes of the models after conversion are about one half
|
||||
of the models before conversion:
|
||||
|
||||
- encoder: 283 MB vs 142 MB
|
||||
- decoder: 1010 KB vs 503 KB
|
||||
- joiner: 3.0 MB vs 1.5 MB
|
||||
|
||||
The reason is that by default ``pnnx`` converts ``float32`` parameters
|
||||
to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
|
||||
for ``float16``. Thus, it is ``twice smaller`` after conversion.
|
||||
|
||||
.. hint::
|
||||
|
||||
If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
|
||||
won't convert ``float32`` to ``float16``.
|
||||
|
||||
4. Test the exported models in icefall
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. note::
|
||||
|
||||
We assume you have set up the environment variable ``PYTHONPATH`` when
|
||||
building `ncnn`_.
|
||||
|
||||
Now we have successfully converted our pre-trained model to `ncnn`_ format.
|
||||
The generated 6 files are what we need. You can use the following code to
|
||||
test the converted models:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
|
||||
--tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \
|
||||
--encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
|
||||
|
||||
.. hint::
|
||||
|
||||
`ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
|
||||
only 1 wave file as input.
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. literalinclude:: ./code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
|
||||
|
||||
Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
|
||||
|
||||
|
||||
.. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn:
|
||||
|
||||
5. Modify the exported encoder for sherpa-ncnn
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In order to use the exported models in `sherpa-ncnn`_, we have to modify
|
||||
``encoder_jit_trace-pnnx.ncnn.param``.
|
||||
|
||||
Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
|
||||
|
||||
.. code-block::
|
||||
|
||||
7767517
|
||||
1060 1342
|
||||
Input in0 0 1 in0
|
||||
|
||||
**Explanation** of the above three lines:
|
||||
|
||||
1. ``7767517``, it is a magic number and should not be changed.
|
||||
2. ``1060 1342``, the first number ``1060`` specifies the number of layers
|
||||
in this file, while ``1342`` specifies the number of intermediate outputs
|
||||
of this file
|
||||
3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
|
||||
is the layer name of this layer; ``0`` means this layer has no input;
|
||||
``1`` means this layer has one output; ``in0`` is the output name of
|
||||
this layer.
|
||||
|
||||
We need to add 1 extra line and also increment the number of layers.
|
||||
The result looks like below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
1061 1342
|
||||
SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
|
||||
Input in0 0 1 in0
|
||||
|
||||
**Explanation**
|
||||
|
||||
1. ``7767517``, it is still the same
|
||||
2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``.
|
||||
We don't need to change ``1342`` since the newly added layer has no inputs or outputs.
|
||||
3. ``SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512``
|
||||
This line is newly added. Its explanation is given below:
|
||||
|
||||
- ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
|
||||
- ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
|
||||
- ``0 0`` means this layer has no inputs or output. Must be ``0 0``
|
||||
- ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1``
|
||||
- ``1=12``, 1 is the key and 12 is the value of the
|
||||
parameter ``--num-encoder-layers`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``2=32``, 2 is the key and 32 is the value of the
|
||||
parameter ``--memory-size`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``3=31``, 3 is the key and 31 is the value of the
|
||||
parameter ``--cnn-module-kernel`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``4=8``, 4 is the key and 8 is the value of the
|
||||
parameter ``--left-context-length`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``5=32``, 5 is the key and 32 is the value of the
|
||||
parameter ``--chunk-length`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``6=8``, 6 is the key and 8 is the value of the
|
||||
parameter ``--right-context-length`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
- ``7=512``, 7 is the key and 512 is the value of the
|
||||
parameter ``--encoder-dim`` that you provided when running
|
||||
``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
|
||||
|
||||
For ease of reference, we list the key-value pairs that you need to add
|
||||
in the following table. If your model has a different setting, please
|
||||
change the values for ``SherpaMetaData`` accordingly. Otherwise, you
|
||||
will be ``SAD``.
|
||||
|
||||
+------+-----------------------------+
|
||||
| key | value |
|
||||
+======+=============================+
|
||||
| 0 | 1 (fixed) |
|
||||
+------+-----------------------------+
|
||||
| 1 | ``--num-encoder-layers`` |
|
||||
+------+-----------------------------+
|
||||
| 2 | ``--memory-size`` |
|
||||
+------+-----------------------------+
|
||||
| 3 | ``--cnn-module-kernel`` |
|
||||
+------+-----------------------------+
|
||||
| 4 | ``--left-context-length`` |
|
||||
+------+-----------------------------+
|
||||
| 5 | ``--chunk-length`` |
|
||||
+------+-----------------------------+
|
||||
| 6 | ``--right-context-length`` |
|
||||
+------+-----------------------------+
|
||||
| 7 | ``--encoder-dim`` |
|
||||
+------+-----------------------------+
|
||||
|
||||
4. ``Input in0 0 1 in0``. No need to change it.
|
||||
|
||||
.. caution::
|
||||
|
||||
When you add a new layer ``SherpaMetaData``, please remember to update the
|
||||
number of layers. In our case, update ``1060`` to ``1061``. Otherwise,
|
||||
you will be SAD later.
|
||||
|
||||
.. hint::
|
||||
|
||||
After adding the new layer ``SherpaMetaData``, you cannot use this model
|
||||
with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
|
||||
supported only in `sherpa-ncnn`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
`ncnn`_ is very flexible. You can add new layers to it just by text-editing
|
||||
the ``param`` file! You don't need to change the ``bin`` file.
|
||||
|
||||
Now you can use this model in `sherpa-ncnn`_.
|
||||
Please refer to the following documentation:
|
||||
|
||||
- Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
|
||||
- Android: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
|
||||
- Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
|
||||
|
||||
We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
|
||||
|
||||
- `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
|
||||
|
||||
You can find more usages there.
|
||||
|
||||
6. (Optional) int8 quantization with sherpa-ncnn
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
This step is optional.
|
||||
|
||||
In this step, we describe how to quantize our model with ``int8``.
|
||||
|
||||
Change :ref:`conv-emformer-step-3-export-torchscript-model-via-pnnx` to
|
||||
disable ``fp16`` when using ``pnnx``:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
pnnx ./encoder_jit_trace-pnnx.pt fp16=0
|
||||
pnnx ./decoder_jit_trace-pnnx.pt
|
||||
pnnx ./joiner_jit_trace-pnnx.pt fp16=0
|
||||
|
||||
.. note::
|
||||
|
||||
We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
|
||||
support quantizing the decoder model yet. We will update this documentation
|
||||
once `ncnn`_ supports it. (Maybe in this year, 2023).
|
||||
|
||||
It will generate the following files
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin}
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
|
||||
-rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
|
||||
|
||||
Let us compare again the file sizes:
|
||||
|
||||
+----------------------------------------+------------+
|
||||
| File name | File size |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.pt | 283 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
|
||||
You can see that the file sizes are doubled when we disable ``fp16``.
|
||||
|
||||
.. note::
|
||||
|
||||
You can again use ``streaming-ncnn-decode.py`` to test the exported models.
|
||||
|
||||
Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
|
||||
to modify ``encoder_jit_trace-pnnx.ncnn.param``.
|
||||
|
||||
Change
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
1060 1342
|
||||
Input in0 0 1 in0
|
||||
|
||||
to
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
7767517
|
||||
1061 1342
|
||||
SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
|
||||
Input in0 0 1 in0
|
||||
|
||||
.. caution::
|
||||
|
||||
Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
|
||||
to change the values for ``SherpaMetaData`` if your model uses a different setting.
|
||||
|
||||
|
||||
Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
|
||||
`sherpa-ncnn`_.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# We will download sherpa-ncnn to $HOME/open-source/
|
||||
# You can change it to anywhere you like.
|
||||
cd $HOME
|
||||
mkdir -p open-source
|
||||
|
||||
cd open-source
|
||||
git clone https://github.com/k2-fsa/sherpa-ncnn
|
||||
cd sherpa-ncnn
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j 4
|
||||
|
||||
./bin/generate-int8-scale-table
|
||||
|
||||
export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
|
||||
|
||||
The output of the above commands are:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
(py38) kuangfangjun:build$ generate-int8-scale-table
|
||||
Please provide 10 arg. Currently given: 1
|
||||
Usage:
|
||||
generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
|
||||
|
||||
Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
|
||||
|
||||
We need to create a file ``wave_filenames.txt``, in which we need to put
|
||||
some calibration wave files. For testing purpose, we put the ``test_wavs``
|
||||
from the pre-trained model repository `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
cat <<EOF > wave_filenames.txt
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
../test_wavs/1221-135766-0001.wav
|
||||
../test_wavs/1221-135766-0002.wav
|
||||
EOF
|
||||
|
||||
Now we can calculate the scales needed for quantization with the calibration data:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
generate-int8-scale-table \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./encoder-scale-table.txt \
|
||||
./joiner-scale-table.txt \
|
||||
./wave_filenames.txt
|
||||
|
||||
The output logs are in the following:
|
||||
|
||||
.. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt
|
||||
|
||||
It generates the following two files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ls -lh encoder-scale-table.txt joiner-scale-table.txt
|
||||
-rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt
|
||||
-rw-r--r-- 1 kuangfangjun root 18K Jan 11 17:28 joiner-scale-table.txt
|
||||
|
||||
.. caution::
|
||||
|
||||
Definitely, you need more calibration data to compute the scale table.
|
||||
|
||||
Finally, let us use the scale table to quantize our models into ``int8``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ncnn2int8
|
||||
|
||||
usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
|
||||
|
||||
First, we quantize the encoder model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
ncnn2int8 \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./encoder-scale-table.txt
|
||||
|
||||
Next, we quantize the joiner model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ncnn2int8 \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.int8.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./joiner-scale-table.txt
|
||||
|
||||
The above two commands generate the following 4 files:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
-rw-r--r-- 1 kuangfangjun root 99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param
|
||||
-rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin
|
||||
-rw-r--r-- 1 kuangfangjun root 496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param
|
||||
|
||||
Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
|
||||
|
||||
.. caution::
|
||||
|
||||
``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
|
||||
|
||||
You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
|
||||
and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
|
||||
|
||||
For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
|
||||
replace the following invocation:
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
sherpa-ncnn \
|
||||
../data/lang_bpe_500/tokens.txt \
|
||||
./encoder_jit_trace-pnnx.ncnn.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
|
||||
with
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
|
||||
|
||||
sherpa-ncnn \
|
||||
../data/lang_bpe_500/tokens.txt \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.param \
|
||||
./encoder_jit_trace-pnnx.ncnn.int8.bin \
|
||||
./decoder_jit_trace-pnnx.ncnn.param \
|
||||
./decoder_jit_trace-pnnx.ncnn.bin \
|
||||
./joiner_jit_trace-pnnx.ncnn.param \
|
||||
./joiner_jit_trace-pnnx.ncnn.bin \
|
||||
../test_wavs/1089-134686-0001.wav
|
||||
|
||||
|
||||
The following table compares again the file sizes:
|
||||
|
||||
|
||||
+----------------------------------------+------------+
|
||||
| File name | File size |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.pt | 283 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.pt | 1010 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.pt | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
|
||||
+----------------------------------------+------------+
|
||||
| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
|
||||
+----------------------------------------+------------+
|
||||
| encoder_jit_trace-pnnx.ncnn.int8.bin | 99 MB |
|
||||
+----------------------------------------+------------+
|
||||
| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB |
|
||||
+----------------------------------------+------------+
|
||||
|
||||
You can see that the file sizes of the model after ``int8`` quantization
|
||||
are much smaller.
|
||||
|
||||
.. hint::
|
||||
|
||||
Currently, only linear layers and convolutional layers are quantized
|
||||
with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
|
||||
|
||||
.. note::
|
||||
|
||||
You need to test the recognition accuracy after ``int8`` quantization.
|
||||
|
||||
You can find the speed comparison at `<https://github.com/k2-fsa/sherpa-ncnn/issues/44>`_.
|
||||
|
||||
|
||||
That's it! Have fun with `sherpa-ncnn`_!
|
||||
export-ncnn-zipformer
|
||||
export-ncnn-conv-emformer
|
||||
export-ncnn-lstm
|
||||
|
||||
@ -1,69 +1,95 @@
|
||||
Export to ONNX
|
||||
==============
|
||||
|
||||
In this section, we describe how to export models to ONNX.
|
||||
In this section, we describe how to export models to `ONNX`_.
|
||||
|
||||
In each recipe, there is a file called ``export-onnx.py``, which is used
|
||||
to export trained models to `ONNX`_.
|
||||
|
||||
There is also a file named ``onnx_pretrained.py``, which you can use
|
||||
the exported `ONNX`_ model in Python with `onnxruntime`_ to decode sound files.
|
||||
|
||||
sherpa-onnx
|
||||
-----------
|
||||
|
||||
We have a separate repository `sherpa-onnx`_ for deploying your exported models
|
||||
on various platforms such as:
|
||||
|
||||
- iOS
|
||||
- Android
|
||||
- Raspberry Pi
|
||||
- Linux/macOS/Windows
|
||||
|
||||
|
||||
Please see the documentation of `sherpa-onnx`_ for details:
|
||||
|
||||
`<https://k2-fsa.github.io/sherpa/onnx/index.html>`_
|
||||
|
||||
Example
|
||||
-------
|
||||
|
||||
In the following, we demonstrate how to export a streaming Zipformer pre-trained
|
||||
model from
|
||||
`<https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11>`_
|
||||
to `ONNX`_.
|
||||
|
||||
Download the pre-trained model
|
||||
------------------------------
|
||||
|
||||
.. hint::
|
||||
|
||||
Only non-streaming conformer transducer models are tested.
|
||||
|
||||
|
||||
When to use it
|
||||
--------------
|
||||
|
||||
It you want to use an inference framework that supports ONNX
|
||||
to run the pretrained model.
|
||||
|
||||
|
||||
How to export
|
||||
-------------
|
||||
|
||||
We use
|
||||
`<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_
|
||||
as an example in the following.
|
||||
We assume you have installed `git-lfs`_.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
epoch=14
|
||||
avg=2
|
||||
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||
--bpe-model data/lang_bpe_500/bpe.model \
|
||||
--epoch $epoch \
|
||||
--avg $avg \
|
||||
--onnx 1
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
It will generate the following files inside ``pruned_transducer_stateless3/exp``:
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
- ``encoder.onnx``
|
||||
- ``decoder.onnx``
|
||||
- ``joiner.onnx``
|
||||
- ``joiner_encoder_proj.onnx``
|
||||
- ``joiner_decoder_proj.onnx``
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
cd exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
You can use ``./pruned_transducer_stateless3/exp/onnx_pretrained.py`` to decode
|
||||
waves with the generated files:
|
||||
Export the model to ONNX
|
||||
------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./pruned_transducer_stateless3/onnx_pretrained.py \
|
||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||
--encoder-model-filename ./pruned_transducer_stateless3/exp/encoder.onnx \
|
||||
--decoder-model-filename ./pruned_transducer_stateless3/exp/decoder.onnx \
|
||||
--joiner-model-filename ./pruned_transducer_stateless3/exp/joiner.onnx \
|
||||
--joiner-encoder-proj-model-filename ./pruned_transducer_stateless3/exp/joiner_encoder_proj.onnx \
|
||||
--joiner-decoder-proj-model-filename ./pruned_transducer_stateless3/exp/joiner_decoder_proj.onnx \
|
||||
/path/to/foo.wav \
|
||||
/path/to/bar.wav \
|
||||
/path/to/baz.wav
|
||||
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--decode-chunk-len 32 \
|
||||
--exp-dir $repo/exp/
|
||||
|
||||
.. warning::
|
||||
|
||||
How to use the exported model
|
||||
-----------------------------
|
||||
``export-onnx.py`` from different recipes has different options.
|
||||
|
||||
We also provide `<https://github.com/k2-fsa/sherpa-onnx>`_
|
||||
performing speech recognition using `onnxruntime <https://github.com/microsoft/onnxruntime>`_
|
||||
with exported models.
|
||||
It has been tested on Linux, macOS, and Windows.
|
||||
In the above example, ``--decode-chunk-len`` is specific for the
|
||||
streaming Zipformer. Other models won't have such an option.
|
||||
|
||||
It will generate the following 3 files in ``$repo/exp``
|
||||
|
||||
- ``encoder-epoch-99-avg-1.onnx``
|
||||
- ``decoder-epoch-99-avg-1.onnx``
|
||||
- ``joiner-epoch-99-avg-1.onnx``
|
||||
|
||||
Decode sound files with exported ONNX models
|
||||
--------------------------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
@ -299,11 +299,11 @@ to run the training part first.
|
||||
|
||||
- (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
|
||||
of each epoch. You can pass ``--epoch`` to
|
||||
``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py`` to use them.
|
||||
``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py`` to use them.
|
||||
|
||||
- (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
|
||||
every ``--save-every-n`` batches. You can pass ``--iter`` to
|
||||
``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py`` to use them.
|
||||
``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py`` to use them.
|
||||
|
||||
We suggest that you try both types of checkpoints and choose the one
|
||||
that produces the lowest WERs.
|
||||
@ -311,7 +311,7 @@ to run the training part first.
|
||||
.. code-block:: bash
|
||||
|
||||
$ cd egs/librispeech/ASR
|
||||
$ ./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py --help
|
||||
$ ./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py --help
|
||||
|
||||
shows the options for decoding.
|
||||
|
||||
@ -320,7 +320,7 @@ The following shows the example using ``epoch-*.pt``:
|
||||
.. code-block:: bash
|
||||
|
||||
for m in greedy_search fast_beam_search modified_beam_search; do
|
||||
./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py \
|
||||
./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py \
|
||||
--epoch 30 \
|
||||
--avg 13 \
|
||||
--exp-dir pruned_transducer_stateless7_ctc_bs/exp \
|
||||
@ -333,7 +333,7 @@ To test CTC branch, you can use the following command:
|
||||
.. code-block:: bash
|
||||
|
||||
for m in ctc-decoding 1best; do
|
||||
./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py \
|
||||
./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py \
|
||||
--epoch 30 \
|
||||
--avg 13 \
|
||||
--exp-dir pruned_transducer_stateless7_ctc_bs/exp \
|
||||
@ -367,7 +367,7 @@ It will generate a file ``./pruned_transducer_stateless7_ctc_bs/exp/pretrained.p
|
||||
|
||||
.. hint::
|
||||
|
||||
To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py``,
|
||||
To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py``,
|
||||
you can run:
|
||||
|
||||
.. code-block:: bash
|
||||
@ -376,7 +376,7 @@ It will generate a file ``./pruned_transducer_stateless7_ctc_bs/exp/pretrained.p
|
||||
ln -s pretrained epoch-9999.pt
|
||||
|
||||
And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
|
||||
``./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py``.
|
||||
``./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py``.
|
||||
|
||||
To use the exported model with ``./pruned_transducer_stateless7_ctc_bs/pretrained.py``, you
|
||||
can run:
|
||||
@ -447,7 +447,8 @@ Download pretrained models
|
||||
If you don't want to train from scratch, you can download the pretrained models
|
||||
by visiting the following links:
|
||||
|
||||
- `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14>`_
|
||||
- trained on LibriSpeech 100h: `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14>`_
|
||||
- trained on LibriSpeech 960h: `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2023-01-29>`_
|
||||
|
||||
See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
|
||||
for the details of the above pretrained models
|
||||
|
||||
@ -30,8 +30,9 @@ In icefall, we implement the streaming conformer the way just like what `WeNet <
|
||||
See :doc:`Pruned transducer statelessX <librispeech/pruned_transducer_stateless>` for more details.
|
||||
|
||||
.. HINT::
|
||||
If you want to adapt a non-streaming conformer model to be streaming, please refer
|
||||
to `this pull request <https://github.com/k2-fsa/icefall/pull/454>`_.
|
||||
If you want to modify a non-streaming conformer recipe to support both streaming and non-streaming, please refer
|
||||
to `this pull request <https://github.com/k2-fsa/icefall/pull/454>`_. After adding the code needed by streaming training,
|
||||
you have to re-train it with the extra arguments metioned in the docs above to get a streaming model.
|
||||
|
||||
|
||||
Streaming Emformer
|
||||
|
||||
@ -515,133 +515,6 @@ To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``:
|
||||
Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/english/server.html>`_
|
||||
for how to use the exported models in ``sherpa``.
|
||||
|
||||
.. _export-lstm-transducer-model-for-ncnn:
|
||||
|
||||
Export LSTM transducer models for ncnn
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
We support exporting pretrained LSTM transducer models to
|
||||
`ncnn <https://github.com/tencent/ncnn>`_ using
|
||||
`pnnx <https://github.com/Tencent/ncnn/tree/master/tools/pnnx>`_.
|
||||
|
||||
First, let us install a modified version of ``ncnn``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
git clone https://github.com/csukuangfj/ncnn
|
||||
cd ncnn
|
||||
git submodule update --recursive --init
|
||||
|
||||
# Note: We don't use "python setup.py install" or "pip install ." here
|
||||
|
||||
mkdir -p build-wheel
|
||||
cd build-wheel
|
||||
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DNCNN_PYTHON=ON \
|
||||
-DNCNN_BUILD_BENCHMARK=OFF \
|
||||
-DNCNN_BUILD_EXAMPLES=OFF \
|
||||
-DNCNN_BUILD_TOOLS=ON \
|
||||
..
|
||||
|
||||
make -j4
|
||||
|
||||
cd ..
|
||||
|
||||
# Note: $PWD here is /path/to/ncnn
|
||||
|
||||
export PYTHONPATH=$PWD/python:$PYTHONPATH
|
||||
export PATH=$PWD/tools/pnnx/build/src:$PATH
|
||||
export PATH=$PWD/build-wheel/tools/quantize:$PATH
|
||||
|
||||
# now build pnnx
|
||||
cd tools/pnnx
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j4
|
||||
|
||||
./src/pnnx
|
||||
|
||||
.. note::
|
||||
|
||||
We assume that you have added the path to the binary ``pnnx`` to the
|
||||
environment variable ``PATH``.
|
||||
|
||||
We also assume that you have added ``build/tools/quantize`` to the environment
|
||||
variable ``PATH`` so that you are able to use ``ncnn2int8`` later.
|
||||
|
||||
Second, let us export the model using ``torch.jit.trace()`` that is suitable
|
||||
for ``pnnx``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
iter=468000
|
||||
avg=16
|
||||
|
||||
./lstm_transducer_stateless2/export.py \
|
||||
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||
--bpe-model data/lang_bpe_500/bpe.model \
|
||||
--iter $iter \
|
||||
--avg $avg \
|
||||
--pnnx 1
|
||||
|
||||
It will generate 3 files:
|
||||
|
||||
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.pt``
|
||||
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.pt``
|
||||
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.pt``
|
||||
|
||||
Third, convert torchscript model to ``ncnn`` format:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pnnx ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.pt
|
||||
pnnx ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.pt
|
||||
pnnx ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.pt
|
||||
|
||||
It will generate the following files:
|
||||
|
||||
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param``
|
||||
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin``
|
||||
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param``
|
||||
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin``
|
||||
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param``
|
||||
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin``
|
||||
|
||||
To use the above generated files, run:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./lstm_transducer_stateless2/ncnn-decode.py \
|
||||
--bpe-model-filename ./data/lang_bpe_500/bpe.model \
|
||||
--encoder-param-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
/path/to/foo.wav
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./lstm_transducer_stateless2/streaming-ncnn-decode.py \
|
||||
--bpe-model-filename ./data/lang_bpe_500/bpe.model \
|
||||
--encoder-param-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||
--encoder-bin-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||
--decoder-param-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||
--decoder-bin-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||
--joiner-param-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||
--joiner-bin-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||
/path/to/foo.wav
|
||||
|
||||
To use the above generated files in C++, please see
|
||||
`<https://github.com/k2-fsa/sherpa-ncnn>`_
|
||||
|
||||
It is able to generate a static linked executable that can be run on Linux, Windows,
|
||||
macOS, Raspberry Pi, etc, without external dependencies.
|
||||
|
||||
Download pretrained models
|
||||
--------------------------
|
||||
|
||||
@ -657,6 +530,3 @@ by visiting the following links:
|
||||
|
||||
You can find more usages of the pretrained models in
|
||||
`<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
|
||||
|
||||
Export ConvEmformer transducer models for ncnn
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -391,18 +391,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_filename, "w") as f:
|
||||
wer = write_error_stats(
|
||||
f, f"{test_set_name}-{key}", results, enable_log=True
|
||||
@ -412,9 +408,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tWER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
|
||||
# Introduction
|
||||
|
||||
Please refer to <https://icefall.readthedocs.io/en/latest/recipes/aishell/index.html>
|
||||
Please refer to <https://icefall.readthedocs.io/en/latest/recipes/Non-streaming-ASR/aishell/index.html>
|
||||
for how to run models in this recipe.
|
||||
|
||||
|
||||
|
||||
@ -388,18 +388,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
# we compute CER for aishell dataset.
|
||||
results_char = []
|
||||
for res in results:
|
||||
@ -413,9 +409,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tWER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -406,18 +406,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
# we compute CER for aishell dataset.
|
||||
results_char = []
|
||||
for res in results:
|
||||
@ -431,9 +427,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tCER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -325,17 +325,13 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
# we compute CER for aishell dataset.
|
||||
results_char = []
|
||||
for res in results:
|
||||
@ -349,9 +345,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tCER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -370,18 +370,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
# we compute CER for aishell dataset.
|
||||
results_char = []
|
||||
for res in results:
|
||||
@ -395,9 +391,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tCER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -374,18 +374,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
# we compute CER for aishell dataset.
|
||||
results_char = []
|
||||
for res in results:
|
||||
@ -399,9 +395,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tCER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -543,18 +543,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_filename, "w") as f:
|
||||
wer = write_error_stats(
|
||||
f, f"{test_set_name}-{key}", results, enable_log=True
|
||||
@ -564,9 +560,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tWER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -406,18 +406,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_filename, "w") as f:
|
||||
wer = write_error_stats(
|
||||
f, f"{test_set_name}-{key}", results, enable_log=True
|
||||
@ -427,9 +423,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tWER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -391,18 +391,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_filename, "w") as f:
|
||||
wer = write_error_stats(
|
||||
f, f"{test_set_name}-{key}", results, enable_log=True
|
||||
@ -412,9 +408,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tWER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -462,18 +462,14 @@ def save_results(
|
||||
):
|
||||
test_set_wers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
results = sorted(results)
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
errs_filename = (
|
||||
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_filename, "w") as f:
|
||||
wer = write_error_stats(
|
||||
f, f"{test_set_name}-{key}", results, enable_log=True
|
||||
@ -483,9 +479,7 @@ def save_results(
|
||||
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||
|
||||
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tWER", file=f)
|
||||
for key, val in test_set_wers:
|
||||
|
||||
@ -478,17 +478,13 @@ def save_results(
|
||||
test_set_wers = dict()
|
||||
test_set_cers = dict()
|
||||
for key, results in results_dict.items():
|
||||
recog_path = (
|
||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
|
||||
store_transcripts(filename=recog_path, texts=results)
|
||||
logging.info(f"The transcripts are stored in {recog_path}")
|
||||
|
||||
# The following prints out WERs, per-word error statistics and aligned
|
||||
# ref/hyp pairs.
|
||||
wers_filename = (
|
||||
params.res_dir / f"wers-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
wers_filename = params.res_dir / f"wers-{test_set_name}-{params.suffix}.txt"
|
||||
with open(wers_filename, "w") as f:
|
||||
wer = write_error_stats(
|
||||
f, f"{test_set_name}-{key}", results, enable_log=True
|
||||
@ -499,9 +495,7 @@ def save_results(
|
||||
results_char = []
|
||||
for res in results:
|
||||
results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
|
||||
cers_filename = (
|
||||
params.res_dir / f"cers-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
cers_filename = params.res_dir / f"cers-{test_set_name}-{params.suffix}.txt"
|
||||
with open(cers_filename, "w") as f:
|
||||
cer = write_error_stats(
|
||||
f, f"{test_set_name}-{key}", results_char, enable_log=True
|
||||
@ -512,9 +506,7 @@ def save_results(
|
||||
|
||||
test_set_wers = {k: v for k, v in sorted(test_set_wers.items(), key=lambda x: x[1])}
|
||||
test_set_cers = {k: v for k, v in sorted(test_set_cers.items(), key=lambda x: x[1])}
|
||||
errs_info = (
|
||||
params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||
)
|
||||
errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
|
||||
with open(errs_info, "w") as f:
|
||||
print("settings\tWER\tCER", file=f)
|
||||
for key in test_set_wers:
|
||||
|
||||
11
egs/csj/ASR/README.md
Normal file
11
egs/csj/ASR/README.md
Normal file
@ -0,0 +1,11 @@
|
||||
# Introduction
|
||||
|
||||
[./RESULTS.md](./RESULTS.md) contains the latest results.
|
||||
|
||||
# Transducers
|
||||
|
||||
These are the types of architectures currently available.
|
||||
|
||||
| | Encoder | Decoder | Comment |
|
||||
|---------------------------------------|---------------------|--------------------|---------------------------------------------------|
|
||||
| `pruned_transducer_stateless7_streaming` | Streaming Zipformer | Embedding + Conv1d | Adapted from librispeech pruned_transducer_stateless7_streaming |
|
||||
200
egs/csj/ASR/RESULTS.md
Normal file
200
egs/csj/ASR/RESULTS.md
Normal file
@ -0,0 +1,200 @@
|
||||
# Results
|
||||
|
||||
## Streaming Zipformer-Transducer (Pruned Stateless Transducer + Streaming Zipformer)
|
||||
|
||||
### [pruned_transducer_stateless7_streaming](./pruned_transducer_stateless7_streaming)
|
||||
|
||||
See <https://github.com/k2-fsa/icefall/pull/892> for more details.
|
||||
|
||||
You can find a pretrained model, training logs, decoding logs, and decoding results at:
|
||||
<https://huggingface.co/TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208>
|
||||
|
||||
Number of model parameters: 75688409, i.e. 75.7M.
|
||||
|
||||
#### training on disfluent transcript
|
||||
|
||||
The CERs are:
|
||||
|
||||
| decoding method | chunk size | eval1 | eval2 | eval3 | excluded | valid | average | decoding mode |
|
||||
| --------------- | ---------- | ----- | ----- | ----- | -------- | ----- | ------- | ------------- |
|
||||
| fast beam search | 320ms | 5.39 | 4.08 | 4.16 | 5.4 | 5.02 | --epoch 30 --avg 17 | simulated streaming |
|
||||
| fast beam search | 320ms | 5.34 | 4.1 | 4.26 | 5.61 | 4.91 | --epoch 30 --avg 17 | chunk-wise |
|
||||
| greedy search | 320ms | 5.43 | 4.14 | 4.31 | 5.48 | 4.88 | --epoch 30 --avg 17 | simulated streaming |
|
||||
| greedy search | 320ms | 5.44 | 4.14 | 4.39 | 5.7 | 4.98 | --epoch 30 --avg 17 | chunk-wise |
|
||||
| modified beam search | 320ms | 5.2 | 3.95 | 4.09 | 5.12 | 4.75 | --epoch 30 --avg 17 | simulated streaming |
|
||||
| modified beam search | 320ms | 5.18 | 4.07 | 4.12 | 5.36 | 4.77 | --epoch 30 --avg 17 | chunk-wise |
|
||||
| fast beam search | 640ms | 5.01 | 3.78 | 3.96 | 4.85 | 4.6 | --epoch 30 --avg 17 | simulated streaming |
|
||||
| fast beam search | 640ms | 4.97 | 3.88 | 3.96 | 4.91 | 4.61 | --epoch 30 --avg 17 | chunk-wise |
|
||||
| greedy search | 640ms | 5.02 | 3.84 | 4.14 | 5.02 | 4.59 | --epoch 30 --avg 17 | simulated streaming |
|
||||
| greedy search | 640ms | 5.32 | 4.22 | 4.33 | 5.39 | 4.99 | --epoch 30 --avg 17 | chunk-wise |
|
||||
| modified beam search | 640ms | 4.78 | 3.66 | 3.85 | 4.72 | 4.42 | --epoch 30 --avg 17 | simulated streaming |
|
||||
| modified beam search | 640ms | 5.77 | 4.72 | 4.73 | 5.85 | 5.36 | --epoch 30 --avg 17 | chunk-wise |
|
||||
|
||||
Note: `simulated streaming` indicates feeding full utterance during decoding using `decode.py`,
|
||||
while `chunk-size` indicates feeding certain number of frames at each time using `streaming_decode.py`.
|
||||
|
||||
The training command was:
|
||||
```bash
|
||||
./pruned_transducer_stateless7_streaming/train.py \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--world-size 8 \
|
||||
--num-epochs 30 \
|
||||
--start-epoch 1 \
|
||||
--use-fp16 1 \
|
||||
--exp-dir pruned_transducer_stateless7_streaming/exp_disfluent_2_pad30 \
|
||||
--max-duration 375 \
|
||||
--transcript-mode disfluent \
|
||||
--lang data/lang_char \
|
||||
--manifest-dir /mnt/host/corpus/csj/fbank \
|
||||
--pad-feature 30 \
|
||||
--musan-dir /mnt/host/corpus/musan/musan/fbank
|
||||
```
|
||||
|
||||
The simulated streaming decoding command was:
|
||||
```bash
|
||||
for chunk in 64 32; do
|
||||
for m in greedy_search fast_beam_search modified_beam_search; do
|
||||
python pruned_transducer_stateless7_streaming/decode.py \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--exp-dir pruned_transducer_stateless7_streaming/exp_disfluent_2_pad30 \
|
||||
--epoch 30 \
|
||||
--avg 17 \
|
||||
--max-duration 350 \
|
||||
--decoding-method $m \
|
||||
--manifest-dir /mnt/host/corpus/csj/fbank \
|
||||
--lang data/lang_char \
|
||||
--transcript-mode disfluent \
|
||||
--res-dir pruned_transducer_stateless7_streaming/exp_disfluent_2_pad30/github/sim_"$chunk"_"$m" \
|
||||
--decode-chunk-len $chunk \
|
||||
--pad-feature 30 \
|
||||
--gpu 0
|
||||
done
|
||||
done
|
||||
```
|
||||
|
||||
The streaming chunk-wise decoding command was:
|
||||
```bash
|
||||
for chunk in 64 32; do
|
||||
for m in greedy_search fast_beam_search modified_beam_search; do
|
||||
python pruned_transducer_stateless7_streaming/streaming_decode.py \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--exp-dir pruned_transducer_stateless7_streaming/exp_disfluent_2_pad30 \
|
||||
--epoch 30 \
|
||||
--avg 17 \
|
||||
--max-duration 350 \
|
||||
--decoding-method $m \
|
||||
--manifest-dir /mnt/host/corpus/csj/fbank \
|
||||
--lang data/lang_char \
|
||||
--transcript-mode disfluent \
|
||||
--res-dir pruned_transducer_stateless7_streaming/exp_disfluent_2_pad30/github/stream_"$chunk"_"$m" \
|
||||
--decode-chunk-len $chunk \
|
||||
--gpu 2 \
|
||||
--num-decode-streams 40
|
||||
done
|
||||
done
|
||||
```
|
||||
|
||||
#### training on fluent transcript
|
||||
|
||||
The CERs are:
|
||||
|
||||
| decoding method | chunk size | eval1 | eval2 | eval3 | excluded | valid | average | decoding mode |
|
||||
| --------------- | ---------- | ----- | ----- | ----- | -------- | ----- | ------- | ------------- |
|
||||
| fast beam search | 320ms | 4.19 | 3.63 | 3.77 | 4.43 | 4.09 | --epoch 30 --avg 12 | simulated streaming |
|
||||
| fast beam search | 320ms | 4.06 | 3.55 | 3.66 | 4.70 | 4.04 | --epoch 30 --avg 12 | chunk-wise |
|
||||
| greedy search | 320ms | 4.22 | 3.62 | 3.82 | 4.45 | 3.98 | --epoch 30 --avg 12 | simulated streaming |
|
||||
| greedy search | 320ms | 4.13 | 3.61 | 3.85 | 4.67 | 4.05 | --epoch 30 --avg 12 | chunk-wise |
|
||||
| modified beam search | 320ms | 4.02 | 3.43 | 3.62 | 4.43 | 3.81 | --epoch 30 --avg 12 | simulated streaming |
|
||||
| modified beam search | 320ms | 3.97 | 3.43 | 3.59 | 4.99 | 3.88 | --epoch 30 --avg 12 | chunk-wise |
|
||||
| fast beam search | 640ms | 3.80 | 3.31 | 3.55 | 4.16 | 3.90 | --epoch 30 --avg 12 | simulated streaming |
|
||||
| fast beam search | 640ms | 3.81 | 3.34 | 3.46 | 4.58 | 3.85 | --epoch 30 --avg 12 | chunk-wise |
|
||||
| greedy search | 640ms | 3.92 | 3.38 | 3.65 | 4.31 | 3.88 | --epoch 30 --avg 12 | simulated streaming |
|
||||
| greedy search | 640ms | 3.98 | 3.38 | 3.64 | 4.54 | 4.01 | --epoch 30 --avg 12 | chunk-wise |
|
||||
| modified beam search | 640ms | 3.72 | 3.26 | 3.39 | 4.10 | 3.65 | --epoch 30 --avg 12 | simulated streaming |
|
||||
| modified beam search | 640ms | 3.78 | 3.32 | 3.45 | 4.81 | 3.81 | --epoch 30 --avg 12 | chunk-wise |
|
||||
|
||||
Note: `simulated streaming` indicates feeding full utterance during decoding using `decode.py`,
|
||||
while `chunk-size` indicates feeding certain number of frames at each time using `streaming_decode.py`.
|
||||
|
||||
The training command was:
|
||||
```bash
|
||||
./pruned_transducer_stateless7_streaming/train.py \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--world-size 8 \
|
||||
--num-epochs 30 \
|
||||
--start-epoch 1 \
|
||||
--use-fp16 1 \
|
||||
--exp-dir pruned_transducer_stateless7_streaming/exp_fluent_2_pad30 \
|
||||
--max-duration 375 \
|
||||
--transcript-mode fluent \
|
||||
--lang data/lang_char \
|
||||
--manifest-dir /mnt/host/corpus/csj/fbank \
|
||||
--pad-feature 30 \
|
||||
--musan-dir /mnt/host/corpus/musan/musan/fbank
|
||||
```
|
||||
|
||||
The simulated streaming decoding command was:
|
||||
```bash
|
||||
for chunk in 64 32; do
|
||||
for m in greedy_search fast_beam_search modified_beam_search; do
|
||||
python pruned_transducer_stateless7_streaming/decode.py \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--exp-dir pruned_transducer_stateless7_streaming/exp_fluent_2_pad30 \
|
||||
--epoch 30 \
|
||||
--avg 12 \
|
||||
--max-duration 350 \
|
||||
--decoding-method $m \
|
||||
--manifest-dir /mnt/host/corpus/csj/fbank \
|
||||
--lang data/lang_char \
|
||||
--transcript-mode fluent \
|
||||
--res-dir pruned_transducer_stateless7_streaming/exp_fluent_2_pad30/github/sim_"$chunk"_"$m" \
|
||||
--decode-chunk-len $chunk \
|
||||
--pad-feature 30 \
|
||||
--gpu 1
|
||||
done
|
||||
done
|
||||
```
|
||||
|
||||
The streaming chunk-wise decoding command was:
|
||||
```bash
|
||||
for chunk in 64 32; do
|
||||
for m in greedy_search fast_beam_search modified_beam_search; do
|
||||
python pruned_transducer_stateless7_streaming/streaming_decode.py \
|
||||
--feedforward-dims "1024,1024,2048,2048,1024" \
|
||||
--exp-dir pruned_transducer_stateless7_streaming/exp_fluent_2_pad30 \
|
||||
--epoch 30 \
|
||||
--avg 12 \
|
||||
--max-duration 350 \
|
||||
--decoding-method $m \
|
||||
--manifest-dir /mnt/host/corpus/csj/fbank \
|
||||
--lang data/lang_char \
|
||||
--transcript-mode fluent \
|
||||
--res-dir pruned_transducer_stateless7_streaming/exp_fluent_2_pad30/github/stream_"$chunk"_"$m" \
|
||||
--decode-chunk-len $chunk \
|
||||
--gpu 3 \
|
||||
--num-decode-streams 40
|
||||
done
|
||||
done
|
||||
```
|
||||
|
||||
#### Comparing disfluent to fluent
|
||||
|
||||
$$ \texttt{CER}^{f}_d = \frac{\texttt{sub}_f + \texttt{ins} + \texttt{del}_f}{N_f} $$
|
||||
|
||||
This comparison evaluates the disfluent model on the fluent transcript (calculated by `disfluent_recogs_to_fluent.py`), forgiving the disfluent model's mistakes on fillers and partial words. It is meant as an illustrative metric only, so that the disfluent and fluent models can be compared.
|
||||
|
||||
| decoding method | chunk size | eval1 (d vs f) | eval2 (d vs f) | eval3 (d vs f) | excluded (d vs f) | valid (d vs f) | decoding mode |
|
||||
| --------------- | ---------- | -------------- | --------------- | -------------- | -------------------- | --------------- | ----------- |
|
||||
| fast beam search | 320ms | 4.54 vs 4.19 | 3.44 vs 3.63 | 3.56 vs 3.77 | 4.22 vs 4.43 | 4.22 vs 4.09 | simulated streaming |
|
||||
| fast beam search | 320ms | 4.48 vs 4.06 | 3.41 vs 3.55 | 3.65 vs 3.66 | 4.26 vs 4.7 | 4.08 vs 4.04 | chunk-wise |
|
||||
| greedy search | 320ms | 4.53 vs 4.22 | 3.48 vs 3.62 | 3.69 vs 3.82 | 4.38 vs 4.45 | 4.05 vs 3.98 | simulated streaming |
|
||||
| greedy search | 320ms | 4.53 vs 4.13 | 3.46 vs 3.61 | 3.71 vs 3.85 | 4.48 vs 4.67 | 4.12 vs 4.05 | chunk-wise |
|
||||
| modified beam search | 320ms | 4.45 vs 4.02 | 3.38 vs 3.43 | 3.57 vs 3.62 | 4.19 vs 4.43 | 4.04 vs 3.81 | simulated streaming |
|
||||
| modified beam search | 320ms | 4.44 vs 3.97 | 3.47 vs 3.43 | 3.56 vs 3.59 | 4.28 vs 4.99 | 4.04 vs 3.88 | chunk-wise |
|
||||
| fast beam search | 640ms | 4.14 vs 3.8 | 3.12 vs 3.31 | 3.38 vs 3.55 | 3.72 vs 4.16 | 3.81 vs 3.9 | simulated streaming |
|
||||
| fast beam search | 640ms | 4.05 vs 3.81 | 3.23 vs 3.34 | 3.36 vs 3.46 | 3.65 vs 4.58 | 3.78 vs 3.85 | chunk-wise |
|
||||
| greedy search | 640ms | 4.1 vs 3.92 | 3.17 vs 3.38 | 3.5 vs 3.65 | 3.87 vs 4.31 | 3.77 vs 3.88 | simulated streaming |
|
||||
| greedy search | 640ms | 4.41 vs 3.98 | 3.56 vs 3.38 | 3.69 vs 3.64 | 4.26 vs 4.54 | 4.16 vs 4.01 | chunk-wise |
|
||||
| modified beam search | 640ms | 4 vs 3.72 | 3.08 vs 3.26 | 3.33 vs 3.39 | 3.75 vs 4.1 | 3.71 vs 3.65 | simulated streaming |
|
||||
| modified beam search | 640ms | 5.05 vs 3.78 | 4.22 vs 3.32 | 4.26 vs 3.45 | 5.02 vs 4.81 | 4.73 vs 3.81 | chunk-wise |
|
||||
| average (d - f) | | 0.43 | -0.02 | -0.02 | -0.34 | 0.13 | |
|
||||
94
egs/csj/ASR/local/add_transcript_mode.py
Normal file
94
egs/csj/ASR/local/add_transcript_mode.py
Normal file
@ -0,0 +1,94 @@
|
||||
import argparse
|
||||
import logging
|
||||
from configparser import ConfigParser
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from lhotse import CutSet, SupervisionSet
|
||||
from lhotse.recipes.csj import CSJSDBParser
|
||||
|
||||
ARGPARSE_DESCRIPTION = """
|
||||
This script adds transcript modes to an existing CutSet or SupervisionSet.
|
||||
"""
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description=ARGPARSE_DESCRIPTION,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--fbank-dir",
|
||||
type=Path,
|
||||
help="Path to directory where manifests are stored.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--config",
|
||||
type=Path,
|
||||
nargs="+",
|
||||
help="Path to config file for transcript parsing.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_CSJParsers(config_files: List[Path]) -> List[CSJSDBParser]:
|
||||
parsers = []
|
||||
for config_file in config_files:
|
||||
config = ConfigParser()
|
||||
config.optionxform = str
|
||||
assert config.read(config_file), f"{config_file} could not be found."
|
||||
decisions = {}
|
||||
for k, v in config["DECISIONS"].items():
|
||||
try:
|
||||
decisions[k] = int(v)
|
||||
except ValueError:
|
||||
decisions[k] = v
|
||||
parsers.append(
|
||||
(config["CONSTANTS"].get("MODE"), CSJSDBParser(decisions=decisions))
|
||||
)
|
||||
return parsers
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
logging.basicConfig(
|
||||
format=("%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"),
|
||||
level=logging.INFO,
|
||||
)
|
||||
parsers = get_CSJParsers(args.config)
|
||||
config = ConfigParser()
|
||||
config.optionxform = str
|
||||
assert config.read(args.config), args.config
|
||||
decisions = {}
|
||||
for k, v in config["DECISIONS"].items():
|
||||
try:
|
||||
decisions[k] = int(v)
|
||||
except ValueError:
|
||||
decisions[k] = v
|
||||
|
||||
logging.info(f"Adding {', '.join(x[0] for x in parsers)} transcript mode.")
|
||||
|
||||
manifests = args.fbank_dir.glob("csj_cuts_*.jsonl.gz")
|
||||
assert manifests, f"No cuts to be found in {args.fbank_dir}"
|
||||
|
||||
for manifest in manifests:
|
||||
results = []
|
||||
logging.info(f"Adding transcript modes to {manifest.name} now.")
|
||||
cutset = CutSet.from_file(manifest)
|
||||
for cut in cutset:
|
||||
for name, parser in parsers:
|
||||
cut.supervisions[0].custom[name] = parser.parse(
|
||||
cut.supervisions[0].custom["raw"]
|
||||
)
|
||||
cut.supervisions[0].text = ""
|
||||
results.append(cut)
|
||||
results = CutSet.from_items(results)
|
||||
res_file = manifest.as_posix()
|
||||
manifest.replace(manifest.parent / ("bak." + manifest.name))
|
||||
results.to_file(res_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
|
||||
# Copyright 2023 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
@ -19,9 +19,7 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from random import Random
|
||||
from typing import List, Tuple
|
||||
|
||||
import torch
|
||||
@ -35,20 +33,10 @@ from lhotse import ( # See the following for why LilcomChunkyWriter is preferre
|
||||
RecordingSet,
|
||||
SupervisionSet,
|
||||
)
|
||||
from lhotse.recipes.csj import concat_csj_supervisions
|
||||
|
||||
# fmt: on
|
||||
|
||||
ARGPARSE_DESCRIPTION = """
|
||||
This script follows the espnet method of splitting the remaining core+noncore
|
||||
utterances into valid and train cutsets at an index which is by default 4000.
|
||||
|
||||
In other words, the core+noncore utterances are shuffled, where 4000 utterances
|
||||
of the shuffled set go to the `valid` cutset and are not subject to speed
|
||||
perturbation. The remaining utterances become the `train` cutset and are speed-
|
||||
perturbed (0.9x, 1.0x, 1.1x).
|
||||
|
||||
"""
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
# Do this outside of main() in case it needs to take effect
|
||||
@ -57,66 +45,101 @@ torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
RNG_SEED = 42
|
||||
# concat_params_train = [
|
||||
# {"gap": 1.0, "maxlen": 10.0},
|
||||
# {"gap": 1.5, "maxlen": 8.0},
|
||||
# {"gap": 1.0, "maxlen": 18.0},
|
||||
# ]
|
||||
|
||||
concat_params = {"gap": 1.0, "maxlen": 10.0}
|
||||
|
||||
|
||||
def make_cutset_blueprints(
|
||||
manifest_dir: Path,
|
||||
split: int,
|
||||
) -> List[Tuple[str, CutSet]]:
|
||||
|
||||
cut_sets = []
|
||||
logging.info("Creating non-train cuts.")
|
||||
|
||||
# Create eval datasets
|
||||
logging.info("Creating eval cuts.")
|
||||
for i in range(1, 4):
|
||||
sps = sorted(
|
||||
SupervisionSet.from_file(
|
||||
manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz"
|
||||
),
|
||||
key=lambda x: x.id,
|
||||
)
|
||||
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=RecordingSet.from_file(
|
||||
manifest_dir / f"csj_recordings_eval{i}.jsonl.gz"
|
||||
),
|
||||
supervisions=SupervisionSet.from_file(
|
||||
manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz"
|
||||
),
|
||||
supervisions=concat_csj_supervisions(sps, **concat_params),
|
||||
)
|
||||
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
||||
cut_sets.append((f"eval{i}", cut_set))
|
||||
|
||||
# Create train and valid cuts
|
||||
logging.info("Loading, trimming, and shuffling the remaining core+noncore cuts.")
|
||||
recording_set = RecordingSet.from_file(
|
||||
manifest_dir / "csj_recordings_core.jsonl.gz"
|
||||
) + RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz")
|
||||
supervision_set = SupervisionSet.from_file(
|
||||
manifest_dir / "csj_supervisions_core.jsonl.gz"
|
||||
) + SupervisionSet.from_file(manifest_dir / "csj_supervisions_noncore.jsonl.gz")
|
||||
|
||||
# Create excluded dataset
|
||||
sps = sorted(
|
||||
SupervisionSet.from_file(manifest_dir / "csj_supervisions_excluded.jsonl.gz"),
|
||||
key=lambda x: x.id,
|
||||
)
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=recording_set,
|
||||
supervisions=supervision_set,
|
||||
recordings=RecordingSet.from_file(
|
||||
manifest_dir / "csj_recordings_excluded.jsonl.gz"
|
||||
),
|
||||
supervisions=concat_csj_supervisions(sps, **concat_params),
|
||||
)
|
||||
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
||||
cut_set = cut_set.shuffle(Random(RNG_SEED))
|
||||
cut_sets.append(("excluded", cut_set))
|
||||
|
||||
logging.info(
|
||||
"Creating valid and train cuts from core and noncore, split at {split}."
|
||||
# Create valid dataset
|
||||
sps = sorted(
|
||||
SupervisionSet.from_file(manifest_dir / "csj_supervisions_valid.jsonl.gz"),
|
||||
key=lambda x: x.id,
|
||||
)
|
||||
valid_set = CutSet.from_cuts(islice(cut_set, 0, split))
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=RecordingSet.from_file(
|
||||
manifest_dir / "csj_recordings_valid.jsonl.gz"
|
||||
),
|
||||
supervisions=concat_csj_supervisions(sps, **concat_params),
|
||||
)
|
||||
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
||||
cut_sets.append(("valid", cut_set))
|
||||
|
||||
train_set = CutSet.from_cuts(islice(cut_set, split, None))
|
||||
logging.info("Creating train cuts.")
|
||||
|
||||
# Create train dataset
|
||||
sps = sorted(
|
||||
SupervisionSet.from_file(manifest_dir / "csj_supervisions_core.jsonl.gz")
|
||||
+ SupervisionSet.from_file(manifest_dir / "csj_supervisions_noncore.jsonl.gz"),
|
||||
key=lambda x: x.id,
|
||||
)
|
||||
|
||||
recording = RecordingSet.from_file(
|
||||
manifest_dir / "csj_recordings_core.jsonl.gz"
|
||||
) + RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz")
|
||||
|
||||
train_set = CutSet.from_manifests(
|
||||
recordings=recording, supervisions=concat_csj_supervisions(sps, **concat_params)
|
||||
).trim_to_supervisions(keep_overlapping=False)
|
||||
train_set = train_set + train_set.perturb_speed(0.9) + train_set.perturb_speed(1.1)
|
||||
|
||||
cut_sets.extend([("valid", valid_set), ("train", train_set)])
|
||||
cut_sets.append(("train", train_set))
|
||||
|
||||
return cut_sets
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=ARGPARSE_DESCRIPTION,
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument("--manifest-dir", type=Path, help="Path to save manifests")
|
||||
parser.add_argument("--fbank-dir", type=Path, help="Path to save fbank features")
|
||||
parser.add_argument("--split", type=int, default=4000, help="Split at this index")
|
||||
parser.add_argument(
|
||||
"-m", "--manifest-dir", type=Path, help="Path to save manifests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f", "--fbank-dir", type=Path, help="Path to save fbank features"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
@ -138,7 +161,7 @@ def main():
|
||||
)
|
||||
return
|
||||
else:
|
||||
cut_sets = make_cutset_blueprints(args.manifest_dir, args.split)
|
||||
cut_sets = make_cutset_blueprints(args.manifest_dir)
|
||||
for part, cut_set in cut_sets:
|
||||
logging.info(f"Processing {part}")
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
@ -147,7 +170,7 @@ def main():
|
||||
storage_path=(args.fbank_dir / f"feats_{part}").as_posix(),
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set.to_file(args.manifest_dir / f"csj_cuts_{part}.jsonl.gz")
|
||||
cut_set.to_file(args.fbank_dir / f"csj_cuts_{part}.jsonl.gz")
|
||||
|
||||
logging.info("All fbank computed for CSJ.")
|
||||
(args.fbank_dir / ".done").touch()
|
||||
|
||||
@ -28,9 +28,7 @@ from icefall.utils import get_executor
|
||||
|
||||
ARGPARSE_DESCRIPTION = """
|
||||
This file computes fbank features of the musan dataset.
|
||||
It looks for manifests in the directory data/manifests.
|
||||
|
||||
The generated fbank features are saved in data/fbank.
|
||||
"""
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
@ -42,8 +40,6 @@ torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_musan(manifest_dir: Path, fbank_dir: Path):
|
||||
# src_dir = Path("data/manifests")
|
||||
# output_dir = Path("data/fbank")
|
||||
num_jobs = min(15, os.cpu_count())
|
||||
num_mel_bins = 80
|
||||
|
||||
@ -104,8 +100,12 @@ def get_args():
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument("--manifest-dir", type=Path, help="Path to save manifests")
|
||||
parser.add_argument("--fbank-dir", type=Path, help="Path to save fbank features")
|
||||
parser.add_argument(
|
||||
"-m", "--manifest-dir", type=Path, help="Path to save manifests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f", "--fbank-dir", type=Path, help="Path to save fbank features"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -1,320 +1,79 @@
|
||||
; # This section is ignored if this file is not supplied as the first config file to
|
||||
; # lhotse prepare csj
|
||||
[SEGMENTS]
|
||||
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
|
||||
gap = 0.5
|
||||
; # Maximum length of segment (s).
|
||||
maxlen = 10
|
||||
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
|
||||
minlen = 0.02
|
||||
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
|
||||
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
|
||||
; # If you intend to use a multicharacter string for gap_sym, remember to register the
|
||||
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
|
||||
gap_sym =
|
||||
|
||||
[CONSTANTS]
|
||||
; # Name of this mode
|
||||
MODE = disfluent
|
||||
; # Suffixes to use after the word surface (no longer used)
|
||||
MORPH = pos1 cForm cType2 pos2
|
||||
; # Used to differentiate between A tag and A_num tag
|
||||
JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 .
|
||||
; # Dummy character to delineate multiline words
|
||||
PLUS = +
|
||||
|
||||
[DECISIONS]
|
||||
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
|
||||
; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries
|
||||
|
||||
; # フィラー、感情表出系感動詞
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(F ぎょっ)'
|
||||
F = 0
|
||||
; # Example: '(L (F ン))', '比べ(F えー)る'
|
||||
F^ = 0
|
||||
; # 言い直し、いいよどみなどによる語断片
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(D だ)(D だいが) 大学の学部の会議'
|
||||
D = 0
|
||||
; # Example: '(L (D ドゥ)+(D ヒ))'
|
||||
D^ = 0
|
||||
; # 助詞、助動詞、接辞の言い直し
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
|
||||
D2 = 0
|
||||
; # Example: '(X (D2 ノ))'
|
||||
D2^ = 0
|
||||
; # 聞き取りや語彙の判断に自信がない場合
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: (? 字数) の
|
||||
; # If no option: empty string is returned regardless of output
|
||||
; # Example: '(?) で'
|
||||
? = 0
|
||||
; # Example: '(D (? すー))+そう+です+よ+ね'
|
||||
?^ = 0
|
||||
; # タグ?で、値は複数の候補が想定される場合
|
||||
; # 0 for main guess with matching morph info, 1 for second guess
|
||||
; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)'
|
||||
?, = 0
|
||||
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
|
||||
?,^ = 0
|
||||
; # 音や言葉に関するメタ的な引用
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
|
||||
M = 0
|
||||
; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))'
|
||||
M^ = 0
|
||||
; # 外国語や古語、方言など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(O ザッツファイン)'
|
||||
O = 0
|
||||
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
|
||||
O^ = 0
|
||||
; # 講演者の名前、差別語、誹謗中傷など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '国語研の (R ××) です'
|
||||
R = 0
|
||||
R^ = 0
|
||||
; # 非朗読対象発話(朗読における言い間違い等)
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(X 実際は) 実際には'
|
||||
X = 0
|
||||
; # Example: '(L (X (D2 ニ)))'
|
||||
X^ = 0
|
||||
; # アルファベットや算用数字、記号の表記
|
||||
; # 0 to use Japanese form, 1 to use alphabet form
|
||||
; # Example: '(A シーディーアール;CD-R)'
|
||||
A = 1
|
||||
; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
|
||||
A^ = 1
|
||||
; # タグAで、単語は算用数字の場合
|
||||
; # 0 to use Japanese form, 1 to use Arabic numerals
|
||||
; # Example: (A 二千;2000)
|
||||
A_num = eval:self.notag
|
||||
A_num^ = eval:self.notag
|
||||
A_num = 0
|
||||
; # 何らかの原因で漢字表記できなくなった場合
|
||||
; # 0 to use broken form, 1 to use orthodox form
|
||||
; # Example: '(K たち (F えー) ばな;橘)'
|
||||
K = 1
|
||||
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
|
||||
K^ = 1
|
||||
; # 転訛、発音の怠けなど、一時的な発音エラー
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(W ギーツ;ギジュツ)'
|
||||
W = 1
|
||||
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
|
||||
W^ = 1
|
||||
; # 語の読みに関する知識レベルのいい間違い
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(B シブタイ;ジュータイ)'
|
||||
B = 0
|
||||
; # Example: 'データー(B カズ;スー)'
|
||||
B^ = 0
|
||||
; # 笑いながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
|
||||
笑 = 0
|
||||
; # Example: 'コク(笑 サイ+(D オン))',
|
||||
笑^ = 0
|
||||
; # 泣きながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(泣 ドンナニ)'
|
||||
泣 = 0
|
||||
泣^ = 0
|
||||
; # 咳をしながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: 'シャ(咳 リン) ノ'
|
||||
咳 = 0
|
||||
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
|
||||
咳^ = 0
|
||||
; # ささやき声や独り言などの小さな声
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
|
||||
L = 0
|
||||
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
|
||||
L^ = 0
|
||||
|
||||
[REPLACEMENTS]
|
||||
; # ボーカルフライなどで母音が同定できない場合
|
||||
<FV> =
|
||||
; # 「うん/うーん/ふーん」の音の特定が困難な場合
|
||||
<VN> =
|
||||
; # 非語彙的な母音の引き延ばし
|
||||
<H> =
|
||||
; # 非語彙的な子音の引き延ばし
|
||||
<Q> =
|
||||
; # 言語音と独立に講演者の笑いが生じている場合
|
||||
<笑> =
|
||||
; # 言語音と独立に講演者の咳が生じている場合
|
||||
<咳> =
|
||||
; # 言語音と独立に講演者の息が生じている場合
|
||||
<息> =
|
||||
; # 講演者の泣き声
|
||||
<泣> =
|
||||
; # 聴衆(司会者なども含む)の発話
|
||||
<フロア発話> =
|
||||
; # 聴衆の笑い
|
||||
<フロア笑> =
|
||||
; # 聴衆の拍手
|
||||
<拍手> =
|
||||
; # 講演者が発表中に用いたデモンストレーションの音声
|
||||
<デモ> =
|
||||
; # 学会講演に発表時間を知らせるためにならすベルの音
|
||||
<ベル> =
|
||||
; # 転記単位全体が再度読み直された場合
|
||||
<朗読間違い> =
|
||||
; # 上記以外の音で特に目立った音
|
||||
<雑音> =
|
||||
; # 0.2秒以上のポーズ
|
||||
<P> =
|
||||
; # Redacted information, for R
|
||||
; # It is \x00D7 multiplication sign, not your normal 'x'
|
||||
× = ×
|
||||
|
||||
[FIELDS]
|
||||
; # Time information for segment
|
||||
time = 3
|
||||
; # Word surface
|
||||
surface = 5
|
||||
; # Word surface root form without CSJ tags
|
||||
notag = 9
|
||||
; # Part Of Speech
|
||||
pos1 = 11
|
||||
; # Conjugated Form
|
||||
cForm = 12
|
||||
; # Conjugation Type
|
||||
cType1 = 13
|
||||
; # Subcategory of POS
|
||||
pos2 = 14
|
||||
; # Euphonic Change / Subcategory of Conjugation Type
|
||||
cType2 = 15
|
||||
; # Other information
|
||||
other = 16
|
||||
; # Pronunciation for lexicon
|
||||
pron = 10
|
||||
; # Speaker ID
|
||||
spk_id = 2
|
||||
|
||||
[KATAKANA2ROMAJI]
|
||||
ア = 'a
|
||||
イ = 'i
|
||||
ウ = 'u
|
||||
エ = 'e
|
||||
オ = 'o
|
||||
カ = ka
|
||||
キ = ki
|
||||
ク = ku
|
||||
ケ = ke
|
||||
コ = ko
|
||||
ガ = ga
|
||||
ギ = gi
|
||||
グ = gu
|
||||
ゲ = ge
|
||||
ゴ = go
|
||||
サ = sa
|
||||
シ = si
|
||||
ス = su
|
||||
セ = se
|
||||
ソ = so
|
||||
ザ = za
|
||||
ジ = zi
|
||||
ズ = zu
|
||||
ゼ = ze
|
||||
ゾ = zo
|
||||
タ = ta
|
||||
チ = ti
|
||||
ツ = tu
|
||||
テ = te
|
||||
ト = to
|
||||
ダ = da
|
||||
ヂ = di
|
||||
ヅ = du
|
||||
デ = de
|
||||
ド = do
|
||||
ナ = na
|
||||
ニ = ni
|
||||
ヌ = nu
|
||||
ネ = ne
|
||||
ノ = no
|
||||
ハ = ha
|
||||
ヒ = hi
|
||||
フ = hu
|
||||
ヘ = he
|
||||
ホ = ho
|
||||
バ = ba
|
||||
ビ = bi
|
||||
ブ = bu
|
||||
ベ = be
|
||||
ボ = bo
|
||||
パ = pa
|
||||
ピ = pi
|
||||
プ = pu
|
||||
ペ = pe
|
||||
ポ = po
|
||||
マ = ma
|
||||
ミ = mi
|
||||
ム = mu
|
||||
メ = me
|
||||
モ = mo
|
||||
ヤ = ya
|
||||
ユ = yu
|
||||
ヨ = yo
|
||||
ラ = ra
|
||||
リ = ri
|
||||
ル = ru
|
||||
レ = re
|
||||
ロ = ro
|
||||
ワ = wa
|
||||
ヰ = we
|
||||
ヱ = wi
|
||||
ヲ = wo
|
||||
ン = ŋ
|
||||
ッ = q
|
||||
ー = -
|
||||
キャ = kǐa
|
||||
キュ = kǐu
|
||||
キョ = kǐo
|
||||
ギャ = gǐa
|
||||
ギュ = gǐu
|
||||
ギョ = gǐo
|
||||
シャ = sǐa
|
||||
シュ = sǐu
|
||||
ショ = sǐo
|
||||
ジャ = zǐa
|
||||
ジュ = zǐu
|
||||
ジョ = zǐo
|
||||
チャ = tǐa
|
||||
チュ = tǐu
|
||||
チョ = tǐo
|
||||
ヂャ = dǐa
|
||||
ヂュ = dǐu
|
||||
ヂョ = dǐo
|
||||
ニャ = nǐa
|
||||
ニュ = nǐu
|
||||
ニョ = nǐo
|
||||
ヒャ = hǐa
|
||||
ヒュ = hǐu
|
||||
ヒョ = hǐo
|
||||
ビャ = bǐa
|
||||
ビュ = bǐu
|
||||
ビョ = bǐo
|
||||
ピャ = pǐa
|
||||
ピュ = pǐu
|
||||
ピョ = pǐo
|
||||
ミャ = mǐa
|
||||
ミュ = mǐu
|
||||
ミョ = mǐo
|
||||
リャ = rǐa
|
||||
リュ = rǐu
|
||||
リョ = rǐo
|
||||
ァ = a
|
||||
ィ = i
|
||||
ゥ = u
|
||||
ェ = e
|
||||
ォ = o
|
||||
ヮ = ʍ
|
||||
ヴ = vu
|
||||
ャ = ǐa
|
||||
ュ = ǐu
|
||||
ョ = ǐo
|
||||
|
||||
@ -1,320 +1,79 @@
|
||||
; # This section is ignored if this file is not supplied as the first config file to
|
||||
; # lhotse prepare csj
|
||||
[SEGMENTS]
|
||||
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
|
||||
gap = 0.5
|
||||
; # Maximum length of segment (s).
|
||||
maxlen = 10
|
||||
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
|
||||
minlen = 0.02
|
||||
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
|
||||
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
|
||||
; # If you intend to use a multicharacter string for gap_sym, remember to register the
|
||||
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
|
||||
gap_sym =
|
||||
|
||||
[CONSTANTS]
|
||||
; # Name of this mode
|
||||
MODE = fluent
|
||||
; # Suffixes to use after the word surface (no longer used)
|
||||
MORPH = pos1 cForm cType2 pos2
|
||||
; # Used to differentiate between A tag and A_num tag
|
||||
JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 .
|
||||
; # Dummy character to delineate multiline words
|
||||
PLUS = +
|
||||
|
||||
[DECISIONS]
|
||||
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
|
||||
; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries
|
||||
|
||||
; # フィラー、感情表出系感動詞
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(F ぎょっ)'
|
||||
F = 1
|
||||
; # Example: '(L (F ン))', '比べ(F えー)る'
|
||||
F^ = 1
|
||||
; # 言い直し、いいよどみなどによる語断片
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(D だ)(D だいが) 大学の学部の会議'
|
||||
D = 1
|
||||
; # Example: '(L (D ドゥ)+(D ヒ))'
|
||||
D^ = 1
|
||||
; # 助詞、助動詞、接辞の言い直し
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
|
||||
D2 = 1
|
||||
; # Example: '(X (D2 ノ))'
|
||||
D2^ = 1
|
||||
; # 聞き取りや語彙の判断に自信がない場合
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: (? 字数) の
|
||||
; # If no option: empty string is returned regardless of output
|
||||
; # Example: '(?) で'
|
||||
? = 0
|
||||
; # Example: '(D (? すー))+そう+です+よ+ね'
|
||||
?^ = 0
|
||||
; # タグ?で、値は複数の候補が想定される場合
|
||||
; # 0 for main guess with matching morph info, 1 for second guess
|
||||
; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)'
|
||||
?, = 0
|
||||
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
|
||||
?,^ = 0
|
||||
; # 音や言葉に関するメタ的な引用
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
|
||||
M = 0
|
||||
; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))'
|
||||
M^ = 0
|
||||
; # 外国語や古語、方言など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(O ザッツファイン)'
|
||||
O = 0
|
||||
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
|
||||
O^ = 0
|
||||
; # 講演者の名前、差別語、誹謗中傷など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '国語研の (R ××) です'
|
||||
R = 0
|
||||
R^ = 0
|
||||
; # 非朗読対象発話(朗読における言い間違い等)
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(X 実際は) 実際には'
|
||||
X = 0
|
||||
; # Example: '(L (X (D2 ニ)))'
|
||||
X^ = 0
|
||||
; # アルファベットや算用数字、記号の表記
|
||||
; # 0 to use Japanese form, 1 to use alphabet form
|
||||
; # Example: '(A シーディーアール;CD-R)'
|
||||
A = 1
|
||||
; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
|
||||
A^ = 1
|
||||
; # タグAで、単語は算用数字の場合
|
||||
; # 0 to use Japanese form, 1 to use Arabic numerals
|
||||
; # Example: (A 二千;2000)
|
||||
A_num = eval:self.notag
|
||||
A_num^ = eval:self.notag
|
||||
A_num = 0
|
||||
; # 何らかの原因で漢字表記できなくなった場合
|
||||
; # 0 to use broken form, 1 to use orthodox form
|
||||
; # Example: '(K たち (F えー) ばな;橘)'
|
||||
K = 1
|
||||
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
|
||||
K^ = 1
|
||||
; # 転訛、発音の怠けなど、一時的な発音エラー
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(W ギーツ;ギジュツ)'
|
||||
W = 1
|
||||
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
|
||||
W^ = 1
|
||||
; # 語の読みに関する知識レベルのいい間違い
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(B シブタイ;ジュータイ)'
|
||||
B = 0
|
||||
; # Example: 'データー(B カズ;スー)'
|
||||
B^ = 0
|
||||
; # 笑いながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
|
||||
笑 = 0
|
||||
; # Example: 'コク(笑 サイ+(D オン))',
|
||||
笑^ = 0
|
||||
; # 泣きながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(泣 ドンナニ)'
|
||||
泣 = 0
|
||||
泣^ = 0
|
||||
; # 咳をしながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: 'シャ(咳 リン) ノ'
|
||||
咳 = 0
|
||||
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
|
||||
咳^ = 0
|
||||
; # ささやき声や独り言などの小さな声
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
|
||||
L = 0
|
||||
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
|
||||
L^ = 0
|
||||
|
||||
[REPLACEMENTS]
|
||||
; # ボーカルフライなどで母音が同定できない場合
|
||||
<FV> =
|
||||
; # 「うん/うーん/ふーん」の音の特定が困難な場合
|
||||
<VN> =
|
||||
; # 非語彙的な母音の引き延ばし
|
||||
<H> =
|
||||
; # 非語彙的な子音の引き延ばし
|
||||
<Q> =
|
||||
; # 言語音と独立に講演者の笑いが生じている場合
|
||||
<笑> =
|
||||
; # 言語音と独立に講演者の咳が生じている場合
|
||||
<咳> =
|
||||
; # 言語音と独立に講演者の息が生じている場合
|
||||
<息> =
|
||||
; # 講演者の泣き声
|
||||
<泣> =
|
||||
; # 聴衆(司会者なども含む)の発話
|
||||
<フロア発話> =
|
||||
; # 聴衆の笑い
|
||||
<フロア笑> =
|
||||
; # 聴衆の拍手
|
||||
<拍手> =
|
||||
; # 講演者が発表中に用いたデモンストレーションの音声
|
||||
<デモ> =
|
||||
; # 学会講演に発表時間を知らせるためにならすベルの音
|
||||
<ベル> =
|
||||
; # 転記単位全体が再度読み直された場合
|
||||
<朗読間違い> =
|
||||
; # 上記以外の音で特に目立った音
|
||||
<雑音> =
|
||||
; # 0.2秒以上のポーズ
|
||||
<P> =
|
||||
; # Redacted information, for R
|
||||
; # It is \x00D7 multiplication sign, not your normal 'x'
|
||||
× = ×
|
||||
|
||||
[FIELDS]
|
||||
; # Time information for segment
|
||||
time = 3
|
||||
; # Word surface
|
||||
surface = 5
|
||||
; # Word surface root form without CSJ tags
|
||||
notag = 9
|
||||
; # Part Of Speech
|
||||
pos1 = 11
|
||||
; # Conjugated Form
|
||||
cForm = 12
|
||||
; # Conjugation Type
|
||||
cType1 = 13
|
||||
; # Subcategory of POS
|
||||
pos2 = 14
|
||||
; # Euphonic Change / Subcategory of Conjugation Type
|
||||
cType2 = 15
|
||||
; # Other information
|
||||
other = 16
|
||||
; # Pronunciation for lexicon
|
||||
pron = 10
|
||||
; # Speaker ID
|
||||
spk_id = 2
|
||||
|
||||
[KATAKANA2ROMAJI]
|
||||
ア = 'a
|
||||
イ = 'i
|
||||
ウ = 'u
|
||||
エ = 'e
|
||||
オ = 'o
|
||||
カ = ka
|
||||
キ = ki
|
||||
ク = ku
|
||||
ケ = ke
|
||||
コ = ko
|
||||
ガ = ga
|
||||
ギ = gi
|
||||
グ = gu
|
||||
ゲ = ge
|
||||
ゴ = go
|
||||
サ = sa
|
||||
シ = si
|
||||
ス = su
|
||||
セ = se
|
||||
ソ = so
|
||||
ザ = za
|
||||
ジ = zi
|
||||
ズ = zu
|
||||
ゼ = ze
|
||||
ゾ = zo
|
||||
タ = ta
|
||||
チ = ti
|
||||
ツ = tu
|
||||
テ = te
|
||||
ト = to
|
||||
ダ = da
|
||||
ヂ = di
|
||||
ヅ = du
|
||||
デ = de
|
||||
ド = do
|
||||
ナ = na
|
||||
ニ = ni
|
||||
ヌ = nu
|
||||
ネ = ne
|
||||
ノ = no
|
||||
ハ = ha
|
||||
ヒ = hi
|
||||
フ = hu
|
||||
ヘ = he
|
||||
ホ = ho
|
||||
バ = ba
|
||||
ビ = bi
|
||||
ブ = bu
|
||||
ベ = be
|
||||
ボ = bo
|
||||
パ = pa
|
||||
ピ = pi
|
||||
プ = pu
|
||||
ペ = pe
|
||||
ポ = po
|
||||
マ = ma
|
||||
ミ = mi
|
||||
ム = mu
|
||||
メ = me
|
||||
モ = mo
|
||||
ヤ = ya
|
||||
ユ = yu
|
||||
ヨ = yo
|
||||
ラ = ra
|
||||
リ = ri
|
||||
ル = ru
|
||||
レ = re
|
||||
ロ = ro
|
||||
ワ = wa
|
||||
ヰ = we
|
||||
ヱ = wi
|
||||
ヲ = wo
|
||||
ン = ŋ
|
||||
ッ = q
|
||||
ー = -
|
||||
キャ = kǐa
|
||||
キュ = kǐu
|
||||
キョ = kǐo
|
||||
ギャ = gǐa
|
||||
ギュ = gǐu
|
||||
ギョ = gǐo
|
||||
シャ = sǐa
|
||||
シュ = sǐu
|
||||
ショ = sǐo
|
||||
ジャ = zǐa
|
||||
ジュ = zǐu
|
||||
ジョ = zǐo
|
||||
チャ = tǐa
|
||||
チュ = tǐu
|
||||
チョ = tǐo
|
||||
ヂャ = dǐa
|
||||
ヂュ = dǐu
|
||||
ヂョ = dǐo
|
||||
ニャ = nǐa
|
||||
ニュ = nǐu
|
||||
ニョ = nǐo
|
||||
ヒャ = hǐa
|
||||
ヒュ = hǐu
|
||||
ヒョ = hǐo
|
||||
ビャ = bǐa
|
||||
ビュ = bǐu
|
||||
ビョ = bǐo
|
||||
ピャ = pǐa
|
||||
ピュ = pǐu
|
||||
ピョ = pǐo
|
||||
ミャ = mǐa
|
||||
ミュ = mǐu
|
||||
ミョ = mǐo
|
||||
リャ = rǐa
|
||||
リュ = rǐu
|
||||
リョ = rǐo
|
||||
ァ = a
|
||||
ィ = i
|
||||
ゥ = u
|
||||
ェ = e
|
||||
ォ = o
|
||||
ヮ = ʍ
|
||||
ヴ = vu
|
||||
ャ = ǐa
|
||||
ュ = ǐu
|
||||
ョ = ǐo
|
||||
|
||||
@ -1,320 +1,79 @@
|
||||
; # This section is ignored if this file is not supplied as the first config file to
|
||||
; # lhotse prepare csj
|
||||
[SEGMENTS]
|
||||
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
|
||||
gap = 0.5
|
||||
; # Maximum length of segment (s).
|
||||
maxlen = 10
|
||||
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
|
||||
minlen = 0.02
|
||||
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
|
||||
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
|
||||
; # If you intend to use a multicharacter string for gap_sym, remember to register the
|
||||
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
|
||||
gap_sym =
|
||||
|
||||
[CONSTANTS]
|
||||
; # Name of this mode
|
||||
MODE = number
|
||||
; # Suffixes to use after the word surface (no longer used)
|
||||
MORPH = pos1 cForm cType2 pos2
|
||||
; # Used to differentiate between A tag and A_num tag
|
||||
JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 .
|
||||
; # Dummy character to delineate multiline words
|
||||
PLUS = +
|
||||
|
||||
[DECISIONS]
|
||||
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
|
||||
; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries
|
||||
|
||||
; # フィラー、感情表出系感動詞
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(F ぎょっ)'
|
||||
F = 1
|
||||
; # Example: '(L (F ン))', '比べ(F えー)る'
|
||||
F^ = 1
|
||||
; # 言い直し、いいよどみなどによる語断片
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(D だ)(D だいが) 大学の学部の会議'
|
||||
D = 1
|
||||
; # Example: '(L (D ドゥ)+(D ヒ))'
|
||||
D^ = 1
|
||||
; # 助詞、助動詞、接辞の言い直し
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
|
||||
D2 = 1
|
||||
; # Example: '(X (D2 ノ))'
|
||||
D2^ = 1
|
||||
; # 聞き取りや語彙の判断に自信がない場合
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: (? 字数) の
|
||||
; # If no option: empty string is returned regardless of output
|
||||
; # Example: '(?) で'
|
||||
? = 0
|
||||
; # Example: '(D (? すー))+そう+です+よ+ね'
|
||||
?^ = 0
|
||||
; # タグ?で、値は複数の候補が想定される場合
|
||||
; # 0 for main guess with matching morph info, 1 for second guess
|
||||
; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)'
|
||||
?, = 0
|
||||
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
|
||||
?,^ = 0
|
||||
; # 音や言葉に関するメタ的な引用
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
|
||||
M = 0
|
||||
; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))'
|
||||
M^ = 0
|
||||
; # 外国語や古語、方言など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(O ザッツファイン)'
|
||||
O = 0
|
||||
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
|
||||
O^ = 0
|
||||
; # 講演者の名前、差別語、誹謗中傷など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '国語研の (R ××) です'
|
||||
R = 0
|
||||
R^ = 0
|
||||
; # 非朗読対象発話(朗読における言い間違い等)
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(X 実際は) 実際には'
|
||||
X = 0
|
||||
; # Example: '(L (X (D2 ニ)))'
|
||||
X^ = 0
|
||||
; # アルファベットや算用数字、記号の表記
|
||||
; # 0 to use Japanese form, 1 to use alphabet form
|
||||
; # Example: '(A シーディーアール;CD-R)'
|
||||
A = 1
|
||||
; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
|
||||
A^ = 1
|
||||
; # タグAで、単語は算用数字の場合
|
||||
; # 0 to use Japanese form, 1 to use Arabic numerals
|
||||
; # Example: (A 二千;2000)
|
||||
A_num = 1
|
||||
A_num^ = 1
|
||||
; # 何らかの原因で漢字表記できなくなった場合
|
||||
; # 0 to use broken form, 1 to use orthodox form
|
||||
; # Example: '(K たち (F えー) ばな;橘)'
|
||||
K = 1
|
||||
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
|
||||
K^ = 1
|
||||
; # 転訛、発音の怠けなど、一時的な発音エラー
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(W ギーツ;ギジュツ)'
|
||||
W = 1
|
||||
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
|
||||
W^ = 1
|
||||
; # 語の読みに関する知識レベルのいい間違い
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(B シブタイ;ジュータイ)'
|
||||
B = 0
|
||||
; # Example: 'データー(B カズ;スー)'
|
||||
B^ = 0
|
||||
; # 笑いながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
|
||||
笑 = 0
|
||||
; # Example: 'コク(笑 サイ+(D オン))',
|
||||
笑^ = 0
|
||||
; # 泣きながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(泣 ドンナニ)'
|
||||
泣 = 0
|
||||
泣^ = 0
|
||||
; # 咳をしながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: 'シャ(咳 リン) ノ'
|
||||
咳 = 0
|
||||
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
|
||||
咳^ = 0
|
||||
; # ささやき声や独り言などの小さな声
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
|
||||
L = 0
|
||||
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
|
||||
L^ = 0
|
||||
|
||||
[REPLACEMENTS]
|
||||
; # ボーカルフライなどで母音が同定できない場合
|
||||
<FV> =
|
||||
; # 「うん/うーん/ふーん」の音の特定が困難な場合
|
||||
<VN> =
|
||||
; # 非語彙的な母音の引き延ばし
|
||||
<H> =
|
||||
; # 非語彙的な子音の引き延ばし
|
||||
<Q> =
|
||||
; # 言語音と独立に講演者の笑いが生じている場合
|
||||
<笑> =
|
||||
; # 言語音と独立に講演者の咳が生じている場合
|
||||
<咳> =
|
||||
; # 言語音と独立に講演者の息が生じている場合
|
||||
<息> =
|
||||
; # 講演者の泣き声
|
||||
<泣> =
|
||||
; # 聴衆(司会者なども含む)の発話
|
||||
<フロア発話> =
|
||||
; # 聴衆の笑い
|
||||
<フロア笑> =
|
||||
; # 聴衆の拍手
|
||||
<拍手> =
|
||||
; # 講演者が発表中に用いたデモンストレーションの音声
|
||||
<デモ> =
|
||||
; # 学会講演に発表時間を知らせるためにならすベルの音
|
||||
<ベル> =
|
||||
; # 転記単位全体が再度読み直された場合
|
||||
<朗読間違い> =
|
||||
; # 上記以外の音で特に目立った音
|
||||
<雑音> =
|
||||
; # 0.2秒以上のポーズ
|
||||
<P> =
|
||||
; # Redacted information, for R
|
||||
; # It is \x00D7 multiplication sign, not your normal 'x'
|
||||
× = ×
|
||||
|
||||
[FIELDS]
|
||||
; # Time information for segment
|
||||
time = 3
|
||||
; # Word surface
|
||||
surface = 5
|
||||
; # Word surface root form without CSJ tags
|
||||
notag = 9
|
||||
; # Part Of Speech
|
||||
pos1 = 11
|
||||
; # Conjugated Form
|
||||
cForm = 12
|
||||
; # Conjugation Type
|
||||
cType1 = 13
|
||||
; # Subcategory of POS
|
||||
pos2 = 14
|
||||
; # Euphonic Change / Subcategory of Conjugation Type
|
||||
cType2 = 15
|
||||
; # Other information
|
||||
other = 16
|
||||
; # Pronunciation for lexicon
|
||||
pron = 10
|
||||
; # Speaker ID
|
||||
spk_id = 2
|
||||
|
||||
[KATAKANA2ROMAJI]
|
||||
ア = 'a
|
||||
イ = 'i
|
||||
ウ = 'u
|
||||
エ = 'e
|
||||
オ = 'o
|
||||
カ = ka
|
||||
キ = ki
|
||||
ク = ku
|
||||
ケ = ke
|
||||
コ = ko
|
||||
ガ = ga
|
||||
ギ = gi
|
||||
グ = gu
|
||||
ゲ = ge
|
||||
ゴ = go
|
||||
サ = sa
|
||||
シ = si
|
||||
ス = su
|
||||
セ = se
|
||||
ソ = so
|
||||
ザ = za
|
||||
ジ = zi
|
||||
ズ = zu
|
||||
ゼ = ze
|
||||
ゾ = zo
|
||||
タ = ta
|
||||
チ = ti
|
||||
ツ = tu
|
||||
テ = te
|
||||
ト = to
|
||||
ダ = da
|
||||
ヂ = di
|
||||
ヅ = du
|
||||
デ = de
|
||||
ド = do
|
||||
ナ = na
|
||||
ニ = ni
|
||||
ヌ = nu
|
||||
ネ = ne
|
||||
ノ = no
|
||||
ハ = ha
|
||||
ヒ = hi
|
||||
フ = hu
|
||||
ヘ = he
|
||||
ホ = ho
|
||||
バ = ba
|
||||
ビ = bi
|
||||
ブ = bu
|
||||
ベ = be
|
||||
ボ = bo
|
||||
パ = pa
|
||||
ピ = pi
|
||||
プ = pu
|
||||
ペ = pe
|
||||
ポ = po
|
||||
マ = ma
|
||||
ミ = mi
|
||||
ム = mu
|
||||
メ = me
|
||||
モ = mo
|
||||
ヤ = ya
|
||||
ユ = yu
|
||||
ヨ = yo
|
||||
ラ = ra
|
||||
リ = ri
|
||||
ル = ru
|
||||
レ = re
|
||||
ロ = ro
|
||||
ワ = wa
|
||||
ヰ = we
|
||||
ヱ = wi
|
||||
ヲ = wo
|
||||
ン = ŋ
|
||||
ッ = q
|
||||
ー = -
|
||||
キャ = kǐa
|
||||
キュ = kǐu
|
||||
キョ = kǐo
|
||||
ギャ = gǐa
|
||||
ギュ = gǐu
|
||||
ギョ = gǐo
|
||||
シャ = sǐa
|
||||
シュ = sǐu
|
||||
ショ = sǐo
|
||||
ジャ = zǐa
|
||||
ジュ = zǐu
|
||||
ジョ = zǐo
|
||||
チャ = tǐa
|
||||
チュ = tǐu
|
||||
チョ = tǐo
|
||||
ヂャ = dǐa
|
||||
ヂュ = dǐu
|
||||
ヂョ = dǐo
|
||||
ニャ = nǐa
|
||||
ニュ = nǐu
|
||||
ニョ = nǐo
|
||||
ヒャ = hǐa
|
||||
ヒュ = hǐu
|
||||
ヒョ = hǐo
|
||||
ビャ = bǐa
|
||||
ビュ = bǐu
|
||||
ビョ = bǐo
|
||||
ピャ = pǐa
|
||||
ピュ = pǐu
|
||||
ピョ = pǐo
|
||||
ミャ = mǐa
|
||||
ミュ = mǐu
|
||||
ミョ = mǐo
|
||||
リャ = rǐa
|
||||
リュ = rǐu
|
||||
リョ = rǐo
|
||||
ァ = a
|
||||
ィ = i
|
||||
ゥ = u
|
||||
ェ = e
|
||||
ォ = o
|
||||
ヮ = ʍ
|
||||
ヴ = vu
|
||||
ャ = ǐa
|
||||
ュ = ǐu
|
||||
ョ = ǐo
|
||||
|
||||
@ -1,321 +1,80 @@
|
||||
; # This section is ignored if this file is not supplied as the first config file to
|
||||
; # lhotse prepare csj
|
||||
[SEGMENTS]
|
||||
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
|
||||
gap = 0.5
|
||||
; # Maximum length of segment (s).
|
||||
maxlen = 10
|
||||
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
|
||||
minlen = 0.02
|
||||
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
|
||||
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
|
||||
; # If you intend to use a multicharacter string for gap_sym, remember to register the
|
||||
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
|
||||
gap_sym =
|
||||
|
||||
[CONSTANTS]
|
||||
; # Name of this mode
|
||||
; # See https://www.isca-speech.org/archive/pdfs/interspeech_2022/horii22_interspeech.pdf
|
||||
; # From https://www.isca-speech.org/archive/pdfs/interspeech_2022/horii22_interspeech.pdf
|
||||
MODE = symbol
|
||||
; # Suffixes to use after the word surface (no longer used)
|
||||
MORPH = pos1 cForm cType2 pos2
|
||||
; # Used to differentiate between A tag and A_num tag
|
||||
JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 .
|
||||
; # Dummy character to delineate multiline words
|
||||
PLUS = +
|
||||
|
||||
[DECISIONS]
|
||||
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
|
||||
; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries
|
||||
|
||||
; # フィラー、感情表出系感動詞
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(F ぎょっ)'
|
||||
F = #
|
||||
; # Example: '(L (F ン))', '比べ(F えー)る'
|
||||
F^ = #
|
||||
F = "#", ["F"]
|
||||
; # 言い直し、いいよどみなどによる語断片
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(D だ)(D だいが) 大学の学部の会議'
|
||||
D = @
|
||||
; # Example: '(L (D ドゥ)+(D ヒ))'
|
||||
D^ = @
|
||||
D = "@", ["D"]
|
||||
; # 助詞、助動詞、接辞の言い直し
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
|
||||
D2 = @
|
||||
; # Example: '(X (D2 ノ))'
|
||||
D2^ = @
|
||||
D2 = "@", ["D2"]
|
||||
; # 聞き取りや語彙の判断に自信がない場合
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: (? 字数) の
|
||||
; # If no option: empty string is returned regardless of output
|
||||
; # Example: '(?) で'
|
||||
? = 0
|
||||
; # Example: '(D (? すー))+そう+です+よ+ね'
|
||||
?^ = 0
|
||||
; # タグ?で、値は複数の候補が想定される場合
|
||||
; # 0 for main guess with matching morph info, 1 for second guess
|
||||
; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)'
|
||||
?, = 0
|
||||
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
|
||||
?,^ = 0
|
||||
; # 音や言葉に関するメタ的な引用
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
|
||||
M = 0
|
||||
; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))'
|
||||
M^ = 0
|
||||
; # 外国語や古語、方言など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(O ザッツファイン)'
|
||||
O = 0
|
||||
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
|
||||
O^ = 0
|
||||
; # 講演者の名前、差別語、誹謗中傷など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '国語研の (R ××) です'
|
||||
R = 0
|
||||
R^ = 0
|
||||
; # 非朗読対象発話(朗読における言い間違い等)
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(X 実際は) 実際には'
|
||||
X = 0
|
||||
; # Example: '(L (X (D2 ニ)))'
|
||||
X^ = 0
|
||||
; # アルファベットや算用数字、記号の表記
|
||||
; # 0 to use Japanese form, 1 to use alphabet form
|
||||
; # Example: '(A シーディーアール;CD-R)'
|
||||
A = 1
|
||||
; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
|
||||
A^ = 1
|
||||
; # タグAで、単語は算用数字の場合
|
||||
; # 0 to use Japanese form, 1 to use Arabic numerals
|
||||
; # Example: (A 二千;2000)
|
||||
A_num = eval:self.notag
|
||||
A_num^ = eval:self.notag
|
||||
A_num = 1
|
||||
; # 何らかの原因で漢字表記できなくなった場合
|
||||
; # 0 to use broken form, 1 to use orthodox form
|
||||
; # Example: '(K たち (F えー) ばな;橘)'
|
||||
K = 1
|
||||
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
|
||||
K^ = 1
|
||||
; # 転訛、発音の怠けなど、一時的な発音エラー
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(W ギーツ;ギジュツ)'
|
||||
W = 1
|
||||
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
|
||||
W^ = 1
|
||||
; # 語の読みに関する知識レベルのいい間違い
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(B シブタイ;ジュータイ)'
|
||||
B = 0
|
||||
; # Example: 'データー(B カズ;スー)'
|
||||
B^ = 0
|
||||
; # 笑いながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
|
||||
笑 = 0
|
||||
; # Example: 'コク(笑 サイ+(D オン))',
|
||||
笑^ = 0
|
||||
; # 泣きながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(泣 ドンナニ)'
|
||||
泣 = 0
|
||||
泣^ = 0
|
||||
; # 咳をしながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: 'シャ(咳 リン) ノ'
|
||||
咳 = 0
|
||||
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
|
||||
咳^ = 0
|
||||
; # ささやき声や独り言などの小さな声
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
|
||||
L = 0
|
||||
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
|
||||
L^ = 0
|
||||
|
||||
[REPLACEMENTS]
|
||||
; # ボーカルフライなどで母音が同定できない場合
|
||||
<FV> =
|
||||
; # 「うん/うーん/ふーん」の音の特定が困難な場合
|
||||
<VN> =
|
||||
; # 非語彙的な母音の引き延ばし
|
||||
<H> =
|
||||
; # 非語彙的な子音の引き延ばし
|
||||
<Q> =
|
||||
; # 言語音と独立に講演者の笑いが生じている場合
|
||||
<笑> =
|
||||
; # 言語音と独立に講演者の咳が生じている場合
|
||||
<咳> =
|
||||
; # 言語音と独立に講演者の息が生じている場合
|
||||
<息> =
|
||||
; # 講演者の泣き声
|
||||
<泣> =
|
||||
; # 聴衆(司会者なども含む)の発話
|
||||
<フロア発話> =
|
||||
; # 聴衆の笑い
|
||||
<フロア笑> =
|
||||
; # 聴衆の拍手
|
||||
<拍手> =
|
||||
; # 講演者が発表中に用いたデモンストレーションの音声
|
||||
<デモ> =
|
||||
; # 学会講演に発表時間を知らせるためにならすベルの音
|
||||
<ベル> =
|
||||
; # 転記単位全体が再度読み直された場合
|
||||
<朗読間違い> =
|
||||
; # 上記以外の音で特に目立った音
|
||||
<雑音> =
|
||||
; # 0.2秒以上のポーズ
|
||||
<P> =
|
||||
; # Redacted information, for R
|
||||
; # It is \x00D7 multiplication sign, not your normal 'x'
|
||||
× = ×
|
||||
|
||||
[FIELDS]
|
||||
; # Time information for segment
|
||||
time = 3
|
||||
; # Word surface
|
||||
surface = 5
|
||||
; # Word surface root form without CSJ tags
|
||||
notag = 9
|
||||
; # Part Of Speech
|
||||
pos1 = 11
|
||||
; # Conjugated Form
|
||||
cForm = 12
|
||||
; # Conjugation Type
|
||||
cType1 = 13
|
||||
; # Subcategory of POS
|
||||
pos2 = 14
|
||||
; # Euphonic Change / Subcategory of Conjugation Type
|
||||
cType2 = 15
|
||||
; # Other information
|
||||
other = 16
|
||||
; # Pronunciation for lexicon
|
||||
pron = 10
|
||||
; # Speaker ID
|
||||
spk_id = 2
|
||||
|
||||
[KATAKANA2ROMAJI]
|
||||
ア = 'a
|
||||
イ = 'i
|
||||
ウ = 'u
|
||||
エ = 'e
|
||||
オ = 'o
|
||||
カ = ka
|
||||
キ = ki
|
||||
ク = ku
|
||||
ケ = ke
|
||||
コ = ko
|
||||
ガ = ga
|
||||
ギ = gi
|
||||
グ = gu
|
||||
ゲ = ge
|
||||
ゴ = go
|
||||
サ = sa
|
||||
シ = si
|
||||
ス = su
|
||||
セ = se
|
||||
ソ = so
|
||||
ザ = za
|
||||
ジ = zi
|
||||
ズ = zu
|
||||
ゼ = ze
|
||||
ゾ = zo
|
||||
タ = ta
|
||||
チ = ti
|
||||
ツ = tu
|
||||
テ = te
|
||||
ト = to
|
||||
ダ = da
|
||||
ヂ = di
|
||||
ヅ = du
|
||||
デ = de
|
||||
ド = do
|
||||
ナ = na
|
||||
ニ = ni
|
||||
ヌ = nu
|
||||
ネ = ne
|
||||
ノ = no
|
||||
ハ = ha
|
||||
ヒ = hi
|
||||
フ = hu
|
||||
ヘ = he
|
||||
ホ = ho
|
||||
バ = ba
|
||||
ビ = bi
|
||||
ブ = bu
|
||||
ベ = be
|
||||
ボ = bo
|
||||
パ = pa
|
||||
ピ = pi
|
||||
プ = pu
|
||||
ペ = pe
|
||||
ポ = po
|
||||
マ = ma
|
||||
ミ = mi
|
||||
ム = mu
|
||||
メ = me
|
||||
モ = mo
|
||||
ヤ = ya
|
||||
ユ = yu
|
||||
ヨ = yo
|
||||
ラ = ra
|
||||
リ = ri
|
||||
ル = ru
|
||||
レ = re
|
||||
ロ = ro
|
||||
ワ = wa
|
||||
ヰ = we
|
||||
ヱ = wi
|
||||
ヲ = wo
|
||||
ン = ŋ
|
||||
ッ = q
|
||||
ー = -
|
||||
キャ = kǐa
|
||||
キュ = kǐu
|
||||
キョ = kǐo
|
||||
ギャ = gǐa
|
||||
ギュ = gǐu
|
||||
ギョ = gǐo
|
||||
シャ = sǐa
|
||||
シュ = sǐu
|
||||
ショ = sǐo
|
||||
ジャ = zǐa
|
||||
ジュ = zǐu
|
||||
ジョ = zǐo
|
||||
チャ = tǐa
|
||||
チュ = tǐu
|
||||
チョ = tǐo
|
||||
ヂャ = dǐa
|
||||
ヂュ = dǐu
|
||||
ヂョ = dǐo
|
||||
ニャ = nǐa
|
||||
ニュ = nǐu
|
||||
ニョ = nǐo
|
||||
ヒャ = hǐa
|
||||
ヒュ = hǐu
|
||||
ヒョ = hǐo
|
||||
ビャ = bǐa
|
||||
ビュ = bǐu
|
||||
ビョ = bǐo
|
||||
ピャ = pǐa
|
||||
ピュ = pǐu
|
||||
ピョ = pǐo
|
||||
ミャ = mǐa
|
||||
ミュ = mǐu
|
||||
ミョ = mǐo
|
||||
リャ = rǐa
|
||||
リュ = rǐu
|
||||
リョ = rǐo
|
||||
ァ = a
|
||||
ィ = i
|
||||
ゥ = u
|
||||
ェ = e
|
||||
ォ = o
|
||||
ヮ = ʍ
|
||||
ヴ = vu
|
||||
ャ = ǐa
|
||||
ュ = ǐu
|
||||
ョ = ǐo
|
||||
|
||||
202
egs/csj/ASR/local/disfluent_recogs_to_fluent.py
Normal file
202
egs/csj/ASR/local/disfluent_recogs_to_fluent.py
Normal file
@ -0,0 +1,202 @@
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import kaldialign
|
||||
from lhotse import CutSet
|
||||
|
||||
ARGPARSE_DESCRIPTION = """
|
||||
This helper code takes in a disfluent recogs file generated from icefall.utils.store_transcript,
|
||||
compares it against a fluent transcript, and saves the results in a separate directory.
|
||||
This is useful to compare disfluent models with fluent models on the same metric.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description=ARGPARSE_DESCRIPTION,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--recogs",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to the recogs-XXX file generated by icefall.utils.store_transcript.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cut",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to the cut manifest to be compared to. Assumes that disfluent_tag exists in the custom dict.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--res-dir", type=Path, required=True, help="Path to save results"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def d2f(stats):
|
||||
"""
|
||||
Compare the outputs of a disfluent model against a fluent reference.
|
||||
Indicates a disfluent model's performance only on the content words
|
||||
|
||||
CER^d_f = (sub_f + ins + del_f) / Nf
|
||||
|
||||
"""
|
||||
return stats["base"] / stats["Nf"]
|
||||
|
||||
|
||||
def calc_cer(refs, hyps):
|
||||
subs = {
|
||||
"F": 0,
|
||||
"D": 0,
|
||||
}
|
||||
ins = 0
|
||||
dels = {
|
||||
"F": 0,
|
||||
"D": 0,
|
||||
}
|
||||
cors = {
|
||||
"F": 0,
|
||||
"D": 0,
|
||||
}
|
||||
dis_ref_len = 0
|
||||
flu_ref_len = 0
|
||||
|
||||
for ref, hyp in zip(refs, hyps):
|
||||
assert (
|
||||
ref[0] == hyp[0]
|
||||
), f"Expected ref cut id {ref[0]} to be the same as hyp cut id {hyp[0]}."
|
||||
tag = ref[2].copy()
|
||||
ref = ref[1]
|
||||
dis_ref_len += len(ref)
|
||||
# Remember that the 'D' and 'F' tags here refer to CSJ tags, not disfluent and fluent respectively.
|
||||
flu_ref_len += len([t for t in tag if ("D" not in t and "F" not in t)])
|
||||
hyp = hyp[1]
|
||||
ali = kaldialign.align(ref, hyp, "*")
|
||||
tags = ["*" if r[0] == "*" else tag.pop(0) for r in ali]
|
||||
for tag, (ref_word, hyp_word) in zip(tags, ali):
|
||||
if "D" in tag or "F" in tag:
|
||||
tag = "D"
|
||||
else:
|
||||
tag = "F"
|
||||
|
||||
if ref_word == "*":
|
||||
ins += 1
|
||||
elif hyp_word == "*":
|
||||
dels[tag] += 1
|
||||
elif ref_word != hyp_word:
|
||||
subs[tag] += 1
|
||||
else:
|
||||
cors[tag] += 1
|
||||
|
||||
return {
|
||||
"subs": subs,
|
||||
"ins": ins,
|
||||
"dels": dels,
|
||||
"cors": cors,
|
||||
"dis_ref_len": dis_ref_len,
|
||||
"flu_ref_len": flu_ref_len,
|
||||
}
|
||||
|
||||
|
||||
def for_each_recogs(recogs_file: Path, refs, out_dir):
|
||||
hyps = []
|
||||
with recogs_file.open() as fin:
|
||||
for line in fin:
|
||||
if "ref" in line:
|
||||
continue
|
||||
cutid, hyp = line.split(":\thyp=")
|
||||
hyps.append((cutid, eval(hyp)))
|
||||
|
||||
assert len(refs) == len(
|
||||
hyps
|
||||
), f"Expected refs len {len(refs)} and hyps len {len(hyps)} to be equal."
|
||||
stats = calc_cer(refs, hyps)
|
||||
stat_table = ["tag,yes,no"]
|
||||
|
||||
for cer_type in ["subs", "dels", "cors", "ins"]:
|
||||
ret = f"{cer_type}"
|
||||
for df in ["D", "F"]:
|
||||
try:
|
||||
ret += f",{stats[cer_type][df]}"
|
||||
except TypeError:
|
||||
# insertions do not belong to F or D, and is not subscriptable.
|
||||
ret += f",{stats[cer_type]},"
|
||||
break
|
||||
stat_table.append(ret)
|
||||
stat_table = "\n".join(stat_table)
|
||||
|
||||
stats = {
|
||||
"subd": stats["subs"]["D"],
|
||||
"deld": stats["dels"]["D"],
|
||||
"cord": stats["cors"]["D"],
|
||||
"Nf": stats["flu_ref_len"],
|
||||
"base": stats["subs"]["F"] + stats["ins"] + stats["dels"]["F"],
|
||||
}
|
||||
|
||||
cer = d2f(stats)
|
||||
results = [
|
||||
f"{cer:.2%}",
|
||||
f"Nf,{stats['Nf']}",
|
||||
]
|
||||
results = "\n".join(results)
|
||||
|
||||
with (out_dir / (recogs_file.stem + ".dfcer")).open("w") as fout:
|
||||
fout.write(results)
|
||||
fout.write("\n\n")
|
||||
fout.write(stat_table)
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
recogs_file: Path = args.recogs
|
||||
assert (
|
||||
recogs_file.is_file() or recogs_file.is_dir()
|
||||
), f"recogs_file cannot be found at {recogs_file}."
|
||||
|
||||
args.res_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if recogs_file.is_file() and recogs_file.stem.startswith("recogs-"):
|
||||
assert (
|
||||
"csj_cuts" in args.cut.name
|
||||
), f"Expected {args.cut} to be a cuts manifest."
|
||||
|
||||
refs: CutSet = CutSet.from_file(args.cut)
|
||||
refs = sorted(
|
||||
[
|
||||
(
|
||||
e.id,
|
||||
list(e.supervisions[0].custom["disfluent"]),
|
||||
e.supervisions[0].custom["disfluent_tag"].split(","),
|
||||
)
|
||||
for e in refs
|
||||
],
|
||||
key=lambda x: x[0],
|
||||
)
|
||||
for_each_recogs(recogs_file, refs, args.res_dir)
|
||||
|
||||
elif recogs_file.is_dir():
|
||||
recogs_file_path = recogs_file
|
||||
for partname in ["eval1", "eval2", "eval3", "excluded", "valid"]:
|
||||
refs: CutSet = CutSet.from_file(args.cut / f"csj_cuts_{partname}.jsonl.gz")
|
||||
refs = sorted(
|
||||
[
|
||||
(
|
||||
r.id,
|
||||
list(r.supervisions[0].custom["disfluent"]),
|
||||
r.supervisions[0].custom["disfluent_tag"].split(","),
|
||||
)
|
||||
for r in refs
|
||||
],
|
||||
key=lambda x: x[0],
|
||||
)
|
||||
for recogs_file in recogs_file_path.glob(f"recogs-{partname}-*.txt"):
|
||||
for_each_recogs(recogs_file, refs, args.res_dir)
|
||||
|
||||
else:
|
||||
raise TypeError(f"Unrecognised recogs file provided: {recogs_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -45,8 +45,8 @@ def get_parser():
|
||||
def main():
|
||||
args = get_parser()
|
||||
|
||||
for path in args.manifest_dir.glob("csj_cuts_*.jsonl.gz"):
|
||||
|
||||
for part in ["eval1", "eval2", "eval3", "valid", "excluded", "train"]:
|
||||
path = args.manifest_dir / f"csj_cuts_{part}.jsonl.gz"
|
||||
cuts: CutSet = load_manifest(path)
|
||||
|
||||
print("\n---------------------------------\n")
|
||||
@ -58,123 +58,271 @@ if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
"""
|
||||
## eval1
|
||||
Cuts count: 1272
|
||||
Total duration (hh:mm:ss): 01:50:07
|
||||
Speech duration (hh:mm:ss): 01:50:07 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 5.2
|
||||
std 3.9
|
||||
min 0.2
|
||||
25% 1.9
|
||||
50% 4.0
|
||||
75% 8.1
|
||||
99% 14.3
|
||||
99.5% 14.7
|
||||
99.9% 16.0
|
||||
max 16.9
|
||||
Recordings available: 1272
|
||||
Features available: 1272
|
||||
Supervisions available: 1272
|
||||
csj_cuts_eval1.jsonl.gz:
|
||||
Cut statistics:
|
||||
╒═══════════════════════════╤══════════╕
|
||||
│ Cuts count: │ 1023 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Total duration (hh:mm:ss) │ 01:55:40 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ mean │ 6.8 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ std │ 2.7 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ min │ 0.2 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 25% │ 4.9 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 50% │ 7.7 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 75% │ 9.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.5% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.9% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ max │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Recordings available: │ 1023 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Features available: │ 0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Supervisions available: │ 1023 │
|
||||
╘═══════════════════════════╧══════════╛
|
||||
SUPERVISION custom fields:
|
||||
- fluent (in 1272 cuts)
|
||||
- disfluent (in 1272 cuts)
|
||||
- number (in 1272 cuts)
|
||||
- symbol (in 1272 cuts)
|
||||
Speech duration statistics:
|
||||
╒══════════════════════════════╤══════════╤══════════════════════╕
|
||||
│ Total speech duration │ 01:55:40 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total speaking time duration │ 01:55:40 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total silence duration │ 00:00:00 │ 0.00% of recording │
|
||||
╘══════════════════════════════╧══════════╧══════════════════════╛
|
||||
|
||||
## eval2
|
||||
Cuts count: 1292
|
||||
Total duration (hh:mm:ss): 01:56:50
|
||||
Speech duration (hh:mm:ss): 01:56:50 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 5.4
|
||||
std 3.9
|
||||
min 0.1
|
||||
25% 2.1
|
||||
50% 4.6
|
||||
75% 8.6
|
||||
99% 14.1
|
||||
99.5% 15.2
|
||||
99.9% 16.1
|
||||
max 16.9
|
||||
Recordings available: 1292
|
||||
Features available: 1292
|
||||
Supervisions available: 1292
|
||||
SUPERVISION custom fields:
|
||||
- fluent (in 1292 cuts)
|
||||
- number (in 1292 cuts)
|
||||
- symbol (in 1292 cuts)
|
||||
- disfluent (in 1292 cuts)
|
||||
---------------------------------
|
||||
|
||||
## eval3
|
||||
Cuts count: 1385
|
||||
Total duration (hh:mm:ss): 01:19:21
|
||||
Speech duration (hh:mm:ss): 01:19:21 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 3.4
|
||||
std 3.0
|
||||
min 0.2
|
||||
25% 1.2
|
||||
50% 2.5
|
||||
75% 4.6
|
||||
99% 12.7
|
||||
99.5% 13.7
|
||||
99.9% 15.0
|
||||
max 15.9
|
||||
Recordings available: 1385
|
||||
Features available: 1385
|
||||
Supervisions available: 1385
|
||||
csj_cuts_eval2.jsonl.gz:
|
||||
Cut statistics:
|
||||
╒═══════════════════════════╤══════════╕
|
||||
│ Cuts count: │ 1025 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Total duration (hh:mm:ss) │ 02:02:07 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ mean │ 7.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ std │ 2.5 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ min │ 0.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 25% │ 5.9 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 50% │ 7.9 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 75% │ 9.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.5% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.9% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ max │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Recordings available: │ 1025 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Features available: │ 0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Supervisions available: │ 1025 │
|
||||
╘═══════════════════════════╧══════════╛
|
||||
SUPERVISION custom fields:
|
||||
- number (in 1385 cuts)
|
||||
- symbol (in 1385 cuts)
|
||||
- fluent (in 1385 cuts)
|
||||
- disfluent (in 1385 cuts)
|
||||
Speech duration statistics:
|
||||
╒══════════════════════════════╤══════════╤══════════════════════╕
|
||||
│ Total speech duration │ 02:02:07 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total speaking time duration │ 02:02:07 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total silence duration │ 00:00:00 │ 0.00% of recording │
|
||||
╘══════════════════════════════╧══════════╧══════════════════════╛
|
||||
|
||||
## valid
|
||||
Cuts count: 4000
|
||||
Total duration (hh:mm:ss): 05:08:09
|
||||
Speech duration (hh:mm:ss): 05:08:09 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 4.6
|
||||
std 3.8
|
||||
min 0.1
|
||||
25% 1.5
|
||||
50% 3.4
|
||||
75% 7.0
|
||||
99% 13.8
|
||||
99.5% 14.8
|
||||
99.9% 16.0
|
||||
max 17.3
|
||||
Recordings available: 4000
|
||||
Features available: 4000
|
||||
Supervisions available: 4000
|
||||
SUPERVISION custom fields:
|
||||
- fluent (in 4000 cuts)
|
||||
- symbol (in 4000 cuts)
|
||||
- disfluent (in 4000 cuts)
|
||||
- number (in 4000 cuts)
|
||||
---------------------------------
|
||||
|
||||
## train
|
||||
Cuts count: 1291134
|
||||
Total duration (hh:mm:ss): 1596:37:27
|
||||
Speech duration (hh:mm:ss): 1596:37:27 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 4.5
|
||||
std 3.6
|
||||
min 0.0
|
||||
25% 1.6
|
||||
50% 3.3
|
||||
75% 6.4
|
||||
99% 14.0
|
||||
99.5% 14.8
|
||||
99.9% 16.6
|
||||
max 27.8
|
||||
Recordings available: 1291134
|
||||
Features available: 1291134
|
||||
Supervisions available: 1291134
|
||||
csj_cuts_eval3.jsonl.gz:
|
||||
Cut statistics:
|
||||
╒═══════════════════════════╤══════════╕
|
||||
│ Cuts count: │ 865 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Total duration (hh:mm:ss) │ 01:26:44 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ mean │ 6.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ std │ 3.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ min │ 0.3 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 25% │ 3.3 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 50% │ 6.8 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 75% │ 8.7 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.5% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.9% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ max │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Recordings available: │ 865 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Features available: │ 0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Supervisions available: │ 865 │
|
||||
╘═══════════════════════════╧══════════╛
|
||||
SUPERVISION custom fields:
|
||||
- disfluent (in 1291134 cuts)
|
||||
- fluent (in 1291134 cuts)
|
||||
- symbol (in 1291134 cuts)
|
||||
- number (in 1291134 cuts)
|
||||
Speech duration statistics:
|
||||
╒══════════════════════════════╤══════════╤══════════════════════╕
|
||||
│ Total speech duration │ 01:26:44 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total speaking time duration │ 01:26:44 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total silence duration │ 00:00:00 │ 0.00% of recording │
|
||||
╘══════════════════════════════╧══════════╧══════════════════════╛
|
||||
|
||||
---------------------------------
|
||||
|
||||
csj_cuts_valid.jsonl.gz:
|
||||
Cut statistics:
|
||||
╒═══════════════════════════╤══════════╕
|
||||
│ Cuts count: │ 3743 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Total duration (hh:mm:ss) │ 06:40:15 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ mean │ 6.4 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ std │ 3.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ min │ 0.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 25% │ 3.9 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 50% │ 7.4 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 75% │ 9.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.5% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.9% │ 10.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ max │ 11.8 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Recordings available: │ 3743 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Features available: │ 0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Supervisions available: │ 3743 │
|
||||
╘═══════════════════════════╧══════════╛
|
||||
SUPERVISION custom fields:
|
||||
Speech duration statistics:
|
||||
╒══════════════════════════════╤══════════╤══════════════════════╕
|
||||
│ Total speech duration │ 06:40:15 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total speaking time duration │ 06:40:15 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total silence duration │ 00:00:00 │ 0.00% of recording │
|
||||
╘══════════════════════════════╧══════════╧══════════════════════╛
|
||||
|
||||
---------------------------------
|
||||
|
||||
csj_cuts_excluded.jsonl.gz:
|
||||
Cut statistics:
|
||||
╒═══════════════════════════╤══════════╕
|
||||
│ Cuts count: │ 980 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Total duration (hh:mm:ss) │ 00:56:06 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ mean │ 3.4 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ std │ 3.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ min │ 0.1 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 25% │ 0.8 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 50% │ 2.2 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 75% │ 5.8 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99% │ 9.9 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.5% │ 9.9 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ 99.9% │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ max │ 10.0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Recordings available: │ 980 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Features available: │ 0 │
|
||||
├───────────────────────────┼──────────┤
|
||||
│ Supervisions available: │ 980 │
|
||||
╘═══════════════════════════╧══════════╛
|
||||
SUPERVISION custom fields:
|
||||
Speech duration statistics:
|
||||
╒══════════════════════════════╤══════════╤══════════════════════╕
|
||||
│ Total speech duration │ 00:56:06 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total speaking time duration │ 00:56:06 │ 100.00% of recording │
|
||||
├──────────────────────────────┼──────────┼──────────────────────┤
|
||||
│ Total silence duration │ 00:00:00 │ 0.00% of recording │
|
||||
╘══════════════════════════════╧══════════╧══════════════════════╛
|
||||
|
||||
---------------------------------
|
||||
|
||||
csj_cuts_train.jsonl.gz:
|
||||
Cut statistics:
|
||||
╒═══════════════════════════╤════════════╕
|
||||
│ Cuts count: │ 914151 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ Total duration (hh:mm:ss) │ 1695:29:43 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ mean │ 6.7 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ std │ 2.9 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ min │ 0.1 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ 25% │ 4.6 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ 50% │ 7.5 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ 75% │ 8.9 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ 99% │ 11.0 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ 99.5% │ 11.0 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ 99.9% │ 11.1 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ max │ 18.0 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ Recordings available: │ 914151 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ Features available: │ 0 │
|
||||
├───────────────────────────┼────────────┤
|
||||
│ Supervisions available: │ 914151 │
|
||||
╘═══════════════════════════╧════════════╛
|
||||
SUPERVISION custom fields:
|
||||
Speech duration statistics:
|
||||
╒══════════════════════════════╤════════════╤══════════════════════╕
|
||||
│ Total speech duration │ 1695:29:43 │ 100.00% of recording │
|
||||
├──────────────────────────────┼────────────┼──────────────────────┤
|
||||
│ Total speaking time duration │ 1695:29:43 │ 100.00% of recording │
|
||||
├──────────────────────────────┼────────────┼──────────────────────┤
|
||||
│ Total silence duration │ 00:00:00 │ 0.00% of recording │
|
||||
╘══════════════════════════════╧════════════╧══════════════════════╛
|
||||
"""
|
||||
|
||||
@ -21,24 +21,14 @@ import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet
|
||||
from lhotse.recipes.csj import CSJSDBParser
|
||||
|
||||
ARGPARSE_DESCRIPTION = """
|
||||
This script gathers all training transcripts of the specified {trans_mode} type
|
||||
and produces a token_list that would be output set of the ASR system.
|
||||
This script gathers all training transcripts, parses them in disfluent mode, and produces a token list that would be the output set of the ASR system.
|
||||
|
||||
It splits transcripts by whitespace into lists, then, for each word in the
|
||||
list, if the word does not appear in the list of user-defined multicharacter
|
||||
strings, it further splits that word into individual characters to be counted
|
||||
into the output token set.
|
||||
|
||||
It outputs 4 files into the lang directory:
|
||||
- trans_mode: the name of transcript mode. If trans_mode was not specified,
|
||||
this will be an empty file.
|
||||
- userdef_string: a list of user defined strings that should not be split
|
||||
further into individual characters. By default, it contains "<unk>", "<blk>",
|
||||
"<sos/eos>"
|
||||
- words_len: the total number of tokens in the output set.
|
||||
- words.txt: a list of tokens in the output set. The length matches words_len.
|
||||
It outputs 3 files into the lang directory:
|
||||
- tokens.txt: a list of tokens in the output set.
|
||||
- lang_type: a file that contains the string "char"
|
||||
|
||||
"""
|
||||
|
||||
@ -50,98 +40,52 @@ def get_args():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--train-cut", type=Path, required=True, help="Path to the train cut"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--trans-mode",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Name of the transcript mode to use. "
|
||||
"If lang-dir is not set, this will also name the lang-dir"
|
||||
),
|
||||
"train_cut", metavar="train-cut", type=Path, help="Path to the train cut"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--lang-dir",
|
||||
type=Path,
|
||||
default=None,
|
||||
default=Path("data/lang_char"),
|
||||
help=(
|
||||
"Name of lang dir. "
|
||||
"If not set, this will default to lang_char_{trans-mode}"
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--userdef-string",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Multicharacter strings that do not need to be split",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
logging.basicConfig(
|
||||
format=("%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"),
|
||||
level=logging.INFO,
|
||||
)
|
||||
|
||||
if not args.lang_dir:
|
||||
p = "lang_char"
|
||||
if args.trans_mode:
|
||||
p += f"_{args.trans_mode}"
|
||||
args.lang_dir = Path(p)
|
||||
sysdef_string = set(["<blk>", "<unk>", "<sos/eos>"])
|
||||
|
||||
if args.userdef_string:
|
||||
args.userdef_string = set(args.userdef_string.read_text().split())
|
||||
else:
|
||||
args.userdef_string = set()
|
||||
# Using disfluent parsing as fluent is a subset of disfluent
|
||||
parser = CSJSDBParser()
|
||||
|
||||
sysdef_string = ["<blk>", "<unk>", "<sos/eos>"]
|
||||
args.userdef_string.update(sysdef_string)
|
||||
token_set = set()
|
||||
logging.info(f"Creating vocabulary from {args.train_cut}.")
|
||||
train_cut: CutSet = CutSet.from_file(args.train_cut)
|
||||
for cut in train_cut:
|
||||
if "_sp" in cut.id:
|
||||
continue
|
||||
|
||||
train_set: CutSet = CutSet.from_file(args.train_cut)
|
||||
|
||||
words = set()
|
||||
logging.info(
|
||||
f"Creating vocabulary from {args.train_cut.name} at {args.trans_mode} mode."
|
||||
)
|
||||
for cut in train_set:
|
||||
try:
|
||||
text: str = (
|
||||
cut.supervisions[0].custom[args.trans_mode]
|
||||
if args.trans_mode
|
||||
else cut.supervisions[0].text
|
||||
)
|
||||
except KeyError:
|
||||
raise KeyError(
|
||||
f"Could not find {args.trans_mode} in {cut.supervisions[0].custom}"
|
||||
)
|
||||
for t in text.split():
|
||||
if t in args.userdef_string:
|
||||
words.add(t)
|
||||
else:
|
||||
words.update(c for c in list(t))
|
||||
|
||||
words -= set(sysdef_string)
|
||||
words = sorted(words)
|
||||
words = ["<blk>"] + words + ["<unk>", "<sos/eos>"]
|
||||
text: str = cut.supervisions[0].custom["raw"]
|
||||
for w in parser.parse(text, sep=" ").split(" "):
|
||||
token_set.update(w)
|
||||
|
||||
token_set = ["<blk>"] + sorted(token_set - sysdef_string) + ["<unk>", "<sos/eos>"]
|
||||
args.lang_dir.mkdir(parents=True, exist_ok=True)
|
||||
(args.lang_dir / "words.txt").write_text(
|
||||
"\n".join(f"{word}\t{i}" for i, word in enumerate(words))
|
||||
(args.lang_dir / "tokens.txt").write_text(
|
||||
"\n".join(f"{t}\t{i}" for i, t in enumerate(token_set))
|
||||
)
|
||||
|
||||
(args.lang_dir / "words_len").write_text(f"{len(words)}")
|
||||
|
||||
(args.lang_dir / "userdef_string").write_text("\n".join(args.userdef_string))
|
||||
|
||||
(args.lang_dir / "trans_mode").write_text(args.trans_mode)
|
||||
(args.lang_dir / "lang_type").write_text("char")
|
||||
logging.info("Done.")
|
||||
|
||||
|
||||
|
||||
462
egs/csj/ASR/local/utils/asr_datamodule.py
Normal file
462
egs/csj/ASR/local/utils/asr_datamodule.py
Normal file
@ -0,0 +1,462 @@
|
||||
# Copyright 2021 Piotr Żelasko
|
||||
# Copyright 2022 Xiaomi Corporation (Author: Mingshuang Luo)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import argparse
|
||||
import inspect
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
|
||||
from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
PrecomputedFeatures,
|
||||
SingleCutSampler,
|
||||
SpecAugment,
|
||||
)
|
||||
from lhotse.dataset.input_strategies import ( # noqa F401 For AudioSamples
|
||||
AudioSamples,
|
||||
OnTheFlyFeatures,
|
||||
)
|
||||
from lhotse.utils import fix_random_seed
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from icefall.utils import str2bool
|
||||
|
||||
|
||||
class _SeedWorkers:
|
||||
def __init__(self, seed: int):
|
||||
self.seed = seed
|
||||
|
||||
def __call__(self, worker_id: int):
|
||||
fix_random_seed(self.seed + worker_id)
|
||||
|
||||
|
||||
class AsrVariableTranscriptDataset(K2SpeechRecognitionDataset):
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
transcript_mode: str = "",
|
||||
return_cuts: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.transcript_mode = transcript_mode
|
||||
self.return_cuts = True
|
||||
self._return_cuts = return_cuts
|
||||
|
||||
def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
|
||||
batch = super().__getitem__(cuts)
|
||||
|
||||
if self.transcript_mode:
|
||||
batch["supervisions"]["text"] = [
|
||||
supervision.custom[self.transcript_mode]
|
||||
for cut in batch["supervisions"]["cut"]
|
||||
for supervision in cut.supervisions
|
||||
]
|
||||
|
||||
if not self._return_cuts:
|
||||
del batch["supervisions"]["cut"]
|
||||
|
||||
return batch
|
||||
|
||||
|
||||
class CSJAsrDataModule:
|
||||
"""
|
||||
DataModule for k2 ASR experiments.
|
||||
It assumes there is always one train and valid dataloader,
|
||||
but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
|
||||
and test-other).
|
||||
It contains all the common data pipeline modules used in ASR
|
||||
experiments, e.g.:
|
||||
- dynamic batch size,
|
||||
- bucketing samplers,
|
||||
- cut concatenation,
|
||||
- augmentation,
|
||||
- on-the-fly feature extraction
|
||||
This class should be derived for specific corpora used in ASR tasks.
|
||||
"""
|
||||
|
||||
def __init__(self, args: argparse.Namespace):
|
||||
self.args = args
|
||||
|
||||
@classmethod
|
||||
def add_arguments(cls, parser: argparse.ArgumentParser):
|
||||
group = parser.add_argument_group(
|
||||
title="ASR data related options",
|
||||
description="These options are used for the preparation of "
|
||||
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
|
||||
"effective batch sizes, sampling strategies, applied data "
|
||||
"augmentations, etc.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--transcript-mode",
|
||||
type=str,
|
||||
default="",
|
||||
help="Mode of transcript in supervision to use.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--manifest-dir",
|
||||
type=Path,
|
||||
default=Path("data/manifests"),
|
||||
help="Path to directory with train/valid/test cuts.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--musan-dir", type=Path, help="Path to directory with musan cuts. "
|
||||
)
|
||||
group.add_argument(
|
||||
"--max-duration",
|
||||
type=int,
|
||||
default=200.0,
|
||||
help="Maximum pooled recordings duration (seconds) in a "
|
||||
"single batch. You can reduce it if it causes CUDA OOM.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--bucketing-sampler",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled, the batches will come from buckets of "
|
||||
"similar duration (saves padding frames).",
|
||||
)
|
||||
group.add_argument(
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=30,
|
||||
help="The number of buckets for the DynamicBucketingSampler"
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
group.add_argument(
|
||||
"--concatenate-cuts",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="When enabled, utterances (cuts) will be concatenated "
|
||||
"to minimize the amount of padding.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--duration-factor",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Determines the maximum duration of a concatenated cut "
|
||||
"relative to the duration of the longest cut in a batch.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--gap",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="The amount of padding (in seconds) inserted between "
|
||||
"concatenated cuts. This padding is filled with noise when "
|
||||
"noise augmentation is used.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--on-the-fly-feats",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="When enabled, use on-the-fly cut mixing and feature "
|
||||
"extraction. Will drop existing precomputed feature manifests "
|
||||
"if available.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--shuffle",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled (=default), the examples will be "
|
||||
"shuffled for each epoch.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--drop-last",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="Whether to drop last batch. Used by sampler.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--return-cuts",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="When enabled, each batch will have the "
|
||||
"field: batch['supervisions']['cut'] with the cuts that "
|
||||
"were used to construct it.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--num-workers",
|
||||
type=int,
|
||||
default=2,
|
||||
help="The number of training dataloader workers that "
|
||||
"collect the batches.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--enable-spec-aug",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled, use SpecAugment for training dataset.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--spec-aug-time-warp-factor",
|
||||
type=int,
|
||||
default=80,
|
||||
help="Used only when --enable-spec-aug is True. "
|
||||
"It specifies the factor for time warping in SpecAugment. "
|
||||
"Larger values mean more warping. "
|
||||
"A value less than 1 means to disable time warp.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--enable-musan",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled, select noise from MUSAN and mix it"
|
||||
"with training dataset. ",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--input-strategy",
|
||||
type=str,
|
||||
default="PrecomputedFeatures",
|
||||
help="AudioSamples or PrecomputedFeatures",
|
||||
)
|
||||
|
||||
def train_dataloaders(
|
||||
self,
|
||||
cuts_train: CutSet,
|
||||
sampler_state_dict: Optional[Dict[str, Any]] = None,
|
||||
) -> DataLoader:
|
||||
"""
|
||||
Args:
|
||||
cuts_train:
|
||||
CutSet for training.
|
||||
sampler_state_dict:
|
||||
The state dict for the training sampler.
|
||||
"""
|
||||
transforms = []
|
||||
if self.args.enable_musan:
|
||||
logging.info("Enable MUSAN")
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(self.args.musan_dir / "musan_cuts.jsonl.gz")
|
||||
transforms.append(
|
||||
CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
|
||||
)
|
||||
else:
|
||||
logging.info("Disable MUSAN")
|
||||
|
||||
if self.args.concatenate_cuts:
|
||||
logging.info(
|
||||
f"Using cut concatenation with duration factor "
|
||||
f"{self.args.duration_factor} and gap {self.args.gap}."
|
||||
)
|
||||
# Cut concatenation should be the first transform in the list,
|
||||
# so that if we e.g. mix noise in, it will fill the gaps between
|
||||
# different utterances.
|
||||
transforms = [
|
||||
CutConcatenate(
|
||||
duration_factor=self.args.duration_factor, gap=self.args.gap
|
||||
)
|
||||
] + transforms
|
||||
|
||||
input_transforms = []
|
||||
if self.args.enable_spec_aug:
|
||||
logging.info("Enable SpecAugment")
|
||||
logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
|
||||
# Set the value of num_frame_masks according to Lhotse's version.
|
||||
# In different Lhotse's versions, the default of num_frame_masks is
|
||||
# different.
|
||||
num_frame_masks = 10
|
||||
num_frame_masks_parameter = inspect.signature(
|
||||
SpecAugment.__init__
|
||||
).parameters["num_frame_masks"]
|
||||
if num_frame_masks_parameter.default == 1:
|
||||
num_frame_masks = 2
|
||||
logging.info(f"Num frame mask: {num_frame_masks}")
|
||||
input_transforms.append(
|
||||
SpecAugment(
|
||||
time_warp_factor=self.args.spec_aug_time_warp_factor,
|
||||
num_frame_masks=num_frame_masks,
|
||||
features_mask_size=27,
|
||||
num_feature_masks=2,
|
||||
frames_mask_size=100,
|
||||
)
|
||||
)
|
||||
else:
|
||||
logging.info("Disable SpecAugment")
|
||||
|
||||
logging.info("About to create train dataset")
|
||||
train = AsrVariableTranscriptDataset(
|
||||
input_strategy=eval(self.args.input_strategy)(),
|
||||
cut_transforms=transforms,
|
||||
input_transforms=input_transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
transcript_mode=self.args.transcript_mode,
|
||||
)
|
||||
|
||||
if self.args.on_the_fly_feats:
|
||||
# NOTE: the PerturbSpeed transform should be added only if we
|
||||
# remove it from data prep stage.
|
||||
# Add on-the-fly speed perturbation; since originally it would
|
||||
# have increased epoch size by 3, we will apply prob 2/3 and use
|
||||
# 3x more epochs.
|
||||
# Speed perturbation probably should come first before
|
||||
# concatenation, but in principle the transforms order doesn't have
|
||||
# to be strict (e.g. could be randomized)
|
||||
# transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa
|
||||
# Drop feats to be on the safe side.
|
||||
train = AsrVariableTranscriptDataset(
|
||||
cut_transforms=transforms,
|
||||
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
|
||||
input_transforms=input_transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
transcript_mode=self.args.transcript_mode,
|
||||
)
|
||||
|
||||
if self.args.bucketing_sampler:
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
drop_last=self.args.drop_last,
|
||||
)
|
||||
else:
|
||||
logging.info("Using SingleCutSampler.")
|
||||
train_sampler = SingleCutSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
)
|
||||
logging.info("About to create train dataloader")
|
||||
|
||||
if sampler_state_dict is not None:
|
||||
logging.info("Loading sampler state dict")
|
||||
train_sampler.load_state_dict(sampler_state_dict)
|
||||
|
||||
# 'seed' is derived from the current random state, which will have
|
||||
# previously been set in the main process.
|
||||
seed = torch.randint(0, 100000, ()).item()
|
||||
worker_init_fn = _SeedWorkers(seed)
|
||||
|
||||
train_dl = DataLoader(
|
||||
train,
|
||||
sampler=train_sampler,
|
||||
batch_size=None,
|
||||
num_workers=self.args.num_workers,
|
||||
persistent_workers=False,
|
||||
worker_init_fn=worker_init_fn,
|
||||
)
|
||||
|
||||
return train_dl
|
||||
|
||||
def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
|
||||
transforms = []
|
||||
if self.args.concatenate_cuts:
|
||||
transforms = [
|
||||
CutConcatenate(
|
||||
duration_factor=self.args.duration_factor, gap=self.args.gap
|
||||
)
|
||||
] + transforms
|
||||
|
||||
logging.info("About to create dev dataset")
|
||||
if self.args.on_the_fly_feats:
|
||||
validate = AsrVariableTranscriptDataset(
|
||||
cut_transforms=transforms,
|
||||
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
|
||||
return_cuts=self.args.return_cuts,
|
||||
transcript_mode=self.args.transcript_mode,
|
||||
)
|
||||
else:
|
||||
validate = AsrVariableTranscriptDataset(
|
||||
cut_transforms=transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
transcript_mode=self.args.transcript_mode,
|
||||
)
|
||||
valid_sampler = DynamicBucketingSampler(
|
||||
cuts_valid,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
logging.info("About to create dev dataloader")
|
||||
valid_dl = DataLoader(
|
||||
validate,
|
||||
sampler=valid_sampler,
|
||||
batch_size=None,
|
||||
num_workers=2,
|
||||
persistent_workers=False,
|
||||
)
|
||||
|
||||
return valid_dl
|
||||
|
||||
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
|
||||
logging.debug("About to create test dataset")
|
||||
|
||||
test = AsrVariableTranscriptDataset(
|
||||
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
|
||||
if self.args.on_the_fly_feats
|
||||
else eval(self.args.input_strategy)(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
transcript_mode=self.args.transcript_mode,
|
||||
)
|
||||
sampler = DynamicBucketingSampler(
|
||||
cuts,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
|
||||
logging.debug("About to create test dataloader")
|
||||
test_dl = DataLoader(
|
||||
test,
|
||||
batch_size=None,
|
||||
sampler=sampler,
|
||||
num_workers=self.args.num_workers,
|
||||
)
|
||||
return test_dl
|
||||
|
||||
@lru_cache()
|
||||
def train_cuts(self) -> CutSet:
|
||||
logging.info("About to get train cuts")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "csj_cuts_train.jsonl.gz")
|
||||
|
||||
@lru_cache()
|
||||
def valid_cuts(self) -> CutSet:
|
||||
logging.info("About to get valid cuts")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "csj_cuts_valid.jsonl.gz")
|
||||
|
||||
@lru_cache()
|
||||
def excluded_cuts(self) -> CutSet:
|
||||
logging.info("About to get excluded cuts")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "csj_cuts_excluded.jsonl.gz")
|
||||
|
||||
@lru_cache()
|
||||
def eval1_cuts(self) -> CutSet:
|
||||
logging.info("About to get eval1 cuts")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "csj_cuts_eval1.jsonl.gz")
|
||||
|
||||
@lru_cache()
|
||||
def eval2_cuts(self) -> CutSet:
|
||||
logging.info("About to get eval2 cuts")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "csj_cuts_eval2.jsonl.gz")
|
||||
|
||||
@lru_cache()
|
||||
def eval3_cuts(self) -> CutSet:
|
||||
logging.info("About to get eval3 cuts")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "csj_cuts_eval3.jsonl.gz")
|
||||
253
egs/csj/ASR/local/utils/tokenizer.py
Normal file
253
egs/csj/ASR/local/utils/tokenizer.py
Normal file
@ -0,0 +1,253 @@
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Callable, List, Union
|
||||
|
||||
import sentencepiece as spm
|
||||
from k2 import SymbolTable
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
text2word: Callable[[str], List[str]]
|
||||
|
||||
@staticmethod
|
||||
def add_arguments(parser: argparse.ArgumentParser):
|
||||
group = parser.add_argument_group(title="Lang related options")
|
||||
|
||||
group.add_argument("--lang", type=Path, help="Path to lang directory.")
|
||||
|
||||
group.add_argument(
|
||||
"--lang-type",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Either 'bpe' or 'char'. If not provided, it expects lang_dir/lang_type to exists. "
|
||||
"Note: 'bpe' directly loads sentencepiece.SentencePieceProcessor"
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def Load(lang_dir: Path, lang_type="", oov="<unk>"):
|
||||
|
||||
if not lang_type:
|
||||
assert (lang_dir / "lang_type").exists(), "lang_type not specified."
|
||||
lang_type = (lang_dir / "lang_type").read_text().strip()
|
||||
|
||||
tokenizer = None
|
||||
|
||||
if lang_type == "bpe":
|
||||
assert (
|
||||
lang_dir / "bpe.model"
|
||||
).exists(), f"No BPE .model could be found in {lang_dir}."
|
||||
tokenizer = spm.SentencePieceProcessor()
|
||||
tokenizer.Load(str(lang_dir / "bpe.model"))
|
||||
elif lang_type == "char":
|
||||
tokenizer = CharTokenizer(lang_dir, oov=oov)
|
||||
else:
|
||||
raise NotImplementedError(f"{lang_type} not supported at the moment.")
|
||||
|
||||
return tokenizer
|
||||
|
||||
load = Load
|
||||
|
||||
def PieceToId(self, piece: str) -> int:
|
||||
raise NotImplementedError(
|
||||
"You need to implement this function in the child class."
|
||||
)
|
||||
|
||||
piece_to_id = PieceToId
|
||||
|
||||
def IdToPiece(self, id: int) -> str:
|
||||
raise NotImplementedError(
|
||||
"You need to implement this function in the child class."
|
||||
)
|
||||
|
||||
id_to_piece = IdToPiece
|
||||
|
||||
def GetPieceSize(self) -> int:
|
||||
raise NotImplementedError(
|
||||
"You need to implement this function in the child class."
|
||||
)
|
||||
|
||||
get_piece_size = GetPieceSize
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.get_piece_size()
|
||||
|
||||
def EncodeAsIdsBatch(self, input: List[str]) -> List[List[int]]:
|
||||
raise NotImplementedError(
|
||||
"You need to implement this function in the child class."
|
||||
)
|
||||
|
||||
def EncodeAsPiecesBatch(self, input: List[str]) -> List[List[str]]:
|
||||
raise NotImplementedError(
|
||||
"You need to implement this function in the child class."
|
||||
)
|
||||
|
||||
def EncodeAsIds(self, input: str) -> List[int]:
|
||||
return self.EncodeAsIdsBatch([input])[0]
|
||||
|
||||
def EncodeAsPieces(self, input: str) -> List[str]:
|
||||
return self.EncodeAsPiecesBatch([input])[0]
|
||||
|
||||
def Encode(
|
||||
self, input: Union[str, List[str]], out_type=int
|
||||
) -> Union[List, List[List]]:
|
||||
if not input:
|
||||
return []
|
||||
|
||||
if isinstance(input, list):
|
||||
if out_type is int:
|
||||
return self.EncodeAsIdsBatch(input)
|
||||
if out_type is str:
|
||||
return self.EncodeAsPiecesBatch(input)
|
||||
|
||||
if out_type is int:
|
||||
return self.EncodeAsIds(input)
|
||||
if out_type is str:
|
||||
return self.EncodeAsPieces(input)
|
||||
|
||||
encode = Encode
|
||||
|
||||
def DecodeIdsBatch(self, input: List[List[int]]) -> List[str]:
|
||||
raise NotImplementedError(
|
||||
"You need to implement this function in the child class."
|
||||
)
|
||||
|
||||
def DecodePiecesBatch(self, input: List[List[str]]) -> List[str]:
|
||||
raise NotImplementedError(
|
||||
"You need to implement this function in the child class."
|
||||
)
|
||||
|
||||
def DecodeIds(self, input: List[int]) -> str:
|
||||
return self.DecodeIdsBatch([input])[0]
|
||||
|
||||
def DecodePieces(self, input: List[str]) -> str:
|
||||
return self.DecodePiecesBatch([input])[0]
|
||||
|
||||
def Decode(
|
||||
self,
|
||||
input: Union[int, List[int], List[str], List[List[int]], List[List[str]]],
|
||||
) -> Union[List[str], str]:
|
||||
|
||||
if not input:
|
||||
return ""
|
||||
|
||||
if isinstance(input, int):
|
||||
return self.id_to_piece(input)
|
||||
elif isinstance(input, str):
|
||||
raise TypeError(
|
||||
"Unlike spm.SentencePieceProcessor, cannot decode from type str."
|
||||
)
|
||||
|
||||
if isinstance(input[0], list):
|
||||
if not input[0] or isinstance(input[0][0], int):
|
||||
return self.DecodeIdsBatch(input)
|
||||
|
||||
if isinstance(input[0][0], str):
|
||||
return self.DecodePiecesBatch(input)
|
||||
|
||||
if isinstance(input[0], int):
|
||||
return self.DecodeIds(input)
|
||||
if isinstance(input[0], str):
|
||||
return self.DecodePieces(input)
|
||||
|
||||
raise RuntimeError("Unknown input type")
|
||||
|
||||
decode = Decode
|
||||
|
||||
def SplitBatch(self, input: List[str]) -> List[List[str]]:
|
||||
raise NotImplementedError(
|
||||
"You need to implement this function in the child class."
|
||||
)
|
||||
|
||||
def Split(self, input: Union[List[str], str]) -> Union[List[List[str]], List[str]]:
|
||||
if isinstance(input, list):
|
||||
return self.SplitBatch(input)
|
||||
elif isinstance(input, str):
|
||||
return self.SplitBatch([input])[0]
|
||||
raise RuntimeError("Unknown input type")
|
||||
|
||||
split = Split
|
||||
|
||||
|
||||
class CharTokenizer(Tokenizer):
|
||||
def __init__(self, lang_dir: Path, oov="<unk>", sep=""):
|
||||
assert (
|
||||
lang_dir / "tokens.txt"
|
||||
).exists(), f"tokens.txt could not be found in {lang_dir}."
|
||||
token_table = SymbolTable.from_file(lang_dir / "tokens.txt")
|
||||
assert (
|
||||
"#0" not in token_table
|
||||
), "This tokenizer does not support disambig symbols."
|
||||
self._id2sym = token_table._id2sym
|
||||
self._sym2id = token_table._sym2id
|
||||
self.oov = oov
|
||||
self.oov_id = self._sym2id[oov]
|
||||
self.sep = sep
|
||||
if self.sep:
|
||||
self.text2word = lambda x: x.split(self.sep)
|
||||
else:
|
||||
self.text2word = lambda x: list(x.replace(" ", ""))
|
||||
|
||||
def piece_to_id(self, piece: str) -> int:
|
||||
try:
|
||||
return self._sym2id[piece]
|
||||
except KeyError:
|
||||
return self.oov_id
|
||||
|
||||
def id_to_piece(self, id: int) -> str:
|
||||
return self._id2sym[id]
|
||||
|
||||
def get_piece_size(self) -> int:
|
||||
return len(self._sym2id)
|
||||
|
||||
def EncodeAsIdsBatch(self, input: List[str]) -> List[List[int]]:
|
||||
return [[self.piece_to_id(i) for i in self.text2word(text)] for text in input]
|
||||
|
||||
def EncodeAsPiecesBatch(self, input: List[str]) -> List[List[str]]:
|
||||
return [
|
||||
[i if i in self._sym2id else self.oov for i in self.text2word(text)]
|
||||
for text in input
|
||||
]
|
||||
|
||||
def DecodeIdsBatch(self, input: List[List[int]]) -> List[str]:
|
||||
return [self.sep.join(self.id_to_piece(i) for i in text) for text in input]
|
||||
|
||||
def DecodePiecesBatch(self, input: List[List[str]]) -> List[str]:
|
||||
return [self.sep.join(text) for text in input]
|
||||
|
||||
def SplitBatch(self, input: List[str]) -> List[List[str]]:
|
||||
return [self.text2word(text) for text in input]
|
||||
|
||||
|
||||
def test_CharTokenizer():
|
||||
test_single_string = "こんにちは"
|
||||
test_multiple_string = [
|
||||
"今日はいい天気ですよね",
|
||||
"諏訪湖は綺麗でしょう",
|
||||
"这在词表外",
|
||||
"分かち 書き に し た 文章 です",
|
||||
"",
|
||||
]
|
||||
test_empty_string = ""
|
||||
sp = Tokenizer.load(Path("lang_char"), "char", oov="<unk>")
|
||||
splitter = sp.split
|
||||
print(sp.encode(test_single_string, out_type=str))
|
||||
print(sp.encode(test_single_string, out_type=int))
|
||||
print(sp.encode(test_multiple_string, out_type=str))
|
||||
print(sp.encode(test_multiple_string, out_type=int))
|
||||
print(sp.encode(test_empty_string, out_type=str))
|
||||
print(sp.encode(test_empty_string, out_type=int))
|
||||
print(sp.decode(sp.encode(test_single_string, out_type=str)))
|
||||
print(sp.decode(sp.encode(test_single_string, out_type=int)))
|
||||
print(sp.decode(sp.encode(test_multiple_string, out_type=str)))
|
||||
print(sp.decode(sp.encode(test_multiple_string, out_type=int)))
|
||||
print(sp.decode(sp.encode(test_empty_string, out_type=str)))
|
||||
print(sp.decode(sp.encode(test_empty_string, out_type=int)))
|
||||
print(splitter(test_single_string))
|
||||
print(splitter(test_multiple_string))
|
||||
print(splitter(test_empty_string))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_CharTokenizer()
|
||||
@ -32,7 +32,7 @@
|
||||
# - speech
|
||||
#
|
||||
# By default, this script produces the original transcript like kaldi and espnet. Optionally, you
|
||||
# can generate other transcript formats by supplying your own config files. A few examples of these
|
||||
# can add other transcript formats by supplying your own config files. A few examples of these
|
||||
# config files can be found in local/conf.
|
||||
|
||||
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||
@ -44,10 +44,10 @@ nj=8
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
csj_dir=/mnt/minami_data_server/t2131178/corpus/CSJ
|
||||
musan_dir=/mnt/minami_data_server/t2131178/corpus/musan/musan
|
||||
trans_dir=$csj_dir/retranscript
|
||||
csj_fbank_dir=/mnt/host/csj_data/fbank
|
||||
csj_dir=/mnt/host/corpus/csj
|
||||
musan_dir=/mnt/host/corpus/musan/musan
|
||||
trans_dir=$csj_dir/transcript
|
||||
csj_fbank_dir=/mnt/host/corpus/csj/fbank
|
||||
musan_fbank_dir=$musan_dir/fbank
|
||||
csj_manifest_dir=data/manifests
|
||||
musan_manifest_dir=$musan_dir/manifests
|
||||
@ -63,12 +63,8 @@ log() {
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare CSJ manifest"
|
||||
# If you want to generate more transcript modes, append the path to those config files at c.
|
||||
# Example: lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -c local/conf/disfluent.ini
|
||||
# NOTE: In case multiple config files are supplied, the second config file and onwards will inherit
|
||||
# the segment boundaries of the first config file.
|
||||
if [ ! -e $csj_manifest_dir/.csj.done ]; then
|
||||
lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -j 4
|
||||
lhotse prepare csj $csj_dir $csj_manifest_dir -t $trans_dir -j 16
|
||||
touch $csj_manifest_dir/.csj.done
|
||||
fi
|
||||
fi
|
||||
@ -88,32 +84,24 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
python local/compute_fbank_csj.py --manifest-dir $csj_manifest_dir \
|
||||
--fbank-dir $csj_fbank_dir
|
||||
parts=(
|
||||
train
|
||||
valid
|
||||
eval1
|
||||
eval2
|
||||
eval3
|
||||
valid
|
||||
excluded
|
||||
train
|
||||
)
|
||||
for part in ${parts[@]}; do
|
||||
python local/validate_manifest.py --manifest $csj_manifest_dir/csj_cuts_$part.jsonl.gz
|
||||
python local/validate_manifest.py --manifest $csj_fbank_dir/csj_cuts_$part.jsonl.gz
|
||||
done
|
||||
touch $csj_fbank_dir/.csj-validated.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
log "Stage 4: Prepare CSJ lang"
|
||||
modes=disfluent
|
||||
|
||||
# If you want prepare the lang directory for other transcript modes, just append
|
||||
# the names of those modes behind. An example is shown as below:-
|
||||
# modes="$modes fluent symbol number"
|
||||
|
||||
for mode in ${modes[@]}; do
|
||||
python local/prepare_lang_char.py --trans-mode $mode \
|
||||
--train-cut $csj_manifest_dir/csj_cuts_train.jsonl.gz \
|
||||
--lang-dir lang_char_$mode
|
||||
done
|
||||
log "Stage 4: Prepare CSJ lang_char"
|
||||
python local/prepare_lang_char.py $csj_fbank_dir/csj_cuts_train.jsonl.gz
|
||||
python local/add_transcript_mode.py -f $csj_fbank_dir -c local/conf/fluent.ini local/conf/number.ini
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
@ -128,6 +116,6 @@ fi
|
||||
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 6: Show manifest statistics"
|
||||
python local/display_manifest_statistics.py --manifest-dir $csj_manifest_dir > $csj_manifest_dir/manifest_statistics.txt
|
||||
cat $csj_manifest_dir/manifest_statistics.txt
|
||||
python local/display_manifest_statistics.py --manifest-dir $csj_fbank_dir > $csj_fbank_dir/manifest_statistics.txt
|
||||
cat $csj_fbank_dir/manifest_statistics.txt
|
||||
fi
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user