mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Merging upstream/master
This commit is contained in:
commit
be5c687fbd
14
.flake8
14
.flake8
@ -4,11 +4,15 @@ statistics=true
|
|||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
per-file-ignores =
|
per-file-ignores =
|
||||||
# line too long
|
# line too long
|
||||||
icefall/diagnostics.py: E501
|
icefall/diagnostics.py: E501,
|
||||||
egs/*/ASR/*/conformer.py: E501,
|
egs/*/ASR/*/conformer.py: E501,
|
||||||
egs/*/ASR/pruned_transducer_stateless*/*.py: E501,
|
egs/*/ASR/pruned_transducer_stateless*/*.py: E501,
|
||||||
egs/*/ASR/*/optim.py: E501,
|
egs/*/ASR/*/optim.py: E501,
|
||||||
egs/*/ASR/*/scaling.py: E501,
|
egs/*/ASR/*/scaling.py: E501,
|
||||||
|
egs/librispeech/ASR/lstm_transducer_stateless*/*.py: E501, E203
|
||||||
|
egs/librispeech/ASR/conv_emformer_transducer_stateless*/*.py: E501, E203
|
||||||
|
egs/librispeech/ASR/conformer_ctc2/*py: E501,
|
||||||
|
egs/librispeech/ASR/RESULTS.md: E999,
|
||||||
|
|
||||||
# invalid escape sequence (cause by tex formular), W605
|
# invalid escape sequence (cause by tex formular), W605
|
||||||
icefall/utils.py: E501, W605
|
icefall/utils.py: E501, W605
|
||||||
@ -18,3 +22,11 @@ exclude =
|
|||||||
**/data/**,
|
**/data/**,
|
||||||
icefall/shared/make_kn_lm.py,
|
icefall/shared/make_kn_lm.py,
|
||||||
icefall/__init__.py
|
icefall/__init__.py
|
||||||
|
|
||||||
|
ignore =
|
||||||
|
# E203 white space before ":"
|
||||||
|
E203,
|
||||||
|
# W503 line break before binary operator
|
||||||
|
W503,
|
||||||
|
# E226 missing whitespace around arithmetic operator
|
||||||
|
E226,
|
||||||
|
|||||||
@ -4,6 +4,8 @@
|
|||||||
# The computed features are saved to ~/tmp/fbank-libri and are
|
# The computed features are saved to ~/tmp/fbank-libri and are
|
||||||
# cached for later runs
|
# cached for later runs
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
echo $PYTHONPATH
|
echo $PYTHONPATH
|
||||||
|
|
||||||
|
|||||||
@ -6,6 +6,8 @@
|
|||||||
# You will find directories `~/tmp/giga-dev-dataset-fbank` after running
|
# You will find directories `~/tmp/giga-dev-dataset-fbank` after running
|
||||||
# this script.
|
# this script.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
mkdir -p ~/tmp
|
mkdir -p ~/tmp
|
||||||
cd ~/tmp
|
cd ~/tmp
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,8 @@
|
|||||||
# You will find directories ~/tmp/download/LibriSpeech after running
|
# You will find directories ~/tmp/download/LibriSpeech after running
|
||||||
# this script.
|
# this script.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
mkdir ~/tmp/download
|
mkdir ~/tmp/download
|
||||||
cd egs/librispeech/ASR
|
cd egs/librispeech/ASR
|
||||||
ln -s ~/tmp/download .
|
ln -s ~/tmp/download .
|
||||||
|
|||||||
2
.github/scripts/install-kaldifeat.sh
vendored
2
.github/scripts/install-kaldifeat.sh
vendored
@ -3,6 +3,8 @@
|
|||||||
# This script installs kaldifeat into the directory ~/tmp/kaldifeat
|
# This script installs kaldifeat into the directory ~/tmp/kaldifeat
|
||||||
# which is cached by GitHub actions for later runs.
|
# which is cached by GitHub actions for later runs.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
mkdir -p ~/tmp
|
mkdir -p ~/tmp
|
||||||
cd ~/tmp
|
cd ~/tmp
|
||||||
git clone https://github.com/csukuangfj/kaldifeat
|
git clone https://github.com/csukuangfj/kaldifeat
|
||||||
|
|||||||
@ -4,6 +4,8 @@
|
|||||||
# to egs/librispeech/ASR/download/LibriSpeech and generates manifest
|
# to egs/librispeech/ASR/download/LibriSpeech and generates manifest
|
||||||
# files in egs/librispeech/ASR/data/manifests
|
# files in egs/librispeech/ASR/data/manifests
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
cd egs/librispeech/ASR
|
cd egs/librispeech/ASR
|
||||||
[ ! -e download ] && ln -s ~/tmp/download .
|
[ ! -e download ] && ln -s ~/tmp/download .
|
||||||
mkdir -p data/manifests
|
mkdir -p data/manifests
|
||||||
|
|||||||
88
.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
vendored
Executable file
88
.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
vendored
Executable file
@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
cd egs/aishell/ASR
|
||||||
|
|
||||||
|
git lfs install
|
||||||
|
|
||||||
|
fbank_url=https://huggingface.co/csukuangfj/aishell-test-dev-manifests
|
||||||
|
log "Downloading pre-commputed fbank from $fbank_url"
|
||||||
|
|
||||||
|
git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
|
||||||
|
ln -s $PWD/aishell-test-dev-manifests/data .
|
||||||
|
|
||||||
|
log "Downloading pre-trained model from $repo_url"
|
||||||
|
repo_url=https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
|
||||||
|
git clone $repo_url
|
||||||
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
|
log "Display test files"
|
||||||
|
tree $repo/
|
||||||
|
soxi $repo/test_wavs/*.wav
|
||||||
|
ls -lh $repo/test_wavs/*.wav
|
||||||
|
|
||||||
|
pushd $repo/exp
|
||||||
|
ln -s pretrained-epoch-29-avg-5-torch-1.10.0.pt pretrained.pt
|
||||||
|
popd
|
||||||
|
|
||||||
|
for sym in 1 2 3; do
|
||||||
|
log "Greedy search with --max-sym-per-frame $sym"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/pretrained.py \
|
||||||
|
--method greedy_search \
|
||||||
|
--max-sym-per-frame $sym \
|
||||||
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
|
--lang-dir $repo/data/lang_char \
|
||||||
|
$repo/test_wavs/BAC009S0764W0121.wav \
|
||||||
|
$repo/test_wavs/BAC009S0764W0122.wav \
|
||||||
|
$repo/test_wavs/BAC009S0764W0123.wav
|
||||||
|
done
|
||||||
|
|
||||||
|
for method in modified_beam_search beam_search fast_beam_search; do
|
||||||
|
log "$method"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/pretrained.py \
|
||||||
|
--method $method \
|
||||||
|
--beam-size 4 \
|
||||||
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
|
--lang-dir $repo/data/lang_char \
|
||||||
|
$repo/test_wavs/BAC009S0764W0121.wav \
|
||||||
|
$repo/test_wavs/BAC009S0764W0122.wav \
|
||||||
|
$repo/test_wavs/BAC009S0764W0123.wav
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
|
||||||
|
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
|
||||||
|
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
|
||||||
|
mkdir -p pruned_transducer_stateless3/exp
|
||||||
|
ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt
|
||||||
|
ln -s $PWD/$repo/data/lang_char data/
|
||||||
|
|
||||||
|
ls -lh data
|
||||||
|
ls -lh pruned_transducer_stateless3/exp
|
||||||
|
|
||||||
|
log "Decoding test and dev"
|
||||||
|
|
||||||
|
# use a small value for decoding with CPU
|
||||||
|
max_duration=100
|
||||||
|
|
||||||
|
for method in greedy_search fast_beam_search modified_beam_search; do
|
||||||
|
log "Decoding with $method"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/decode.py \
|
||||||
|
--decoding-method $method \
|
||||||
|
--epoch 999 \
|
||||||
|
--avg 1 \
|
||||||
|
--max-duration $max_duration \
|
||||||
|
--exp-dir pruned_transducer_stateless3/exp
|
||||||
|
done
|
||||||
|
|
||||||
|
rm pruned_transducer_stateless3/exp/*.pt
|
||||||
|
fi
|
||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
203
.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
vendored
Executable file
203
.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
vendored
Executable file
@ -0,0 +1,203 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
#
|
||||||
|
set -e
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
|
||||||
|
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
|
||||||
|
|
||||||
|
log "Downloading pre-trained model from $repo_url"
|
||||||
|
git lfs install
|
||||||
|
git clone $repo_url
|
||||||
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
|
log "Display test files"
|
||||||
|
tree $repo/
|
||||||
|
soxi $repo/test_wavs/*.wav
|
||||||
|
ls -lh $repo/test_wavs/*.wav
|
||||||
|
|
||||||
|
pushd $repo/exp
|
||||||
|
ln -s pretrained-iter-468000-avg-16.pt pretrained.pt
|
||||||
|
ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
|
||||||
|
popd
|
||||||
|
|
||||||
|
log "Install ncnn and pnnx"
|
||||||
|
|
||||||
|
# We are using a modified ncnn here. Will try to merge it to the official repo
|
||||||
|
# of ncnn
|
||||||
|
git clone https://github.com/csukuangfj/ncnn
|
||||||
|
pushd ncnn
|
||||||
|
git submodule init
|
||||||
|
git submodule update python/pybind11
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
ls -lh dist/
|
||||||
|
pip install dist/*.whl
|
||||||
|
cd tools/pnnx
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
make -j4 pnnx
|
||||||
|
|
||||||
|
./src/pnnx || echo "pass"
|
||||||
|
|
||||||
|
popd
|
||||||
|
|
||||||
|
log "Test exporting to pnnx format"
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--use-averaged-model 0 \
|
||||||
|
--pnnx 1
|
||||||
|
|
||||||
|
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/encoder_jit_trace-pnnx.pt
|
||||||
|
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/decoder_jit_trace-pnnx.pt
|
||||||
|
./ncnn/tools/pnnx/build/src/pnnx $repo/exp/joiner_jit_trace-pnnx.pt
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/ncnn-decode.py \
|
||||||
|
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||||
|
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||||
|
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||||
|
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||||
|
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||||
|
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/streaming-ncnn-decode.py \
|
||||||
|
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||||
|
--encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||||
|
--decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||||
|
--decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||||
|
--joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||||
|
--joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
log "Test exporting with torch.jit.trace()"
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--use-averaged-model 0 \
|
||||||
|
--jit-trace 1
|
||||||
|
|
||||||
|
log "Decode with models exported by torch.jit.trace()"
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/jit_pretrained.py \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename $repo/exp/encoder_jit_trace.pt \
|
||||||
|
--decoder-model-filename $repo/exp/decoder_jit_trace.pt \
|
||||||
|
--joiner-model-filename $repo/exp/joiner_jit_trace.pt \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
log "Test exporting to ONNX"
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--use-averaged-model 0 \
|
||||||
|
--onnx 1
|
||||||
|
|
||||||
|
log "Decode with ONNX models "
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/streaming-onnx-decode.py \
|
||||||
|
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename $repo//exp/encoder.onnx \
|
||||||
|
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||||
|
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||||
|
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||||
|
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/streaming-onnx-decode.py \
|
||||||
|
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename $repo//exp/encoder.onnx \
|
||||||
|
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||||
|
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||||
|
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||||
|
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/streaming-onnx-decode.py \
|
||||||
|
--bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename $repo//exp/encoder.onnx \
|
||||||
|
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||||
|
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||||
|
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||||
|
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for sym in 1 2 3; do
|
||||||
|
log "Greedy search with --max-sym-per-frame $sym"
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/pretrained.py \
|
||||||
|
--method greedy_search \
|
||||||
|
--max-sym-per-frame $sym \
|
||||||
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
done
|
||||||
|
|
||||||
|
for method in modified_beam_search beam_search fast_beam_search; do
|
||||||
|
log "$method"
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/pretrained.py \
|
||||||
|
--method $method \
|
||||||
|
--beam-size 4 \
|
||||||
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
|
||||||
|
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
|
||||||
|
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
|
||||||
|
mkdir -p lstm_transducer_stateless2/exp
|
||||||
|
ln -s $PWD/$repo/exp/pretrained.pt lstm_transducer_stateless2/exp/epoch-999.pt
|
||||||
|
ln -s $PWD/$repo/data/lang_bpe_500 data/
|
||||||
|
|
||||||
|
ls -lh data
|
||||||
|
ls -lh lstm_transducer_stateless2/exp
|
||||||
|
|
||||||
|
log "Decoding test-clean and test-other"
|
||||||
|
|
||||||
|
# use a small value for decoding with CPU
|
||||||
|
max_duration=100
|
||||||
|
|
||||||
|
for method in greedy_search fast_beam_search modified_beam_search; do
|
||||||
|
log "Decoding with $method"
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/decode.py \
|
||||||
|
--decoding-method $method \
|
||||||
|
--epoch 999 \
|
||||||
|
--avg 1 \
|
||||||
|
--use-averaged-model 0 \
|
||||||
|
--max-duration $max_duration \
|
||||||
|
--exp-dir lstm_transducer_stateless2/exp
|
||||||
|
done
|
||||||
|
|
||||||
|
rm lstm_transducer_stateless2/exp/*.pt
|
||||||
|
fi
|
||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
@ -11,10 +13,14 @@ cd egs/librispeech/ASR
|
|||||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29
|
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29
|
||||||
|
|
||||||
log "Downloading pre-trained model from $repo_url"
|
log "Downloading pre-trained model from $repo_url"
|
||||||
git lfs install
|
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||||
git clone $repo_url
|
|
||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
|
pushd $repo
|
||||||
|
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||||
|
git lfs pull --include "exp/pretrained-epoch-38-avg-10.pt"
|
||||||
|
popd
|
||||||
|
|
||||||
log "Display test files"
|
log "Display test files"
|
||||||
tree $repo/
|
tree $repo/
|
||||||
soxi $repo/test_wavs/*.wav
|
soxi $repo/test_wavs/*.wav
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
@ -11,9 +13,12 @@ cd egs/librispeech/ASR
|
|||||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-04-29
|
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-04-29
|
||||||
|
|
||||||
log "Downloading pre-trained model from $repo_url"
|
log "Downloading pre-trained model from $repo_url"
|
||||||
git lfs install
|
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||||
git clone $repo_url
|
|
||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
pushd $repo
|
||||||
|
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||||
|
git lfs pull --include "exp/pretrained-epoch-25-avg-6.pt"
|
||||||
|
popd
|
||||||
|
|
||||||
log "Display test files"
|
log "Display test files"
|
||||||
tree $repo/
|
tree $repo/
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
@ -22,8 +24,80 @@ ls -lh $repo/test_wavs/*.wav
|
|||||||
|
|
||||||
pushd $repo/exp
|
pushd $repo/exp
|
||||||
ln -s pretrained-iter-1224000-avg-14.pt pretrained.pt
|
ln -s pretrained-iter-1224000-avg-14.pt pretrained.pt
|
||||||
|
ln -s pretrained-iter-1224000-avg-14.pt epoch-99.pt
|
||||||
popd
|
popd
|
||||||
|
|
||||||
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--onnx 1
|
||||||
|
|
||||||
|
log "Export to torchscript model"
|
||||||
|
./pruned_transducer_stateless3/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--jit 1
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--jit-trace 1
|
||||||
|
|
||||||
|
ls -lh $repo/exp/*.onnx
|
||||||
|
ls -lh $repo/exp/*.pt
|
||||||
|
|
||||||
|
log "Decode with ONNX models"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/onnx_check.py \
|
||||||
|
--jit-filename $repo/exp/cpu_jit.pt \
|
||||||
|
--onnx-encoder-filename $repo/exp/encoder.onnx \
|
||||||
|
--onnx-decoder-filename $repo/exp/decoder.onnx \
|
||||||
|
--onnx-joiner-filename $repo/exp/joiner.onnx \
|
||||||
|
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||||
|
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/onnx_pretrained.py \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename $repo/exp/encoder.onnx \
|
||||||
|
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||||
|
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||||
|
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||||
|
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
log "Decode with models exported by torch.jit.trace()"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/jit_pretrained.py \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename $repo/exp/encoder_jit_trace.pt \
|
||||||
|
--decoder-model-filename $repo/exp/decoder_jit_trace.pt \
|
||||||
|
--joiner-model-filename $repo/exp/joiner_jit_trace.pt \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
log "Decode with models exported by torch.jit.script()"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/jit_pretrained.py \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename $repo/exp/encoder_jit_script.pt \
|
||||||
|
--decoder-model-filename $repo/exp/decoder_jit_script.pt \
|
||||||
|
--joiner-model-filename $repo/exp/joiner_jit_script.pt \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
|
||||||
for sym in 1 2 3; do
|
for sym in 1 2 3; do
|
||||||
log "Greedy search with --max-sym-per-frame $sym"
|
log "Greedy search with --max-sym-per-frame $sym"
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
@ -32,6 +34,12 @@ for sym in 1 2 3; do
|
|||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--num-encoder-layers 18 \
|
||||||
|
--dim-feedforward 2048 \
|
||||||
|
--nhead 8 \
|
||||||
|
--encoder-dim 512 \
|
||||||
|
--decoder-dim 512 \
|
||||||
|
--joiner-dim 512 \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -76,6 +84,7 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" ==
|
|||||||
|
|
||||||
./pruned_transducer_stateless5/decode.py \
|
./pruned_transducer_stateless5/decode.py \
|
||||||
--decoding-method $method \
|
--decoding-method $method \
|
||||||
|
--use-averaged-model 0 \
|
||||||
--epoch 999 \
|
--epoch 999 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--max-duration $max_duration \
|
--max-duration $max_duration \
|
||||||
|
|||||||
102
.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
vendored
Executable file
102
.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
vendored
Executable file
@ -0,0 +1,102 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
|
||||||
|
repo_url=https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless2_20220625
|
||||||
|
|
||||||
|
log "Downloading pre-trained model from $repo_url"
|
||||||
|
git lfs install
|
||||||
|
git clone $repo_url
|
||||||
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
|
log "Display test files"
|
||||||
|
tree $repo/
|
||||||
|
soxi $repo/test_wavs/*.wav
|
||||||
|
ls -lh $repo/test_wavs/*.wav
|
||||||
|
|
||||||
|
pushd $repo/exp
|
||||||
|
ln -s pretrained-epoch-24-avg-10.pt pretrained.pt
|
||||||
|
popd
|
||||||
|
|
||||||
|
for sym in 1 2 3; do
|
||||||
|
log "Greedy search with --max-sym-per-frame $sym"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/pretrained.py \
|
||||||
|
--method greedy_search \
|
||||||
|
--max-sym-per-frame $sym \
|
||||||
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--simulate-streaming 1 \
|
||||||
|
--causal-convolution 1 \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
done
|
||||||
|
|
||||||
|
for method in modified_beam_search beam_search fast_beam_search; do
|
||||||
|
log "$method"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/pretrained.py \
|
||||||
|
--method $method \
|
||||||
|
--beam-size 4 \
|
||||||
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
|
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||||
|
--simulate-streaming 1 \
|
||||||
|
--causal-convolution 1 \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
|
||||||
|
echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
|
||||||
|
if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
|
||||||
|
mkdir -p pruned_transducer_stateless2/exp
|
||||||
|
ln -s $PWD/$repo/exp/pretrained-epoch-24-avg-10.pt pruned_transducer_stateless2/exp/epoch-999.pt
|
||||||
|
ln -s $PWD/$repo/data/lang_bpe_500 data/
|
||||||
|
|
||||||
|
ls -lh data
|
||||||
|
ls -lh pruned_transducer_stateless2/exp
|
||||||
|
|
||||||
|
log "Decoding test-clean and test-other"
|
||||||
|
|
||||||
|
# use a small value for decoding with CPU
|
||||||
|
max_duration=100
|
||||||
|
|
||||||
|
for method in greedy_search fast_beam_search modified_beam_search; do
|
||||||
|
log "Simulate streaming decoding with $method"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/decode.py \
|
||||||
|
--decoding-method $method \
|
||||||
|
--epoch 999 \
|
||||||
|
--avg 1 \
|
||||||
|
--max-duration $max_duration \
|
||||||
|
--exp-dir pruned_transducer_stateless2/exp \
|
||||||
|
--simulate-streaming 1 \
|
||||||
|
--causal-convolution 1
|
||||||
|
done
|
||||||
|
|
||||||
|
for method in greedy_search fast_beam_search modified_beam_search; do
|
||||||
|
log "Real streaming decoding with $method"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/streaming_decode.py \
|
||||||
|
--decoding-method $method \
|
||||||
|
--epoch 999 \
|
||||||
|
--avg 1 \
|
||||||
|
--num-decode-streams 100 \
|
||||||
|
--exp-dir pruned_transducer_stateless2/exp \
|
||||||
|
--left-context 32 \
|
||||||
|
--decode-chunk-size 8 \
|
||||||
|
--right-context 0
|
||||||
|
done
|
||||||
|
|
||||||
|
rm pruned_transducer_stateless2/exp/*.pt
|
||||||
|
fi
|
||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
@ -10,7 +12,6 @@ cd egs/librispeech/ASR
|
|||||||
|
|
||||||
repo_url=https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500
|
repo_url=https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500
|
||||||
git lfs install
|
git lfs install
|
||||||
git clone $repo
|
|
||||||
|
|
||||||
log "Downloading pre-trained model from $repo_url"
|
log "Downloading pre-trained model from $repo_url"
|
||||||
git clone $repo_url
|
git clone $repo_url
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
|||||||
124
.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
vendored
Executable file
124
.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
vendored
Executable file
@ -0,0 +1,124 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
cd egs/wenetspeech/ASR
|
||||||
|
|
||||||
|
repo_url=https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2
|
||||||
|
|
||||||
|
log "Downloading pre-trained model from $repo_url"
|
||||||
|
git lfs install
|
||||||
|
git clone $repo_url
|
||||||
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
|
|
||||||
|
log "Display test files"
|
||||||
|
tree $repo/
|
||||||
|
soxi $repo/test_wavs/*.wav
|
||||||
|
ls -lh $repo/test_wavs/*.wav
|
||||||
|
|
||||||
|
pushd $repo/exp
|
||||||
|
ln -s pretrained_epoch_10_avg_2.pt pretrained.pt
|
||||||
|
ln -s pretrained_epoch_10_avg_2.pt epoch-99.pt
|
||||||
|
popd
|
||||||
|
|
||||||
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--lang-dir $repo/data/lang_char \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--onnx 1
|
||||||
|
|
||||||
|
log "Export to torchscript model"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--lang-dir $repo/data/lang_char \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--jit 1
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--lang-dir $repo/data/lang_char \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--jit-trace 1
|
||||||
|
|
||||||
|
ls -lh $repo/exp/*.onnx
|
||||||
|
ls -lh $repo/exp/*.pt
|
||||||
|
|
||||||
|
log "Decode with ONNX models"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/onnx_check.py \
|
||||||
|
--jit-filename $repo/exp/cpu_jit.pt \
|
||||||
|
--onnx-encoder-filename $repo/exp/encoder.onnx \
|
||||||
|
--onnx-decoder-filename $repo/exp/decoder.onnx \
|
||||||
|
--onnx-joiner-filename $repo/exp/joiner.onnx \
|
||||||
|
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||||
|
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/onnx_pretrained.py \
|
||||||
|
--tokens $repo/data/lang_char/tokens.txt \
|
||||||
|
--encoder-model-filename $repo/exp/encoder.onnx \
|
||||||
|
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||||
|
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||||
|
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||||
|
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||||
|
$repo/test_wavs/DEV_T0000000000.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000001.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000002.wav
|
||||||
|
|
||||||
|
log "Decode with models exported by torch.jit.trace()"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/jit_pretrained.py \
|
||||||
|
--tokens $repo/data/lang_char/tokens.txt \
|
||||||
|
--encoder-model-filename $repo/exp/encoder_jit_trace.pt \
|
||||||
|
--decoder-model-filename $repo/exp/decoder_jit_trace.pt \
|
||||||
|
--joiner-model-filename $repo/exp/joiner_jit_trace.pt \
|
||||||
|
$repo/test_wavs/DEV_T0000000000.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000001.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000002.wav
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/jit_pretrained.py \
|
||||||
|
--tokens $repo/data/lang_char/tokens.txt \
|
||||||
|
--encoder-model-filename $repo/exp/encoder_jit_script.pt \
|
||||||
|
--decoder-model-filename $repo/exp/decoder_jit_script.pt \
|
||||||
|
--joiner-model-filename $repo/exp/joiner_jit_script.pt \
|
||||||
|
$repo/test_wavs/DEV_T0000000000.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000001.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000002.wav
|
||||||
|
|
||||||
|
for sym in 1 2 3; do
|
||||||
|
log "Greedy search with --max-sym-per-frame $sym"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/pretrained.py \
|
||||||
|
--checkpoint $repo/exp/epoch-99.pt \
|
||||||
|
--lang-dir $repo/data/lang_char \
|
||||||
|
--decoding-method greedy_search \
|
||||||
|
--max-sym-per-frame $sym \
|
||||||
|
$repo/test_wavs/DEV_T0000000000.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000001.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000002.wav
|
||||||
|
done
|
||||||
|
|
||||||
|
for method in modified_beam_search beam_search fast_beam_search; do
|
||||||
|
log "$method"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/pretrained.py \
|
||||||
|
--decoding-method $method \
|
||||||
|
--beam-size 4 \
|
||||||
|
--checkpoint $repo/exp/epoch-99.pt \
|
||||||
|
--lang-dir $repo/data/lang_char \
|
||||||
|
$repo/test_wavs/DEV_T0000000000.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000001.wav \
|
||||||
|
$repo/test_wavs/DEV_T0000000002.wav
|
||||||
|
done
|
||||||
65
.github/workflows/build-doc.yml
vendored
Normal file
65
.github/workflows/build-doc.yml
vendored
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang)
|
||||||
|
|
||||||
|
# See ../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# refer to https://github.com/actions/starter-workflows/pull/47/files
|
||||||
|
|
||||||
|
# You can access it at https://k2-fsa.github.io/icefall/
|
||||||
|
name: Generate doc
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
- doc
|
||||||
|
pull_request:
|
||||||
|
types: [labeled]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-doc:
|
||||||
|
if: github.event.label.name == 'doc' || github.event_name == 'push'
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest]
|
||||||
|
python-version: ["3.8"]
|
||||||
|
steps:
|
||||||
|
# refer to https://github.com/actions/checkout
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
- name: Display Python version
|
||||||
|
run: python -c "import sys; print(sys.version)"
|
||||||
|
|
||||||
|
- name: Build doc
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
cd docs
|
||||||
|
python3 -m pip install -r ./requirements.txt
|
||||||
|
make html
|
||||||
|
touch build/html/.nojekyll
|
||||||
|
|
||||||
|
- name: Deploy
|
||||||
|
uses: peaceiris/actions-gh-pages@v3
|
||||||
|
with:
|
||||||
|
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
publish_dir: ./docs/build/html
|
||||||
|
publish_branch: gh-pages
|
||||||
119
.github/workflows/run-aishell-2022-06-20.yml
vendored
Normal file
119
.github/workflows/run-aishell-2022-06-20.yml
vendored
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
|
||||||
|
|
||||||
|
# See ../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
name: run-aishell-2022-06-20
|
||||||
|
# pruned RNN-T + reworked model with random combiner
|
||||||
|
# https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [labeled]
|
||||||
|
|
||||||
|
schedule:
|
||||||
|
# minute (0-59)
|
||||||
|
# hour (0-23)
|
||||||
|
# day of the month (1-31)
|
||||||
|
# month (1-12)
|
||||||
|
# day of the week (0-6)
|
||||||
|
# nightly build at 15:50 UTC time every day
|
||||||
|
- cron: "50 15 * * *"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_aishell_2022_06_20:
|
||||||
|
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-18.04]
|
||||||
|
python-version: [3.7, 3.8, 3.9]
|
||||||
|
|
||||||
|
fail-fast: false
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
|
cache-dependency-path: '**/requirements-ci.txt'
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
|
- name: Cache kaldifeat
|
||||||
|
id: my-cache
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/tmp/kaldifeat
|
||||||
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
|
- name: Install kaldifeat
|
||||||
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/install-kaldifeat.sh
|
||||||
|
|
||||||
|
- name: Inference with pre-trained model
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||||
|
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||||
|
run: |
|
||||||
|
sudo apt-get -qq install git-lfs tree sox
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||||
|
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||||
|
|
||||||
|
.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
|
||||||
|
|
||||||
|
- name: Display decoding results for aishell pruned_transducer_stateless3
|
||||||
|
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
cd egs/aishell/ASR/
|
||||||
|
tree ./pruned_transducer_stateless3/exp
|
||||||
|
|
||||||
|
cd pruned_transducer_stateless3
|
||||||
|
echo "results for pruned_transducer_stateless3"
|
||||||
|
echo "===greedy search==="
|
||||||
|
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
|
||||||
|
find exp/greedy_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
|
||||||
|
|
||||||
|
echo "===fast_beam_search==="
|
||||||
|
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
|
||||||
|
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
|
||||||
|
|
||||||
|
echo "===modified beam search==="
|
||||||
|
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
|
||||||
|
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
|
||||||
|
|
||||||
|
- name: Upload decoding results for aishell pruned_transducer_stateless3
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
|
||||||
|
with:
|
||||||
|
name: aishell-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-06-20
|
||||||
|
path: egs/aishell/ASR/pruned_transducer_stateless3/exp/
|
||||||
@ -59,6 +59,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -66,7 +68,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -59,6 +59,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -66,7 +68,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
@ -99,7 +101,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/fbank-libri
|
~/tmp/fbank-libri
|
||||||
key: cache-libri-fbank-test-clean-and-test-other
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -59,6 +59,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -66,7 +68,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
@ -99,7 +101,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/fbank-libri
|
~/tmp/fbank-libri
|
||||||
key: cache-libri-fbank-test-clean-and-test-other
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -59,6 +59,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -66,7 +68,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
@ -99,7 +101,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/fbank-libri
|
~/tmp/fbank-libri
|
||||||
key: cache-libri-fbank-test-clean-and-test-other
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
|||||||
136
.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
vendored
Normal file
136
.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
vendored
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
name: run-librispeech-lstm-transducer2-2022-09-03
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [labeled]
|
||||||
|
|
||||||
|
schedule:
|
||||||
|
# minute (0-59)
|
||||||
|
# hour (0-23)
|
||||||
|
# day of the month (1-31)
|
||||||
|
# month (1-12)
|
||||||
|
# day of the week (0-6)
|
||||||
|
# nightly build at 15:50 UTC time every day
|
||||||
|
- cron: "50 15 * * *"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_librispeech_lstm_transducer_stateless2_2022_09_03:
|
||||||
|
if: github.event.label.name == 'ready' || github.event.label.name == 'ncnn' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-18.04]
|
||||||
|
python-version: [3.8]
|
||||||
|
|
||||||
|
fail-fast: false
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
|
cache-dependency-path: '**/requirements-ci.txt'
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
|
- name: Cache kaldifeat
|
||||||
|
id: my-cache
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/tmp/kaldifeat
|
||||||
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
|
- name: Install kaldifeat
|
||||||
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/install-kaldifeat.sh
|
||||||
|
|
||||||
|
- name: Cache LibriSpeech test-clean and test-other datasets
|
||||||
|
id: libri-test-clean-and-test-other-data
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/tmp/download
|
||||||
|
key: cache-libri-test-clean-and-test-other
|
||||||
|
|
||||||
|
- name: Download LibriSpeech test-clean and test-other
|
||||||
|
if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
|
||||||
|
|
||||||
|
- name: Prepare manifests for LibriSpeech test-clean and test-other
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
|
||||||
|
|
||||||
|
- name: Cache LibriSpeech test-clean and test-other fbank features
|
||||||
|
id: libri-test-clean-and-test-other-fbank
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/tmp/fbank-libri
|
||||||
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
|
||||||
|
|
||||||
|
- name: Inference with pre-trained model
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||||
|
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||||
|
run: |
|
||||||
|
mkdir -p egs/librispeech/ASR/data
|
||||||
|
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||||
|
ls -lh egs/librispeech/ASR/data/*
|
||||||
|
|
||||||
|
sudo apt-get -qq install git-lfs tree sox
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||||
|
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||||
|
|
||||||
|
.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
|
||||||
|
|
||||||
|
- name: Display decoding results for lstm_transducer_stateless2
|
||||||
|
if: github.event_name == 'schedule'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
tree lstm_transducer_stateless2/exp
|
||||||
|
cd lstm_transducer_stateless2/exp
|
||||||
|
echo "===greedy search==="
|
||||||
|
find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
|
||||||
|
find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
|
||||||
|
|
||||||
|
echo "===fast_beam_search==="
|
||||||
|
find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
|
||||||
|
find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
|
||||||
|
|
||||||
|
echo "===modified beam search==="
|
||||||
|
find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
|
||||||
|
find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
|
||||||
|
|
||||||
|
- name: Upload decoding results for lstm_transducer_stateless2
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
if: github.event_name == 'schedule'
|
||||||
|
with:
|
||||||
|
name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-lstm_transducer_stateless2-2022-09-03
|
||||||
|
path: egs/librispeech/ASR/lstm_transducer_stateless2/exp/
|
||||||
@ -35,7 +35,7 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run_librispeech_pruned_transducer_stateless3_2022_05_13:
|
run_librispeech_pruned_transducer_stateless3_2022_05_13:
|
||||||
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
|
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@ -59,6 +59,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -66,7 +68,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
@ -99,7 +101,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/fbank-libri
|
~/tmp/fbank-libri
|
||||||
key: cache-libri-fbank-test-clean-and-test-other
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
|||||||
155
.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
vendored
Normal file
155
.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
vendored
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
|
||||||
|
|
||||||
|
# See ../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
name: run-librispeech-streaming-2022-06-26
|
||||||
|
# streaming conformer stateless transducer2
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [labeled]
|
||||||
|
|
||||||
|
schedule:
|
||||||
|
# minute (0-59)
|
||||||
|
# hour (0-23)
|
||||||
|
# day of the month (1-31)
|
||||||
|
# month (1-12)
|
||||||
|
# day of the week (0-6)
|
||||||
|
# nightly build at 15:50 UTC time every day
|
||||||
|
- cron: "50 15 * * *"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_librispeech_streaming_2022_06_26:
|
||||||
|
if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-18.04]
|
||||||
|
python-version: [3.7, 3.8, 3.9]
|
||||||
|
|
||||||
|
fail-fast: false
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
|
cache-dependency-path: '**/requirements-ci.txt'
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
|
- name: Cache kaldifeat
|
||||||
|
id: my-cache
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/tmp/kaldifeat
|
||||||
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
|
- name: Install kaldifeat
|
||||||
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/install-kaldifeat.sh
|
||||||
|
|
||||||
|
- name: Cache LibriSpeech test-clean and test-other datasets
|
||||||
|
id: libri-test-clean-and-test-other-data
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/tmp/download
|
||||||
|
key: cache-libri-test-clean-and-test-other
|
||||||
|
|
||||||
|
- name: Download LibriSpeech test-clean and test-other
|
||||||
|
if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
|
||||||
|
|
||||||
|
- name: Prepare manifests for LibriSpeech test-clean and test-other
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
|
||||||
|
|
||||||
|
- name: Cache LibriSpeech test-clean and test-other fbank features
|
||||||
|
id: libri-test-clean-and-test-other-fbank
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/tmp/fbank-libri
|
||||||
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
|
||||||
|
|
||||||
|
- name: Inference with pre-trained model
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||||
|
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||||
|
run: |
|
||||||
|
mkdir -p egs/librispeech/ASR/data
|
||||||
|
ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
|
||||||
|
ls -lh egs/librispeech/ASR/data/*
|
||||||
|
|
||||||
|
sudo apt-get -qq install git-lfs tree sox
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||||
|
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||||
|
|
||||||
|
.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
|
||||||
|
|
||||||
|
- name: Display decoding results
|
||||||
|
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
cd egs/librispeech/ASR/
|
||||||
|
tree ./pruned_transducer_stateless2/exp
|
||||||
|
|
||||||
|
cd pruned_transducer_stateless2
|
||||||
|
echo "results for pruned_transducer_stateless2"
|
||||||
|
echo "===greedy search==="
|
||||||
|
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
|
||||||
|
find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
|
||||||
|
|
||||||
|
echo "===fast_beam_search==="
|
||||||
|
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
|
||||||
|
find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
|
||||||
|
|
||||||
|
echo "===modified_beam_search==="
|
||||||
|
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
|
||||||
|
find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
|
||||||
|
|
||||||
|
- name: Upload decoding results for pruned_transducer_stateless2
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
|
||||||
|
with:
|
||||||
|
name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless2-2022-06-26
|
||||||
|
path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
|
||||||
@ -59,6 +59,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -66,7 +68,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
@ -99,7 +101,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/fbank-libri
|
~/tmp/fbank-libri
|
||||||
key: cache-libri-fbank-test-clean-and-test-other
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -58,7 +58,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -58,6 +58,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -65,7 +67,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
@ -98,7 +100,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/fbank-libri
|
~/tmp/fbank-libri
|
||||||
key: cache-libri-fbank-test-clean-and-test-other
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -58,6 +58,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -65,7 +67,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
@ -98,7 +100,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/fbank-libri
|
~/tmp/fbank-libri
|
||||||
key: cache-libri-fbank-test-clean-and-test-other
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -58,7 +58,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -58,7 +58,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -58,6 +58,8 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
- name: Cache kaldifeat
|
- name: Cache kaldifeat
|
||||||
id: my-cache
|
id: my-cache
|
||||||
@ -65,7 +67,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
@ -98,7 +100,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/fbank-libri
|
~/tmp/fbank-libri
|
||||||
key: cache-libri-fbank-test-clean-and-test-other
|
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||||
|
|
||||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||||
|
|||||||
@ -58,7 +58,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/tmp/kaldifeat
|
~/tmp/kaldifeat
|
||||||
key: cache-tmp-${{ matrix.python-version }}
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
- name: Install kaldifeat
|
- name: Install kaldifeat
|
||||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
|||||||
80
.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
vendored
Normal file
80
.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
vendored
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
|
||||||
|
|
||||||
|
# See ../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
name: run-wenetspeech-pruned-transducer-stateless2
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [labeled]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_librispeech_pruned_transducer_stateless3_2022_05_13:
|
||||||
|
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'wenetspeech'
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-18.04]
|
||||||
|
python-version: [3.8]
|
||||||
|
|
||||||
|
fail-fast: false
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
|
cache-dependency-path: '**/requirements-ci.txt'
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||||
|
pip uninstall -y protobuf
|
||||||
|
pip install --no-binary protobuf protobuf
|
||||||
|
|
||||||
|
- name: Cache kaldifeat
|
||||||
|
id: my-cache
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/tmp/kaldifeat
|
||||||
|
key: cache-tmp-${{ matrix.python-version }}-2022-09-25
|
||||||
|
|
||||||
|
- name: Install kaldifeat
|
||||||
|
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
.github/scripts/install-kaldifeat.sh
|
||||||
|
|
||||||
|
- name: Inference with pre-trained model
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||||
|
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||||
|
run: |
|
||||||
|
sudo apt-get -qq install git-lfs tree sox
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||||
|
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||||
|
|
||||||
|
.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
|
||||||
4
.github/workflows/style_check.yml
vendored
4
.github/workflows/style_check.yml
vendored
@ -29,8 +29,8 @@ jobs:
|
|||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-18.04, macos-10.15]
|
os: [ubuntu-latest]
|
||||||
python-version: [3.7, 3.9]
|
python-version: [3.8]
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
12
.github/workflows/test.yml
vendored
12
.github/workflows/test.yml
vendored
@ -33,13 +33,13 @@ jobs:
|
|||||||
# disable macOS test for now.
|
# disable macOS test for now.
|
||||||
os: [ubuntu-18.04]
|
os: [ubuntu-18.04]
|
||||||
python-version: [3.7, 3.8]
|
python-version: [3.7, 3.8]
|
||||||
torch: ["1.8.0", "1.10.0"]
|
torch: ["1.8.0", "1.11.0"]
|
||||||
torchaudio: ["0.8.0", "0.10.0"]
|
torchaudio: ["0.8.0", "0.11.0"]
|
||||||
k2-version: ["1.9.dev20211101"]
|
k2-version: ["1.15.1.dev20220427"]
|
||||||
exclude:
|
exclude:
|
||||||
- torch: "1.8.0"
|
- torch: "1.8.0"
|
||||||
torchaudio: "0.10.0"
|
torchaudio: "0.11.0"
|
||||||
- torch: "1.10.0"
|
- torch: "1.11.0"
|
||||||
torchaudio: "0.8.0"
|
torchaudio: "0.8.0"
|
||||||
|
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@ -67,7 +67,7 @@ jobs:
|
|||||||
# numpy 1.20.x does not support python 3.6
|
# numpy 1.20.x does not support python 3.6
|
||||||
pip install numpy==1.19
|
pip install numpy==1.19
|
||||||
pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
||||||
if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
|
if [[ ${{ matrix.torchaudio }} == "0.11.0" ]]; then
|
||||||
pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
||||||
else
|
else
|
||||||
pip install torchaudio==${{ matrix.torchaudio }}
|
pip install torchaudio==${{ matrix.torchaudio }}
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -11,3 +11,5 @@ log
|
|||||||
*.bak
|
*.bak
|
||||||
*-bak
|
*-bak
|
||||||
*bak.py
|
*bak.py
|
||||||
|
*.param
|
||||||
|
*.bin
|
||||||
|
|||||||
66
README.md
66
README.md
@ -2,6 +2,18 @@
|
|||||||
<img src="https://raw.githubusercontent.com/k2-fsa/icefall/master/docs/source/_static/logo.png" width=168>
|
<img src="https://raw.githubusercontent.com/k2-fsa/icefall/master/docs/source/_static/logo.png" width=168>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
icefall contains ASR recipes for various datasets
|
||||||
|
using <https://github.com/k2-fsa/k2>.
|
||||||
|
|
||||||
|
You can use <https://github.com/k2-fsa/sherpa> to deploy models
|
||||||
|
trained with icefall.
|
||||||
|
|
||||||
|
You can try pre-trained models from within your browser without the need
|
||||||
|
to download or install anything by visiting <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>
|
||||||
|
See <https://k2-fsa.github.io/icefall/huggingface/spaces.html> for more details.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
Please refer to <https://icefall.readthedocs.io/en/latest/installation/index.html>
|
Please refer to <https://icefall.readthedocs.io/en/latest/installation/index.html>
|
||||||
@ -23,6 +35,8 @@ We provide the following recipes:
|
|||||||
- [Aidatatang_200zh][aidatatang_200zh]
|
- [Aidatatang_200zh][aidatatang_200zh]
|
||||||
- [WenetSpeech][wenetspeech]
|
- [WenetSpeech][wenetspeech]
|
||||||
- [Alimeeting][alimeeting]
|
- [Alimeeting][alimeeting]
|
||||||
|
- [Aishell4][aishell4]
|
||||||
|
- [TAL_CSASR][tal_csasr]
|
||||||
|
|
||||||
### yesno
|
### yesno
|
||||||
|
|
||||||
@ -236,17 +250,25 @@ We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless mod
|
|||||||
|
|
||||||
### WenetSpeech
|
### WenetSpeech
|
||||||
|
|
||||||
We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless2].
|
We provide some models for this recipe: [Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless2] and [Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless5].
|
||||||
|
|
||||||
#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset)
|
#### Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset, offline ASR)
|
||||||
|
|
||||||
| | Dev | Test-Net | Test-Meeting |
|
| | Dev | Test-Net | Test-Meeting |
|
||||||
|----------------------|-------|----------|--------------|
|
|----------------------|-------|----------|--------------|
|
||||||
| greedy search | 7.80 | 8.75 | 13.49 |
|
| greedy search | 7.80 | 8.75 | 13.49 |
|
||||||
|
| modified beam search| 7.76 | 8.71 | 13.41 |
|
||||||
| fast beam search | 7.94 | 8.74 | 13.80 |
|
| fast beam search | 7.94 | 8.74 | 13.80 |
|
||||||
| modified beam search | 7.76 | 8.71 | 13.41 |
|
|
||||||
|
|
||||||
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1EV4e1CHa1GZgEF-bZgizqI9RyFFehIiN?usp=sharing)
|
#### Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset)
|
||||||
|
**Streaming**:
|
||||||
|
| | Dev | Test-Net | Test-Meeting |
|
||||||
|
|----------------------|-------|----------|--------------|
|
||||||
|
| greedy_search | 8.78 | 10.12 | 16.16 |
|
||||||
|
| modified_beam_search | 8.53| 9.95 | 15.81 |
|
||||||
|
| fast_beam_search| 9.01 | 10.47 | 16.28 |
|
||||||
|
|
||||||
|
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless2 model: [](https://colab.research.google.com/drive/1EV4e1CHa1GZgEF-bZgizqI9RyFFehIiN?usp=sharing)
|
||||||
|
|
||||||
### Alimeeting
|
### Alimeeting
|
||||||
|
|
||||||
@ -262,6 +284,36 @@ We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder
|
|||||||
|
|
||||||
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)
|
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)
|
||||||
|
|
||||||
|
### Aishell4
|
||||||
|
|
||||||
|
We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aishell4_pruned_transducer_stateless5].
|
||||||
|
|
||||||
|
#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with all subsets)
|
||||||
|
|
||||||
|
The best CER(%) results:
|
||||||
|
| | test |
|
||||||
|
|----------------------|--------|
|
||||||
|
| greedy search | 29.89 |
|
||||||
|
| fast beam search | 28.91 |
|
||||||
|
| modified beam search | 29.08 |
|
||||||
|
|
||||||
|
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
|
||||||
|
|
||||||
|
### TAL_CSASR
|
||||||
|
|
||||||
|
We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TAL_CSASR_pruned_transducer_stateless5].
|
||||||
|
|
||||||
|
#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
|
||||||
|
|
||||||
|
The best results for Chinese CER(%) and English WER(%) respectivly (zh: Chinese, en: English):
|
||||||
|
|decoding-method | dev | dev_zh | dev_en | test | test_zh | test_en |
|
||||||
|
|--|--|--|--|--|--|--|
|
||||||
|
|greedy_search| 7.30 | 6.48 | 19.19 |7.39| 6.66 | 19.13|
|
||||||
|
|modified_beam_search| 7.15 | 6.35 | 18.95 | 7.22| 6.50 | 18.70 |
|
||||||
|
|fast_beam_search| 7.18 | 6.39| 18.90 | 7.27| 6.55 | 18.77|
|
||||||
|
|
||||||
|
We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1DmIx-NloI1CMU5GdZrlse7TRu4y3Dpf8?usp=sharing)
|
||||||
|
|
||||||
## Deployment with C++
|
## Deployment with C++
|
||||||
|
|
||||||
Once you have trained a model in icefall, you may want to deploy it with C++,
|
Once you have trained a model in icefall, you may want to deploy it with C++,
|
||||||
@ -289,7 +341,10 @@ Please see: [
|
|
||||||
|
|||||||
108
docker/README.md
108
docker/README.md
@ -1,24 +1,114 @@
|
|||||||
# icefall dockerfile
|
# icefall dockerfile
|
||||||
|
|
||||||
We provide a dockerfile for some users, the configuration of dockerfile is : Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8-python3.8. You can use the dockerfile by following the steps:
|
2 sets of configuration are provided - (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8, and (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.
|
||||||
|
|
||||||
## Building images locally
|
If your NVIDIA driver supports CUDA Version: 11.3, please go for case (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8.
|
||||||
|
|
||||||
|
Otherwise, since the older PyTorch images are not updated with the [apt-key rotation by NVIDIA](https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key), you have to go for case (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8. Ensure that your NVDIA driver supports at least CUDA 11.0.
|
||||||
|
|
||||||
|
You can check the highest CUDA version within your NVIDIA driver's support with the `nvidia-smi` command below. In this example, the highest CUDA version is 11.0, i.e. case (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8
|
$ nvidia-smi
|
||||||
docker build -t icefall/pytorch1.7.1:latest -f ./Dockerfile ./
|
Tue Sep 20 00:26:13 2022
|
||||||
|
+-----------------------------------------------------------------------------+
|
||||||
|
| NVIDIA-SMI 450.119.03 Driver Version: 450.119.03 CUDA Version: 11.0 |
|
||||||
|
|-------------------------------+----------------------+----------------------+
|
||||||
|
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||||
|
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|
||||||
|
| | | MIG M. |
|
||||||
|
|===============================+======================+======================|
|
||||||
|
| 0 TITAN RTX On | 00000000:03:00.0 Off | N/A |
|
||||||
|
| 41% 31C P8 4W / 280W | 16MiB / 24219MiB | 0% Default |
|
||||||
|
| | | N/A |
|
||||||
|
+-------------------------------+----------------------+----------------------+
|
||||||
|
| 1 TITAN RTX On | 00000000:04:00.0 Off | N/A |
|
||||||
|
| 41% 30C P8 11W / 280W | 6MiB / 24220MiB | 0% Default |
|
||||||
|
| | | N/A |
|
||||||
|
+-------------------------------+----------------------+----------------------+
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------------+
|
||||||
|
| Processes: |
|
||||||
|
| GPU GI CI PID Type Process name GPU Memory |
|
||||||
|
| ID ID Usage |
|
||||||
|
|=============================================================================|
|
||||||
|
| 0 N/A N/A 2085 G /usr/lib/xorg/Xorg 9MiB |
|
||||||
|
| 0 N/A N/A 2240 G /usr/bin/gnome-shell 4MiB |
|
||||||
|
| 1 N/A N/A 2085 G /usr/lib/xorg/Xorg 4MiB |
|
||||||
|
+-----------------------------------------------------------------------------+
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Using built images
|
## Building images locally
|
||||||
Sample usage of the GPU based images:
|
If your environment requires a proxy to access the Internet, remember to add those information into the Dockerfile directly.
|
||||||
|
For most cases, you can uncomment these lines in the Dockerfile and add in your proxy details.
|
||||||
|
|
||||||
|
```dockerfile
|
||||||
|
ENV http_proxy=http://aaa.bb.cc.net:8080 \
|
||||||
|
https_proxy=http://aaa.bb.cc.net:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, proceed with these commands.
|
||||||
|
|
||||||
|
### If you are case (a), i.e. your NVIDIA driver supports CUDA version >= 11.3:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8
|
||||||
|
docker build -t icefall/pytorch1.12.1 .
|
||||||
|
```
|
||||||
|
|
||||||
|
### If you are case (b), i.e. your NVIDIA driver can only support CUDA versions 11.0 <= x < 11.3:
|
||||||
|
```bash
|
||||||
|
cd docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8
|
||||||
|
docker build -t icefall/pytorch1.7.1 .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running your built local image
|
||||||
|
Sample usage of the GPU based images. These commands are written with case (a) in mind, so please make the necessary changes to your image name if you are case (b).
|
||||||
Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run the GPU images.
|
Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run the GPU images.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -it --runtime=nvidia --name=icefall_username --gpus all icefall/pytorch1.7.1:latest
|
docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all icefall/pytorch1.12.1
|
||||||
```
|
```
|
||||||
|
|
||||||
Sample usage of the CPU based images:
|
### Tips:
|
||||||
|
1. Since your data and models most probably won't be in the docker, you must use the -v flag to access the host machine. Do this by specifying `-v {/path/in/docker}:{/path/in/host/machine}`.
|
||||||
|
|
||||||
|
2. Also, if your environment requires a proxy, this would be a good time to add it in too: `-e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080`.
|
||||||
|
|
||||||
|
Overall, your docker run command should look like this.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -it icefall/pytorch1.7.1:latest /bin/bash
|
docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all -v {/path/in/docker}:{/path/in/host/machine} -e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080 icefall/pytorch1.12.1
|
||||||
|
```
|
||||||
|
|
||||||
|
You can explore more docker run options [here](https://docs.docker.com/engine/reference/commandline/run/) to suit your environment.
|
||||||
|
|
||||||
|
### Linking to icefall in your host machine
|
||||||
|
|
||||||
|
If you already have icefall downloaded onto your host machine, you can use that repository instead so that changes in your code are visible inside and outside of the container.
|
||||||
|
|
||||||
|
Note: Remember to set the -v flag above during the first run of the container, as that is the only way for your container to access your host machine.
|
||||||
|
Warning: Check that the icefall in your host machine is visible from within your container before proceeding to the commands below.
|
||||||
|
|
||||||
|
Use these commands once you are inside the container.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rm -r /workspace/icefall
|
||||||
|
ln -s {/path/in/docker/to/icefall} /workspace/icefall
|
||||||
|
```
|
||||||
|
|
||||||
|
## Starting another session in the same running container.
|
||||||
|
```bash
|
||||||
|
docker exec -it icefall /bin/bash
|
||||||
|
```
|
||||||
|
|
||||||
|
## Restarting a killed container that has been run before.
|
||||||
|
```bash
|
||||||
|
docker start -ai icefall
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sample usage of the CPU based images:
|
||||||
|
```bash
|
||||||
|
docker run -it icefall /bin/bash
|
||||||
```
|
```
|
||||||
72
docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile
Normal file
72
docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel
|
||||||
|
|
||||||
|
# ENV http_proxy=http://aaa.bbb.cc.net:8080 \
|
||||||
|
# https_proxy=http://aaa.bbb.cc.net:8080
|
||||||
|
|
||||||
|
# install normal source
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
g++ \
|
||||||
|
make \
|
||||||
|
automake \
|
||||||
|
autoconf \
|
||||||
|
bzip2 \
|
||||||
|
unzip \
|
||||||
|
wget \
|
||||||
|
sox \
|
||||||
|
libtool \
|
||||||
|
git \
|
||||||
|
subversion \
|
||||||
|
zlib1g-dev \
|
||||||
|
gfortran \
|
||||||
|
ca-certificates \
|
||||||
|
patch \
|
||||||
|
ffmpeg \
|
||||||
|
valgrind \
|
||||||
|
libssl-dev \
|
||||||
|
vim \
|
||||||
|
curl
|
||||||
|
|
||||||
|
# cmake
|
||||||
|
RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
|
||||||
|
cd /opt && \
|
||||||
|
tar -zxvf cmake-3.18.0.tar.gz && \
|
||||||
|
cd cmake-3.18.0 && \
|
||||||
|
./bootstrap && \
|
||||||
|
make && \
|
||||||
|
make install && \
|
||||||
|
rm -rf cmake-3.18.0.tar.gz && \
|
||||||
|
find /opt/cmake-3.18.0 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
|
||||||
|
cd -
|
||||||
|
|
||||||
|
# flac
|
||||||
|
RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz && \
|
||||||
|
cd /opt && \
|
||||||
|
xz -d flac-1.3.2.tar.xz && \
|
||||||
|
tar -xvf flac-1.3.2.tar && \
|
||||||
|
cd flac-1.3.2 && \
|
||||||
|
./configure && \
|
||||||
|
make && make install && \
|
||||||
|
rm -rf flac-1.3.2.tar && \
|
||||||
|
find /opt/flac-1.3.2 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
|
||||||
|
cd -
|
||||||
|
|
||||||
|
RUN pip install kaldiio graphviz && \
|
||||||
|
conda install -y -c pytorch torchaudio
|
||||||
|
|
||||||
|
#install k2 from source
|
||||||
|
RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
|
||||||
|
cd /opt/k2 && \
|
||||||
|
python3 setup.py install && \
|
||||||
|
cd -
|
||||||
|
|
||||||
|
# install lhotse
|
||||||
|
RUN pip install git+https://github.com/lhotse-speech/lhotse
|
||||||
|
|
||||||
|
RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
|
||||||
|
cd /workspace/icefall && \
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
|
||||||
|
|
||||||
|
WORKDIR /workspace/icefall
|
||||||
@ -1,7 +1,13 @@
|
|||||||
FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel
|
FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel
|
||||||
|
|
||||||
# install normal source
|
# ENV http_proxy=http://aaa.bbb.cc.net:8080 \
|
||||||
|
# https_proxy=http://aaa.bbb.cc.net:8080
|
||||||
|
|
||||||
|
RUN rm /etc/apt/sources.list.d/cuda.list && \
|
||||||
|
rm /etc/apt/sources.list.d/nvidia-ml.list && \
|
||||||
|
apt-key del 7fa2af80
|
||||||
|
|
||||||
|
# install normal source
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
g++ \
|
g++ \
|
||||||
@ -21,20 +27,25 @@ RUN apt-get update && \
|
|||||||
patch \
|
patch \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
valgrind \
|
valgrind \
|
||||||
libssl-dev \
|
libssl-dev \
|
||||||
vim && \
|
vim \
|
||||||
rm -rf /var/lib/apt/lists/*
|
curl
|
||||||
|
|
||||||
|
# Add new keys and reupdate
|
||||||
RUN mv /opt/conda/lib/libcufft.so.10 /opt/libcufft.so.10.bak && \
|
RUN curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub | apt-key add - && \
|
||||||
|
curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
|
||||||
|
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
|
||||||
|
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
mv /opt/conda/lib/libcufft.so.10 /opt/libcufft.so.10.bak && \
|
||||||
mv /opt/conda/lib/libcurand.so.10 /opt/libcurand.so.10.bak && \
|
mv /opt/conda/lib/libcurand.so.10 /opt/libcurand.so.10.bak && \
|
||||||
mv /opt/conda/lib/libcublas.so.11 /opt/libcublas.so.11.bak && \
|
mv /opt/conda/lib/libcublas.so.11 /opt/libcublas.so.11.bak && \
|
||||||
mv /opt/conda/lib/libnvrtc.so.11.0 /opt/libnvrtc.so.11.1.bak && \
|
mv /opt/conda/lib/libnvrtc.so.11.0 /opt/libnvrtc.so.11.1.bak && \
|
||||||
mv /opt/conda/lib/libnvToolsExt.so.1 /opt/libnvToolsExt.so.1.bak && \
|
# mv /opt/conda/lib/libnvToolsExt.so.1 /opt/libnvToolsExt.so.1.bak && \
|
||||||
mv /opt/conda/lib/libcudart.so.11.0 /opt/libcudart.so.11.0.bak
|
mv /opt/conda/lib/libcudart.so.11.0 /opt/libcudart.so.11.0.bak && \
|
||||||
|
apt-get update && apt-get -y upgrade
|
||||||
|
|
||||||
# cmake
|
# cmake
|
||||||
|
|
||||||
RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
|
RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
|
||||||
cd /opt && \
|
cd /opt && \
|
||||||
tar -zxvf cmake-3.18.0.tar.gz && \
|
tar -zxvf cmake-3.18.0.tar.gz && \
|
||||||
@ -46,10 +57,6 @@ RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
|
|||||||
find /opt/cmake-3.18.0 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
|
find /opt/cmake-3.18.0 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
|
||||||
cd -
|
cd -
|
||||||
|
|
||||||
#kaldiio
|
|
||||||
|
|
||||||
RUN pip install kaldiio
|
|
||||||
|
|
||||||
# flac
|
# flac
|
||||||
RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz && \
|
RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz && \
|
||||||
cd /opt && \
|
cd /opt && \
|
||||||
@ -62,15 +69,8 @@ RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz &&
|
|||||||
find /opt/flac-1.3.2 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
|
find /opt/flac-1.3.2 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
|
||||||
cd -
|
cd -
|
||||||
|
|
||||||
# graphviz
|
RUN pip install kaldiio graphviz && \
|
||||||
RUN pip install graphviz
|
conda install -y -c pytorch torchaudio=0.7.1
|
||||||
|
|
||||||
# kaldifeat
|
|
||||||
RUN git clone https://github.com/csukuangfj/kaldifeat.git /opt/kaldifeat && \
|
|
||||||
cd /opt/kaldifeat && \
|
|
||||||
python setup.py install && \
|
|
||||||
cd -
|
|
||||||
|
|
||||||
|
|
||||||
#install k2 from source
|
#install k2 from source
|
||||||
RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
|
RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
|
||||||
@ -80,12 +80,12 @@ RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
|
|||||||
|
|
||||||
# install lhotse
|
# install lhotse
|
||||||
RUN pip install git+https://github.com/lhotse-speech/lhotse
|
RUN pip install git+https://github.com/lhotse-speech/lhotse
|
||||||
#RUN pip install lhotse
|
|
||||||
|
|
||||||
# install icefall
|
RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
|
||||||
RUN git clone https://github.com/k2-fsa/icefall && \
|
cd /workspace/icefall && \
|
||||||
cd icefall && \
|
pip install -r requirements.txt
|
||||||
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
||||||
|
|
||||||
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
|
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
|
||||||
|
|
||||||
|
WORKDIR /workspace/icefall
|
||||||
|
|
||||||
|
|||||||
@ -1,2 +1,3 @@
|
|||||||
sphinx_rtd_theme
|
sphinx_rtd_theme
|
||||||
sphinx
|
sphinx
|
||||||
|
sphinxcontrib-youtube==1.1.0
|
||||||
|
|||||||
@ -32,8 +32,9 @@ release = "0.1"
|
|||||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
# ones.
|
# ones.
|
||||||
extensions = [
|
extensions = [
|
||||||
"sphinx_rtd_theme",
|
|
||||||
"sphinx.ext.todo",
|
"sphinx.ext.todo",
|
||||||
|
"sphinx_rtd_theme",
|
||||||
|
"sphinxcontrib.youtube",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
@ -73,7 +74,7 @@ html_context = {
|
|||||||
"github_user": "k2-fsa",
|
"github_user": "k2-fsa",
|
||||||
"github_repo": "icefall",
|
"github_repo": "icefall",
|
||||||
"github_version": "master",
|
"github_version": "master",
|
||||||
"conf_py_path": "/icefall/docs/source/",
|
"conf_py_path": "/docs/source/",
|
||||||
}
|
}
|
||||||
|
|
||||||
todo_include_todos = True
|
todo_include_todos = True
|
||||||
|
|||||||
13
docs/source/huggingface/index.rst
Normal file
13
docs/source/huggingface/index.rst
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
Huggingface
|
||||||
|
===========
|
||||||
|
|
||||||
|
This section describes how to find pre-trained models.
|
||||||
|
It also demonstrates how to try them from within your browser
|
||||||
|
without installing anything by using
|
||||||
|
`Huggingface spaces <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
pretrained-models
|
||||||
|
spaces
|
||||||
BIN
docs/source/huggingface/pic/hugging-face-sherpa-2.png
Normal file
BIN
docs/source/huggingface/pic/hugging-face-sherpa-2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 455 KiB |
BIN
docs/source/huggingface/pic/hugging-face-sherpa-3.png
Normal file
BIN
docs/source/huggingface/pic/hugging-face-sherpa-3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 392 KiB |
BIN
docs/source/huggingface/pic/hugging-face-sherpa.png
Normal file
BIN
docs/source/huggingface/pic/hugging-face-sherpa.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 426 KiB |
17
docs/source/huggingface/pretrained-models.rst
Normal file
17
docs/source/huggingface/pretrained-models.rst
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
Pre-trained models
|
||||||
|
==================
|
||||||
|
|
||||||
|
We have uploaded pre-trained models for all recipes in ``icefall``
|
||||||
|
to `<https://huggingface.co/>`_.
|
||||||
|
|
||||||
|
You can find them by visiting the following link:
|
||||||
|
|
||||||
|
`<https://huggingface.co/models?search=icefall>`_.
|
||||||
|
|
||||||
|
You can also find links of pre-trained models for a specific recipe
|
||||||
|
by looking at the corresponding ``RESULTS.md``. For instance:
|
||||||
|
|
||||||
|
- `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
|
||||||
|
- `<https://github.com/k2-fsa/icefall/blob/master/egs/aishell/ASR/RESULTS.md>`_
|
||||||
|
- `<https://github.com/k2-fsa/icefall/blob/master/egs/gigaspeech/ASR/RESULTS.md>`_
|
||||||
|
- `<https://github.com/k2-fsa/icefall/blob/master/egs/wenetspeech/ASR/RESULTS.md>`_
|
||||||
65
docs/source/huggingface/spaces.rst
Normal file
65
docs/source/huggingface/spaces.rst
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
Huggingface spaces
|
||||||
|
==================
|
||||||
|
|
||||||
|
We have integrated the server framework
|
||||||
|
`sherpa <http://github.com/k2-fsa/sherpa>`_
|
||||||
|
with `Huggingface spaces <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
|
||||||
|
so that you can try pre-trained models from within your browser
|
||||||
|
without the need to download or install anything.
|
||||||
|
|
||||||
|
All you need is a browser, which can be run on Windows, macOS, Linux, or even on your
|
||||||
|
iPad and your phone.
|
||||||
|
|
||||||
|
Start your browser and visit the following address:
|
||||||
|
|
||||||
|
`<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
|
||||||
|
|
||||||
|
and you will see a page like the following screenshot:
|
||||||
|
|
||||||
|
.. image:: ./pic/hugging-face-sherpa.png
|
||||||
|
:alt: screenshot of `<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
|
||||||
|
:target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
|
||||||
|
|
||||||
|
You can:
|
||||||
|
|
||||||
|
1. Select a language for recognition. Currently, we provide pre-trained models
|
||||||
|
from ``icefall`` for the following languages: ``Chinese``, ``English``, and
|
||||||
|
``Chinese+English``.
|
||||||
|
2. After selecting the target language, you can select a pre-trained model
|
||||||
|
corresponding to the language.
|
||||||
|
3. Select the decoding method. Currently, it provides ``greedy search``
|
||||||
|
and ``modified_beam_search``.
|
||||||
|
4. If you selected ``modified_beam_search``, you can choose the number of
|
||||||
|
active paths during the search.
|
||||||
|
5. Either upload a file or record your speech for recognition.
|
||||||
|
6. Click the button ``Submit for recognition``.
|
||||||
|
7. Wait for a moment and you will get the recognition results.
|
||||||
|
|
||||||
|
The following screenshot shows an example when selecting ``Chinese+English``:
|
||||||
|
|
||||||
|
.. image:: ./pic/hugging-face-sherpa-3.png
|
||||||
|
:alt: screenshot of `<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
|
||||||
|
:target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
|
||||||
|
|
||||||
|
|
||||||
|
In the bottom part of the page, you can find a table of examples. You can click
|
||||||
|
one of them and then click ``Submit for recognition``.
|
||||||
|
|
||||||
|
.. image:: ./pic/hugging-face-sherpa-2.png
|
||||||
|
:alt: screenshot of `<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
|
||||||
|
:target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
|
||||||
|
|
||||||
|
YouTube Video
|
||||||
|
-------------
|
||||||
|
|
||||||
|
We provide the following YouTube video demonstrating how to use
|
||||||
|
`<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
|
||||||
|
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
|
||||||
|
|
||||||
|
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
|
||||||
|
|
||||||
|
.. youtube:: ElN3r9dkKE4
|
||||||
@ -21,5 +21,7 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
|
|||||||
:caption: Contents:
|
:caption: Contents:
|
||||||
|
|
||||||
installation/index
|
installation/index
|
||||||
|
model-export/index
|
||||||
recipes/index
|
recipes/index
|
||||||
contributing/index
|
contributing/index
|
||||||
|
huggingface/index
|
||||||
|
|||||||
@ -474,3 +474,19 @@ The decoding log is:
|
|||||||
**Congratulations!** You have successfully setup the environment and have run the first recipe in ``icefall``.
|
**Congratulations!** You have successfully setup the environment and have run the first recipe in ``icefall``.
|
||||||
|
|
||||||
Have fun with ``icefall``!
|
Have fun with ``icefall``!
|
||||||
|
|
||||||
|
YouTube Video
|
||||||
|
-------------
|
||||||
|
|
||||||
|
We provide the following YouTube video showing how to install ``icefall``.
|
||||||
|
It also shows how to debug various problems that you may encounter while
|
||||||
|
using ``icefall``.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
|
||||||
|
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
|
||||||
|
|
||||||
|
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
|
||||||
|
|
||||||
|
.. youtube:: LVmrBD0tLfE
|
||||||
|
|||||||
@ -0,0 +1,21 @@
|
|||||||
|
2022-10-13 19:09:02,233 INFO [pretrained.py:265] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'encoder_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'decoder_dim': 512, 'joiner_dim': 512, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.21', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4810e00d8738f1a21278b0156a42ff396a2d40ac', 'k2-git-date': 'Fri Oct 7 19:35:03 2022', 'lhotse-version': '1.3.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'onnx-doc-1013', 'icefall-git-sha1': 'c39cba5-dirty', 'icefall-git-date': 'Thu Oct 13 15:17:20 2022', 'icefall-path': '/k2-dev/fangjun/open-source/icefall-master', 'k2-path': '/k2-dev/fangjun/open-source/k2-master/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-jsonl/lhotse/__init__.py', 'hostname': 'de-74279-k2-test-4-0324160024-65bfd8b584-jjlbn', 'IP address': '10.177.74.203'}, 'checkpoint': './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt', 'bpe_model': './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model', 'method': 'greedy_search', 'sound_files': ['./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav'], 'sample_rate': 16000, 'beam_size': 4, 'beam': 4, 'max_contexts': 4, 'max_states': 8, 'context_size': 2, 'max_sym_per_frame': 1, 'simulate_streaming': False, 'decode_chunk_size': 16, 'left_context': 64, 'dynamic_chunk_training': False, 'causal_convolution': False, 'short_chunk_size': 25, 'num_left_chunks': 4, 'blank_id': 0, 'unk_id': 2, 'vocab_size': 500}
|
||||||
|
2022-10-13 19:09:02,233 INFO [pretrained.py:271] device: cpu
|
||||||
|
2022-10-13 19:09:02,233 INFO [pretrained.py:273] Creating model
|
||||||
|
2022-10-13 19:09:02,612 INFO [train.py:458] Disable giga
|
||||||
|
2022-10-13 19:09:02,623 INFO [pretrained.py:277] Number of model parameters: 78648040
|
||||||
|
2022-10-13 19:09:02,951 INFO [pretrained.py:285] Constructing Fbank computer
|
||||||
|
2022-10-13 19:09:02,952 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav']
|
||||||
|
2022-10-13 19:09:02,957 INFO [pretrained.py:301] Decoding started
|
||||||
|
2022-10-13 19:09:06,700 INFO [pretrained.py:329] Using greedy_search
|
||||||
|
2022-10-13 19:09:06,912 INFO [pretrained.py:388]
|
||||||
|
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav:
|
||||||
|
AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
|
||||||
|
|
||||||
|
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav:
|
||||||
|
GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
|
||||||
|
|
||||||
|
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav:
|
||||||
|
YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
|
||||||
|
|
||||||
|
|
||||||
|
2022-10-13 19:09:06,912 INFO [pretrained.py:390] Decoding Done
|
||||||
135
docs/source/model-export/export-model-state-dict.rst
Normal file
135
docs/source/model-export/export-model-state-dict.rst
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
Export model.state_dict()
|
||||||
|
=========================
|
||||||
|
|
||||||
|
When to use it
|
||||||
|
--------------
|
||||||
|
|
||||||
|
During model training, we save checkpoints periodically to disk.
|
||||||
|
|
||||||
|
A checkpoint contains the following information:
|
||||||
|
|
||||||
|
- ``model.state_dict()``
|
||||||
|
- ``optimizer.state_dict()``
|
||||||
|
- and some other information related to training
|
||||||
|
|
||||||
|
When we need to resume the training process from some point, we need a checkpoint.
|
||||||
|
However, if we want to publish the model for inference, then only
|
||||||
|
``model.state_dict()`` is needed. In this case, we need to strip all other information
|
||||||
|
except ``model.state_dict()`` to reduce the file size of the published model.
|
||||||
|
|
||||||
|
How to export
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Every recipe contains a file ``export.py`` that you can use to
|
||||||
|
export ``model.state_dict()`` by taking some checkpoints as inputs.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Each ``export.py`` contains well-documented usage information.
|
||||||
|
|
||||||
|
In the following, we use
|
||||||
|
`<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless3/export.py>`_
|
||||||
|
as an example.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The steps for other recipes are almost the same.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/export.py \
|
||||||
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch 20 \
|
||||||
|
--avg 10
|
||||||
|
|
||||||
|
will generate a file ``pruned_transducer_stateless3/exp/pretrained.pt``, which
|
||||||
|
is a dict containing ``{"model": model.state_dict()}`` saved by ``torch.save()``.
|
||||||
|
|
||||||
|
How to use the exported model
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
For each recipe, we provide pretrained models hosted on huggingface.
|
||||||
|
You can find links to pretrained models in ``RESULTS.md`` of each dataset.
|
||||||
|
|
||||||
|
In the following, we demonstrate how to use the pretrained model from
|
||||||
|
`<https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13>`_.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
|
||||||
|
git lfs install
|
||||||
|
git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
|
||||||
|
|
||||||
|
After cloning the repo with ``git lfs``, you will find several files in the folder
|
||||||
|
``icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp``
|
||||||
|
that have a prefix ``pretrained-``. Those files contain ``model.state_dict()``
|
||||||
|
exported by the above ``export.py``.
|
||||||
|
|
||||||
|
In each recipe, there is also a file ``pretrained.py``, which can use
|
||||||
|
``pretrained-xxx.pt`` to decode waves. The following is an example:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/pretrained.py \
|
||||||
|
--checkpoint ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt \
|
||||||
|
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model \
|
||||||
|
--method greedy_search \
|
||||||
|
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \
|
||||||
|
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \
|
||||||
|
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav
|
||||||
|
|
||||||
|
The above commands show how to use the exported model with ``pretrained.py`` to
|
||||||
|
decode multiple sound files. Its output is given as follows for reference:
|
||||||
|
|
||||||
|
.. literalinclude:: ./code/export-model-state-dict-pretrained-out.txt
|
||||||
|
|
||||||
|
Use the exported model to run decode.py
|
||||||
|
---------------------------------------
|
||||||
|
|
||||||
|
When we publish the model, we always note down its WERs on some test
|
||||||
|
dataset in ``RESULTS.md``. This section describes how to use the
|
||||||
|
pretrained model to reproduce the WER.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
git lfs install
|
||||||
|
git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
|
||||||
|
|
||||||
|
cd icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp
|
||||||
|
ln -s pretrained-iter-1224000-avg-14.pt epoch-9999.pt
|
||||||
|
cd ../..
|
||||||
|
|
||||||
|
We create a symlink with name ``epoch-9999.pt`` to ``pretrained-iter-1224000-avg-14.pt``,
|
||||||
|
so that we can pass ``--epoch 9999 --avg 1`` to ``decode.py`` in the following
|
||||||
|
command:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/decode.py \
|
||||||
|
--epoch 9999 \
|
||||||
|
--avg 1 \
|
||||||
|
--exp-dir ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp \
|
||||||
|
--lang-dir ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500 \
|
||||||
|
--max-duration 600 \
|
||||||
|
--decoding-method greedy_search
|
||||||
|
|
||||||
|
You will find the decoding results in
|
||||||
|
``./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/greedy_search``.
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
For some recipes, you also need to pass ``--use-averaged-model False``
|
||||||
|
to ``decode.py``. The reason is that the exported pretrained model is already
|
||||||
|
the averaged one.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Before running ``decode.py``, we assume that you have already run
|
||||||
|
``prepare.sh`` to prepare the test dataset.
|
||||||
12
docs/source/model-export/export-ncnn.rst
Normal file
12
docs/source/model-export/export-ncnn.rst
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
Export to ncnn
|
||||||
|
==============
|
||||||
|
|
||||||
|
We support exporting LSTM transducer models to `ncnn <https://github.com/tencent/ncnn>`_.
|
||||||
|
|
||||||
|
Please refer to :ref:`export-model-for-ncnn` for details.
|
||||||
|
|
||||||
|
We also provide `<https://github.com/k2-fsa/sherpa-ncnn>`_
|
||||||
|
performing speech recognition using ``ncnn`` with exported models.
|
||||||
|
It has been tested on Linux, macOS, Windows, and Raspberry Pi. The project is
|
||||||
|
self-contained and can be statically linked to produce a binary containing
|
||||||
|
everything needed.
|
||||||
69
docs/source/model-export/export-onnx.rst
Normal file
69
docs/source/model-export/export-onnx.rst
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
Export to ONNX
|
||||||
|
==============
|
||||||
|
|
||||||
|
In this section, we describe how to export models to ONNX.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Only non-streaming conformer transducer models are tested.
|
||||||
|
|
||||||
|
|
||||||
|
When to use it
|
||||||
|
--------------
|
||||||
|
|
||||||
|
It you want to use an inference framework that supports ONNX
|
||||||
|
to run the pretrained model.
|
||||||
|
|
||||||
|
|
||||||
|
How to export
|
||||||
|
-------------
|
||||||
|
|
||||||
|
We use
|
||||||
|
`<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_
|
||||||
|
as an example in the following.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
epoch=14
|
||||||
|
avg=2
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/export.py \
|
||||||
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch $epoch \
|
||||||
|
--avg $avg \
|
||||||
|
--onnx 1
|
||||||
|
|
||||||
|
It will generate the following files inside ``pruned_transducer_stateless3/exp``:
|
||||||
|
|
||||||
|
- ``encoder.onnx``
|
||||||
|
- ``decoder.onnx``
|
||||||
|
- ``joiner.onnx``
|
||||||
|
- ``joiner_encoder_proj.onnx``
|
||||||
|
- ``joiner_decoder_proj.onnx``
|
||||||
|
|
||||||
|
You can use ``./pruned_transducer_stateless3/exp/onnx_pretrained.py`` to decode
|
||||||
|
waves with the generated files:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/onnx_pretrained.py \
|
||||||
|
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename ./pruned_transducer_stateless3/exp/encoder.onnx \
|
||||||
|
--decoder-model-filename ./pruned_transducer_stateless3/exp/decoder.onnx \
|
||||||
|
--joiner-model-filename ./pruned_transducer_stateless3/exp/joiner.onnx \
|
||||||
|
--joiner-encoder-proj-model-filename ./pruned_transducer_stateless3/exp/joiner_encoder_proj.onnx \
|
||||||
|
--joiner-decoder-proj-model-filename ./pruned_transducer_stateless3/exp/joiner_decoder_proj.onnx \
|
||||||
|
/path/to/foo.wav \
|
||||||
|
/path/to/bar.wav \
|
||||||
|
/path/to/baz.wav
|
||||||
|
|
||||||
|
|
||||||
|
How to use the exported model
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
We also provide `<https://github.com/k2-fsa/sherpa-onnx>`_
|
||||||
|
performing speech recognition using `onnxruntime <https://github.com/microsoft/onnxruntime>`_
|
||||||
|
with exported models.
|
||||||
|
It has been tested on Linux, macOS, and Windows.
|
||||||
58
docs/source/model-export/export-with-torch-jit-script.rst
Normal file
58
docs/source/model-export/export-with-torch-jit-script.rst
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
.. _export-model-with-torch-jit-script:
|
||||||
|
|
||||||
|
Export model with torch.jit.script()
|
||||||
|
===================================
|
||||||
|
|
||||||
|
In this section, we describe how to export a model via
|
||||||
|
``torch.jit.script()``.
|
||||||
|
|
||||||
|
When to use it
|
||||||
|
--------------
|
||||||
|
|
||||||
|
If we want to use our trained model with torchscript,
|
||||||
|
we can use ``torch.jit.script()``.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
See :ref:`export-model-with-torch-jit-trace`
|
||||||
|
if you want to use ``torch.jit.trace()``.
|
||||||
|
|
||||||
|
How to export
|
||||||
|
-------------
|
||||||
|
|
||||||
|
We use
|
||||||
|
`<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_
|
||||||
|
as an example in the following.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
epoch=14
|
||||||
|
avg=1
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/export.py \
|
||||||
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--epoch $epoch \
|
||||||
|
--avg $avg \
|
||||||
|
--jit 1
|
||||||
|
|
||||||
|
It will generate a file ``cpu_jit.pt`` in ``pruned_transducer_stateless3/exp``.
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
Don't be confused by ``cpu`` in ``cpu_jit.pt``. We move all parameters
|
||||||
|
to CPU before saving it into a ``pt`` file; that's why we use ``cpu``
|
||||||
|
in the filename.
|
||||||
|
|
||||||
|
How to use the exported model
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
Please refer to the following pages for usage:
|
||||||
|
|
||||||
|
- `<https://k2-fsa.github.io/sherpa/python/streaming_asr/emformer/index.html>`_
|
||||||
|
- `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conv_emformer/index.html>`_
|
||||||
|
- `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html>`_
|
||||||
|
- `<https://k2-fsa.github.io/sherpa/python/offline_asr/conformer/index.html>`_
|
||||||
|
- `<https://k2-fsa.github.io/sherpa/cpp/offline_asr/gigaspeech.html>`_
|
||||||
|
- `<https://k2-fsa.github.io/sherpa/cpp/offline_asr/wenetspeech.html>`_
|
||||||
69
docs/source/model-export/export-with-torch-jit-trace.rst
Normal file
69
docs/source/model-export/export-with-torch-jit-trace.rst
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
.. _export-model-with-torch-jit-trace:
|
||||||
|
|
||||||
|
Export model with torch.jit.trace()
|
||||||
|
===================================
|
||||||
|
|
||||||
|
In this section, we describe how to export a model via
|
||||||
|
``torch.jit.trace()``.
|
||||||
|
|
||||||
|
When to use it
|
||||||
|
--------------
|
||||||
|
|
||||||
|
If we want to use our trained model with torchscript,
|
||||||
|
we can use ``torch.jit.trace()``.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
See :ref:`export-model-with-torch-jit-script`
|
||||||
|
if you want to use ``torch.jit.script()``.
|
||||||
|
|
||||||
|
How to export
|
||||||
|
-------------
|
||||||
|
|
||||||
|
We use
|
||||||
|
`<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
|
||||||
|
as an example in the following.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
iter=468000
|
||||||
|
avg=16
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/export.py \
|
||||||
|
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--iter $iter \
|
||||||
|
--avg $avg \
|
||||||
|
--jit-trace 1
|
||||||
|
|
||||||
|
It will generate three files inside ``lstm_transducer_stateless2/exp``:
|
||||||
|
|
||||||
|
- ``encoder_jit_trace.pt``
|
||||||
|
- ``decoder_jit_trace.pt``
|
||||||
|
- ``joiner_jit_trace.pt``
|
||||||
|
|
||||||
|
You can use
|
||||||
|
`<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless2/jit_pretrained.py>`_
|
||||||
|
to decode sound files with the following commands:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd egs/librispeech/ASR
|
||||||
|
./lstm_transducer_stateless2/jit_pretrained.py \
|
||||||
|
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace.pt \
|
||||||
|
--decoder-model-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace.pt \
|
||||||
|
--joiner-model-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace.pt \
|
||||||
|
/path/to/foo.wav \
|
||||||
|
/path/to/bar.wav \
|
||||||
|
/path/to/baz.wav
|
||||||
|
|
||||||
|
How to use the exported models
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
Please refer to
|
||||||
|
`<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
|
||||||
|
for its usage in `sherpa <https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_.
|
||||||
|
You can also find pretrained models there.
|
||||||
14
docs/source/model-export/index.rst
Normal file
14
docs/source/model-export/index.rst
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
Model export
|
||||||
|
============
|
||||||
|
|
||||||
|
In this section, we describe various ways to export models.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
|
||||||
|
export-model-state-dict
|
||||||
|
export-with-torch-jit-trace
|
||||||
|
export-with-torch-jit-script
|
||||||
|
export-onnx
|
||||||
|
export-ncnn
|
||||||
@ -422,7 +422,7 @@ The information of the test sound files is listed below:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
$ soxi tmp/icefall_asr_aishell_conformer_ctc/test_wavs/*.wav
|
$ soxi tmp/icefall_asr_aishell_conformer_ctc/test_waves/*.wav
|
||||||
|
|
||||||
Input File : 'tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav'
|
Input File : 'tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav'
|
||||||
Channels : 1
|
Channels : 1
|
||||||
@ -485,9 +485,9 @@ The command to run CTC decoding is:
|
|||||||
--checkpoint ./tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt \
|
--checkpoint ./tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt \
|
||||||
--tokens-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/tokens.txt \
|
--tokens-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/tokens.txt \
|
||||||
--method ctc-decoding \
|
--method ctc-decoding \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0121.wav \
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0122.wav \
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0123.wav
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav
|
||||||
|
|
||||||
The output is given below:
|
The output is given below:
|
||||||
|
|
||||||
@ -529,9 +529,9 @@ The command to run HLG decoding is:
|
|||||||
--words-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
|
--words-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
|
||||||
--HLG ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
|
--HLG ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
|
||||||
--method 1best \
|
--method 1best \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0121.wav \
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0122.wav \
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0123.wav
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav
|
||||||
|
|
||||||
The output is given below:
|
The output is given below:
|
||||||
|
|
||||||
@ -575,9 +575,9 @@ The command to run HLG decoding + attention decoder rescoring is:
|
|||||||
--words-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
|
--words-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
|
||||||
--HLG ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
|
--HLG ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
|
||||||
--method attention-decoder \
|
--method attention-decoder \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0121.wav \
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0122.wav \
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
|
||||||
./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0123.wav
|
./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav
|
||||||
|
|
||||||
The output is below:
|
The output is below:
|
||||||
|
|
||||||
|
|||||||
@ -402,7 +402,7 @@ The information of the test sound files is listed below:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
$ soxi tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_wavs/*.wav
|
$ soxi tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/*.wav
|
||||||
|
|
||||||
Input File : 'tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav'
|
Input File : 'tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav'
|
||||||
Channels : 1
|
Channels : 1
|
||||||
@ -461,9 +461,9 @@ The command to run HLG decoding is:
|
|||||||
--words-file ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/words.txt \
|
--words-file ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/words.txt \
|
||||||
--HLG ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/HLG.pt \
|
--HLG ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/HLG.pt \
|
||||||
--method 1best \
|
--method 1best \
|
||||||
./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_wavs/BAC009S0764W0121.wav \
|
./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav \
|
||||||
./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_wavs/BAC009S0764W0122.wav \
|
./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0122.wav \
|
||||||
./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_wavs/BAC009S0764W0123.wav
|
./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0123.wav
|
||||||
|
|
||||||
The output is given below:
|
The output is given below:
|
||||||
|
|
||||||
|
|||||||
@ -70,6 +70,17 @@ To run stage 2 to stage 5, use:
|
|||||||
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
|
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
|
||||||
are saved in ``./data`` directory.
|
are saved in ``./data`` directory.
|
||||||
|
|
||||||
|
We provide the following YouTube video showing how to run ``./prepare.sh``.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
|
||||||
|
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
|
||||||
|
|
||||||
|
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
|
||||||
|
|
||||||
|
.. youtube:: ofEIoJL-mGM
|
||||||
|
|
||||||
|
|
||||||
Training
|
Training
|
||||||
--------
|
--------
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 413 KiB |
@ -6,3 +6,4 @@ LibriSpeech
|
|||||||
|
|
||||||
tdnn_lstm_ctc
|
tdnn_lstm_ctc
|
||||||
conformer_ctc
|
conformer_ctc
|
||||||
|
lstm_pruned_stateless_transducer
|
||||||
|
|||||||
@ -0,0 +1,636 @@
|
|||||||
|
LSTM Transducer
|
||||||
|
===============
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Please scroll down to the bottom of this page to find download links
|
||||||
|
for pretrained models if you don't want to train a model from scratch.
|
||||||
|
|
||||||
|
|
||||||
|
This tutorial shows you how to train an LSTM transducer model
|
||||||
|
with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
|
||||||
|
|
||||||
|
We use pruned RNN-T to compute the loss.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
You can find the paper about pruned RNN-T at the following address:
|
||||||
|
|
||||||
|
`<https://arxiv.org/abs/2206.13236>`_
|
||||||
|
|
||||||
|
The transducer model consists of 3 parts:
|
||||||
|
|
||||||
|
- Encoder, a.k.a, the transcription network. We use an LSTM model
|
||||||
|
- Decoder, a.k.a, the prediction network. We use a stateless model consisting of
|
||||||
|
``nn.Embedding`` and ``nn.Conv1d``
|
||||||
|
- Joiner, a.k.a, the joint network.
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
Contrary to the conventional RNN-T models, we use a stateless decoder.
|
||||||
|
That is, it has no recurrent connections.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Since the encoder model is an LSTM, not Transformer/Conformer, the
|
||||||
|
resulting model is suitable for streaming/online ASR.
|
||||||
|
|
||||||
|
|
||||||
|
Which model to use
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Currently, there are two folders about LSTM stateless transducer training:
|
||||||
|
|
||||||
|
- ``(1)`` `<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless>`_
|
||||||
|
|
||||||
|
This recipe uses only LibriSpeech during training.
|
||||||
|
|
||||||
|
- ``(2)`` `<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
|
||||||
|
|
||||||
|
This recipe uses GigaSpeech + LibriSpeech during training.
|
||||||
|
|
||||||
|
``(1)`` and ``(2)`` use the same model architecture. The only difference is that ``(2)`` supports
|
||||||
|
multi-dataset. Since ``(2)`` uses more data, it has a lower WER than ``(1)`` but it needs
|
||||||
|
more training time.
|
||||||
|
|
||||||
|
We use ``lstm_transducer_stateless2`` as an example below.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
You need to download the `GigaSpeech <https://github.com/SpeechColab/GigaSpeech>`_ dataset
|
||||||
|
to run ``(2)``. If you have only ``LibriSpeech`` dataset available, feel free to use ``(1)``.
|
||||||
|
|
||||||
|
Data preparation
|
||||||
|
----------------
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ cd egs/librispeech/ASR
|
||||||
|
$ ./prepare.sh
|
||||||
|
|
||||||
|
# If you use (1), you can **skip** the following command
|
||||||
|
$ ./prepare_giga_speech.sh
|
||||||
|
|
||||||
|
The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
|
||||||
|
All you need to do is to run it.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
We encourage you to read ``./prepare.sh``.
|
||||||
|
|
||||||
|
The data preparation contains several stages. You can use the following two
|
||||||
|
options:
|
||||||
|
|
||||||
|
- ``--stage``
|
||||||
|
- ``--stop-stage``
|
||||||
|
|
||||||
|
to control which stage(s) should be run. By default, all stages are executed.
|
||||||
|
|
||||||
|
|
||||||
|
For example,
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ cd egs/librispeech/ASR
|
||||||
|
$ ./prepare.sh --stage 0 --stop-stage 0
|
||||||
|
|
||||||
|
means to run only stage 0.
|
||||||
|
|
||||||
|
To run stage 2 to stage 5, use:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ ./prepare.sh --stage 2 --stop-stage 5
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
|
||||||
|
dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
|
||||||
|
they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
|
||||||
|
the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
|
||||||
|
``./prepare.sh`` won't re-download them.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
|
||||||
|
are saved in ``./data`` directory.
|
||||||
|
|
||||||
|
We provide the following YouTube video showing how to run ``./prepare.sh``.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
|
||||||
|
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
|
||||||
|
|
||||||
|
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
|
||||||
|
|
||||||
|
.. youtube:: ofEIoJL-mGM
|
||||||
|
|
||||||
|
Training
|
||||||
|
--------
|
||||||
|
|
||||||
|
Configurable options
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ cd egs/librispeech/ASR
|
||||||
|
$ ./lstm_transducer_stateless2/train.py --help
|
||||||
|
|
||||||
|
shows you the training options that can be passed from the commandline.
|
||||||
|
The following options are used quite often:
|
||||||
|
|
||||||
|
- ``--full-libri``
|
||||||
|
|
||||||
|
If it's True, the training part uses all the training data, i.e.,
|
||||||
|
960 hours. Otherwise, the training part uses only the subset
|
||||||
|
``train-clean-100``, which has 100 hours of training data.
|
||||||
|
|
||||||
|
.. CAUTION::
|
||||||
|
|
||||||
|
The training set is perturbed by speed with two factors: 0.9 and 1.1.
|
||||||
|
If ``--full-libri`` is True, each epoch actually processes
|
||||||
|
``3x960 == 2880`` hours of data.
|
||||||
|
|
||||||
|
- ``--num-epochs``
|
||||||
|
|
||||||
|
It is the number of epochs to train. For instance,
|
||||||
|
``./lstm_transducer_stateless2/train.py --num-epochs 30`` trains for 30 epochs
|
||||||
|
and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
|
||||||
|
in the folder ``./lstm_transducer_stateless2/exp``.
|
||||||
|
|
||||||
|
- ``--start-epoch``
|
||||||
|
|
||||||
|
It's used to resume training.
|
||||||
|
``./lstm_transducer_stateless2/train.py --start-epoch 10`` loads the
|
||||||
|
checkpoint ``./lstm_transducer_stateless2/exp/epoch-9.pt`` and starts
|
||||||
|
training from epoch 10, based on the state from epoch 9.
|
||||||
|
|
||||||
|
- ``--world-size``
|
||||||
|
|
||||||
|
It is used for multi-GPU single-machine DDP training.
|
||||||
|
|
||||||
|
- (a) If it is 1, then no DDP training is used.
|
||||||
|
|
||||||
|
- (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
|
||||||
|
|
||||||
|
The following shows some use cases with it.
|
||||||
|
|
||||||
|
**Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
|
||||||
|
GPU 2 for training. You can do the following:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ cd egs/librispeech/ASR
|
||||||
|
$ export CUDA_VISIBLE_DEVICES="0,2"
|
||||||
|
$ ./lstm_transducer_stateless2/train.py --world-size 2
|
||||||
|
|
||||||
|
**Use case 2**: You have 4 GPUs and you want to use all of them
|
||||||
|
for training. You can do the following:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ cd egs/librispeech/ASR
|
||||||
|
$ ./lstm_transducer_stateless2/train.py --world-size 4
|
||||||
|
|
||||||
|
**Use case 3**: You have 4 GPUs but you only want to use GPU 3
|
||||||
|
for training. You can do the following:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ cd egs/librispeech/ASR
|
||||||
|
$ export CUDA_VISIBLE_DEVICES="3"
|
||||||
|
$ ./lstm_transducer_stateless2/train.py --world-size 1
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
Only multi-GPU single-machine DDP training is implemented at present.
|
||||||
|
Multi-GPU multi-machine DDP training will be added later.
|
||||||
|
|
||||||
|
- ``--max-duration``
|
||||||
|
|
||||||
|
It specifies the number of seconds over all utterances in a
|
||||||
|
batch, before **padding**.
|
||||||
|
If you encounter CUDA OOM, please reduce it.
|
||||||
|
|
||||||
|
.. HINT::
|
||||||
|
|
||||||
|
Due to padding, the number of seconds of all utterances in a
|
||||||
|
batch will usually be larger than ``--max-duration``.
|
||||||
|
|
||||||
|
A larger value for ``--max-duration`` may cause OOM during training,
|
||||||
|
while a smaller value may increase the training time. You have to
|
||||||
|
tune it.
|
||||||
|
|
||||||
|
- ``--giga-prob``
|
||||||
|
|
||||||
|
The probability to select a batch from the ``GigaSpeech`` dataset.
|
||||||
|
Note: It is available only for ``(2)``.
|
||||||
|
|
||||||
|
Pre-configured options
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
There are some training options, e.g., weight decay,
|
||||||
|
number of warmup steps, results dir, etc,
|
||||||
|
that are not passed from the commandline.
|
||||||
|
They are pre-configured by the function ``get_params()`` in
|
||||||
|
`lstm_transducer_stateless2/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless2/train.py>`_
|
||||||
|
|
||||||
|
You don't need to change these pre-configured parameters. If you really need to change
|
||||||
|
them, please modify ``./lstm_transducer_stateless2/train.py`` directly.
|
||||||
|
|
||||||
|
Training logs
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Training logs and checkpoints are saved in ``lstm_transducer_stateless2/exp``.
|
||||||
|
You will find the following files in that directory:
|
||||||
|
|
||||||
|
- ``epoch-1.pt``, ``epoch-2.pt``, ...
|
||||||
|
|
||||||
|
These are checkpoint files saved at the end of each epoch, containing model
|
||||||
|
``state_dict`` and optimizer ``state_dict``.
|
||||||
|
To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ ./lstm_transducer_stateless2/train.py --start-epoch 11
|
||||||
|
|
||||||
|
- ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
|
||||||
|
|
||||||
|
These are checkpoint files saved every ``--save-every-n`` batches,
|
||||||
|
containing model ``state_dict`` and optimizer ``state_dict``.
|
||||||
|
To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ ./lstm_transducer_stateless2/train.py --start-batch 436000
|
||||||
|
|
||||||
|
- ``tensorboard/``
|
||||||
|
|
||||||
|
This folder contains tensorBoard logs. Training loss, validation loss, learning
|
||||||
|
rate, etc, are recorded in these logs. You can visualize them by:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ cd lstm_transducer_stateless2/exp/tensorboard
|
||||||
|
$ tensorboard dev upload --logdir . --description "LSTM transducer training for LibriSpeech with icefall"
|
||||||
|
|
||||||
|
It will print something like below:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
TensorFlow installation not found - running with reduced feature set.
|
||||||
|
Upload started and will continue reading any new data as it's added to the logdir.
|
||||||
|
|
||||||
|
To stop uploading, press Ctrl-C.
|
||||||
|
|
||||||
|
New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/cj2vtPiwQHKN9Q1tx6PTpg/
|
||||||
|
|
||||||
|
[2022-09-20T15:50:50] Started scanning logdir.
|
||||||
|
Uploading 4468 scalars...
|
||||||
|
[2022-09-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
|
||||||
|
Listening for new data in logdir...
|
||||||
|
|
||||||
|
Note there is a URL in the above output. Click it and you will see
|
||||||
|
the following screenshot:
|
||||||
|
|
||||||
|
.. figure:: images/librispeech-lstm-transducer-tensorboard-log.png
|
||||||
|
:width: 600
|
||||||
|
:alt: TensorBoard screenshot
|
||||||
|
:align: center
|
||||||
|
:target: https://tensorboard.dev/experiment/lzGnETjwRxC3yghNMd4kPw/
|
||||||
|
|
||||||
|
TensorBoard screenshot.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
If you don't have access to google, you can use the following command
|
||||||
|
to view the tensorboard log locally:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd lstm_transducer_stateless2/exp/tensorboard
|
||||||
|
tensorboard --logdir . --port 6008
|
||||||
|
|
||||||
|
It will print the following message:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
|
||||||
|
TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
|
||||||
|
|
||||||
|
Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
|
||||||
|
logs.
|
||||||
|
|
||||||
|
|
||||||
|
- ``log/log-train-xxxx``
|
||||||
|
|
||||||
|
It is the detailed training log in text format, same as the one
|
||||||
|
you saw printed to the console during training.
|
||||||
|
|
||||||
|
Usage example
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
You can use the following command to start the training using 8 GPUs:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
|
||||||
|
./lstm_transducer_stateless2/train.py \
|
||||||
|
--world-size 8 \
|
||||||
|
--num-epochs 35 \
|
||||||
|
--start-epoch 1 \
|
||||||
|
--full-libri 1 \
|
||||||
|
--exp-dir lstm_transducer_stateless2/exp \
|
||||||
|
--max-duration 500 \
|
||||||
|
--use-fp16 0 \
|
||||||
|
--lr-epochs 10 \
|
||||||
|
--num-workers 2 \
|
||||||
|
--giga-prob 0.9
|
||||||
|
|
||||||
|
Decoding
|
||||||
|
--------
|
||||||
|
|
||||||
|
The decoding part uses checkpoints saved by the training part, so you have
|
||||||
|
to run the training part first.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
There are two kinds of checkpoints:
|
||||||
|
|
||||||
|
- (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
|
||||||
|
of each epoch. You can pass ``--epoch`` to
|
||||||
|
``lstm_transducer_stateless2/decode.py`` to use them.
|
||||||
|
|
||||||
|
- (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
|
||||||
|
every ``--save-every-n`` batches. You can pass ``--iter`` to
|
||||||
|
``lstm_transducer_stateless2/decode.py`` to use them.
|
||||||
|
|
||||||
|
We suggest that you try both types of checkpoints and choose the one
|
||||||
|
that produces the lowest WERs.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ cd egs/librispeech/ASR
|
||||||
|
$ ./lstm_transducer_stateless2/decode.py --help
|
||||||
|
|
||||||
|
shows the options for decoding.
|
||||||
|
|
||||||
|
The following shows two examples:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
for m in greedy_search fast_beam_search modified_beam_search; do
|
||||||
|
for epoch in 17; do
|
||||||
|
for avg in 1 2; do
|
||||||
|
./lstm_transducer_stateless2/decode.py \
|
||||||
|
--epoch $epoch \
|
||||||
|
--avg $avg \
|
||||||
|
--exp-dir lstm_transducer_stateless2/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--num-encoder-layers 12 \
|
||||||
|
--rnn-hidden-size 1024 \
|
||||||
|
--decoding-method $m \
|
||||||
|
--use-averaged-model True \
|
||||||
|
--beam 4 \
|
||||||
|
--max-contexts 4 \
|
||||||
|
--max-states 8 \
|
||||||
|
--beam-size 4
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
for m in greedy_search fast_beam_search modified_beam_search; do
|
||||||
|
for iter in 474000; do
|
||||||
|
for avg in 8 10 12 14 16 18; do
|
||||||
|
./lstm_transducer_stateless2/decode.py \
|
||||||
|
--iter $iter \
|
||||||
|
--avg $avg \
|
||||||
|
--exp-dir lstm_transducer_stateless2/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--num-encoder-layers 12 \
|
||||||
|
--rnn-hidden-size 1024 \
|
||||||
|
--decoding-method $m \
|
||||||
|
--use-averaged-model True \
|
||||||
|
--beam 4 \
|
||||||
|
--max-contexts 4 \
|
||||||
|
--max-states 8 \
|
||||||
|
--beam-size 4
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
Export models
|
||||||
|
-------------
|
||||||
|
|
||||||
|
`lstm_transducer_stateless2/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless2/export.py>`_ supports exporting checkpoints from ``lstm_transducer_stateless2/exp`` in the following ways.
|
||||||
|
|
||||||
|
Export ``model.state_dict()``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Checkpoints saved by ``lstm_transducer_stateless2/train.py`` also include
|
||||||
|
``optimizer.state_dict()``. It is useful for resuming training. But after training,
|
||||||
|
we are interested only in ``model.state_dict()``. You can use the following
|
||||||
|
command to extract ``model.state_dict()``.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# Assume that --iter 468000 --avg 16 produces the smallest WER
|
||||||
|
# (You can get such information after running ./lstm_transducer_stateless2/decode.py)
|
||||||
|
|
||||||
|
iter=468000
|
||||||
|
avg=16
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/export.py \
|
||||||
|
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--iter $iter \
|
||||||
|
--avg $avg
|
||||||
|
|
||||||
|
It will generate a file ``./lstm_transducer_stateless2/exp/pretrained.pt``.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
To use the generated ``pretrained.pt`` for ``lstm_transducer_stateless2/decode.py``,
|
||||||
|
you can run:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd lstm_transducer_stateless2/exp
|
||||||
|
ln -s pretrained epoch-9999.pt
|
||||||
|
|
||||||
|
And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
|
||||||
|
``./lstm_transducer_stateless2/decode.py``.
|
||||||
|
|
||||||
|
To use the exported model with ``./lstm_transducer_stateless2/pretrained.py``, you
|
||||||
|
can run:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/pretrained.py \
|
||||||
|
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
||||||
|
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||||
|
--method greedy_search \
|
||||||
|
/path/to/foo.wav \
|
||||||
|
/path/to/bar.wav
|
||||||
|
|
||||||
|
Export model using ``torch.jit.trace()``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
iter=468000
|
||||||
|
avg=16
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/export.py \
|
||||||
|
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--iter $iter \
|
||||||
|
--avg $avg \
|
||||||
|
--jit-trace 1
|
||||||
|
|
||||||
|
It will generate 3 files:
|
||||||
|
|
||||||
|
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace.pt``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace.pt``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace.pt``
|
||||||
|
|
||||||
|
To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/jit_pretrained.py \
|
||||||
|
--bpe-model ./data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-model-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace.pt \
|
||||||
|
--decoder-model-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace.pt \
|
||||||
|
--joiner-model-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace.pt \
|
||||||
|
/path/to/foo.wav \
|
||||||
|
/path/to/bar.wav
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/english/server.html>`_
|
||||||
|
for how to use the exported models in ``sherpa``.
|
||||||
|
|
||||||
|
.. _export-model-for-ncnn:
|
||||||
|
|
||||||
|
Export model for ncnn
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
We support exporting pretrained LSTM transducer models to
|
||||||
|
`ncnn <https://github.com/tencent/ncnn>`_ using
|
||||||
|
`pnnx <https://github.com/Tencent/ncnn/tree/master/tools/pnnx>`_.
|
||||||
|
|
||||||
|
First, let us install a modified version of ``ncnn``:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
git clone https://github.com/csukuangfj/ncnn
|
||||||
|
cd ncnn
|
||||||
|
git submodule update --recursive --init
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
ls -lh dist/
|
||||||
|
pip install ./dist/*.whl
|
||||||
|
|
||||||
|
# now build pnnx
|
||||||
|
cd tools/pnnx
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
make -j4
|
||||||
|
export PATH=$PWD/src:$PATH
|
||||||
|
|
||||||
|
./src/pnnx
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
We assume that you have added the path to the binary ``pnnx`` to the
|
||||||
|
environment variable ``PATH``.
|
||||||
|
|
||||||
|
Second, let us export the model using ``torch.jit.trace()`` that is suitable
|
||||||
|
for ``pnnx``:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
iter=468000
|
||||||
|
avg=16
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/export.py \
|
||||||
|
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||||
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
|
--iter $iter \
|
||||||
|
--avg $avg \
|
||||||
|
--pnnx 1
|
||||||
|
|
||||||
|
It will generate 3 files:
|
||||||
|
|
||||||
|
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.pt``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.pt``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.pt``
|
||||||
|
|
||||||
|
Third, convert torchscript model to ``ncnn`` format:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pnnx ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.pt
|
||||||
|
pnnx ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.pt
|
||||||
|
pnnx ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.pt
|
||||||
|
|
||||||
|
It will generate the following files:
|
||||||
|
|
||||||
|
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param``
|
||||||
|
- ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin``
|
||||||
|
|
||||||
|
To use the above generated files, run:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/ncnn-decode.py \
|
||||||
|
--bpe-model-filename ./data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-param-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||||
|
--encoder-bin-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||||
|
--decoder-param-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||||
|
--decoder-bin-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||||
|
--joiner-param-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||||
|
--joiner-bin-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||||
|
/path/to/foo.wav
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./lstm_transducer_stateless2/streaming-ncnn-decode.py \
|
||||||
|
--bpe-model-filename ./data/lang_bpe_500/bpe.model \
|
||||||
|
--encoder-param-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param \
|
||||||
|
--encoder-bin-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin \
|
||||||
|
--decoder-param-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param \
|
||||||
|
--decoder-bin-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin \
|
||||||
|
--joiner-param-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param \
|
||||||
|
--joiner-bin-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin \
|
||||||
|
/path/to/foo.wav
|
||||||
|
|
||||||
|
To use the above generated files in C++, please see
|
||||||
|
`<https://github.com/k2-fsa/sherpa-ncnn>`_
|
||||||
|
|
||||||
|
It is able to generate a static linked executable that can be run on Linux, Windows,
|
||||||
|
macOS, Raspberry Pi, etc, without external dependencies.
|
||||||
|
|
||||||
|
Download pretrained models
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
If you don't want to train from scratch, you can download the pretrained models
|
||||||
|
by visiting the following links:
|
||||||
|
|
||||||
|
- `<https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03>`_
|
||||||
|
|
||||||
|
- `<https://huggingface.co/Zengwei/icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18>`_
|
||||||
|
|
||||||
|
See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
|
||||||
|
for the details of the above pretrained models
|
||||||
|
|
||||||
|
You can find more usages of the pretrained models in
|
||||||
|
`<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
|
||||||
@ -45,6 +45,16 @@ To run stage 2 to stage 5, use:
|
|||||||
|
|
||||||
$ ./prepare.sh --stage 2 --stop-stage 5
|
$ ./prepare.sh --stage 2 --stop-stage 5
|
||||||
|
|
||||||
|
We provide the following YouTube video showing how to run ``./prepare.sh``.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
|
||||||
|
the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
|
||||||
|
|
||||||
|
`<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
|
||||||
|
|
||||||
|
.. youtube:: ofEIoJL-mGM
|
||||||
|
|
||||||
Training
|
Training
|
||||||
--------
|
--------
|
||||||
|
|||||||
@ -29,7 +29,7 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
from icefall.utils import get_executor
|
from icefall.utils import get_executor
|
||||||
@ -52,19 +52,35 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
|||||||
"dev",
|
"dev",
|
||||||
"test",
|
"test",
|
||||||
)
|
)
|
||||||
|
prefix = "aidatatang"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
dataset_parts=dataset_parts, output_dir=src_dir
|
dataset_parts=dataset_parts,
|
||||||
|
output_dir=src_dir,
|
||||||
|
prefix=prefix,
|
||||||
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
assert manifests is not None
|
assert manifests is not None
|
||||||
|
|
||||||
|
assert len(manifests) == len(dataset_parts), (
|
||||||
|
len(manifests),
|
||||||
|
len(dataset_parts),
|
||||||
|
list(manifests.keys()),
|
||||||
|
dataset_parts,
|
||||||
|
)
|
||||||
|
|
||||||
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
|
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
|
||||||
|
|
||||||
with get_executor() as ex: # Initialize the executor only once.
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
for partition, m in manifests.items():
|
for partition, m in manifests.items():
|
||||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||||
logging.info(f"{partition} already exists - skipping.")
|
logging.info(f"{partition} already exists - skipping.")
|
||||||
continue
|
continue
|
||||||
logging.info(f"Processing {partition}")
|
logging.info(f"Processing {partition}")
|
||||||
|
|
||||||
|
for sup in m["supervisions"]:
|
||||||
|
sup.custom = {"origin": "aidatatang_200zh"}
|
||||||
|
|
||||||
cut_set = CutSet.from_manifests(
|
cut_set = CutSet.from_manifests(
|
||||||
recordings=m["recordings"],
|
recordings=m["recordings"],
|
||||||
supervisions=m["supervisions"],
|
supervisions=m["supervisions"],
|
||||||
@ -77,13 +93,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
|||||||
)
|
)
|
||||||
cut_set = cut_set.compute_and_store_features(
|
cut_set = cut_set.compute_and_store_features(
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
storage_path=f"{output_dir}/feats_{partition}",
|
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||||
# when an executor is specified, make more partitions
|
# when an executor is specified, make more partitions
|
||||||
num_jobs=num_jobs if ex is None else 80,
|
num_jobs=num_jobs if ex is None else 80,
|
||||||
executor=ex,
|
executor=ex,
|
||||||
storage_type=ChunkedLilcomHdf5Writer,
|
storage_type=LilcomChunkyWriter,
|
||||||
)
|
)
|
||||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
|
||||||
|
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
|
|||||||
@ -25,19 +25,19 @@ for usage.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from lhotse import load_manifest
|
from lhotse import load_manifest_lazy
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
paths = [
|
paths = [
|
||||||
"./data/fbank/cuts_train.json.gz",
|
"./data/fbank/aidatatang_cuts_train.jsonl.gz",
|
||||||
"./data/fbank/cuts_dev.json.gz",
|
"./data/fbank/aidatatang_cuts_dev.jsonl.gz",
|
||||||
"./data/fbank/cuts_test.json.gz",
|
"./data/fbank/aidatatang_cuts_test.jsonl.gz",
|
||||||
]
|
]
|
||||||
|
|
||||||
for path in paths:
|
for path in paths:
|
||||||
print(f"Starting display the statistics for {path}")
|
print(f"Starting display the statistics for {path}")
|
||||||
cuts = load_manifest(path)
|
cuts = load_manifest_lazy(path)
|
||||||
cuts.describe()
|
cuts.describe()
|
||||||
|
|
||||||
|
|
||||||
@ -45,7 +45,7 @@ if __name__ == "__main__":
|
|||||||
main()
|
main()
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Starting display the statistics for ./data/fbank/cuts_train.json.gz
|
Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
|
||||||
Cuts count: 494715
|
Cuts count: 494715
|
||||||
Total duration (hours): 422.6
|
Total duration (hours): 422.6
|
||||||
Speech duration (hours): 422.6 (100.0%)
|
Speech duration (hours): 422.6 (100.0%)
|
||||||
@ -61,7 +61,7 @@ min 1.0
|
|||||||
99.5% 8.0
|
99.5% 8.0
|
||||||
99.9% 9.5
|
99.9% 9.5
|
||||||
max 18.1
|
max 18.1
|
||||||
Starting display the statistics for ./data/fbank/cuts_dev.json.gz
|
Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
|
||||||
Cuts count: 24216
|
Cuts count: 24216
|
||||||
Total duration (hours): 20.2
|
Total duration (hours): 20.2
|
||||||
Speech duration (hours): 20.2 (100.0%)
|
Speech duration (hours): 20.2 (100.0%)
|
||||||
@ -77,7 +77,7 @@ min 1.2
|
|||||||
99.5% 7.3
|
99.5% 7.3
|
||||||
99.9% 8.8
|
99.9% 8.8
|
||||||
max 11.3
|
max 11.3
|
||||||
Starting display the statistics for ./data/fbank/cuts_test.json.gz
|
Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
|
||||||
Cuts count: 48144
|
Cuts count: 48144
|
||||||
Total duration (hours): 40.2
|
Total duration (hours): 40.2
|
||||||
Speech duration (hours): 40.2 (100.0%)
|
Speech duration (hours): 40.2 (100.0%)
|
||||||
|
|||||||
@ -50,28 +50,19 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
log "Stage 2: Process aidatatang_200zh"
|
log "Stage 2: Prepare musan manifest"
|
||||||
if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
|
# We assume that you have downloaded the musan corpus
|
||||||
mkdir -p data/fbank/aidatatang_200zh
|
# to data/musan
|
||||||
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
|
if [ ! -f data/manifests/.manifests.done ]; then
|
||||||
touch data/fbank/aidatatang_200zh/.fbank.done
|
log "It may take 6 minutes"
|
||||||
|
mkdir -p data/manifests/
|
||||||
|
lhotse prepare musan $dl_dir/musan data/manifests/
|
||||||
|
touch data/manifests/.manifests.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
log "Stage 3: Prepare musan manifest"
|
log "Stage 3: Compute fbank for musan"
|
||||||
# We assume that you have downloaded the musan corpus
|
|
||||||
# to data/musan
|
|
||||||
if [ ! -f data/manifests/.musan_manifests.done ]; then
|
|
||||||
log "It may take 6 minutes"
|
|
||||||
mkdir -p data/manifests
|
|
||||||
lhotse prepare musan $dl_dir/musan data/manifests
|
|
||||||
touch data/manifests/.musan_manifests.done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
||||||
log "Stage 4: Compute fbank for musan"
|
|
||||||
if [ ! -f data/fbank/.msuan.done ]; then
|
if [ ! -f data/fbank/.msuan.done ]; then
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
./local/compute_fbank_musan.py
|
./local/compute_fbank_musan.py
|
||||||
@ -79,8 +70,8 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
log "Stage 5: Compute fbank for aidatatang_200zh"
|
log "Stage 4: Compute fbank for aidatatang_200zh"
|
||||||
if [ ! -f data/fbank/.aidatatang_200zh.done ]; then
|
if [ ! -f data/fbank/.aidatatang_200zh.done ]; then
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
./local/compute_fbank_aidatatang_200zh.py
|
./local/compute_fbank_aidatatang_200zh.py
|
||||||
@ -88,27 +79,33 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
log "Stage 6: Prepare char based lang"
|
log "Stage 5: Prepare char based lang"
|
||||||
lang_char_dir=data/lang_char
|
lang_char_dir=data/lang_char
|
||||||
mkdir -p $lang_char_dir
|
mkdir -p $lang_char_dir
|
||||||
|
|
||||||
# Prepare text.
|
# Prepare text.
|
||||||
grep "\"text\":" data/manifests/aidatatang_200zh/supervisions_train.json \
|
# Note: in Linux, you can install jq with the following command:
|
||||||
| sed -e 's/["text:\t ]*//g' | sed 's/,//g' \
|
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
|
||||||
| ./local/text2token.py -t "char" > $lang_char_dir/text
|
# 2. chmod +x ./jq
|
||||||
|
# 3. cp jq /usr/bin
|
||||||
|
if [ ! -f $lang_char_dir/text ]; then
|
||||||
|
gunzip -c data/manifests/aidatatang_200zh/aidatatang_supervisions_train.jsonl.gz \
|
||||||
|
|jq '.text' |sed -e 's/["text:\t ]*//g' | sed 's/"//g' \
|
||||||
|
| ./local/text2token.py -t "char" > $lang_char_dir/text
|
||||||
|
fi
|
||||||
# Prepare words.txt
|
# Prepare words.txt
|
||||||
grep "\"text\":" data/manifests/aidatatang_200zh/supervisions_train.json \
|
if [ ! -f $lang_char_dir/text_words ]; then
|
||||||
| sed -e 's/["text:\t]*//g' | sed 's/,//g' \
|
gunzip -c data/manifests/aidatatang_200zh/aidatatang_supervisions_train.jsonl.gz \
|
||||||
| ./local/text2token.py -t "char" > $lang_char_dir/text_words
|
| jq '.text' | sed -e 's/["text:\t]*//g' | sed 's/"//g' \
|
||||||
|
| ./local/text2token.py -t "char" > $lang_char_dir/text_words
|
||||||
|
fi
|
||||||
|
|
||||||
cat $lang_char_dir/text_words | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
|
cat $lang_char_dir/text_words | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
|
||||||
| uniq > $lang_char_dir/words_no_ids.txt
|
| uniq > $lang_char_dir/words_no_ids.txt
|
||||||
|
|
||||||
if [ ! -f $lang_char_dir/words.txt ]; then
|
if [ ! -f $lang_char_dir/words.txt ]; then
|
||||||
./local/prepare_words.py \
|
./local/prepare_words.py \
|
||||||
--input-file $lang_char_dir/words_no_ids.txt
|
--input-file $lang_char_dir/words_no_ids.txt \
|
||||||
--output-file $lang_char_dir/words.txt
|
--output-file $lang_char_dir/words.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -116,3 +113,4 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|||||||
./local/prepare_char.py
|
./local/prepare_char.py
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@ -28,10 +28,10 @@ from lhotse import (
|
|||||||
Fbank,
|
Fbank,
|
||||||
FbankConfig,
|
FbankConfig,
|
||||||
load_manifest,
|
load_manifest,
|
||||||
|
load_manifest_lazy,
|
||||||
set_caching_enabled,
|
set_caching_enabled,
|
||||||
)
|
)
|
||||||
from lhotse.dataset import (
|
from lhotse.dataset import (
|
||||||
BucketingSampler,
|
|
||||||
CutConcatenate,
|
CutConcatenate,
|
||||||
CutMix,
|
CutMix,
|
||||||
DynamicBucketingSampler,
|
DynamicBucketingSampler,
|
||||||
@ -206,7 +206,7 @@ class Aidatatang_200zhAsrDataModule:
|
|||||||
"""
|
"""
|
||||||
logging.info("About to get Musan cuts")
|
logging.info("About to get Musan cuts")
|
||||||
cuts_musan = load_manifest(
|
cuts_musan = load_manifest(
|
||||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
transforms = []
|
transforms = []
|
||||||
@ -290,13 +290,12 @@ class Aidatatang_200zhAsrDataModule:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.args.bucketing_sampler:
|
if self.args.bucketing_sampler:
|
||||||
logging.info("Using BucketingSampler.")
|
logging.info("Using DynamicBucketingSampler.")
|
||||||
train_sampler = BucketingSampler(
|
train_sampler = DynamicBucketingSampler(
|
||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
num_buckets=self.args.num_buckets,
|
num_buckets=self.args.num_buckets,
|
||||||
bucket_method="equal_duration",
|
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -402,14 +401,20 @@ class Aidatatang_200zhAsrDataModule:
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def train_cuts(self) -> CutSet:
|
def train_cuts(self) -> CutSet:
|
||||||
logging.info("About to get train cuts")
|
logging.info("About to get train cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def valid_cuts(self) -> CutSet:
|
def valid_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev cuts")
|
logging.info("About to get dev cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_cuts(self) -> List[CutSet]:
|
def test_cuts(self) -> List[CutSet]:
|
||||||
logging.info("About to get test cuts")
|
logging.info("About to get test cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
|
||||||
|
)
|
||||||
|
|||||||
@ -367,6 +367,7 @@ def decode_dataset(
|
|||||||
for batch_idx, batch in enumerate(dl):
|
for batch_idx, batch in enumerate(dl):
|
||||||
texts = batch["supervisions"]["text"]
|
texts = batch["supervisions"]["text"]
|
||||||
texts = [list(str(text).replace(" ", "")) for text in texts]
|
texts = [list(str(text).replace(" ", "")) for text in texts]
|
||||||
|
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
|
||||||
|
|
||||||
hyps_dict = decode_one_batch(
|
hyps_dict = decode_one_batch(
|
||||||
params=params,
|
params=params,
|
||||||
@ -379,8 +380,8 @@ def decode_dataset(
|
|||||||
for name, hyps in hyps_dict.items():
|
for name, hyps in hyps_dict.items():
|
||||||
this_batch = []
|
this_batch = []
|
||||||
assert len(hyps) == len(texts)
|
assert len(hyps) == len(texts)
|
||||||
for hyp_words, ref_text in zip(hyps, texts):
|
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
|
||||||
this_batch.append((ref_text, hyp_words))
|
this_batch.append((cut_id, ref_text, hyp_words))
|
||||||
|
|
||||||
results[name].extend(this_batch)
|
results[name].extend(this_batch)
|
||||||
|
|
||||||
@ -405,6 +406,7 @@ def save_results(
|
|||||||
recog_path = (
|
recog_path = (
|
||||||
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||||
)
|
)
|
||||||
|
results = sorted(results)
|
||||||
store_transcripts(filename=recog_path, texts=results)
|
store_transcripts(filename=recog_path, texts=results)
|
||||||
logging.info(f"The transcripts are stored in {recog_path}")
|
logging.info(f"The transcripts are stored in {recog_path}")
|
||||||
|
|
||||||
@ -520,61 +522,14 @@ def main():
|
|||||||
num_param = sum([p.numel() for p in model.parameters()])
|
num_param = sum([p.numel() for p in model.parameters()])
|
||||||
logging.info(f"Number of model parameters: {num_param}")
|
logging.info(f"Number of model parameters: {num_param}")
|
||||||
|
|
||||||
# Note: Please use "pip install webdataset==0.1.103"
|
# we need cut ids to display recognition results.
|
||||||
# for installing the webdataset.
|
args.return_cuts = True
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
|
|
||||||
from lhotse import CutSet
|
|
||||||
from lhotse.dataset.webdataset import export_to_webdataset
|
|
||||||
|
|
||||||
aidatatang_200zh = Aidatatang_200zhAsrDataModule(args)
|
aidatatang_200zh = Aidatatang_200zhAsrDataModule(args)
|
||||||
|
|
||||||
dev = "dev"
|
dev_cuts = aidatatang_200zh.valid_cuts()
|
||||||
test = "test"
|
test_cuts = aidatatang_200zh.test_cuts()
|
||||||
|
dev_dl = aidatatang_200zh.valid_dataloaders(dev_cuts)
|
||||||
if not os.path.exists(f"{dev}/shared-0.tar"):
|
test_dl = aidatatang_200zh.test_dataloaders(test_cuts)
|
||||||
os.makedirs(dev)
|
|
||||||
dev_cuts = aidatatang_200zh.valid_cuts()
|
|
||||||
export_to_webdataset(
|
|
||||||
dev_cuts,
|
|
||||||
output_path=f"{dev}/shared-%d.tar",
|
|
||||||
shard_size=300,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not os.path.exists(f"{test}/shared-0.tar"):
|
|
||||||
os.makedirs(test)
|
|
||||||
test_cuts = aidatatang_200zh.test_cuts()
|
|
||||||
export_to_webdataset(
|
|
||||||
test_cuts,
|
|
||||||
output_path=f"{test}/shared-%d.tar",
|
|
||||||
shard_size=300,
|
|
||||||
)
|
|
||||||
|
|
||||||
dev_shards = [
|
|
||||||
str(path)
|
|
||||||
for path in sorted(glob.glob(os.path.join(dev, "shared-*.tar")))
|
|
||||||
]
|
|
||||||
cuts_dev_webdataset = CutSet.from_webdataset(
|
|
||||||
dev_shards,
|
|
||||||
split_by_worker=True,
|
|
||||||
split_by_node=True,
|
|
||||||
shuffle_shards=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
test_shards = [
|
|
||||||
str(path)
|
|
||||||
for path in sorted(glob.glob(os.path.join(test, "shared-*.tar")))
|
|
||||||
]
|
|
||||||
cuts_test_webdataset = CutSet.from_webdataset(
|
|
||||||
test_shards,
|
|
||||||
split_by_worker=True,
|
|
||||||
split_by_node=True,
|
|
||||||
shuffle_shards=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
dev_dl = aidatatang_200zh.valid_dataloaders(cuts_dev_webdataset)
|
|
||||||
test_dl = aidatatang_200zh.test_dataloaders(cuts_test_webdataset)
|
|
||||||
|
|
||||||
test_sets = ["dev", "test"]
|
test_sets = ["dev", "test"]
|
||||||
test_dl = [dev_dl, test_dl]
|
test_dl = [dev_dl, test_dl]
|
||||||
|
|||||||
@ -114,8 +114,6 @@ def main():
|
|||||||
args = get_parser().parse_args()
|
args = get_parser().parse_args()
|
||||||
args.exp_dir = Path(args.exp_dir)
|
args.exp_dir = Path(args.exp_dir)
|
||||||
|
|
||||||
assert args.jit is False, "Support torchscript will be added later"
|
|
||||||
|
|
||||||
params = get_params()
|
params = get_params()
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
@ -155,6 +153,11 @@ def main():
|
|||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
if params.jit:
|
if params.jit:
|
||||||
|
# We won't use the forward() method of the model in C++, so just ignore
|
||||||
|
# it here.
|
||||||
|
# Otherwise, one of its arguments is a ragged tensor and is not
|
||||||
|
# torch scriptabe.
|
||||||
|
model.__class__.forward = torch.jit.ignore(model.__class__.forward)
|
||||||
logging.info("Using torch.jit.script")
|
logging.info("Using torch.jit.script")
|
||||||
model = torch.jit.script(model)
|
model = torch.jit.script(model)
|
||||||
filename = params.exp_dir / "cpu_jit.pt"
|
filename = params.exp_dir / "cpu_jit.pt"
|
||||||
|
|||||||
@ -4,6 +4,8 @@
|
|||||||
Please refer to <https://icefall.readthedocs.io/en/latest/recipes/aishell/index.html>
|
Please refer to <https://icefall.readthedocs.io/en/latest/recipes/aishell/index.html>
|
||||||
for how to run models in this recipe.
|
for how to run models in this recipe.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Transducers
|
# Transducers
|
||||||
|
|
||||||
There are various folders containing the name `transducer` in this folder.
|
There are various folders containing the name `transducer` in this folder.
|
||||||
@ -14,6 +16,7 @@ The following table lists the differences among them.
|
|||||||
| `transducer_stateless` | Conformer | Embedding + Conv1d | with `k2.rnnt_loss` |
|
| `transducer_stateless` | Conformer | Embedding + Conv1d | with `k2.rnnt_loss` |
|
||||||
| `transducer_stateless_modified` | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` |
|
| `transducer_stateless_modified` | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` |
|
||||||
| `transducer_stateless_modified-2` | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` + extra data |
|
| `transducer_stateless_modified-2` | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` + extra data |
|
||||||
|
| `pruned_transducer_stateless3` | Conformer (reworked) | Embedding + Conv1d | pruned RNN-T + reworked model with random combiner + using aidatatang_20zh as extra data|
|
||||||
|
|
||||||
The decoder in `transducer_stateless` is modified from the paper
|
The decoder in `transducer_stateless` is modified from the paper
|
||||||
[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
|
[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
|
||||||
|
|||||||
@ -1,10 +1,145 @@
|
|||||||
## Results
|
## Results
|
||||||
### Aishell training result(Transducer-stateless)
|
|
||||||
|
### Aishell training result(Stateless Transducer)
|
||||||
|
|
||||||
|
#### Pruned transducer stateless 3
|
||||||
|
|
||||||
|
See <https://github.com/k2-fsa/icefall/pull/436>
|
||||||
|
|
||||||
|
|
||||||
|
[./pruned_transducer_stateless3](./pruned_transducer_stateless3)
|
||||||
|
|
||||||
|
It uses pruned RNN-T.
|
||||||
|
|
||||||
|
| | test | dev | comment |
|
||||||
|
|------------------------|------|------|---------------------------------------|
|
||||||
|
| greedy search | 5.39 | 5.09 | --epoch 29 --avg 5 --max-duration 600 |
|
||||||
|
| modified beam search | 5.05 | 4.79 | --epoch 29 --avg 5 --max-duration 600 |
|
||||||
|
| fast beam search | 5.13 | 4.91 | --epoch 29 --avg 5 --max-duration 600 |
|
||||||
|
|
||||||
|
Training command is:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./prepare.sh
|
||||||
|
./prepare_aidatatang_200zh.sh
|
||||||
|
|
||||||
|
export CUDA_VISIBLE_DEVICES="4,5,6,7"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless3/train.py \
|
||||||
|
--exp-dir ./pruned_transducer_stateless3/exp-context-size-1 \
|
||||||
|
--world-size 4 \
|
||||||
|
--max-duration 200 \
|
||||||
|
--datatang-prob 0.5 \
|
||||||
|
--start-epoch 1 \
|
||||||
|
--num-epochs 30 \
|
||||||
|
--use-fp16 1 \
|
||||||
|
--num-encoder-layers 12 \
|
||||||
|
--dim-feedforward 2048 \
|
||||||
|
--nhead 8 \
|
||||||
|
--encoder-dim 512 \
|
||||||
|
--context-size 1 \
|
||||||
|
--decoder-dim 512 \
|
||||||
|
--joiner-dim 512 \
|
||||||
|
--master-port 12356
|
||||||
|
```
|
||||||
|
|
||||||
|
**Caution**: It uses `--context-size=1`.
|
||||||
|
|
||||||
|
The tensorboard log is available at
|
||||||
|
<https://tensorboard.dev/experiment/OKKacljwR6ik7rbDr5gMqQ>
|
||||||
|
|
||||||
|
The decoding command is:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
for epoch in 29; do
|
||||||
|
for avg in 5; do
|
||||||
|
for m in greedy_search modified_beam_search fast_beam_search; do
|
||||||
|
./pruned_transducer_stateless3/decode.py \
|
||||||
|
--exp-dir ./pruned_transducer_stateless3/exp-context-size-1 \
|
||||||
|
--epoch $epoch \
|
||||||
|
--avg $avg \
|
||||||
|
--use-averaged-model 1 \
|
||||||
|
--max-duration 600 \
|
||||||
|
--decoding-method $m \
|
||||||
|
--num-encoder-layers 12 \
|
||||||
|
--dim-feedforward 2048 \
|
||||||
|
--nhead 8 \
|
||||||
|
--context-size 1 \
|
||||||
|
--encoder-dim 512 \
|
||||||
|
--decoder-dim 512 \
|
||||||
|
--joiner-dim 512
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
Pretrained models, training logs, decoding logs, and decoding results
|
||||||
|
are available at
|
||||||
|
<https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20>
|
||||||
|
|
||||||
|
We have a tutorial in [sherpa](https://github.com/k2-fsa/sherpa) about how
|
||||||
|
to use the pre-trained model for non-streaming ASR. See
|
||||||
|
<https://k2-fsa.github.io/sherpa/offline_asr/conformer/aishell.html>
|
||||||
|
|
||||||
|
|
||||||
|
#### Pruned transducer stateless 2
|
||||||
|
|
||||||
|
See https://github.com/k2-fsa/icefall/pull/536
|
||||||
|
|
||||||
|
[./pruned_transducer_stateless2](./pruned_transducer_stateless2)
|
||||||
|
|
||||||
|
It uses pruned RNN-T.
|
||||||
|
|
||||||
|
| | test | dev | comment |
|
||||||
|
| -------------------- | ---- | ---- | -------------------------------------- |
|
||||||
|
| greedy search | 5.20 | 4.78 | --epoch 72 --avg 14 --max-duration 200 |
|
||||||
|
| modified beam search | 5.07 | 4.63 | --epoch 72 --avg 14 --max-duration 200 |
|
||||||
|
| fast beam search | 5.13 | 4.70 | --epoch 72 --avg 14 --max-duration 200 |
|
||||||
|
|
||||||
|
Training command is:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./prepare.sh
|
||||||
|
|
||||||
|
export CUDA_VISIBLE_DEVICES="0,1"
|
||||||
|
|
||||||
|
./pruned_transducer_stateless2/train.py \
|
||||||
|
--world-size 2 \
|
||||||
|
--num-epochs 90 \
|
||||||
|
--start-epoch 0 \
|
||||||
|
--exp-dir pruned_transducer_stateless2/exp \
|
||||||
|
--max-duration 200 \
|
||||||
|
```
|
||||||
|
|
||||||
|
The tensorboard log is available at
|
||||||
|
https://tensorboard.dev/experiment/QI3PVzrGRrebxpbWUPwmkA/
|
||||||
|
|
||||||
|
The decoding command is:
|
||||||
|
```bash
|
||||||
|
for m in greedy_search modified_beam_search fast_beam_search ; do
|
||||||
|
./pruned_transducer_stateless2/decode.py \
|
||||||
|
--epoch 72 \
|
||||||
|
--avg 14 \
|
||||||
|
--exp-dir ./pruned_transducer_stateless2/exp \
|
||||||
|
--lang-dir data/lang_char \
|
||||||
|
--max-duration 200 \
|
||||||
|
--decoding-method $m
|
||||||
|
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
Pretrained models, training logs, decoding logs, and decoding results
|
||||||
|
are available at
|
||||||
|
<https://huggingface.co/teapoly/icefall-aishell-pruned-transducer-stateless2-2022-08-18>
|
||||||
|
|
||||||
|
|
||||||
#### 2022-03-01
|
#### 2022-03-01
|
||||||
|
|
||||||
[./transducer_stateless_modified-2](./transducer_stateless_modified-2)
|
[./transducer_stateless_modified-2](./transducer_stateless_modified-2)
|
||||||
|
|
||||||
|
It uses [optimized_transducer](https://github.com/csukuangfj/optimized_transducer)
|
||||||
|
for computing RNN-T loss.
|
||||||
|
|
||||||
Stateless transducer + modified transducer + using [aidatatang_200zh](http://www.openslr.org/62/) as extra training data.
|
Stateless transducer + modified transducer + using [aidatatang_200zh](http://www.openslr.org/62/) as extra training data.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
residual = src
|
residual = src
|
||||||
if self.normalize_before:
|
if self.normalize_before:
|
||||||
src = self.norm_conv(src)
|
src = self.norm_conv(src)
|
||||||
src = residual + self.dropout(self.conv_module(src))
|
src = residual + self.dropout(
|
||||||
|
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
|
||||||
|
)
|
||||||
if not self.normalize_before:
|
if not self.normalize_before:
|
||||||
src = self.norm_conv(src)
|
src = self.norm_conv(src)
|
||||||
|
|
||||||
@ -364,7 +366,7 @@ class RelPositionalEncoding(torch.nn.Module):
|
|||||||
):
|
):
|
||||||
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
|
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
|
||||||
return
|
return
|
||||||
# Suppose `i` means to the position of query vecotr and `j` means the
|
# Suppose `i` means to the position of query vector and `j` means the
|
||||||
# position of key vector. We use position relative positions when keys
|
# position of key vector. We use position relative positions when keys
|
||||||
# are to the left (i>j) and negative relative positions otherwise (i<j).
|
# are to the left (i>j) and negative relative positions otherwise (i<j).
|
||||||
pe_positive = torch.zeros(x.size(1), self.d_model)
|
pe_positive = torch.zeros(x.size(1), self.d_model)
|
||||||
@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module):
|
|||||||
)
|
)
|
||||||
self.activation = Swish()
|
self.activation = Swish()
|
||||||
|
|
||||||
def forward(self, x: Tensor) -> Tensor:
|
def forward(
|
||||||
|
self,
|
||||||
|
x: Tensor,
|
||||||
|
src_key_padding_mask: Optional[Tensor] = None,
|
||||||
|
) -> Tensor:
|
||||||
"""Compute convolution module.
|
"""Compute convolution module.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x: Input tensor (#time, batch, channels).
|
x: Input tensor (#time, batch, channels).
|
||||||
|
src_key_padding_mask: the mask for the src keys per batch (optional).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tensor: Output tensor (#time, batch, channels).
|
Tensor: Output tensor (#time, batch, channels).
|
||||||
@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module):
|
|||||||
x = nn.functional.glu(x, dim=1) # (batch, channels, time)
|
x = nn.functional.glu(x, dim=1) # (batch, channels, time)
|
||||||
|
|
||||||
# 1D Depthwise Conv
|
# 1D Depthwise Conv
|
||||||
|
if src_key_padding_mask is not None:
|
||||||
|
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
|
||||||
x = self.depthwise_conv(x)
|
x = self.depthwise_conv(x)
|
||||||
x = self.activation(self.norm(x))
|
x = self.activation(self.norm(x))
|
||||||
|
|
||||||
|
|||||||
@ -335,7 +335,7 @@ def decode_dataset(
|
|||||||
lexicon: Lexicon,
|
lexicon: Lexicon,
|
||||||
sos_id: int,
|
sos_id: int,
|
||||||
eos_id: int,
|
eos_id: int,
|
||||||
) -> Dict[str, List[Tuple[List[int], List[int]]]]:
|
) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
|
||||||
"""Decode dataset.
|
"""Decode dataset.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -374,6 +374,7 @@ def decode_dataset(
|
|||||||
results = defaultdict(list)
|
results = defaultdict(list)
|
||||||
for batch_idx, batch in enumerate(dl):
|
for batch_idx, batch in enumerate(dl):
|
||||||
texts = batch["supervisions"]["text"]
|
texts = batch["supervisions"]["text"]
|
||||||
|
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
|
||||||
|
|
||||||
hyps_dict = decode_one_batch(
|
hyps_dict = decode_one_batch(
|
||||||
params=params,
|
params=params,
|
||||||
@ -389,9 +390,9 @@ def decode_dataset(
|
|||||||
for lm_scale, hyps in hyps_dict.items():
|
for lm_scale, hyps in hyps_dict.items():
|
||||||
this_batch = []
|
this_batch = []
|
||||||
assert len(hyps) == len(texts)
|
assert len(hyps) == len(texts)
|
||||||
for hyp_words, ref_text in zip(hyps, texts):
|
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
|
||||||
ref_words = ref_text.split()
|
ref_words = ref_text.split()
|
||||||
this_batch.append((ref_words, hyp_words))
|
this_batch.append((cut_id, ref_words, hyp_words))
|
||||||
|
|
||||||
results[lm_scale].extend(this_batch)
|
results[lm_scale].extend(this_batch)
|
||||||
|
|
||||||
@ -409,7 +410,7 @@ def decode_dataset(
|
|||||||
def save_results(
|
def save_results(
|
||||||
params: AttributeDict,
|
params: AttributeDict,
|
||||||
test_set_name: str,
|
test_set_name: str,
|
||||||
results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
|
results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
|
||||||
):
|
):
|
||||||
if params.method == "attention-decoder":
|
if params.method == "attention-decoder":
|
||||||
# Set it to False since there are too many logs.
|
# Set it to False since there are too many logs.
|
||||||
@ -419,6 +420,7 @@ def save_results(
|
|||||||
test_set_wers = dict()
|
test_set_wers = dict()
|
||||||
for key, results in results_dict.items():
|
for key, results in results_dict.items():
|
||||||
recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
|
recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
|
||||||
|
results = sorted(results)
|
||||||
store_transcripts(filename=recog_path, texts=results)
|
store_transcripts(filename=recog_path, texts=results)
|
||||||
if enable_log:
|
if enable_log:
|
||||||
logging.info(f"The transcripts are stored in {recog_path}")
|
logging.info(f"The transcripts are stored in {recog_path}")
|
||||||
@ -429,7 +431,9 @@ def save_results(
|
|||||||
# we compute CER for aishell dataset.
|
# we compute CER for aishell dataset.
|
||||||
results_char = []
|
results_char = []
|
||||||
for res in results:
|
for res in results:
|
||||||
results_char.append((list("".join(res[0])), list("".join(res[1]))))
|
results_char.append(
|
||||||
|
(res[0], list("".join(res[1])), list("".join(res[2])))
|
||||||
|
)
|
||||||
with open(errs_filename, "w") as f:
|
with open(errs_filename, "w") as f:
|
||||||
wer = write_error_stats(
|
wer = write_error_stats(
|
||||||
f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
|
f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
|
||||||
@ -537,6 +541,8 @@ def main():
|
|||||||
num_param = sum([p.numel() for p in model.parameters()])
|
num_param = sum([p.numel() for p in model.parameters()])
|
||||||
logging.info(f"Number of model parameters: {num_param}")
|
logging.info(f"Number of model parameters: {num_param}")
|
||||||
|
|
||||||
|
# we need cut ids to display recognition results.
|
||||||
|
args.return_cuts = True
|
||||||
aishell = AishellAsrDataModule(args)
|
aishell = AishellAsrDataModule(args)
|
||||||
test_cuts = aishell.test_cuts()
|
test_cuts = aishell.test_cuts()
|
||||||
test_dl = aishell.test_dataloaders(test_cuts)
|
test_dl = aishell.test_dataloaders(test_cuts)
|
||||||
|
|||||||
@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
|
|||||||
"best_train_epoch": -1,
|
"best_train_epoch": -1,
|
||||||
"best_valid_epoch": -1,
|
"best_valid_epoch": -1,
|
||||||
"batch_idx_train": 0,
|
"batch_idx_train": 0,
|
||||||
"log_interval": 10,
|
"log_interval": 50,
|
||||||
"reset_interval": 200,
|
"reset_interval": 200,
|
||||||
"valid_interval": 3000,
|
"valid_interval": 2000,
|
||||||
# parameters for k2.ctc_loss
|
# parameters for k2.ctc_loss
|
||||||
"beam_size": 10,
|
"beam_size": 10,
|
||||||
"reduction": "sum",
|
"reduction": "sum",
|
||||||
|
|||||||
@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
residual = src
|
residual = src
|
||||||
if self.normalize_before:
|
if self.normalize_before:
|
||||||
src = self.norm_conv(src)
|
src = self.norm_conv(src)
|
||||||
src = residual + self.dropout(self.conv_module(src))
|
src = residual + self.dropout(
|
||||||
|
self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
|
||||||
|
)
|
||||||
if not self.normalize_before:
|
if not self.normalize_before:
|
||||||
src = self.norm_conv(src)
|
src = self.norm_conv(src)
|
||||||
|
|
||||||
@ -364,7 +366,7 @@ class RelPositionalEncoding(torch.nn.Module):
|
|||||||
):
|
):
|
||||||
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
|
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
|
||||||
return
|
return
|
||||||
# Suppose `i` means to the position of query vecotr and `j` means the
|
# Suppose `i` means to the position of query vector and `j` means the
|
||||||
# position of key vector. We use position relative positions when keys
|
# position of key vector. We use position relative positions when keys
|
||||||
# are to the left (i>j) and negative relative positions otherwise (i<j).
|
# are to the left (i>j) and negative relative positions otherwise (i<j).
|
||||||
pe_positive = torch.zeros(x.size(1), self.d_model)
|
pe_positive = torch.zeros(x.size(1), self.d_model)
|
||||||
@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module):
|
|||||||
)
|
)
|
||||||
self.activation = Swish()
|
self.activation = Swish()
|
||||||
|
|
||||||
def forward(self, x: Tensor) -> Tensor:
|
def forward(
|
||||||
|
self,
|
||||||
|
x: Tensor,
|
||||||
|
src_key_padding_mask: Optional[Tensor] = None,
|
||||||
|
) -> Tensor:
|
||||||
"""Compute convolution module.
|
"""Compute convolution module.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x: Input tensor (#time, batch, channels).
|
x: Input tensor (#time, batch, channels).
|
||||||
|
src_key_padding_mask: the mask for the src keys per batch (optional).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tensor: Output tensor (#time, batch, channels).
|
Tensor: Output tensor (#time, batch, channels).
|
||||||
@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module):
|
|||||||
x = nn.functional.glu(x, dim=1) # (batch, channels, time)
|
x = nn.functional.glu(x, dim=1) # (batch, channels, time)
|
||||||
|
|
||||||
# 1D Depthwise Conv
|
# 1D Depthwise Conv
|
||||||
|
if src_key_padding_mask is not None:
|
||||||
|
x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
|
||||||
x = self.depthwise_conv(x)
|
x = self.depthwise_conv(x)
|
||||||
x = self.activation(self.norm(x))
|
x = self.activation(self.norm(x))
|
||||||
|
|
||||||
|
|||||||
@ -347,7 +347,7 @@ def decode_dataset(
|
|||||||
lexicon: Lexicon,
|
lexicon: Lexicon,
|
||||||
sos_id: int,
|
sos_id: int,
|
||||||
eos_id: int,
|
eos_id: int,
|
||||||
) -> Dict[str, List[Tuple[List[int], List[int]]]]:
|
) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
|
||||||
"""Decode dataset.
|
"""Decode dataset.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -386,6 +386,7 @@ def decode_dataset(
|
|||||||
results = defaultdict(list)
|
results = defaultdict(list)
|
||||||
for batch_idx, batch in enumerate(dl):
|
for batch_idx, batch in enumerate(dl):
|
||||||
texts = batch["supervisions"]["text"]
|
texts = batch["supervisions"]["text"]
|
||||||
|
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
|
||||||
|
|
||||||
hyps_dict = decode_one_batch(
|
hyps_dict = decode_one_batch(
|
||||||
params=params,
|
params=params,
|
||||||
@ -401,9 +402,9 @@ def decode_dataset(
|
|||||||
for lm_scale, hyps in hyps_dict.items():
|
for lm_scale, hyps in hyps_dict.items():
|
||||||
this_batch = []
|
this_batch = []
|
||||||
assert len(hyps) == len(texts)
|
assert len(hyps) == len(texts)
|
||||||
for hyp_words, ref_text in zip(hyps, texts):
|
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
|
||||||
ref_words = ref_text.split()
|
ref_words = ref_text.split()
|
||||||
this_batch.append((ref_words, hyp_words))
|
this_batch.append((cut_id, ref_words, hyp_words))
|
||||||
|
|
||||||
results[lm_scale].extend(this_batch)
|
results[lm_scale].extend(this_batch)
|
||||||
|
|
||||||
@ -421,7 +422,7 @@ def decode_dataset(
|
|||||||
def save_results(
|
def save_results(
|
||||||
params: AttributeDict,
|
params: AttributeDict,
|
||||||
test_set_name: str,
|
test_set_name: str,
|
||||||
results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
|
results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
|
||||||
):
|
):
|
||||||
if params.method == "attention-decoder":
|
if params.method == "attention-decoder":
|
||||||
# Set it to False since there are too many logs.
|
# Set it to False since there are too many logs.
|
||||||
@ -431,6 +432,7 @@ def save_results(
|
|||||||
test_set_wers = dict()
|
test_set_wers = dict()
|
||||||
for key, results in results_dict.items():
|
for key, results in results_dict.items():
|
||||||
recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
|
recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
|
||||||
|
results = sorted(results)
|
||||||
store_transcripts(filename=recog_path, texts=results)
|
store_transcripts(filename=recog_path, texts=results)
|
||||||
if enable_log:
|
if enable_log:
|
||||||
logging.info(f"The transcripts are stored in {recog_path}")
|
logging.info(f"The transcripts are stored in {recog_path}")
|
||||||
@ -441,7 +443,9 @@ def save_results(
|
|||||||
# we compute CER for aishell dataset.
|
# we compute CER for aishell dataset.
|
||||||
results_char = []
|
results_char = []
|
||||||
for res in results:
|
for res in results:
|
||||||
results_char.append((list("".join(res[0])), list("".join(res[1]))))
|
results_char.append(
|
||||||
|
(res[0], list("".join(res[1])), list("".join(res[2])))
|
||||||
|
)
|
||||||
with open(errs_filename, "w") as f:
|
with open(errs_filename, "w") as f:
|
||||||
wer = write_error_stats(
|
wer = write_error_stats(
|
||||||
f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
|
f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
|
||||||
@ -556,6 +560,8 @@ def main():
|
|||||||
num_param = sum([p.numel() for p in model.parameters()])
|
num_param = sum([p.numel() for p in model.parameters()])
|
||||||
logging.info(f"Number of model parameters: {num_param}")
|
logging.info(f"Number of model parameters: {num_param}")
|
||||||
|
|
||||||
|
# we need cut ids to display recognition results.
|
||||||
|
args.return_cuts = True
|
||||||
aishell = AishellAsrDataModule(args)
|
aishell = AishellAsrDataModule(args)
|
||||||
test_cuts = aishell.test_cuts()
|
test_cuts = aishell.test_cuts()
|
||||||
test_dl = aishell.test_dataloaders(test_cuts)
|
test_dl = aishell.test_dataloaders(test_cuts)
|
||||||
|
|||||||
126
egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
Executable file
126
egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
Executable file
@ -0,0 +1,126 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file computes fbank features of the aidatatang_200zh dataset.
|
||||||
|
It looks for manifests in the directory data/manifests.
|
||||||
|
|
||||||
|
The generated fbank features are saved in data/fbank.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||||
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
|
from icefall.utils import get_executor
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||||
|
src_dir = Path("data/manifests")
|
||||||
|
output_dir = Path("data/fbank")
|
||||||
|
num_jobs = min(15, os.cpu_count())
|
||||||
|
|
||||||
|
dataset_parts = (
|
||||||
|
"train",
|
||||||
|
"test",
|
||||||
|
"dev",
|
||||||
|
)
|
||||||
|
prefix = "aidatatang"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
|
manifests = read_manifests_if_cached(
|
||||||
|
dataset_parts=dataset_parts,
|
||||||
|
output_dir=src_dir,
|
||||||
|
prefix=prefix,
|
||||||
|
suffix=suffix,
|
||||||
|
)
|
||||||
|
assert manifests is not None
|
||||||
|
|
||||||
|
assert len(manifests) == len(dataset_parts), (
|
||||||
|
len(manifests),
|
||||||
|
len(dataset_parts),
|
||||||
|
list(manifests.keys()),
|
||||||
|
dataset_parts,
|
||||||
|
)
|
||||||
|
|
||||||
|
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
|
||||||
|
|
||||||
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
|
for partition, m in manifests.items():
|
||||||
|
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||||
|
logging.info(f"{partition} already exists - skipping.")
|
||||||
|
continue
|
||||||
|
logging.info(f"Processing {partition}")
|
||||||
|
|
||||||
|
for sup in m["supervisions"]:
|
||||||
|
sup.custom = {"origin": "aidatatang_200zh"}
|
||||||
|
|
||||||
|
cut_set = CutSet.from_manifests(
|
||||||
|
recordings=m["recordings"],
|
||||||
|
supervisions=m["supervisions"],
|
||||||
|
)
|
||||||
|
if "train" in partition:
|
||||||
|
cut_set = (
|
||||||
|
cut_set
|
||||||
|
+ cut_set.perturb_speed(0.9)
|
||||||
|
+ cut_set.perturb_speed(1.1)
|
||||||
|
)
|
||||||
|
cut_set = cut_set.compute_and_store_features(
|
||||||
|
extractor=extractor,
|
||||||
|
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||||
|
# when an executor is specified, make more partitions
|
||||||
|
num_jobs=num_jobs if ex is None else 80,
|
||||||
|
executor=ex,
|
||||||
|
storage_type=LilcomChunkyWriter,
|
||||||
|
)
|
||||||
|
|
||||||
|
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-mel-bins",
|
||||||
|
type=int,
|
||||||
|
default=80,
|
||||||
|
help="""The number of mel bins for Fbank""",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = (
|
||||||
|
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
|
||||||
|
args = get_args()
|
||||||
|
compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
|
||||||
@ -29,7 +29,7 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
from icefall.utils import get_executor
|
from icefall.utils import get_executor
|
||||||
@ -52,16 +52,28 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
|
|||||||
"dev",
|
"dev",
|
||||||
"test",
|
"test",
|
||||||
)
|
)
|
||||||
|
prefix = "aishell"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
prefix="aishell", dataset_parts=dataset_parts, output_dir=src_dir
|
dataset_parts=dataset_parts,
|
||||||
|
output_dir=src_dir,
|
||||||
|
prefix=prefix,
|
||||||
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
assert manifests is not None
|
assert manifests is not None
|
||||||
|
|
||||||
|
assert len(manifests) == len(dataset_parts), (
|
||||||
|
len(manifests),
|
||||||
|
len(dataset_parts),
|
||||||
|
list(manifests.keys()),
|
||||||
|
dataset_parts,
|
||||||
|
)
|
||||||
|
|
||||||
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
|
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
|
||||||
|
|
||||||
with get_executor() as ex: # Initialize the executor only once.
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
for partition, m in manifests.items():
|
for partition, m in manifests.items():
|
||||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||||
logging.info(f"{partition} already exists - skipping.")
|
logging.info(f"{partition} already exists - skipping.")
|
||||||
continue
|
continue
|
||||||
logging.info(f"Processing {partition}")
|
logging.info(f"Processing {partition}")
|
||||||
@ -77,13 +89,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
|
|||||||
)
|
)
|
||||||
cut_set = cut_set.compute_and_store_features(
|
cut_set = cut_set.compute_and_store_features(
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
storage_path=f"{output_dir}/feats_{partition}",
|
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||||
# when an executor is specified, make more partitions
|
# when an executor is specified, make more partitions
|
||||||
num_jobs=num_jobs if ex is None else 80,
|
num_jobs=num_jobs if ex is None else 80,
|
||||||
executor=ex,
|
executor=ex,
|
||||||
storage_type=LilcomHdf5Writer,
|
storage_type=LilcomChunkyWriter,
|
||||||
)
|
)
|
||||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
|
|||||||
@ -25,18 +25,18 @@ for usage.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from lhotse import load_manifest
|
from lhotse import load_manifest_lazy
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# path = "./data/fbank/cuts_train.json.gz"
|
# path = "./data/fbank/aishell_cuts_train.jsonl.gz"
|
||||||
# path = "./data/fbank/cuts_test.json.gz"
|
# path = "./data/fbank/aishell_cuts_test.jsonl.gz"
|
||||||
# path = "./data/fbank/cuts_dev.json.gz"
|
path = "./data/fbank/aishell_cuts_dev.jsonl.gz"
|
||||||
# path = "./data/fbank/aidatatang_200zh/cuts_train_raw.jsonl.gz"
|
# path = "./data/fbank/aidatatang_cuts_train.jsonl.gz"
|
||||||
# path = "./data/fbank/aidatatang_200zh/cuts_test_raw.jsonl.gz"
|
# path = "./data/fbank/aidatatang_cuts_test.jsonl.gz"
|
||||||
path = "./data/fbank/aidatatang_200zh/cuts_dev_raw.jsonl.gz"
|
# path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz"
|
||||||
|
|
||||||
cuts = load_manifest(path)
|
cuts = load_manifest_lazy(path)
|
||||||
cuts.describe()
|
cuts.describe()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,71 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2022 Xiaomi Corp. (Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from lhotse import CutSet
|
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_aidatatang_200zh():
|
|
||||||
src_dir = Path("data/manifests/aidatatang_200zh")
|
|
||||||
output_dir = Path("data/fbank/aidatatang_200zh")
|
|
||||||
output_dir.mkdir(exist_ok=True, parents=True)
|
|
||||||
|
|
||||||
dataset_parts = (
|
|
||||||
"train",
|
|
||||||
"test",
|
|
||||||
"dev",
|
|
||||||
)
|
|
||||||
|
|
||||||
logging.info("Loading manifest")
|
|
||||||
manifests = read_manifests_if_cached(
|
|
||||||
dataset_parts=dataset_parts, output_dir=src_dir, prefix="aidatatang"
|
|
||||||
)
|
|
||||||
assert len(manifests) > 0
|
|
||||||
|
|
||||||
for partition, m in manifests.items():
|
|
||||||
logging.info(f"Processing {partition}")
|
|
||||||
raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
|
|
||||||
if raw_cuts_path.is_file():
|
|
||||||
logging.info(f"{partition} already exists - skipping")
|
|
||||||
continue
|
|
||||||
|
|
||||||
for sup in m["supervisions"]:
|
|
||||||
sup.custom = {"origin": "aidatatang_200zh"}
|
|
||||||
|
|
||||||
cut_set = CutSet.from_manifests(
|
|
||||||
recordings=m["recordings"],
|
|
||||||
supervisions=m["supervisions"],
|
|
||||||
)
|
|
||||||
|
|
||||||
logging.info(f"Saving to {raw_cuts_path}")
|
|
||||||
cut_set.to_file(raw_cuts_path)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
formatter = (
|
|
||||||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
||||||
)
|
|
||||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
||||||
|
|
||||||
preprocess_aidatatang_200zh()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@ -18,7 +18,7 @@ stop_stage=10
|
|||||||
# This directory contains the language model downloaded from
|
# This directory contains the language model downloaded from
|
||||||
# https://huggingface.co/pkufool/aishell_lm
|
# https://huggingface.co/pkufool/aishell_lm
|
||||||
#
|
#
|
||||||
# - 3-gram.unpruned.apra
|
# - 3-gram.unpruned.arpa
|
||||||
#
|
#
|
||||||
# - $dl_dir/musan
|
# - $dl_dir/musan
|
||||||
# This directory contains the following directories downloaded from
|
# This directory contains the following directories downloaded from
|
||||||
@ -48,6 +48,8 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
|||||||
log "stage -1: Download LM"
|
log "stage -1: Download LM"
|
||||||
# We assume that you have installed the git-lfs, if not, you could install it
|
# We assume that you have installed the git-lfs, if not, you could install it
|
||||||
# using: `sudo apt-get install git-lfs && git-lfs install`
|
# using: `sudo apt-get install git-lfs && git-lfs install`
|
||||||
|
git lfs 1>/dev/null 2>&1 || (echo "please install git-lfs, consider using: sudo apt-get install git-lfs && git-lfs install" && exit 1)
|
||||||
|
|
||||||
if [ ! -f $dl_dir/lm/3-gram.unpruned.arpa ]; then
|
if [ ! -f $dl_dir/lm/3-gram.unpruned.arpa ]; then
|
||||||
git clone https://huggingface.co/pkufool/aishell_lm $dl_dir/lm
|
git clone https://huggingface.co/pkufool/aishell_lm $dl_dir/lm
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -42,18 +42,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
log "Stage 1: Prepare manifest"
|
log "Stage 1: Prepare manifest"
|
||||||
# We assume that you have downloaded the aidatatang_200zh corpus
|
# We assume that you have downloaded the aidatatang_200zh corpus
|
||||||
# to $dl_dir/aidatatang_200zh
|
# to $dl_dir/aidatatang_200zh
|
||||||
if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then
|
if [ ! -f data/manifests/.aidatatang_200zh_manifests.done ]; then
|
||||||
mkdir -p data/manifests/aidatatang_200zh
|
mkdir -p data/manifests
|
||||||
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
|
lhotse prepare aidatatang-200zh $dl_dir data/manifests
|
||||||
touch data/manifests/aidatatang_200zh/.manifests.done
|
touch data/manifests/.aidatatang_200zh_manifests.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
log "Stage 2: Process aidatatang_200zh"
|
log "Stage 2: Process aidatatang_200zh"
|
||||||
if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
|
if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then
|
||||||
mkdir -p data/fbank/aidatatang_200zh
|
mkdir -p data/fbank
|
||||||
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
|
./local/compute_fbank_aidatatang_200zh.py
|
||||||
touch data/fbank/aidatatang_200zh/.fbank.done
|
touch data/fbank/.aidatatang_200zh_fbank.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|||||||
1
egs/aishell/ASR/pruned_transducer_stateless2/asr_datamodule.py
Symbolic link
1
egs/aishell/ASR/pruned_transducer_stateless2/asr_datamodule.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../tdnn_lstm_ctc/asr_datamodule.py
|
||||||
1
egs/aishell/ASR/pruned_transducer_stateless2/beam_search.py
Symbolic link
1
egs/aishell/ASR/pruned_transducer_stateless2/beam_search.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py
|
||||||
1
egs/aishell/ASR/pruned_transducer_stateless2/conformer.py
Symbolic link
1
egs/aishell/ASR/pruned_transducer_stateless2/conformer.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/pruned_transducer_stateless2/conformer.py
|
||||||
573
egs/aishell/ASR/pruned_transducer_stateless2/decode.py
Executable file
573
egs/aishell/ASR/pruned_transducer_stateless2/decode.py
Executable file
@ -0,0 +1,573 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
|
||||||
|
# Zengwei Yao)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Usage:
|
||||||
|
(1) greedy search
|
||||||
|
./pruned_transducer_stateless2/decode.py \
|
||||||
|
--epoch 84 \
|
||||||
|
--avg 25 \
|
||||||
|
--exp-dir ./pruned_transducer_stateless2/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--decoding-method greedy_search
|
||||||
|
|
||||||
|
(2) beam search (not recommended)
|
||||||
|
./pruned_transducer_stateless2/decode.py \
|
||||||
|
--epoch 84 \
|
||||||
|
--avg 25 \
|
||||||
|
--exp-dir ./pruned_transducer_stateless2/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--decoding-method beam_search \
|
||||||
|
--beam-size 4
|
||||||
|
|
||||||
|
(3) modified beam search
|
||||||
|
./pruned_transducer_stateless2/decode.py \
|
||||||
|
--epoch 84 \
|
||||||
|
--avg 25 \
|
||||||
|
--exp-dir ./pruned_transducer_stateless2/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--decoding-method modified_beam_search \
|
||||||
|
--beam-size 4
|
||||||
|
|
||||||
|
(4) fast beam search
|
||||||
|
./pruned_transducer_stateless2/decode.py \
|
||||||
|
--epoch 84 \
|
||||||
|
--avg 25 \
|
||||||
|
--exp-dir ./pruned_transducer_stateless2/exp \
|
||||||
|
--max-duration 600 \
|
||||||
|
--decoding-method fast_beam_search \
|
||||||
|
--beam 4 \
|
||||||
|
--max-contexts 4 \
|
||||||
|
--max-states 8
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from asr_datamodule import AishellAsrDataModule
|
||||||
|
from beam_search import (
|
||||||
|
beam_search,
|
||||||
|
fast_beam_search_one_best,
|
||||||
|
greedy_search,
|
||||||
|
greedy_search_batch,
|
||||||
|
modified_beam_search,
|
||||||
|
)
|
||||||
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
|
from icefall.checkpoint import (
|
||||||
|
average_checkpoints,
|
||||||
|
find_checkpoints,
|
||||||
|
load_checkpoint,
|
||||||
|
)
|
||||||
|
from icefall.lexicon import Lexicon
|
||||||
|
from icefall.utils import (
|
||||||
|
AttributeDict,
|
||||||
|
setup_logger,
|
||||||
|
store_transcripts,
|
||||||
|
write_error_stats,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--epoch",
|
||||||
|
type=int,
|
||||||
|
default=30,
|
||||||
|
help="""It specifies the checkpoint to use for decoding.
|
||||||
|
Note: Epoch counts from 1.
|
||||||
|
You can specify --avg to use more checkpoints for model averaging.""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--iter",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="""If positive, --epoch is ignored and it
|
||||||
|
will use the checkpoint exp_dir/checkpoint-iter.pt.
|
||||||
|
You can specify --avg to use more checkpoints for model averaging.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--avg",
|
||||||
|
type=int,
|
||||||
|
default=15,
|
||||||
|
help="Number of checkpoints to average. Automatically select "
|
||||||
|
"consecutive checkpoints before the checkpoint specified by "
|
||||||
|
"'--epoch' and '--iter'",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--exp-dir",
|
||||||
|
type=str,
|
||||||
|
default="pruned_transducer_stateless2/exp",
|
||||||
|
help="The experiment dir",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang-dir",
|
||||||
|
type=str,
|
||||||
|
default="data/lang_char",
|
||||||
|
help="The lang dir",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--decoding-method",
|
||||||
|
type=str,
|
||||||
|
default="greedy_search",
|
||||||
|
help="""Possible values are:
|
||||||
|
- greedy_search
|
||||||
|
- beam_search
|
||||||
|
- modified_beam_search
|
||||||
|
- fast_beam_search
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--beam-size",
|
||||||
|
type=int,
|
||||||
|
default=4,
|
||||||
|
help="""An integer indicating how many candidates we will keep for each
|
||||||
|
frame. Used only when --decoding-method is beam_search or
|
||||||
|
modified_beam_search.""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--beam",
|
||||||
|
type=float,
|
||||||
|
default=4,
|
||||||
|
help="""A floating point value to calculate the cutoff score during beam
|
||||||
|
search (i.e., `cutoff = max-score - beam`), which is the same as the
|
||||||
|
`beam` in Kaldi.
|
||||||
|
Used only when --decoding-method is fast_beam_search""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-contexts",
|
||||||
|
type=int,
|
||||||
|
default=4,
|
||||||
|
help="""Used only when --decoding-method is
|
||||||
|
fast_beam_search""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-states",
|
||||||
|
type=int,
|
||||||
|
default=8,
|
||||||
|
help="""Used only when --decoding-method is
|
||||||
|
fast_beam_search""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--context-size",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="The context size in the decoder. 1 means bigram; "
|
||||||
|
"2 means tri-gram",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-sym-per-frame",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="""Maximum number of symbols per frame.
|
||||||
|
Used only when --decoding_method is greedy_search""",
|
||||||
|
)
|
||||||
|
|
||||||
|
add_model_arguments(parser)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def decode_one_batch(
|
||||||
|
params: AttributeDict,
|
||||||
|
model: nn.Module,
|
||||||
|
token_table: k2.SymbolTable,
|
||||||
|
batch: dict,
|
||||||
|
decoding_graph: Optional[k2.Fsa] = None,
|
||||||
|
) -> Dict[str, List[List[str]]]:
|
||||||
|
"""Decode one batch and return the result in a dict. The dict has the
|
||||||
|
following format:
|
||||||
|
|
||||||
|
- key: It indicates the setting used for decoding. For example,
|
||||||
|
if greedy_search is used, it would be "greedy_search"
|
||||||
|
If beam search with a beam size of 7 is used, it would be
|
||||||
|
"beam_7"
|
||||||
|
- value: It contains the decoding result. `len(value)` equals to
|
||||||
|
batch size. `value[i]` is the decoding result for the i-th
|
||||||
|
utterance in the given batch.
|
||||||
|
Args:
|
||||||
|
params:
|
||||||
|
It's the return value of :func:`get_params`.
|
||||||
|
model:
|
||||||
|
The neural model.
|
||||||
|
token_table:
|
||||||
|
It maps token ID to a string.
|
||||||
|
batch:
|
||||||
|
It is the return value from iterating
|
||||||
|
`lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
|
||||||
|
for the format of the `batch`.
|
||||||
|
decoding_graph:
|
||||||
|
The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
|
||||||
|
only when --decoding_method is fast_beam_search.
|
||||||
|
Returns:
|
||||||
|
Return the decoding result. See above description for the format of
|
||||||
|
the returned dict.
|
||||||
|
"""
|
||||||
|
device = next(model.parameters()).device
|
||||||
|
feature = batch["inputs"]
|
||||||
|
assert feature.ndim == 3
|
||||||
|
|
||||||
|
feature = feature.to(device)
|
||||||
|
# at entry, feature is (N, T, C)
|
||||||
|
|
||||||
|
supervisions = batch["supervisions"]
|
||||||
|
feature_lens = supervisions["num_frames"].to(device)
|
||||||
|
|
||||||
|
encoder_out, encoder_out_lens = model.encoder(
|
||||||
|
x=feature, x_lens=feature_lens
|
||||||
|
)
|
||||||
|
|
||||||
|
if params.decoding_method == "fast_beam_search":
|
||||||
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
|
model=model,
|
||||||
|
decoding_graph=decoding_graph,
|
||||||
|
encoder_out=encoder_out,
|
||||||
|
encoder_out_lens=encoder_out_lens,
|
||||||
|
beam=params.beam,
|
||||||
|
max_contexts=params.max_contexts,
|
||||||
|
max_states=params.max_states,
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
params.decoding_method == "greedy_search"
|
||||||
|
and params.max_sym_per_frame == 1
|
||||||
|
):
|
||||||
|
hyp_tokens = greedy_search_batch(
|
||||||
|
model=model,
|
||||||
|
encoder_out=encoder_out,
|
||||||
|
encoder_out_lens=encoder_out_lens,
|
||||||
|
)
|
||||||
|
elif params.decoding_method == "modified_beam_search":
|
||||||
|
hyp_tokens = modified_beam_search(
|
||||||
|
model=model,
|
||||||
|
encoder_out=encoder_out,
|
||||||
|
encoder_out_lens=encoder_out_lens,
|
||||||
|
beam=params.beam_size,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
hyp_tokens = []
|
||||||
|
batch_size = encoder_out.size(0)
|
||||||
|
for i in range(batch_size):
|
||||||
|
# fmt: off
|
||||||
|
encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
|
||||||
|
# fmt: on
|
||||||
|
if params.decoding_method == "greedy_search":
|
||||||
|
hyp = greedy_search(
|
||||||
|
model=model,
|
||||||
|
encoder_out=encoder_out_i,
|
||||||
|
max_sym_per_frame=params.max_sym_per_frame,
|
||||||
|
)
|
||||||
|
elif params.decoding_method == "beam_search":
|
||||||
|
hyp = beam_search(
|
||||||
|
model=model,
|
||||||
|
encoder_out=encoder_out_i,
|
||||||
|
beam=params.beam_size,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported decoding method: {params.decoding_method}"
|
||||||
|
)
|
||||||
|
hyp_tokens.append(hyp)
|
||||||
|
|
||||||
|
hyps = [[token_table[t] for t in tokens] for tokens in hyp_tokens]
|
||||||
|
|
||||||
|
if params.decoding_method == "greedy_search":
|
||||||
|
return {"greedy_search": hyps}
|
||||||
|
elif params.decoding_method == "fast_beam_search":
|
||||||
|
return {
|
||||||
|
(
|
||||||
|
f"beam_{params.beam}_"
|
||||||
|
f"max_contexts_{params.max_contexts}_"
|
||||||
|
f"max_states_{params.max_states}"
|
||||||
|
): hyps
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {f"beam_size_{params.beam_size}": hyps}
|
||||||
|
|
||||||
|
|
||||||
|
def decode_dataset(
|
||||||
|
dl: torch.utils.data.DataLoader,
|
||||||
|
params: AttributeDict,
|
||||||
|
model: nn.Module,
|
||||||
|
token_table: k2.SymbolTable,
|
||||||
|
decoding_graph: Optional[k2.Fsa] = None,
|
||||||
|
) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
|
||||||
|
"""Decode dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dl:
|
||||||
|
PyTorch's dataloader containing the dataset to decode.
|
||||||
|
params:
|
||||||
|
It is returned by :func:`get_params`.
|
||||||
|
model:
|
||||||
|
The neural model.
|
||||||
|
token_table:
|
||||||
|
It maps a token ID to a string.
|
||||||
|
decoding_graph:
|
||||||
|
The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
|
||||||
|
only when --decoding_method is fast_beam_search.
|
||||||
|
Returns:
|
||||||
|
Return a dict, whose key may be "greedy_search" if greedy search
|
||||||
|
is used, or it may be "beam_7" if beam size of 7 is used.
|
||||||
|
Its value is a list of tuples. Each tuple contains two elements:
|
||||||
|
The first is the reference transcript, and the second is the
|
||||||
|
predicted result.
|
||||||
|
"""
|
||||||
|
num_cuts = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
num_batches = len(dl)
|
||||||
|
except TypeError:
|
||||||
|
num_batches = "?"
|
||||||
|
|
||||||
|
if params.decoding_method == "greedy_search":
|
||||||
|
log_interval = 50
|
||||||
|
else:
|
||||||
|
log_interval = 20
|
||||||
|
|
||||||
|
results = defaultdict(list)
|
||||||
|
for batch_idx, batch in enumerate(dl):
|
||||||
|
texts = batch["supervisions"]["text"]
|
||||||
|
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
|
||||||
|
|
||||||
|
hyps_dict = decode_one_batch(
|
||||||
|
params=params,
|
||||||
|
model=model,
|
||||||
|
token_table=token_table,
|
||||||
|
decoding_graph=decoding_graph,
|
||||||
|
batch=batch,
|
||||||
|
)
|
||||||
|
|
||||||
|
for name, hyps in hyps_dict.items():
|
||||||
|
this_batch = []
|
||||||
|
assert len(hyps) == len(texts)
|
||||||
|
for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
|
||||||
|
ref_words = ref_text.split()
|
||||||
|
this_batch.append((cut_id, ref_words, hyp_words))
|
||||||
|
|
||||||
|
results[name].extend(this_batch)
|
||||||
|
|
||||||
|
num_cuts += len(texts)
|
||||||
|
|
||||||
|
if batch_idx % log_interval == 0:
|
||||||
|
batch_str = f"{batch_idx}/{num_batches}"
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
f"batch {batch_str}, cuts processed until now is {num_cuts}"
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(
|
||||||
|
params: AttributeDict,
|
||||||
|
test_set_name: str,
|
||||||
|
results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
|
||||||
|
):
|
||||||
|
test_set_wers = dict()
|
||||||
|
for key, results in results_dict.items():
|
||||||
|
recog_path = (
|
||||||
|
params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||||
|
)
|
||||||
|
results = sorted(results)
|
||||||
|
store_transcripts(filename=recog_path, texts=results)
|
||||||
|
logging.info(f"The transcripts are stored in {recog_path}")
|
||||||
|
|
||||||
|
# The following prints out WERs, per-word error statistics and aligned
|
||||||
|
# ref/hyp pairs.
|
||||||
|
errs_filename = (
|
||||||
|
params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
|
||||||
|
)
|
||||||
|
# we compute CER for aishell dataset.
|
||||||
|
results_char = []
|
||||||
|
for res in results:
|
||||||
|
results_char.append(
|
||||||
|
(res[0], list("".join(res[1])), list("".join(res[2])))
|
||||||
|
)
|
||||||
|
with open(errs_filename, "w") as f:
|
||||||
|
wer = write_error_stats(
|
||||||
|
f, f"{test_set_name}-{key}", results_char, enable_log=True
|
||||||
|
)
|
||||||
|
test_set_wers[key] = wer
|
||||||
|
|
||||||
|
logging.info("Wrote detailed error stats to {}".format(errs_filename))
|
||||||
|
|
||||||
|
test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
|
||||||
|
errs_info = (
|
||||||
|
params.res_dir
|
||||||
|
/ f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
|
||||||
|
)
|
||||||
|
with open(errs_info, "w") as f:
|
||||||
|
print("settings\tWER", file=f)
|
||||||
|
for key, val in test_set_wers:
|
||||||
|
print("{}\t{}".format(key, val), file=f)
|
||||||
|
|
||||||
|
s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
|
||||||
|
note = "\tbest for {}".format(test_set_name)
|
||||||
|
for key, val in test_set_wers:
|
||||||
|
s += "{}\t{}{}\n".format(key, val, note)
|
||||||
|
note = ""
|
||||||
|
logging.info(s)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def main():
|
||||||
|
parser = get_parser()
|
||||||
|
AishellAsrDataModule.add_arguments(parser)
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.exp_dir = Path(args.exp_dir)
|
||||||
|
args.lang_dir = Path(args.lang_dir)
|
||||||
|
|
||||||
|
params = get_params()
|
||||||
|
params.update(vars(args))
|
||||||
|
|
||||||
|
assert params.decoding_method in (
|
||||||
|
"greedy_search",
|
||||||
|
"beam_search",
|
||||||
|
"fast_beam_search",
|
||||||
|
"modified_beam_search",
|
||||||
|
)
|
||||||
|
params.res_dir = params.exp_dir / params.decoding_method
|
||||||
|
|
||||||
|
if params.iter > 0:
|
||||||
|
params.suffix = f"iter-{params.iter}-avg-{params.avg}"
|
||||||
|
else:
|
||||||
|
params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
|
||||||
|
|
||||||
|
if "fast_beam_search" in params.decoding_method:
|
||||||
|
params.suffix += f"-beam-{params.beam}"
|
||||||
|
params.suffix += f"-max-contexts-{params.max_contexts}"
|
||||||
|
params.suffix += f"-max-states-{params.max_states}"
|
||||||
|
elif "beam_search" in params.decoding_method:
|
||||||
|
params.suffix += (
|
||||||
|
f"-{params.decoding_method}-beam-size-{params.beam_size}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
params.suffix += f"-context-{params.context_size}"
|
||||||
|
params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
|
||||||
|
|
||||||
|
setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
|
||||||
|
logging.info("Decoding started")
|
||||||
|
|
||||||
|
device = torch.device("cpu")
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = torch.device("cuda", 0)
|
||||||
|
|
||||||
|
logging.info(f"Device: {device}")
|
||||||
|
|
||||||
|
lexicon = Lexicon(params.lang_dir)
|
||||||
|
params.blank_id = 0
|
||||||
|
params.vocab_size = max(lexicon.tokens) + 1
|
||||||
|
|
||||||
|
logging.info(params)
|
||||||
|
|
||||||
|
logging.info("About to create model")
|
||||||
|
model = get_transducer_model(params)
|
||||||
|
|
||||||
|
if params.iter > 0:
|
||||||
|
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
|
||||||
|
: params.avg
|
||||||
|
]
|
||||||
|
if len(filenames) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"No checkpoints found for"
|
||||||
|
f" --iter {params.iter}, --avg {params.avg}"
|
||||||
|
)
|
||||||
|
elif len(filenames) < params.avg:
|
||||||
|
raise ValueError(
|
||||||
|
f"Not enough checkpoints ({len(filenames)}) found for"
|
||||||
|
f" --iter {params.iter}, --avg {params.avg}"
|
||||||
|
)
|
||||||
|
logging.info(f"averaging {filenames}")
|
||||||
|
model.to(device)
|
||||||
|
model.load_state_dict(
|
||||||
|
average_checkpoints(filenames, device=device), strict=False
|
||||||
|
)
|
||||||
|
elif params.avg == 1:
|
||||||
|
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
|
||||||
|
else:
|
||||||
|
start = params.epoch - params.avg + 1
|
||||||
|
filenames = []
|
||||||
|
for i in range(start, params.epoch + 1):
|
||||||
|
if i >= 1:
|
||||||
|
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
|
||||||
|
logging.info(f"averaging {filenames}")
|
||||||
|
model.to(device)
|
||||||
|
model.load_state_dict(
|
||||||
|
average_checkpoints(filenames, device=device), strict=False
|
||||||
|
)
|
||||||
|
|
||||||
|
model.to(device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
if params.decoding_method == "fast_beam_search":
|
||||||
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
|
else:
|
||||||
|
decoding_graph = None
|
||||||
|
|
||||||
|
num_param = sum([p.numel() for p in model.parameters()])
|
||||||
|
logging.info(f"Number of model parameters: {num_param}")
|
||||||
|
|
||||||
|
aishell = AishellAsrDataModule(args)
|
||||||
|
test_cuts = aishell.test_cuts()
|
||||||
|
dev_cuts = aishell.valid_cuts()
|
||||||
|
test_dl = aishell.test_dataloaders(test_cuts)
|
||||||
|
dev_dl = aishell.test_dataloaders(dev_cuts)
|
||||||
|
|
||||||
|
test_sets = ["test", "dev"]
|
||||||
|
test_dls = [test_dl, dev_dl]
|
||||||
|
|
||||||
|
for test_set, test_dl in zip(test_sets, test_dls):
|
||||||
|
results_dict = decode_dataset(
|
||||||
|
dl=test_dl,
|
||||||
|
params=params,
|
||||||
|
model=model,
|
||||||
|
token_table=lexicon.token_table,
|
||||||
|
decoding_graph=decoding_graph,
|
||||||
|
)
|
||||||
|
|
||||||
|
save_results(
|
||||||
|
params=params,
|
||||||
|
test_set_name=test_set,
|
||||||
|
results_dict=results_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info("Done!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
1
egs/aishell/ASR/pruned_transducer_stateless2/decoder.py
Symbolic link
1
egs/aishell/ASR/pruned_transducer_stateless2/decoder.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/pruned_transducer_stateless2/decoder.py
|
||||||
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/pruned_transducer_stateless2/encoder_interface.py
|
||||||
217
egs/aishell/ASR/pruned_transducer_stateless2/export.py
Executable file
217
egs/aishell/ASR/pruned_transducer_stateless2/export.py
Executable file
@ -0,0 +1,217 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This script converts several saved checkpoints
|
||||||
|
# to a single one using model averaging.
|
||||||
|
"""
|
||||||
|
Usage:
|
||||||
|
./pruned_transducer_stateless2/export.py \
|
||||||
|
--exp-dir ./pruned_transducer_stateless2/exp \
|
||||||
|
--jit 0 \
|
||||||
|
--epoch 29 \
|
||||||
|
--avg 5
|
||||||
|
|
||||||
|
It will generate a file exp_dir/pretrained-epoch-29-avg-5.pt
|
||||||
|
|
||||||
|
To use the generated file with `pruned_transducer_stateless2/decode.py`,
|
||||||
|
you can do::
|
||||||
|
|
||||||
|
cd /path/to/exp_dir
|
||||||
|
ln -s pretrained-epoch-29-avg-5.pt epoch-9999.pt
|
||||||
|
|
||||||
|
cd /path/to/egs/aishell/ASR
|
||||||
|
./pruned_transducer_stateless2/decode.py \
|
||||||
|
--exp-dir ./pruned_transducer_stateless2/exp \
|
||||||
|
--epoch 9999 \
|
||||||
|
--avg 1 \
|
||||||
|
--max-duration 100 \
|
||||||
|
--lang-dir data/lang_char
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
|
from icefall.checkpoint import (
|
||||||
|
average_checkpoints,
|
||||||
|
find_checkpoints,
|
||||||
|
load_checkpoint,
|
||||||
|
)
|
||||||
|
from icefall.lexicon import Lexicon
|
||||||
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--epoch",
|
||||||
|
type=int,
|
||||||
|
default=29,
|
||||||
|
help="""It specifies the checkpoint to use for averaging.
|
||||||
|
Note: Epoch counts from 1.
|
||||||
|
You can specify --avg to use more checkpoints for model averaging.""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--iter",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="""If positive, --epoch is ignored and it
|
||||||
|
will use the checkpoint exp_dir/checkpoint-iter.pt.
|
||||||
|
You can specify --avg to use more checkpoints for model averaging.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--avg",
|
||||||
|
type=int,
|
||||||
|
default=15,
|
||||||
|
help="Number of checkpoints to average. Automatically select "
|
||||||
|
"consecutive checkpoints before the checkpoint specified by "
|
||||||
|
"'--epoch' and '--iter'",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--exp-dir",
|
||||||
|
type=Path,
|
||||||
|
default=Path("pruned_transducer_stateless2/exp"),
|
||||||
|
help="""It specifies the directory where all training related
|
||||||
|
files, e.g., checkpoints, log, etc, are saved
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--jit",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="""True to save a model after applying torch.jit.script.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang-dir",
|
||||||
|
type=Path,
|
||||||
|
default=Path("data/lang_char"),
|
||||||
|
help="The lang dir",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--context-size",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="The context size in the decoder. 1 means bigram; "
|
||||||
|
"2 means tri-gram",
|
||||||
|
)
|
||||||
|
|
||||||
|
add_model_arguments(parser)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_parser().parse_args()
|
||||||
|
|
||||||
|
params = get_params()
|
||||||
|
params.update(vars(args))
|
||||||
|
|
||||||
|
device = torch.device("cpu")
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = torch.device("cuda", 0)
|
||||||
|
|
||||||
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
|
lexicon = Lexicon(params.lang_dir)
|
||||||
|
|
||||||
|
params.blank_id = 0
|
||||||
|
params.vocab_size = max(lexicon.tokens) + 1
|
||||||
|
|
||||||
|
logging.info(params)
|
||||||
|
|
||||||
|
logging.info("About to create model")
|
||||||
|
model = get_transducer_model(params)
|
||||||
|
|
||||||
|
if params.iter > 0:
|
||||||
|
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
|
||||||
|
: params.avg
|
||||||
|
]
|
||||||
|
if len(filenames) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"No checkpoints found for"
|
||||||
|
f" --iter {params.iter}, --avg {params.avg}"
|
||||||
|
)
|
||||||
|
elif len(filenames) < params.avg:
|
||||||
|
raise ValueError(
|
||||||
|
f"Not enough checkpoints ({len(filenames)}) found for"
|
||||||
|
f" --iter {params.iter}, --avg {params.avg}"
|
||||||
|
)
|
||||||
|
logging.info(f"averaging {filenames}")
|
||||||
|
model.to(device)
|
||||||
|
model.load_state_dict(average_checkpoints(filenames, device=device))
|
||||||
|
elif params.avg == 1:
|
||||||
|
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
|
||||||
|
else:
|
||||||
|
start = params.epoch - params.avg + 1
|
||||||
|
filenames = []
|
||||||
|
for i in range(start, params.epoch + 1):
|
||||||
|
if i >= 1:
|
||||||
|
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
|
||||||
|
logging.info(f"averaging {filenames}")
|
||||||
|
model.to(device)
|
||||||
|
model.load_state_dict(average_checkpoints(filenames, device=device))
|
||||||
|
|
||||||
|
model.to("cpu")
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
if params.jit:
|
||||||
|
# We won't use the forward() method of the model in C++, so just ignore
|
||||||
|
# it here.
|
||||||
|
# Otherwise, one of its arguments is a ragged tensor and is not
|
||||||
|
# torch scriptabe.
|
||||||
|
model.__class__.forward = torch.jit.ignore(model.__class__.forward)
|
||||||
|
logging.info("Using torch.jit.script")
|
||||||
|
model = torch.jit.script(model)
|
||||||
|
filename = (
|
||||||
|
params.exp_dir / f"cpu_jit-epoch-{params.epoch}-avg-{params.avg}.pt"
|
||||||
|
)
|
||||||
|
model.save(str(filename))
|
||||||
|
logging.info(f"Saved to {filename}")
|
||||||
|
else:
|
||||||
|
logging.info("Not using torch.jit.script")
|
||||||
|
# Save it using a format so that it can be loaded
|
||||||
|
# by :func:`load_checkpoint`
|
||||||
|
filename = (
|
||||||
|
params.exp_dir
|
||||||
|
/ f"pretrained-epoch-{params.epoch}-avg-{params.avg}.pt"
|
||||||
|
)
|
||||||
|
torch.save({"model": model.state_dict()}, str(filename))
|
||||||
|
logging.info(f"Saved to {filename}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = (
|
||||||
|
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
main()
|
||||||
1
egs/aishell/ASR/pruned_transducer_stateless2/joiner.py
Symbolic link
1
egs/aishell/ASR/pruned_transducer_stateless2/joiner.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/pruned_transducer_stateless2/joiner.py
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user