mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-05 07:04:18 +00:00
Merge branch 'master' into diagnostics_fix
This commit is contained in:
commit
d36efe14c7
1
.flake8
1
.flake8
@ -24,6 +24,7 @@ exclude =
|
||||
**/data/**,
|
||||
icefall/shared/make_kn_lm.py,
|
||||
icefall/__init__.py
|
||||
icefall/ctc/__init__.py
|
||||
|
||||
ignore =
|
||||
# E203 white space before ":"
|
||||
|
@ -29,6 +29,9 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" ==
|
||||
ls -lh data/fbank
|
||||
ls -lh pruned_transducer_stateless2/exp
|
||||
|
||||
ln -s data/fbank/cuts_DEV.jsonl.gz data/fbank/gigaspeech_cuts_DEV.jsonl.gz
|
||||
ln -s data/fbank/cuts_TEST.jsonl.gz data/fbank/gigaspeech_cuts_TEST.jsonl.gz
|
||||
|
||||
log "Decoding dev and test"
|
||||
|
||||
# use a small value for decoding with CPU
|
||||
|
@ -53,7 +53,7 @@ log "Export to torchscript model"
|
||||
|
||||
./conformer_ctc3/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--lang-dir $repo/data/lang_bpe_500 \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--jit-trace 1 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
@ -82,7 +82,7 @@ for m in ctc-decoding 1best; do
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--words-file $repo/data/lang_bpe_500/words.txt \
|
||||
--HLG $repo/data/lang_bpe_500/HLG.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--G $repo/data/lm/G_4_gram.pt \
|
||||
--method $m \
|
||||
--sample-rate 16000 \
|
||||
|
@ -31,7 +31,7 @@ log "Test exporting with torch.jit.trace()"
|
||||
|
||||
./lstm_transducer_stateless2/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
@ -55,7 +55,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -68,7 +68,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -36,7 +36,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -49,7 +49,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -35,7 +35,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -48,7 +48,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -30,14 +30,14 @@ popd
|
||||
log "Export to torchscript model"
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--jit 1
|
||||
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--jit-trace 1
|
||||
@ -74,7 +74,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -87,7 +87,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -32,7 +32,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--num-encoder-layers 18 \
|
||||
--dim-feedforward 2048 \
|
||||
--nhead 8 \
|
||||
@ -51,7 +51,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav \
|
||||
|
@ -33,7 +33,7 @@ log "Export to torchscript model"
|
||||
./pruned_transducer_stateless7/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model false \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--jit 1
|
||||
@ -56,7 +56,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -69,7 +69,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -37,7 +37,7 @@ log "Export to torchscript model"
|
||||
./pruned_transducer_stateless7_ctc/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model false \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--jit 1
|
||||
@ -74,7 +74,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -87,7 +87,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -36,7 +36,7 @@ log "Export to torchscript model"
|
||||
./pruned_transducer_stateless7_ctc_bs/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model false \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--jit 1
|
||||
@ -72,7 +72,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -85,7 +85,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -37,7 +37,7 @@ log "Export to torchscript model"
|
||||
./pruned_transducer_stateless7_streaming/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model false \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--decode-chunk-len 32 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
@ -81,7 +81,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--decode-chunk-len 32 \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
@ -95,7 +95,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--decode-chunk-len 32 \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
|
@ -41,7 +41,7 @@ log "Decode with models exported by torch.jit.script()"
|
||||
log "Export to torchscript model"
|
||||
./pruned_transducer_stateless8/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model false \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
@ -65,7 +65,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -78,7 +78,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -32,7 +32,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--simulate-streaming 1 \
|
||||
--causal-convolution 1 \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
@ -47,7 +47,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--simulate-streaming 1 \
|
||||
--causal-convolution 1 \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
|
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -37,7 +37,7 @@ log "Export to torchscript model"
|
||||
./zipformer_mmi/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model false \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--jit 1
|
||||
@ -61,7 +61,7 @@ for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescor
|
||||
--method $method \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--lang-dir $repo/data/lang_bpe_500 \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
51
.github/scripts/run-multi-zh_hans-zipformer.sh
vendored
Executable file
51
.github/scripts/run-multi-zh_hans-zipformer.sh
vendored
Executable file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
cd egs/multi_zh-hans/ASR
|
||||
|
||||
repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2/
|
||||
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
git lfs install
|
||||
git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
ln -s epoch-20.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
ls -lh $repo/exp/*.pt
|
||||
|
||||
|
||||
./zipformer/pretrained.py \
|
||||
--checkpoint $repo/exp/epoch-99.pt \
|
||||
--tokens $repo/data/lang_bpe_2000/tokens.txt \
|
||||
--method greedy_search \
|
||||
$repo/test_wavs/DEV_T0000000000.wav \
|
||||
$repo/test_wavs/DEV_T0000000001.wav \
|
||||
$repo/test_wavs/DEV_T0000000002.wav
|
||||
|
||||
for method in modified_beam_search fast_beam_search; do
|
||||
log "$method"
|
||||
|
||||
./zipformer/pretrained.py \
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/epoch-99.pt \
|
||||
--tokens $repo/data/lang_bpe_2000/tokens.txt \
|
||||
$repo/test_wavs/DEV_T0000000000.wav \
|
||||
$repo/test_wavs/DEV_T0000000001.wav \
|
||||
$repo/test_wavs/DEV_T0000000002.wav
|
||||
done
|
46
.github/scripts/run-pre-trained-conformer-ctc.sh
vendored
46
.github/scripts/run-pre-trained-conformer-ctc.sh
vendored
@ -1,46 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
repo_url=https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500
|
||||
git lfs install
|
||||
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
ls -lh $repo/test_wavs/*.flac
|
||||
|
||||
log "CTC decoding"
|
||||
|
||||
./conformer_ctc/pretrained.py \
|
||||
--method ctc-decoding \
|
||||
--num-classes 500 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
$repo/test_wavs/1089-134686-0001.flac \
|
||||
$repo/test_wavs/1221-135766-0001.flac \
|
||||
$repo/test_wavs/1221-135766-0002.flac
|
||||
|
||||
log "HLG decoding"
|
||||
|
||||
./conformer_ctc/pretrained.py \
|
||||
--method 1best \
|
||||
--num-classes 500 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--words-file $repo/data/lang_bpe_500/words.txt \
|
||||
--HLG $repo/data/lang_bpe_500/HLG.pt \
|
||||
$repo/test_wavs/1089-134686-0001.flac \
|
||||
$repo/test_wavs/1221-135766-0001.flac \
|
||||
$repo/test_wavs/1221-135766-0002.flac
|
240
.github/scripts/run-pre-trained-ctc.sh
vendored
Executable file
240
.github/scripts/run-pre-trained-ctc.sh
vendored
Executable file
@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
pushd egs/librispeech/ASR
|
||||
|
||||
repo_url=https://huggingface.co/csukuangfj/sherpa-onnx-zipformer-ctc-en-2023-10-02
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
git lfs install
|
||||
git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
log "CTC greedy search"
|
||||
|
||||
./zipformer/onnx_pretrained_ctc.py \
|
||||
--nn-model $repo/model.onnx \
|
||||
--tokens $repo/tokens.txt \
|
||||
$repo/test_wavs/0.wav \
|
||||
$repo/test_wavs/1.wav \
|
||||
$repo/test_wavs/2.wav
|
||||
|
||||
log "CTC H decoding"
|
||||
|
||||
./zipformer/onnx_pretrained_ctc_H.py \
|
||||
--nn-model $repo/model.onnx \
|
||||
--tokens $repo/tokens.txt \
|
||||
--H $repo/H.fst \
|
||||
$repo/test_wavs/0.wav \
|
||||
$repo/test_wavs/1.wav \
|
||||
$repo/test_wavs/2.wav
|
||||
|
||||
log "CTC HL decoding"
|
||||
|
||||
./zipformer/onnx_pretrained_ctc_HL.py \
|
||||
--nn-model $repo/model.onnx \
|
||||
--words $repo/words.txt \
|
||||
--HL $repo/HL.fst \
|
||||
$repo/test_wavs/0.wav \
|
||||
$repo/test_wavs/1.wav \
|
||||
$repo/test_wavs/2.wav
|
||||
|
||||
log "CTC HLG decoding"
|
||||
|
||||
./zipformer/onnx_pretrained_ctc_HLG.py \
|
||||
--nn-model $repo/model.onnx \
|
||||
--words $repo/words.txt \
|
||||
--HLG $repo/HLG.fst \
|
||||
$repo/test_wavs/0.wav \
|
||||
$repo/test_wavs/1.wav \
|
||||
$repo/test_wavs/2.wav
|
||||
|
||||
rm -rf $repo
|
||||
|
||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
pushd $repo
|
||||
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/HLG.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/L.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/L_disambig.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/Linv.pt"
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "data/lang_bpe_500/lexicon.txt"
|
||||
git lfs pull --include "data/lang_bpe_500/lexicon_disambig.txt"
|
||||
git lfs pull --include "data/lang_bpe_500/tokens.txt"
|
||||
git lfs pull --include "data/lang_bpe_500/words.txt"
|
||||
git lfs pull --include "data/lm/G_3_gram.fst.txt"
|
||||
|
||||
popd
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
log "CTC decoding"
|
||||
|
||||
./conformer_ctc/pretrained.py \
|
||||
--method ctc-decoding \
|
||||
--num-classes 500 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
log "HLG decoding"
|
||||
|
||||
./conformer_ctc/pretrained.py \
|
||||
--method 1best \
|
||||
--num-classes 500 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--words-file $repo/data/lang_bpe_500/words.txt \
|
||||
--HLG $repo/data/lang_bpe_500/HLG.pt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
log "CTC decoding on CPU with kaldi decoders using OpenFst"
|
||||
|
||||
log "Exporting model with torchscript"
|
||||
|
||||
pushd $repo/exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
./conformer_ctc/export.py \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--jit 1
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
|
||||
log "Generating H.fst, HL.fst"
|
||||
|
||||
./local/prepare_lang_fst.py --lang-dir $repo/data/lang_bpe_500 --ngram-G $repo/data/lm/G_3_gram.fst.txt
|
||||
|
||||
ls -lh $repo/data/lang_bpe_500
|
||||
|
||||
log "Decoding with H on CPU with OpenFst"
|
||||
|
||||
./conformer_ctc/jit_pretrained_decode_with_H.py \
|
||||
--nn-model $repo/exp/cpu_jit.pt \
|
||||
--H $repo/data/lang_bpe_500/H.fst \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
log "Decoding with HL on CPU with OpenFst"
|
||||
|
||||
./conformer_ctc/jit_pretrained_decode_with_HL.py \
|
||||
--nn-model $repo/exp/cpu_jit.pt \
|
||||
--HL $repo/data/lang_bpe_500/HL.fst \
|
||||
--words $repo/data/lang_bpe_500/words.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
log "Decoding with HLG on CPU with OpenFst"
|
||||
|
||||
./conformer_ctc/jit_pretrained_decode_with_HLG.py \
|
||||
--nn-model $repo/exp/cpu_jit.pt \
|
||||
--HLG $repo/data/lang_bpe_500/HLG.fst \
|
||||
--words $repo/data/lang_bpe_500/words.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
||||
rm -rf $repo
|
||||
|
||||
popd
|
||||
|
||||
log "Test aishell"
|
||||
|
||||
pushd egs/aishell/ASR
|
||||
|
||||
repo_url=https://huggingface.co/csukuangfj/icefall_asr_aishell_conformer_ctc
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
pushd $repo
|
||||
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
git lfs pull --include "data/lang_char/H.fst"
|
||||
git lfs pull --include "data/lang_char/HL.fst"
|
||||
git lfs pull --include "data/lang_char/HLG.fst"
|
||||
|
||||
popd
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
log "CTC decoding"
|
||||
|
||||
log "Exporting model with torchscript"
|
||||
|
||||
pushd $repo/exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
./conformer_ctc/export.py \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp \
|
||||
--tokens $repo/data/lang_char/tokens.txt \
|
||||
--jit 1
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
ls -lh $repo/data/lang_char
|
||||
|
||||
log "Decoding with H on CPU with OpenFst"
|
||||
|
||||
./conformer_ctc/jit_pretrained_decode_with_H.py \
|
||||
--nn-model $repo/exp/cpu_jit.pt \
|
||||
--H $repo/data/lang_char/H.fst \
|
||||
--tokens $repo/data/lang_char/tokens.txt \
|
||||
$repo/test_wavs/0.wav \
|
||||
$repo/test_wavs/1.wav \
|
||||
$repo/test_wavs/2.wav
|
||||
|
||||
log "Decoding with HL on CPU with OpenFst"
|
||||
|
||||
./conformer_ctc/jit_pretrained_decode_with_HL.py \
|
||||
--nn-model $repo/exp/cpu_jit.pt \
|
||||
--HL $repo/data/lang_char/HL.fst \
|
||||
--words $repo/data/lang_char/words.txt \
|
||||
$repo/test_wavs/0.wav \
|
||||
$repo/test_wavs/1.wav \
|
||||
$repo/test_wavs/2.wav
|
||||
|
||||
log "Decoding with HLG on CPU with OpenFst"
|
||||
|
||||
./conformer_ctc/jit_pretrained_decode_with_HLG.py \
|
||||
--nn-model $repo/exp/cpu_jit.pt \
|
||||
--HLG $repo/data/lang_char/HLG.fst \
|
||||
--words $repo/data/lang_char/words.txt \
|
||||
$repo/test_wavs/0.wav \
|
||||
$repo/test_wavs/1.wav \
|
||||
$repo/test_wavs/2.wav
|
||||
|
||||
rm -rf $repo
|
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -41,7 +41,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -41,7 +41,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
||||
--method greedy_search \
|
||||
--max-sym-per-frame $sym \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
|
||||
--method $method \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
@ -27,7 +27,7 @@ log "Beam search decoding"
|
||||
--method beam_search \
|
||||
--beam-size 4 \
|
||||
--checkpoint $repo/exp/pretrained.pt \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
|
44
.github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
vendored
Executable file
44
.github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
vendored
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
cd egs/swbd/ASR
|
||||
|
||||
repo_url=https://huggingface.co/zrjin/icefall-asr-swbd-conformer-ctc-2023-8-26
|
||||
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
git lfs install
|
||||
git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
|
||||
pushd $repo/exp
|
||||
ln -s epoch-98.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
ls -lh $repo/exp/*.pt
|
||||
|
||||
for method in ctc-decoding 1best; do
|
||||
log "$method"
|
||||
|
||||
./conformer_ctc/pretrained.py \
|
||||
--method $method \
|
||||
--checkpoint $repo/exp/epoch-99.pt \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--words-file $repo/data/lang_bpe_500/words.txt \
|
||||
--HLG $repo/data/lang_bpe_500/HLG.pt \
|
||||
--G $repo/data/lm/G_4_gram.pt \
|
||||
$repo/test_wavs/1089-134686-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0001.wav \
|
||||
$repo/test_wavs/1221-135766-0002.wav
|
||||
done
|
@ -17,7 +17,6 @@ git lfs install
|
||||
git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
|
||||
log "Display test files"
|
||||
tree $repo/
|
||||
ls -lh $repo/test_wavs/*.wav
|
||||
@ -29,12 +28,11 @@ popd
|
||||
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless2/export.py \
|
||||
./pruned_transducer_stateless2/export-onnx.py \
|
||||
--exp-dir $repo/exp \
|
||||
--lang-dir $repo/data/lang_char \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--onnx 1
|
||||
--avg 1
|
||||
|
||||
log "Export to torchscript model"
|
||||
|
||||
@ -59,19 +57,17 @@ log "Decode with ONNX models"
|
||||
|
||||
./pruned_transducer_stateless2/onnx_check.py \
|
||||
--jit-filename $repo/exp/cpu_jit.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner.onnx \
|
||||
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
|
||||
--onnx-encoder-filename $repo/exp/encoder-epoch-10-avg-2.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder-epoch-10-avg-2.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner-epoch-10-avg-2.onnx \
|
||||
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj-epoch-10-avg-2.onnx \
|
||||
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj-epoch-10-avg-2.onnx
|
||||
|
||||
./pruned_transducer_stateless2/onnx_pretrained.py \
|
||||
--tokens $repo/data/lang_char/tokens.txt \
|
||||
--encoder-model-filename $repo/exp/encoder.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner.onnx \
|
||||
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
||||
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||
$repo/test_wavs/DEV_T0000000000.wav \
|
||||
$repo/test_wavs/DEV_T0000000001.wav \
|
||||
$repo/test_wavs/DEV_T0000000002.wav
|
||||
|
12
.github/scripts/test-ncnn-export.sh
vendored
12
.github/scripts/test-ncnn-export.sh
vendored
@ -45,7 +45,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
||||
|
||||
cd exp
|
||||
@ -56,11 +55,10 @@ log "Export via torch.jit.trace()"
|
||||
|
||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
\
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--num-encoder-layers 12 \
|
||||
--chunk-length 32 \
|
||||
--cnn-module-kernel 31 \
|
||||
@ -91,7 +89,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
|
||||
|
||||
cd exp
|
||||
@ -102,7 +99,7 @@ log "Export via torch.jit.trace()"
|
||||
|
||||
./lstm_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $repo/exp \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0
|
||||
@ -140,7 +137,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
|
||||
cd exp
|
||||
@ -148,7 +144,7 @@ ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
@ -199,7 +195,7 @@ ln -s pretrained.pt epoch-9999.pt
|
||||
popd
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-for-ncnn-zh.py \
|
||||
--lang-dir $repo/data/lang_char_bpe \
|
||||
--tokens $repo/data/lang_char_bpe/tokens.txt \
|
||||
--exp-dir $repo/exp \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 9999 \
|
||||
|
138
.github/scripts/test-onnx-export.sh
vendored
138
.github/scripts/test-onnx-export.sh
vendored
@ -10,7 +10,123 @@ log() {
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
git lfs install
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
cd exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Export via torch.jit.script()"
|
||||
./zipformer/export.py \
|
||||
--exp-dir $repo/exp \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--jit 1
|
||||
|
||||
log "Test export to ONNX format"
|
||||
./zipformer/export-onnx.py \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp \
|
||||
--num-encoder-layers "2,2,3,4,3,2" \
|
||||
--downsampling-factor "1,2,4,8,4,2" \
|
||||
--feedforward-dim "512,768,1024,1536,1024,768" \
|
||||
--num-heads "4,4,4,8,4,4" \
|
||||
--encoder-dim "192,256,384,512,384,256" \
|
||||
--query-head-dim 32 \
|
||||
--value-head-dim 12 \
|
||||
--pos-head-dim 4 \
|
||||
--pos-dim 48 \
|
||||
--encoder-unmasked-dim "192,192,256,256,256,192" \
|
||||
--cnn-module-kernel "31,31,15,15,15,31" \
|
||||
--decoder-dim 512 \
|
||||
--joiner-dim 512 \
|
||||
--causal False \
|
||||
--chunk-size "16,32,64,-1" \
|
||||
--left-context-frames "64,128,256,-1"
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
log "Run onnx_check.py"
|
||||
|
||||
./zipformer/onnx_check.py \
|
||||
--jit-filename $repo/exp/jit_script.pt \
|
||||
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
|
||||
|
||||
log "Run onnx_pretrained.py"
|
||||
|
||||
./zipformer/onnx_pretrained.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
rm -rf $repo
|
||||
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
|
||||
log "Downloading pre-trained model from $repo_url"
|
||||
git lfs install
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
|
||||
cd exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
popd
|
||||
|
||||
log "Test export streaming model to ONNX format"
|
||||
./zipformer/export-onnx-streaming.py \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp \
|
||||
--num-encoder-layers "2,2,3,4,3,2" \
|
||||
--downsampling-factor "1,2,4,8,4,2" \
|
||||
--feedforward-dim "512,768,1024,1536,1024,768" \
|
||||
--num-heads "4,4,4,8,4,4" \
|
||||
--encoder-dim "192,256,384,512,384,256" \
|
||||
--query-head-dim 32 \
|
||||
--value-head-dim 12 \
|
||||
--pos-head-dim 4 \
|
||||
--pos-dim 48 \
|
||||
--encoder-unmasked-dim "192,192,256,256,256,192" \
|
||||
--cnn-module-kernel "31,31,15,15,15,31" \
|
||||
--decoder-dim 512 \
|
||||
--joiner-dim 512 \
|
||||
--causal True \
|
||||
--chunk-size 16 \
|
||||
--left-context-frames 64
|
||||
|
||||
ls -lh $repo/exp
|
||||
|
||||
log "Run onnx_pretrained-streaming.py"
|
||||
|
||||
./zipformer/onnx_pretrained-streaming.py \
|
||||
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
$repo/test_wavs/1089-134686-0001.wav
|
||||
|
||||
rm -rf $repo
|
||||
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
@ -39,7 +155,7 @@ log "Export via torch.jit.trace()"
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
@ -88,7 +204,7 @@ popd
|
||||
log "Export via torch.jit.script()"
|
||||
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 9999 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp/ \
|
||||
@ -97,7 +213,7 @@ log "Export via torch.jit.script()"
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless3/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 9999 \
|
||||
--avg 1 \
|
||||
--exp-dir $repo/exp/
|
||||
@ -126,7 +242,6 @@ log "Run onnx_pretrained.py"
|
||||
rm -rf $repo
|
||||
log "--------------------------------------------------------------------------"
|
||||
|
||||
|
||||
log "=========================================================================="
|
||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
@ -143,7 +258,7 @@ popd
|
||||
log "Export via torch.jit.script()"
|
||||
|
||||
./pruned_transducer_stateless5/export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
@ -159,7 +274,7 @@ log "Export via torch.jit.script()"
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless5/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
@ -205,7 +320,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||
repo=$(basename $repo_url)
|
||||
|
||||
pushd $repo
|
||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
||||
git lfs pull --include "exp/pretrained.pt"
|
||||
|
||||
cd exp
|
||||
@ -215,7 +329,7 @@ popd
|
||||
log "Export via torch.jit.script()"
|
||||
|
||||
./pruned_transducer_stateless7/export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
@ -226,7 +340,7 @@ log "Export via torch.jit.script()"
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./pruned_transducer_stateless7/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
@ -270,7 +384,7 @@ popd
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./conv_emformer_transducer_stateless2/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
@ -310,7 +424,7 @@ popd
|
||||
log "Export via torch.jit.trace()"
|
||||
|
||||
./lstm_transducer_stateless2/export.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
@ -320,7 +434,7 @@ log "Export via torch.jit.trace()"
|
||||
log "Test exporting to ONNX format"
|
||||
|
||||
./lstm_transducer_stateless2/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
|
45
.github/workflows/build-docker-image.yml
vendored
Normal file
45
.github/workflows/build-docker-image.yml
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
# see also
|
||||
# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages
|
||||
name: Build docker image
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: build_docker-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build-docker-image:
|
||||
name: ${{ matrix.image }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
image: ["torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
|
||||
|
||||
steps:
|
||||
# refer to https://github.com/actions/checkout
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Rename
|
||||
shell: bash
|
||||
run: |
|
||||
image=${{ matrix.image }}
|
||||
mv -v ./docker/$image.dockerfile ./Dockerfile
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
push: true
|
||||
tags: k2fsa/icefall:${{ matrix.image }}
|
2
.github/workflows/run-aishell-2022-06-20.yml
vendored
2
.github/workflows/run-aishell-2022-06-20.yml
vendored
@ -45,7 +45,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
92
.github/workflows/run-docker-image.yml
vendored
Normal file
92
.github/workflows/run-docker-image.yml
vendored
Normal file
@ -0,0 +1,92 @@
|
||||
name: Run docker image
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: run_docker_image-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run-docker-image:
|
||||
name: ${{ matrix.image }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
image: ["torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
|
||||
steps:
|
||||
# refer to https://github.com/actions/checkout
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Run the build process with Docker
|
||||
uses: addnab/docker-run-action@v3
|
||||
with:
|
||||
image: k2fsa/icefall:${{ matrix.image }}
|
||||
shell: bash
|
||||
run: |
|
||||
uname -a
|
||||
cat /etc/*release
|
||||
|
||||
nvcc --version
|
||||
|
||||
# For torch1.9.0-cuda10.2
|
||||
export LD_LIBRARY_PATH=/usr/local/cuda-10.2/compat:$LD_LIBRARY_PATH
|
||||
|
||||
# For torch1.12.1-cuda11.3
|
||||
export LD_LIBRARY_PATH=/usr/local/cuda-11.3/compat:$LD_LIBRARY_PATH
|
||||
|
||||
# For torch2.0.0-cuda11.7
|
||||
export LD_LIBRARY_PATH=/usr/local/cuda-11.7/compat:$LD_LIBRARY_PATH
|
||||
|
||||
|
||||
which nvcc
|
||||
cuda_dir=$(dirname $(which nvcc))
|
||||
echo "cuda_dir: $cuda_dir"
|
||||
|
||||
find $cuda_dir -name libcuda.so*
|
||||
echo "--------------------"
|
||||
|
||||
find / -name libcuda.so* 2>/dev/null
|
||||
|
||||
# for torch1.13.0-cuda11.6
|
||||
if [ -e /opt/conda/lib/stubs/libcuda.so ]; then
|
||||
cd /opt/conda/lib/stubs && ln -s libcuda.so libcuda.so.1 && cd -
|
||||
export LD_LIBRARY_PATH=/opt/conda/lib/stubs:$LD_LIBRARY_PATH
|
||||
fi
|
||||
|
||||
find / -name libcuda.so* 2>/dev/null
|
||||
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
|
||||
|
||||
python3 --version
|
||||
which python3
|
||||
|
||||
python3 -m pip list
|
||||
|
||||
echo "----------torch----------"
|
||||
python3 -m torch.utils.collect_env
|
||||
|
||||
echo "----------k2----------"
|
||||
python3 -c "import k2; print(k2.__file__)"
|
||||
python3 -c "import k2; print(k2.__dev_version__)"
|
||||
python3 -m k2.version
|
||||
|
||||
echo "----------lhotse----------"
|
||||
python3 -c "import lhotse; print(lhotse.__file__)"
|
||||
python3 -c "import lhotse; print(lhotse.__version__)"
|
||||
|
||||
echo "----------kaldifeat----------"
|
||||
python3 -c "import kaldifeat; print(kaldifeat.__file__)"
|
||||
python3 -c "import kaldifeat; print(kaldifeat.__version__)"
|
||||
|
||||
echo "Test yesno recipe"
|
||||
|
||||
cd egs/yesno/ASR
|
||||
|
||||
./prepare.sh
|
||||
|
||||
./tdnn/train.py
|
||||
|
||||
./tdnn/decode.py
|
@ -44,7 +44,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -44,7 +44,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -44,7 +44,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -44,7 +44,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -44,7 +44,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -44,7 +44,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -44,7 +44,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
84
.github/workflows/run-multi-zh_hans-zipformer.yml
vendored
Normal file
84
.github/workflows/run-multi-zh_hans-zipformer.yml
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
# Copyright 2023 Xiaomi Corp. (author: Zengrui Jin)
|
||||
|
||||
# See ../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: run-multi-zh_hans-zipformer
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
|
||||
concurrency:
|
||||
group: run_multi-zh_hans_zipformer-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run_multi-zh_hans_zipformer:
|
||||
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'multi-zh_hans'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: 'pip'
|
||||
cache-dependency-path: '**/requirements-ci.txt'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
~/tmp/kaldifeat
|
||||
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
|
||||
|
||||
- name: Install kaldifeat
|
||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
.github/scripts/install-kaldifeat.sh
|
||||
|
||||
- name: Inference with pre-trained model
|
||||
shell: bash
|
||||
env:
|
||||
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
.github/scripts/run-multi-zh_hans-zipformer.sh
|
@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: run-pre-trained-conformer-ctc
|
||||
name: run-pre-trained-ctc
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -23,18 +23,25 @@ on:
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
test-run:
|
||||
description: 'Test (y/n)?'
|
||||
required: true
|
||||
default: 'y'
|
||||
|
||||
concurrency:
|
||||
group: run_pre_trained_conformer_ctc-${{ github.ref }}
|
||||
group: run_pre_trained_ctc-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run_pre_trained_conformer_ctc:
|
||||
if: github.event.label.name == 'ready' || github.event_name == 'push'
|
||||
run_pre_trained_ctc:
|
||||
if: github.event.label.name == 'ready' || github.event_name == 'push' || github.event.inputs.test-run == 'y' || github.event.label.name == 'ctc'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
@ -77,4 +84,4 @@ jobs:
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
.github/scripts/run-pre-trained-conformer-ctc.sh
|
||||
.github/scripts/run-pre-trained-ctc.sh
|
@ -43,7 +43,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -43,7 +43,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -34,7 +34,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -34,7 +34,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -43,7 +43,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
@ -34,7 +34,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.7, 3.8, 3.9]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
|
84
.github/workflows/run-swbd-conformer-ctc.yml
vendored
Normal file
84
.github/workflows/run-swbd-conformer-ctc.yml
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
# Copyright 2023 Xiaomi Corp. (author: Zengrui Jin)
|
||||
|
||||
# See ../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: run-swbd-conformer_ctc
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
|
||||
concurrency:
|
||||
group: run-swbd-conformer_ctc-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run-swbd-conformer_ctc:
|
||||
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'swbd'
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [3.8]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: 'pip'
|
||||
cache-dependency-path: '**/requirements-ci.txt'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
~/tmp/kaldifeat
|
||||
key: cache-tmp-${{ matrix.python-version }}-2023-05-22
|
||||
|
||||
- name: Install kaldifeat
|
||||
if: steps.my-cache.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
.github/scripts/install-kaldifeat.sh
|
||||
|
||||
- name: Inference with pre-trained model
|
||||
shell: bash
|
||||
env:
|
||||
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
||||
GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
|
||||
run: |
|
||||
sudo apt-get -qq install git-lfs tree
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
|
||||
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
|
||||
|
||||
.github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
|
115
.github/workflows/run-yesno-recipe.yml
vendored
115
.github/workflows/run-yesno-recipe.yml
vendored
@ -44,11 +44,6 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install graphviz
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get -qq install graphviz
|
||||
|
||||
- name: Setup Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
@ -65,11 +60,12 @@ jobs:
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
pip install --no-deps --force-reinstall https://huggingface.co/csukuangfj/k2/resolve/main/cpu/k2-1.24.3.dev20230508+cpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
|
||||
pip install kaldifeat==1.25.0.dev20230726+cpu.torch1.13.1 -f https://csukuangfj.github.io/kaldifeat/cpu.html
|
||||
|
||||
- name: Run yesno recipe
|
||||
shell: bash
|
||||
@ -78,9 +74,112 @@ jobs:
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
echo $PYTHONPATH
|
||||
|
||||
|
||||
cd egs/yesno/ASR
|
||||
./prepare.sh
|
||||
python3 ./tdnn/train.py
|
||||
python3 ./tdnn/decode.py
|
||||
# TODO: Check that the WER is less than some value
|
||||
|
||||
- name: Test exporting to pretrained.pt
|
||||
shell: bash
|
||||
working-directory: ${{github.workspace}}
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
echo $PYTHONPATH
|
||||
|
||||
cd egs/yesno/ASR
|
||||
python3 ./tdnn/export.py --epoch 14 --avg 2
|
||||
|
||||
python3 ./tdnn/pretrained.py \
|
||||
--checkpoint ./tdnn/exp/pretrained.pt \
|
||||
--HLG ./data/lang_phone/HLG.pt \
|
||||
--words-file ./data/lang_phone/words.txt \
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||
|
||||
- name: Test exporting to torchscript
|
||||
shell: bash
|
||||
working-directory: ${{github.workspace}}
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
echo $PYTHONPATH
|
||||
|
||||
cd egs/yesno/ASR
|
||||
python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
|
||||
|
||||
python3 ./tdnn/jit_pretrained.py \
|
||||
--nn-model ./tdnn/exp/cpu_jit.pt \
|
||||
--HLG ./data/lang_phone/HLG.pt \
|
||||
--words-file ./data/lang_phone/words.txt \
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||
|
||||
- name: Test exporting to onnx
|
||||
shell: bash
|
||||
working-directory: ${{github.workspace}}
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
echo $PYTHONPATH
|
||||
|
||||
cd egs/yesno/ASR
|
||||
python3 ./tdnn/export_onnx.py --epoch 14 --avg 2
|
||||
|
||||
echo "Test float32 model"
|
||||
python3 ./tdnn/onnx_pretrained.py \
|
||||
--nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
|
||||
--HLG ./data/lang_phone/HLG.pt \
|
||||
--words-file ./data/lang_phone/words.txt \
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||
|
||||
|
||||
echo "Test int8 model"
|
||||
python3 ./tdnn/onnx_pretrained.py \
|
||||
--nn-model ./tdnn/exp/model-epoch-14-avg-2.int8.onnx \
|
||||
--HLG ./data/lang_phone/HLG.pt \
|
||||
--words-file ./data/lang_phone/words.txt \
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||
|
||||
- name: Test decoding with H
|
||||
shell: bash
|
||||
working-directory: ${{github.workspace}}
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
echo $PYTHONPATH
|
||||
|
||||
cd egs/yesno/ASR
|
||||
python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
|
||||
|
||||
python3 ./tdnn/jit_pretrained_decode_with_H.py \
|
||||
--nn-model ./tdnn/exp/cpu_jit.pt \
|
||||
--H ./data/lang_phone/H.fst \
|
||||
--tokens ./data/lang_phone/tokens.txt \
|
||||
./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
|
||||
./download/waves_yesno/0_0_1_0_0_1_1_1.wav
|
||||
|
||||
- name: Test decoding with HL
|
||||
shell: bash
|
||||
working-directory: ${{github.workspace}}
|
||||
run: |
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
echo $PYTHONPATH
|
||||
|
||||
cd egs/yesno/ASR
|
||||
python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
|
||||
|
||||
python3 ./tdnn/jit_pretrained_decode_with_HL.py \
|
||||
--nn-model ./tdnn/exp/cpu_jit.pt \
|
||||
--HL ./data/lang_phone/HL.fst \
|
||||
--words ./data/lang_phone/words.txt \
|
||||
./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
|
||||
./download/waves_yesno/0_0_1_0_0_1_1_1.wav
|
||||
|
||||
- name: Show generated files
|
||||
shell: bash
|
||||
working-directory: ${{github.workspace}}
|
||||
run: |
|
||||
cd egs/yesno/ASR
|
||||
ls -lh tdnn/exp
|
||||
ls -lh data/lang_phone
|
||||
|
57
.github/workflows/test.yml
vendored
57
.github/workflows/test.yml
vendored
@ -35,9 +35,9 @@ jobs:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: ["3.8"]
|
||||
torch: ["1.10.0"]
|
||||
torchaudio: ["0.10.0"]
|
||||
k2-version: ["1.23.2.dev20221201"]
|
||||
torch: ["1.13.0"]
|
||||
torchaudio: ["0.13.0"]
|
||||
k2-version: ["1.24.3.dev20230719"]
|
||||
|
||||
fail-fast: false
|
||||
|
||||
@ -66,14 +66,14 @@ jobs:
|
||||
pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
||||
pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
||||
|
||||
pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
|
||||
pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html
|
||||
pip install git+https://github.com/lhotse-speech/lhotse
|
||||
# icefall requirements
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf==3.20.*
|
||||
|
||||
pip install kaldifst
|
||||
pip install onnxruntime
|
||||
pip install onnxruntime matplotlib
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Install graphviz
|
||||
@ -83,13 +83,6 @@ jobs:
|
||||
python3 -m pip install -qq graphviz
|
||||
sudo apt-get -qq install graphviz
|
||||
|
||||
- name: Install graphviz
|
||||
if: startsWith(matrix.os, 'macos')
|
||||
shell: bash
|
||||
run: |
|
||||
python3 -m pip install -qq graphviz
|
||||
brew install -q graphviz
|
||||
|
||||
- name: Run tests
|
||||
if: startsWith(matrix.os, 'ubuntu')
|
||||
run: |
|
||||
@ -129,40 +122,10 @@ jobs:
|
||||
cd ../transducer_lstm
|
||||
pytest -v -s
|
||||
|
||||
- name: Run tests
|
||||
if: startsWith(matrix.os, 'macos')
|
||||
run: |
|
||||
ls -lh
|
||||
export PYTHONPATH=$PWD:$PWD/lhotse:$PYTHONPATH
|
||||
lib_path=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
|
||||
echo "lib_path: $lib_path"
|
||||
export DYLD_LIBRARY_PATH=$lib_path:$DYLD_LIBRARY_PATH
|
||||
pytest -v -s ./test
|
||||
|
||||
# run tests for conformer ctc
|
||||
cd egs/librispeech/ASR/conformer_ctc
|
||||
cd ../zipformer
|
||||
pytest -v -s
|
||||
|
||||
cd ../pruned_transducer_stateless
|
||||
pytest -v -s
|
||||
|
||||
cd ../pruned_transducer_stateless2
|
||||
pytest -v -s
|
||||
|
||||
cd ../pruned_transducer_stateless3
|
||||
pytest -v -s
|
||||
|
||||
cd ../pruned_transducer_stateless4
|
||||
pytest -v -s
|
||||
|
||||
cd ../transducer_stateless
|
||||
pytest -v -s
|
||||
|
||||
# cd ../transducer
|
||||
# pytest -v -s
|
||||
|
||||
cd ../transducer_stateless2
|
||||
pytest -v -s
|
||||
|
||||
cd ../transducer_lstm
|
||||
pytest -v -s
|
||||
- uses: actions/upload-artifact@v2
|
||||
with:
|
||||
path: egs/librispeech/ASR/zipformer/swoosh.pdf
|
||||
name: swoosh.pdf
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -34,3 +34,5 @@ node_modules
|
||||
*.param
|
||||
*.bin
|
||||
.DS_Store
|
||||
*.fst
|
||||
*.arpa
|
||||
|
12
README.md
12
README.md
@ -29,6 +29,7 @@ We provide the following recipes:
|
||||
- [yesno][yesno]
|
||||
- [LibriSpeech][librispeech]
|
||||
- [GigaSpeech][gigaspeech]
|
||||
- [AMI][ami]
|
||||
- [Aishell][aishell]
|
||||
- [Aishell2][aishell2]
|
||||
- [Aishell4][aishell4]
|
||||
@ -37,6 +38,7 @@ We provide the following recipes:
|
||||
- [Aidatatang_200zh][aidatatang_200zh]
|
||||
- [WenetSpeech][wenetspeech]
|
||||
- [Alimeeting][alimeeting]
|
||||
- [Switchboard][swbd]
|
||||
- [TAL_CSASR][tal_csasr]
|
||||
|
||||
### yesno
|
||||
@ -118,9 +120,9 @@ We provide a Colab notebook to run a pre-trained transducer conformer + stateles
|
||||
|
||||
| Encoder | Params | test-clean | test-other |
|
||||
|-----------------|--------|------------|------------|
|
||||
| zipformer | 65.5M | 2.21 | 4.91 |
|
||||
| zipformer-small | 23.2M | 2.46 | 5.83 |
|
||||
| zipformer-large | 148.4M | 2.11 | 4.77 |
|
||||
| zipformer | 65.5M | 2.21 | 4.79 |
|
||||
| zipformer-small | 23.2M | 2.42 | 5.73 |
|
||||
| zipformer-large | 148.4M | 2.06 | 4.63 |
|
||||
|
||||
Note: No auxiliary losses are used in the training and no LMs are used
|
||||
in the decoding.
|
||||
@ -338,7 +340,7 @@ We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder
|
||||
|
||||
#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
|
||||
|
||||
The best results for Chinese CER(%) and English WER(%) respectivly (zh: Chinese, en: English):
|
||||
The best results for Chinese CER(%) and English WER(%) respectively (zh: Chinese, en: English):
|
||||
|decoding-method | dev | dev_zh | dev_en | test | test_zh | test_en |
|
||||
|--|--|--|--|--|--|--|
|
||||
|greedy_search| 7.30 | 6.48 | 19.19 |7.39| 6.66 | 19.13|
|
||||
@ -393,4 +395,6 @@ Please see: [ Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8, and (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.
|
||||
|
||||
If your NVIDIA driver supports CUDA Version: 11.3, please go for case (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8.
|
||||
|
70
docker/torch1.12.1-cuda11.3.dockerfile
Normal file
70
docker/torch1.12.1-cuda11.3.dockerfile
Normal file
@ -0,0 +1,70 @@
|
||||
FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel
|
||||
|
||||
ENV LC_ALL C.UTF-8
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG K2_VERSION="1.24.3.dev20230725+cuda11.3.torch1.12.1"
|
||||
ARG KALDIFEAT_VERSION="1.25.0.dev20230726+cuda11.3.torch1.12.1"
|
||||
ARG TORCHAUDIO_VERSION="0.12.1+cu113"
|
||||
|
||||
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
|
||||
LABEL k2_version=${K2_VERSION}
|
||||
LABEL kaldifeat_version=${KALDIFEAT_VERSION}
|
||||
LABEL github_repo="https://github.com/k2-fsa/icefall"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
vim \
|
||||
libssl-dev \
|
||||
autoconf \
|
||||
automake \
|
||||
bzip2 \
|
||||
ca-certificates \
|
||||
ffmpeg \
|
||||
g++ \
|
||||
gfortran \
|
||||
git \
|
||||
libtool \
|
||||
make \
|
||||
patch \
|
||||
sox \
|
||||
subversion \
|
||||
unzip \
|
||||
valgrind \
|
||||
wget \
|
||||
zlib1g-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install dependencies
|
||||
RUN pip install --no-cache-dir \
|
||||
torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
|
||||
k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
|
||||
git+https://github.com/lhotse-speech/lhotse \
|
||||
kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
|
||||
\
|
||||
kaldi_native_io \
|
||||
kaldialign \
|
||||
kaldifst \
|
||||
kaldilm \
|
||||
sentencepiece>=0.1.96 \
|
||||
tensorboard \
|
||||
typeguard \
|
||||
dill \
|
||||
onnx \
|
||||
onnxruntime \
|
||||
onnxmltools \
|
||||
multi_quantization \
|
||||
typeguard \
|
||||
numpy \
|
||||
pytest \
|
||||
graphviz
|
||||
|
||||
RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
|
||||
cd /workspace/icefall && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
|
||||
|
||||
WORKDIR /workspace/icefall
|
72
docker/torch1.13.0-cuda11.6.dockerfile
Normal file
72
docker/torch1.13.0-cuda11.6.dockerfile
Normal file
@ -0,0 +1,72 @@
|
||||
FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
|
||||
|
||||
ENV LC_ALL C.UTF-8
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG K2_VERSION="1.24.3.dev20230725+cuda11.6.torch1.13.0"
|
||||
ARG KALDIFEAT_VERSION="1.25.0.dev20230726+cuda11.6.torch1.13.0"
|
||||
ARG TORCHAUDIO_VERSION="0.13.0+cu116"
|
||||
|
||||
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
|
||||
LABEL k2_version=${K2_VERSION}
|
||||
LABEL kaldifeat_version=${KALDIFEAT_VERSION}
|
||||
LABEL github_repo="https://github.com/k2-fsa/icefall"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
vim \
|
||||
libssl-dev \
|
||||
autoconf \
|
||||
automake \
|
||||
bzip2 \
|
||||
ca-certificates \
|
||||
ffmpeg \
|
||||
g++ \
|
||||
gfortran \
|
||||
git \
|
||||
libtool \
|
||||
make \
|
||||
patch \
|
||||
sox \
|
||||
subversion \
|
||||
unzip \
|
||||
valgrind \
|
||||
wget \
|
||||
zlib1g-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install dependencies
|
||||
RUN pip install --no-cache-dir \
|
||||
torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
|
||||
k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
|
||||
git+https://github.com/lhotse-speech/lhotse \
|
||||
kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
|
||||
\
|
||||
kaldi_native_io \
|
||||
kaldialign \
|
||||
kaldifst \
|
||||
kaldilm \
|
||||
sentencepiece>=0.1.96 \
|
||||
tensorboard \
|
||||
typeguard \
|
||||
dill \
|
||||
onnx \
|
||||
onnxruntime \
|
||||
onnxmltools \
|
||||
multi_quantization \
|
||||
typeguard \
|
||||
numpy \
|
||||
pytest \
|
||||
graphviz
|
||||
|
||||
RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
|
||||
cd /workspace/icefall && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
|
||||
|
||||
ENV LD_LIBRARY_PATH /opt/conda/lib/stubs:$LD_LIBRARY_PATH
|
||||
|
||||
WORKDIR /workspace/icefall
|
86
docker/torch1.9.0-cuda10.2.dockerfile
Normal file
86
docker/torch1.9.0-cuda10.2.dockerfile
Normal file
@ -0,0 +1,86 @@
|
||||
FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
|
||||
|
||||
ENV LC_ALL C.UTF-8
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG K2_VERSION="1.24.3.dev20230726+cuda10.2.torch1.9.0"
|
||||
ARG KALDIFEAT_VERSION="1.25.0.dev20230726+cuda10.2.torch1.9.0"
|
||||
ARG TORCHAUDIO_VERSION="0.9.0"
|
||||
|
||||
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
|
||||
LABEL k2_version=${K2_VERSION}
|
||||
LABEL kaldifeat_version=${KALDIFEAT_VERSION}
|
||||
LABEL github_repo="https://github.com/k2-fsa/icefall"
|
||||
|
||||
# see https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/
|
||||
|
||||
RUN rm /etc/apt/sources.list.d/cuda.list && \
|
||||
rm /etc/apt/sources.list.d/nvidia-ml.list && \
|
||||
apt-key del 7fa2af80
|
||||
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
vim \
|
||||
libssl-dev \
|
||||
autoconf \
|
||||
automake \
|
||||
bzip2 \
|
||||
ca-certificates \
|
||||
ffmpeg \
|
||||
g++ \
|
||||
gfortran \
|
||||
git \
|
||||
libtool \
|
||||
make \
|
||||
patch \
|
||||
sox \
|
||||
subversion \
|
||||
unzip \
|
||||
valgrind \
|
||||
wget \
|
||||
zlib1g-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
|
||||
dpkg -i cuda-keyring_1.0-1_all.deb && \
|
||||
rm -v cuda-keyring_1.0-1_all.deb && \
|
||||
apt-get update && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install dependencies
|
||||
RUN pip uninstall -y tqdm && \
|
||||
pip install -U --no-cache-dir \
|
||||
torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
|
||||
k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
|
||||
kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
|
||||
git+https://github.com/lhotse-speech/lhotse \
|
||||
\
|
||||
kaldi_native_io \
|
||||
kaldialign \
|
||||
kaldifst \
|
||||
kaldilm \
|
||||
sentencepiece>=0.1.96 \
|
||||
tensorboard \
|
||||
typeguard \
|
||||
dill \
|
||||
onnx \
|
||||
onnxruntime \
|
||||
onnxmltools \
|
||||
multi_quantization \
|
||||
typeguard \
|
||||
numpy \
|
||||
pytest \
|
||||
graphviz \
|
||||
tqdm>=4.63.0
|
||||
|
||||
|
||||
RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
|
||||
cd /workspace/icefall && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
|
||||
|
||||
WORKDIR /workspace/icefall
|
70
docker/torch2.0.0-cuda11.7.dockerfile
Normal file
70
docker/torch2.0.0-cuda11.7.dockerfile
Normal file
@ -0,0 +1,70 @@
|
||||
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
|
||||
|
||||
ENV LC_ALL C.UTF-8
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG K2_VERSION="1.24.3.dev20230718+cuda11.7.torch2.0.0"
|
||||
ARG KALDIFEAT_VERSION="1.25.0.dev20230726+cuda11.7.torch2.0.0"
|
||||
ARG TORCHAUDIO_VERSION="2.0.0+cu117"
|
||||
|
||||
LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
|
||||
LABEL k2_version=${K2_VERSION}
|
||||
LABEL kaldifeat_version=${KALDIFEAT_VERSION}
|
||||
LABEL github_repo="https://github.com/k2-fsa/icefall"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
vim \
|
||||
libssl-dev \
|
||||
autoconf \
|
||||
automake \
|
||||
bzip2 \
|
||||
ca-certificates \
|
||||
ffmpeg \
|
||||
g++ \
|
||||
gfortran \
|
||||
git \
|
||||
libtool \
|
||||
make \
|
||||
patch \
|
||||
sox \
|
||||
subversion \
|
||||
unzip \
|
||||
valgrind \
|
||||
wget \
|
||||
zlib1g-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install dependencies
|
||||
RUN pip install --no-cache-dir \
|
||||
torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
|
||||
k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
|
||||
git+https://github.com/lhotse-speech/lhotse \
|
||||
kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
|
||||
\
|
||||
kaldi_native_io \
|
||||
kaldialign \
|
||||
kaldifst \
|
||||
kaldilm \
|
||||
sentencepiece>=0.1.96 \
|
||||
tensorboard \
|
||||
typeguard \
|
||||
dill \
|
||||
onnx \
|
||||
onnxruntime \
|
||||
onnxmltools \
|
||||
multi_quantization \
|
||||
typeguard \
|
||||
numpy \
|
||||
pytest \
|
||||
graphviz
|
||||
|
||||
RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
|
||||
cd /workspace/icefall && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
|
||||
|
||||
WORKDIR /workspace/icefall
|
@ -90,4 +90,12 @@ rst_epilog = """
|
||||
.. _musan: http://www.openslr.org/17/
|
||||
.. _ONNX: https://github.com/onnx/onnx
|
||||
.. _onnxruntime: https://github.com/microsoft/onnxruntime
|
||||
.. _torch: https://github.com/pytorch/pytorch
|
||||
.. _torchaudio: https://github.com/pytorch/audio
|
||||
.. _k2: https://github.com/k2-fsa/k2
|
||||
.. _lhotse: https://github.com/lhotse-speech/lhotse
|
||||
.. _yesno: https://www.openslr.org/1/
|
||||
.. _Next-gen Kaldi: https://github.com/k2-fsa
|
||||
.. _Kaldi: https://github.com/kaldi-asr/kaldi
|
||||
.. _lilcom: https://github.com/danpovey/lilcom
|
||||
"""
|
||||
|
@ -38,7 +38,7 @@ Please fix any issues reported by the check tools.
|
||||
.. HINT::
|
||||
|
||||
Some of the check tools, i.e., ``black`` and ``isort`` will modify
|
||||
the files to be commited **in-place**. So please run ``git status``
|
||||
the files to be committed **in-place**. So please run ``git status``
|
||||
after failure to see which file has been modified by the tools
|
||||
before you make any further changes.
|
||||
|
||||
|
@ -71,9 +71,12 @@ As the initial step, let's download the pre-trained model.
|
||||
.. code-block:: bash
|
||||
|
||||
$ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
$ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
|
||||
$ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
|
||||
$ git lfs pull --include "pretrained.pt"
|
||||
$ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
|
||||
$ cd ../data/lang_bpe_500
|
||||
$ git lfs pull --include bpe.model
|
||||
$ cd ../../..
|
||||
|
||||
To test the model, let's have a look at the decoding results **without** using LM. This can be done via the following command:
|
||||
|
||||
@ -85,7 +88,7 @@ To test the model, let's have a look at the decoding results **without** using L
|
||||
--avg 1 \
|
||||
--use-averaged-model False \
|
||||
--exp-dir $exp_dir \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
|
||||
--max-duration 600 \
|
||||
--decode-chunk-len 32 \
|
||||
--decoding-method modified_beam_search
|
||||
@ -135,8 +138,8 @@ Then, we perform LODR decoding by setting ``--decoding-method`` to ``modified_be
|
||||
--exp-dir $exp_dir \
|
||||
--max-duration 600 \
|
||||
--decode-chunk-len 32 \
|
||||
--decoding-method modified_beam_search_lm_LODR \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
|
||||
--decoding-method modified_beam_search_LODR \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
|
||||
--use-shallow-fusion 1 \
|
||||
--lm-type rnn \
|
||||
--lm-exp-dir $lm_dir \
|
||||
|
@ -2,7 +2,29 @@ Decoding with language models
|
||||
=============================
|
||||
|
||||
This section describes how to use external langugage models
|
||||
during decoding to improve the WER of transducer models.
|
||||
during decoding to improve the WER of transducer models. To train an external language model,
|
||||
please refer to this tutorial: :ref:`train_nnlm`.
|
||||
|
||||
The following decoding methods with external langugage models are available:
|
||||
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 50
|
||||
:header-rows: 1
|
||||
|
||||
* - Decoding method
|
||||
- beam=4
|
||||
* - ``modified_beam_search``
|
||||
- Beam search (i.e. really n-best decoding, the "beam" is the value of n), similar to the original RNN-T paper. Note, this method does not use language model.
|
||||
* - ``modified_beam_search_lm_shallow_fusion``
|
||||
- As ``modified_beam_search``, but interpolate RNN-T scores with language model scores, also known as shallow fusion
|
||||
* - ``modified_beam_search_LODR``
|
||||
- As ``modified_beam_search_lm_shallow_fusion``, but subtract score of a (BPE-symbol-level) bigram backoff language model used as an approximation to the internal language model of RNN-T.
|
||||
* - ``modified_beam_search_lm_rescore``
|
||||
- As ``modified_beam_search``, but rescore the n-best hypotheses with external language model (e.g. RNNLM) and re-rank them.
|
||||
* - ``modified_beam_search_lm_rescore_LODR``
|
||||
- As ``modified_beam_search_lm_rescore``, but also subtract the score of a (BPE-symbol-level) bigram backoff language model during re-ranking.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
@ -4,7 +4,7 @@ LM rescoring for Transducer
|
||||
=================================
|
||||
|
||||
LM rescoring is a commonly used approach to incorporate external LM information. Unlike shallow-fusion-based
|
||||
methods (see :ref:`shallow-fusion`, :ref:`LODR`), rescoring is usually performed to re-rank the n-best hypotheses after beam search.
|
||||
methods (see :ref:`shallow_fusion`, :ref:`LODR`), rescoring is usually performed to re-rank the n-best hypotheses after beam search.
|
||||
Rescoring is usually more efficient than shallow fusion since less computation is performed on the external LM.
|
||||
In this tutorial, we will show you how to use external LM to rescore the n-best hypotheses decoded from neural transducer models in
|
||||
`icefall <https://github.com/k2-fsa/icefall>`__.
|
||||
@ -34,9 +34,12 @@ As the initial step, let's download the pre-trained model.
|
||||
.. code-block:: bash
|
||||
|
||||
$ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
$ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
|
||||
$ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
|
||||
$ git lfs pull --include "pretrained.pt"
|
||||
$ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
|
||||
$ cd ../data/lang_bpe_500
|
||||
$ git lfs pull --include bpe.model
|
||||
$ cd ../../..
|
||||
|
||||
As usual, we first test the model's performance without external LM. This can be done via the following command:
|
||||
|
||||
@ -48,7 +51,7 @@ As usual, we first test the model's performance without external LM. This can be
|
||||
--avg 1 \
|
||||
--use-averaged-model False \
|
||||
--exp-dir $exp_dir \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
|
||||
--max-duration 600 \
|
||||
--decode-chunk-len 32 \
|
||||
--decoding-method modified_beam_search
|
||||
@ -101,7 +104,7 @@ is set to `False`.
|
||||
--max-duration 600 \
|
||||
--decode-chunk-len 32 \
|
||||
--decoding-method modified_beam_search_lm_rescore \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
|
||||
--use-shallow-fusion 0 \
|
||||
--lm-type rnn \
|
||||
--lm-exp-dir $lm_dir \
|
||||
@ -173,7 +176,7 @@ Then we can performn LM rescoring + LODR by changing the decoding method to `mod
|
||||
--max-duration 600 \
|
||||
--decode-chunk-len 32 \
|
||||
--decoding-method modified_beam_search_lm_rescore_LODR \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
|
||||
--use-shallow-fusion 0 \
|
||||
--lm-type rnn \
|
||||
--lm-exp-dir $lm_dir \
|
||||
@ -225,23 +228,23 @@ Here, we benchmark the WERs and decoding speed of them:
|
||||
- beam=4
|
||||
- beam=8
|
||||
- beam=12
|
||||
* - `modified_beam_search`
|
||||
* - ``modified_beam_search``
|
||||
- 3.11/7.93; 132s
|
||||
- 3.1/7.95; 177s
|
||||
- 3.1/7.96; 210s
|
||||
* - `modified_beam_search_lm_shallow_fusion`
|
||||
* - ``modified_beam_search_lm_shallow_fusion``
|
||||
- 2.77/7.08; 262s
|
||||
- 2.62/6.65; 352s
|
||||
- 2.58/6.65; 488s
|
||||
* - LODR
|
||||
* - ``modified_beam_search_LODR``
|
||||
- 2.61/6.74; 400s
|
||||
- 2.45/6.38; 610s
|
||||
- 2.4/6.23; 870s
|
||||
* - `modified_beam_search_lm_rescore`
|
||||
* - ``modified_beam_search_lm_rescore``
|
||||
- 2.93/7.6; 156s
|
||||
- 2.67/7.11; 203s
|
||||
- 2.59/6.86; 255s
|
||||
* - `modified_beam_search_lm_rescore_LODR`
|
||||
* - ``modified_beam_search_lm_rescore_LODR``
|
||||
- 2.9/7.57; 160s
|
||||
- 2.63/7.04; 203s
|
||||
- 2.52/6.73; 263s
|
||||
|
@ -32,9 +32,12 @@ As the initial step, let's download the pre-trained model.
|
||||
.. code-block:: bash
|
||||
|
||||
$ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
$ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
|
||||
$ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
|
||||
$ git lfs pull --include "pretrained.pt"
|
||||
$ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
|
||||
$ cd ../data/lang_bpe_500
|
||||
$ git lfs pull --include bpe.model
|
||||
$ cd ../../..
|
||||
|
||||
To test the model, let's have a look at the decoding results without using LM. This can be done via the following command:
|
||||
|
||||
@ -46,7 +49,7 @@ To test the model, let's have a look at the decoding results without using LM. T
|
||||
--avg 1 \
|
||||
--use-averaged-model False \
|
||||
--exp-dir $exp_dir \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
|
||||
--max-duration 600 \
|
||||
--decode-chunk-len 32 \
|
||||
--decoding-method modified_beam_search
|
||||
@ -95,7 +98,7 @@ To use shallow fusion for decoding, we can execute the following command:
|
||||
--max-duration 600 \
|
||||
--decode-chunk-len 32 \
|
||||
--decoding-method modified_beam_search_lm_shallow_fusion \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
|
||||
--use-shallow-fusion 1 \
|
||||
--lm-type rnn \
|
||||
--lm-exp-dir $lm_dir \
|
||||
|
BIN
docs/source/docker/img/docker-hub.png
Normal file
BIN
docs/source/docker/img/docker-hub.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 356 KiB |
17
docs/source/docker/index.rst
Normal file
17
docs/source/docker/index.rst
Normal file
@ -0,0 +1,17 @@
|
||||
.. _icefall_docker:
|
||||
|
||||
Docker
|
||||
======
|
||||
|
||||
This section describes how to use pre-built docker images to run `icefall`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
If you only have CPUs available, you can still use the pre-built docker
|
||||
images.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
./intro.rst
|
||||
|
171
docs/source/docker/intro.rst
Normal file
171
docs/source/docker/intro.rst
Normal file
@ -0,0 +1,171 @@
|
||||
Introduction
|
||||
=============
|
||||
|
||||
We have pre-built docker images hosted at the following address:
|
||||
|
||||
`<https://hub.docker.com/repository/docker/k2fsa/icefall/general>`_
|
||||
|
||||
.. figure:: img/docker-hub.png
|
||||
:width: 600
|
||||
:align: center
|
||||
|
||||
You can find the ``Dockerfile`` at `<https://github.com/k2-fsa/icefall/tree/master/docker>`_.
|
||||
|
||||
We describe the following items in this section:
|
||||
|
||||
- How to view available tags
|
||||
- How to download pre-built docker images
|
||||
- How to run the `yesno`_ recipe within a docker container on ``CPU``
|
||||
|
||||
View available tags
|
||||
===================
|
||||
|
||||
You can use the following command to view available tags:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
curl -s 'https://registry.hub.docker.com/v2/repositories/k2fsa/icefall/tags/'|jq '."results"[]["name"]'
|
||||
|
||||
which will give you something like below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
"torch2.0.0-cuda11.7"
|
||||
"torch1.12.1-cuda11.3"
|
||||
"torch1.9.0-cuda10.2"
|
||||
"torch1.13.0-cuda11.6"
|
||||
|
||||
.. hint::
|
||||
|
||||
Available tags will be updated when there are new releases of `torch`_.
|
||||
|
||||
Please select an appropriate combination of `torch`_ and CUDA.
|
||||
|
||||
Download a docker image
|
||||
=======================
|
||||
|
||||
Suppose that you select the tag ``torch1.13.0-cuda11.6``, you can use
|
||||
the following command to download it:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo docker image pull k2fsa/icefall:torch1.13.0-cuda11.6
|
||||
|
||||
Run a docker image with GPU
|
||||
===========================
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo docker run --gpus all --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
|
||||
|
||||
Run a docker image with CPU
|
||||
===========================
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo docker run --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
|
||||
|
||||
Run yesno within a docker container
|
||||
===================================
|
||||
|
||||
After starting the container, the following interface is presented:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
root@60c947eac59c:/workspace/icefall#
|
||||
|
||||
It shows the current user is ``root`` and the current working directory
|
||||
is ``/workspace/icefall``.
|
||||
|
||||
Update the code
|
||||
---------------
|
||||
|
||||
Please first run:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
root@60c947eac59c:/workspace/icefall# git pull
|
||||
|
||||
so that your local copy contains the latest code.
|
||||
|
||||
Data preparation
|
||||
----------------
|
||||
|
||||
Now we can use
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
root@60c947eac59c:/workspace/icefall# cd egs/yesno/ASR/
|
||||
|
||||
to switch to the ``yesno`` recipe and run
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./prepare.sh
|
||||
|
||||
.. hint::
|
||||
|
||||
If you are running without GPU, it may report the following error:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
File "/opt/conda/lib/python3.9/site-packages/k2/__init__.py", line 23, in <module>
|
||||
from _k2 import DeterminizeWeightPushingType
|
||||
ImportError: libcuda.so.1: cannot open shared object file: No such file or directory
|
||||
|
||||
We can use the following command to fix it:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ln -s /opt/conda/lib/stubs/libcuda.so /opt/conda/lib/stubs/libcuda.so.1
|
||||
|
||||
The logs of running ``./prepare.sh`` are listed below:
|
||||
|
||||
.. literalinclude:: ./log/log-preparation.txt
|
||||
|
||||
Training
|
||||
--------
|
||||
|
||||
After preparing the data, we can start training with the following command
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./tdnn/train.py
|
||||
|
||||
All of the training logs are given below:
|
||||
|
||||
.. hint::
|
||||
|
||||
It is running on CPU and it takes only 16 seconds for this run.
|
||||
|
||||
.. literalinclude:: ./log/log-train-2023-08-01-01-55-27
|
||||
|
||||
|
||||
Decoding
|
||||
--------
|
||||
|
||||
After training, we can decode the trained model with
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./tdnn/decode.py
|
||||
|
||||
The decoding logs are given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2023-08-01 02:06:22,400 INFO [decode.py:263] Decoding started
|
||||
2023-08-01 02:06:22,400 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d663.clean', 'torch-version': '1.13.0', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.9', 'icefall-git-branch': 'master', 'icefall-git-sha1': '375520d-clean', 'icefall-git-date': 'Fri Jul 28 07:43:08 2023', 'icefall-path': '/workspace/icefall', 'k2-path': '/opt/conda/lib/python3.9/site-packages/k2/__init__.py', 'lhotse-path': '/opt/conda/lib/python3.9/site-packages/lhotse/__init__.py', 'hostname': '60c947eac59c', 'IP address': '172.17.0.2'}}
|
||||
2023-08-01 02:06:22,401 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-08-01 02:06:22,403 INFO [decode.py:273] device: cpu
|
||||
2023-08-01 02:06:22,406 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||
2023-08-01 02:06:22,424 INFO [asr_datamodule.py:218] About to get test cuts
|
||||
2023-08-01 02:06:22,425 INFO [asr_datamodule.py:252] About to get test cuts
|
||||
2023-08-01 02:06:22,504 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
|
||||
[W NNPACK.cpp:53] Could not initialize NNPACK! Reason: Unsupported hardware.
|
||||
2023-08-01 02:06:22,687 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
|
||||
2023-08-01 02:06:22,688 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
|
||||
2023-08-01 02:06:22,690 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
|
||||
2023-08-01 02:06:22,690 INFO [decode.py:316] Done!
|
||||
|
||||
Congratulations! You have finished successfully running `icefall`_ within a docker container.
|
180
docs/source/for-dummies/data-preparation.rst
Normal file
180
docs/source/for-dummies/data-preparation.rst
Normal file
@ -0,0 +1,180 @@
|
||||
.. _dummies_tutorial_data_preparation:
|
||||
|
||||
Data Preparation
|
||||
================
|
||||
|
||||
After :ref:`dummies_tutorial_environment_setup`, we can start preparing the
|
||||
data for training and decoding.
|
||||
|
||||
The first step is to prepare the data for training. We have already provided
|
||||
`prepare.sh <https://github.com/k2-fsa/icefall/blob/master/egs/yesno/ASR/prepare.sh>`_
|
||||
that would prepare everything required for training.
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd /tmp/icefall
|
||||
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
cd egs/yesno/ASR
|
||||
|
||||
./prepare.sh
|
||||
|
||||
Note that in each recipe from `icefall`_, there exists a file ``prepare.sh``,
|
||||
which you should run before you run anything else.
|
||||
|
||||
That is all you need for data preparation.
|
||||
|
||||
For the more curious
|
||||
--------------------
|
||||
|
||||
If you are wondering how to prepare your own dataset, please refer to the following
|
||||
URLs for more details:
|
||||
|
||||
- `<https://github.com/lhotse-speech/lhotse/tree/master/lhotse/recipes>`_
|
||||
|
||||
It contains recipes for a variety of dataset. If you want to add your own
|
||||
dataset, please read recipes in this folder first.
|
||||
|
||||
- `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/yesno.py>`_
|
||||
|
||||
The `yesno`_ recipe in `lhotse`_.
|
||||
|
||||
If you already have a `Kaldi`_ dataset directory, which contains files like
|
||||
``wav.scp``, ``feats.scp``, then you can refer to `<https://lhotse.readthedocs.io/en/latest/kaldi.html#example>`_.
|
||||
|
||||
A quick look to the generated files
|
||||
-----------------------------------
|
||||
|
||||
``./prepare.sh`` puts generated files into two directories:
|
||||
|
||||
- ``download``
|
||||
- ``data``
|
||||
|
||||
download
|
||||
^^^^^^^^
|
||||
|
||||
The ``download`` directory contains downloaded dataset files:
|
||||
|
||||
.. code-block:: bas
|
||||
|
||||
tree -L 1 ./download/
|
||||
|
||||
./download/
|
||||
|-- waves_yesno
|
||||
`-- waves_yesno.tar.gz
|
||||
|
||||
.. hint::
|
||||
|
||||
Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/yesno.py#L41>`_
|
||||
for how the data is downloaded and extracted.
|
||||
|
||||
data
|
||||
^^^^
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
tree ./data/
|
||||
|
||||
./data/
|
||||
|-- fbank
|
||||
| |-- yesno_cuts_test.jsonl.gz
|
||||
| |-- yesno_cuts_train.jsonl.gz
|
||||
| |-- yesno_feats_test.lca
|
||||
| `-- yesno_feats_train.lca
|
||||
|-- lang_phone
|
||||
| |-- HLG.pt
|
||||
| |-- L.pt
|
||||
| |-- L_disambig.pt
|
||||
| |-- Linv.pt
|
||||
| |-- lexicon.txt
|
||||
| |-- lexicon_disambig.txt
|
||||
| |-- tokens.txt
|
||||
| `-- words.txt
|
||||
|-- lm
|
||||
| |-- G.arpa
|
||||
| `-- G.fst.txt
|
||||
`-- manifests
|
||||
|-- yesno_recordings_test.jsonl.gz
|
||||
|-- yesno_recordings_train.jsonl.gz
|
||||
|-- yesno_supervisions_test.jsonl.gz
|
||||
`-- yesno_supervisions_train.jsonl.gz
|
||||
|
||||
4 directories, 18 files
|
||||
|
||||
**data/manifests**:
|
||||
|
||||
This directory contains manifests. They are used to generate files in
|
||||
``data/fbank``.
|
||||
|
||||
To give you an idea of what it contains, we examine the first few lines of
|
||||
the manifests related to the ``train`` dataset.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd data/manifests
|
||||
gunzip -c yesno_recordings_train.jsonl.gz | head -n 3
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
{"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}
|
||||
{"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}
|
||||
{"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}
|
||||
|
||||
Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/audio.py#L300>`_
|
||||
for the meaning of each field per line.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
gunzip -c yesno_supervisions_train.jsonl.gz | head -n 3
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
{"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}
|
||||
{"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}
|
||||
{"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}
|
||||
|
||||
Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/supervision.py#L510>`_
|
||||
for the meaning of each field per line.
|
||||
|
||||
**data/fbank**:
|
||||
|
||||
This directory contains everything from ``data/manifests``. Furthermore, it also contains features
|
||||
for training.
|
||||
|
||||
``data/fbank/yesno_feats_train.lca`` contains the features for the train dataset.
|
||||
Features are compressed using `lilcom`_.
|
||||
|
||||
``data/fbank/yesno_cuts_train.jsonl.gz`` stores the `CutSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/cut/set.py#L72>`_,
|
||||
which stores `RecordingSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/audio.py#L928>`_,
|
||||
`SupervisionSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/supervision.py#L510>`_,
|
||||
and `FeatureSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/features/base.py#L593>`_.
|
||||
|
||||
To give you an idea about what it looks like, we can run the following command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd data/fbank
|
||||
|
||||
gunzip -c yesno_cuts_train.jsonl.gz | head -n 3
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
{"id": "0_0_0_0_1_1_1_1-0", "start": 0, "duration": 6.35, "channel": 0, "supervisions": [{"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 635, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.35, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "0,13000,3570", "channels": 0}, "recording": {"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}, "type": "MonoCut"}
|
||||
{"id": "0_0_0_1_0_1_1_0-1", "start": 0, "duration": 6.11, "channel": 0, "supervisions": [{"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 611, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.11, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "16570,12964,2929", "channels": 0}, "recording": {"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}, "type": "MonoCut"}
|
||||
{"id": "0_0_1_0_0_1_1_0-2", "start": 0, "duration": 6.02, "channel": 0, "supervisions": [{"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 602, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.02, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "32463,12936,2696", "channels": 0}, "recording": {"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}, "type": "MonoCut"}
|
||||
|
||||
Note that ``yesno_cuts_train.jsonl.gz`` only stores the information about how to read the features.
|
||||
The actual features are stored separately in ``data/fbank/yesno_feats_train.lca``.
|
||||
|
||||
**data/lang**:
|
||||
|
||||
This directory contains the lexicon.
|
||||
|
||||
**data/lm**:
|
||||
|
||||
This directory contains language models.
|
39
docs/source/for-dummies/decoding.rst
Normal file
39
docs/source/for-dummies/decoding.rst
Normal file
@ -0,0 +1,39 @@
|
||||
.. _dummies_tutorial_decoding:
|
||||
|
||||
Decoding
|
||||
========
|
||||
|
||||
After :ref:`dummies_tutorial_training`, we can start decoding.
|
||||
|
||||
The command to start the decoding is quite simple:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd /tmp/icefall
|
||||
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
cd egs/yesno/ASR
|
||||
|
||||
# We use CPU for decoding by setting the following environment variable
|
||||
export CUDA_VISIBLE_DEVICES=""
|
||||
|
||||
./tdnn/decode.py
|
||||
|
||||
The output logs are given below:
|
||||
|
||||
.. literalinclude:: ./code/decoding-yesno.txt
|
||||
|
||||
For the more curious
|
||||
--------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tdnn/decode.py --help
|
||||
|
||||
will print the usage information about ``./tdnn/decode.py``. For instance, you
|
||||
can specify:
|
||||
|
||||
- ``--epoch`` to use which checkpoint for decoding
|
||||
- ``--avg`` to select how many checkpoints to use for model averaging
|
||||
|
||||
You usually try different combinations of ``--epoch`` and ``--avg`` and select
|
||||
one that leads to the lowest WER (`Word Error Rate <https://en.wikipedia.org/wiki/Word_error_rate>`_).
|
121
docs/source/for-dummies/environment-setup.rst
Normal file
121
docs/source/for-dummies/environment-setup.rst
Normal file
@ -0,0 +1,121 @@
|
||||
.. _dummies_tutorial_environment_setup:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
We will create an environment for `Next-gen Kaldi`_ that runs on ``CPU``
|
||||
in this tutorial.
|
||||
|
||||
.. note::
|
||||
|
||||
Since the `yesno`_ dataset used in this tutorial is very tiny, training on
|
||||
``CPU`` works very well for it.
|
||||
|
||||
If your dataset is very large, e.g., hundreds or thousands of hours of
|
||||
training data, please follow :ref:`install icefall` to install `icefall`_
|
||||
that works with ``GPU``.
|
||||
|
||||
|
||||
Create a virtual environment
|
||||
----------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
virtualenv -p python3 /tmp/icefall_env
|
||||
|
||||
The above command creates a virtual environment in the directory ``/tmp/icefall_env``.
|
||||
You can select any directory you want.
|
||||
|
||||
The output of the above command is given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
Already using interpreter /usr/bin/python3
|
||||
Using base prefix '/usr'
|
||||
New python executable in /tmp/icefall_env/bin/python3
|
||||
Also creating executable in /tmp/icefall_env/bin/python
|
||||
Installing setuptools, pkg_resources, pip, wheel...done.
|
||||
|
||||
Now we can activate the environment using:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
source /tmp/icefall_env/bin/activate
|
||||
|
||||
Install dependencies
|
||||
--------------------
|
||||
|
||||
.. warning::
|
||||
|
||||
Remeber to activate your virtual environment before you continue!
|
||||
|
||||
After activating the virtual environment, we can use the following command
|
||||
to install dependencies of `icefall`_:
|
||||
|
||||
.. hint::
|
||||
|
||||
Remeber that we will run this tutorial on ``CPU``, so we install
|
||||
dependencies required only by running on ``CPU``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Caution: Installation order matters!
|
||||
|
||||
# We use torch 2.0.0 and torchaduio 2.0.0 in this tutorial.
|
||||
# Other versions should also work.
|
||||
|
||||
pip install torch==2.0.0+cpu torchaudio==2.0.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
|
||||
|
||||
# If you are using macOS or Windows, please use the following command to install torch and torchaudio
|
||||
# pip install torch==2.0.0 torchaudio==2.0.0 -f https://download.pytorch.org/whl/torch_stable.html
|
||||
|
||||
# Now install k2
|
||||
# Please refer to https://k2-fsa.github.io/k2/installation/from_wheels.html#linux-cpu-example
|
||||
|
||||
pip install k2==1.24.3.dev20230726+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu.html
|
||||
|
||||
# Install the latest version of lhotse
|
||||
|
||||
pip install git+https://github.com/lhotse-speech/lhotse
|
||||
|
||||
|
||||
Install icefall
|
||||
---------------
|
||||
|
||||
We will put the source code of `icefall`_ into the directory ``/tmp``
|
||||
You can select any directory you want.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd /tmp
|
||||
git clone https://github.com/k2-fsa/icefall
|
||||
cd icefall
|
||||
pip install -r ./requirements.txt
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Anytime we want to use icefall, we have to set the following
|
||||
# environment variable
|
||||
|
||||
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
|
||||
.. hint::
|
||||
|
||||
If you get the following error during this tutorial:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ModuleNotFoundError: No module named 'icefall'
|
||||
|
||||
please set the above environment variable to fix it.
|
||||
|
||||
|
||||
Congratulations! You have installed `icefall`_ successfully.
|
||||
|
||||
For the more curious
|
||||
--------------------
|
||||
|
||||
`icefall`_ contains a collection of Python scripts and you don't need to
|
||||
use ``python3 setup.py install`` or ``pip install icefall`` to install it.
|
||||
All you need to do is to download the code and set the environment variable
|
||||
``PYTHONPATH``.
|
34
docs/source/for-dummies/index.rst
Normal file
34
docs/source/for-dummies/index.rst
Normal file
@ -0,0 +1,34 @@
|
||||
Icefall for dummies tutorial
|
||||
============================
|
||||
|
||||
This tutorial walks you step by step about how to create a simple
|
||||
ASR (`Automatic Speech Recognition <https://en.wikipedia.org/wiki/Speech_recognition>`_)
|
||||
system with `Next-gen Kaldi`_.
|
||||
|
||||
We use the `yesno`_ dataset for demonstration. We select it out of two reasons:
|
||||
|
||||
- It is quite tiny, containing only about 12 minutes of data
|
||||
- The training can be finished within 20 seconds on ``CPU``.
|
||||
|
||||
That also means you don't need a ``GPU`` to run this tutorial.
|
||||
|
||||
Let's get started!
|
||||
|
||||
Please follow items below **sequentially**.
|
||||
|
||||
.. note::
|
||||
|
||||
The :ref:`dummies_tutorial_data_preparation` runs only on Linux and on macOS.
|
||||
All other parts run on Linux, macOS, and Windows.
|
||||
|
||||
Help from the community is appreciated to port the :ref:`dummies_tutorial_data_preparation`
|
||||
to Windows.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
./environment-setup.rst
|
||||
./data-preparation.rst
|
||||
./training.rst
|
||||
./decoding.rst
|
||||
./model-export.rst
|
310
docs/source/for-dummies/model-export.rst
Normal file
310
docs/source/for-dummies/model-export.rst
Normal file
@ -0,0 +1,310 @@
|
||||
Model Export
|
||||
============
|
||||
|
||||
There are three ways to export a pre-trained model.
|
||||
|
||||
- Export the model parameters via `model.state_dict() <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.state_dict>`_
|
||||
- Export via `torchscript <https://pytorch.org/docs/stable/jit.html>`_: either `torch.jit.script() <https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script>`_ or `torch.jit.trace() <https://pytorch.org/docs/stable/generated/torch.jit.trace.html>`_
|
||||
- Export to `ONNX`_ via `torch.onnx.export() <https://pytorch.org/docs/stable/onnx.html>`_
|
||||
|
||||
Each method is explained below in detail.
|
||||
|
||||
Export the model parameters via model.state_dict()
|
||||
---------------------------------------------------
|
||||
|
||||
The command for this kind of export is
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd /tmp/icefall
|
||||
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
cd egs/yesno/ASR
|
||||
|
||||
# assume that "--epoch 14 --avg 2" produces the lowest WER.
|
||||
|
||||
./tdnn/export.py --epoch 14 --avg 2
|
||||
|
||||
The output logs are given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2023-08-16 20:42:03,912 INFO [export.py:76] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'jit': False}
|
||||
2023-08-16 20:42:03,913 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-08-16 20:42:03,950 INFO [export.py:93] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||
2023-08-16 20:42:03,971 INFO [export.py:106] Not using torch.jit.script
|
||||
2023-08-16 20:42:03,974 INFO [export.py:111] Saved to tdnn/exp/pretrained.pt
|
||||
|
||||
We can see from the logs that the exported model is saved to the file ``tdnn/exp/pretrained.pt``.
|
||||
|
||||
To give you an idea of what ``tdnn/exp/pretrained.pt`` contains, we can use the following command:
|
||||
|
||||
.. code-block:: python3
|
||||
|
||||
>>> import torch
|
||||
>>> m = torch.load("tdnn/exp/pretrained.pt")
|
||||
>>> list(m.keys())
|
||||
['model']
|
||||
>>> list(m["model"].keys())
|
||||
['tdnn.0.weight', 'tdnn.0.bias', 'tdnn.2.running_mean', 'tdnn.2.running_var', 'tdnn.2.num_batches_tracked', 'tdnn.3.weight', 'tdnn.3.bias', 'tdnn.5.running_mean', 'tdnn.5.running_var', 'tdnn.5.num_batches_tracked', 'tdnn.6.weight', 'tdnn.6.bias', 'tdnn.8.running_mean', 'tdnn.8.running_var', 'tdnn.8.num_batches_tracked', 'output_linear.weight', 'output_linear.bias']
|
||||
|
||||
We can use ``tdnn/exp/pretrained.pt`` in the following way with ``./tdnn/decode.py``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd tdnn/exp
|
||||
ln -s pretrained.pt epoch-99.pt
|
||||
cd ../..
|
||||
|
||||
./tdnn/decode.py --epoch 99 --avg 1
|
||||
|
||||
The output logs of the above command are given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2023-08-16 20:45:48,089 INFO [decode.py:262] Decoding started
|
||||
2023-08-16 20:45:48,090 INFO [decode.py:263] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 99, 'avg': 1, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': False, 'k2-git-sha1': 'ad79f1c699c684de9785ed6ca5edb805a41f78c3', 'k2-git-date': 'Wed Jul 26 09:30:42 2023', 'lhotse-version': '1.16.0.dev+git.aa073f6.clean', 'torch-version': '2.0.0', 'torch-cuda-available': False, 'torch-cuda-version': None, 'python-version': '3.1', 'icefall-git-branch': 'master', 'icefall-git-sha1': '9a47c08-clean', 'icefall-git-date': 'Mon Aug 14 22:10:50 2023', 'icefall-path': '/private/tmp/icefall', 'k2-path': '/private/tmp/icefall_env/lib/python3.11/site-packages/k2/__init__.py', 'lhotse-path': '/private/tmp/icefall_env/lib/python3.11/site-packages/lhotse/__init__.py', 'hostname': 'fangjuns-MacBook-Pro.local', 'IP address': '127.0.0.1'}}
|
||||
2023-08-16 20:45:48,092 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-08-16 20:45:48,103 INFO [decode.py:272] device: cpu
|
||||
2023-08-16 20:45:48,109 INFO [checkpoint.py:112] Loading checkpoint from tdnn/exp/epoch-99.pt
|
||||
2023-08-16 20:45:48,115 INFO [asr_datamodule.py:218] About to get test cuts
|
||||
2023-08-16 20:45:48,115 INFO [asr_datamodule.py:253] About to get test cuts
|
||||
2023-08-16 20:45:50,386 INFO [decode.py:203] batch 0/?, cuts processed until now is 4
|
||||
2023-08-16 20:45:50,556 INFO [decode.py:240] The transcripts are stored in tdnn/exp/recogs-test_set.txt
|
||||
2023-08-16 20:45:50,557 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
|
||||
2023-08-16 20:45:50,558 INFO [decode.py:248] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
|
||||
2023-08-16 20:45:50,559 INFO [decode.py:315] Done!
|
||||
|
||||
We can see that it produces an identical WER as before.
|
||||
|
||||
We can also use it to decode files with the following command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# ./tdnn/pretrained.py requires kaldifeat
|
||||
#
|
||||
# Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
|
||||
# for how to install kaldifeat
|
||||
|
||||
pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
|
||||
|
||||
./tdnn/pretrained.py \
|
||||
--checkpoint ./tdnn/exp/pretrained.pt \
|
||||
--HLG ./data/lang_phone/HLG.pt \
|
||||
--words-file ./data/lang_phone/words.txt \
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2023-08-16 20:53:19,208 INFO [pretrained.py:136] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './tdnn/exp/pretrained.pt', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
|
||||
2023-08-16 20:53:19,208 INFO [pretrained.py:142] device: cpu
|
||||
2023-08-16 20:53:19,208 INFO [pretrained.py:144] Creating model
|
||||
2023-08-16 20:53:19,212 INFO [pretrained.py:156] Loading HLG from ./data/lang_phone/HLG.pt
|
||||
2023-08-16 20:53:19,213 INFO [pretrained.py:160] Constructing Fbank computer
|
||||
2023-08-16 20:53:19,213 INFO [pretrained.py:170] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
|
||||
2023-08-16 20:53:19,224 INFO [pretrained.py:176] Decoding started
|
||||
2023-08-16 20:53:19,304 INFO [pretrained.py:212]
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav:
|
||||
NO NO NO YES NO NO NO YES
|
||||
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav:
|
||||
NO NO YES NO NO NO YES NO
|
||||
|
||||
|
||||
2023-08-16 20:53:19,304 INFO [pretrained.py:214] Decoding Done
|
||||
|
||||
|
||||
Export via torch.jit.script()
|
||||
-----------------------------
|
||||
|
||||
The command for this kind of export is
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd /tmp/icefall
|
||||
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
cd egs/yesno/ASR
|
||||
|
||||
# assume that "--epoch 14 --avg 2" produces the lowest WER.
|
||||
|
||||
./tdnn/export.py --epoch 14 --avg 2 --jit true
|
||||
|
||||
The output logs are given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2023-08-16 20:47:44,666 INFO [export.py:76] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'jit': True}
|
||||
2023-08-16 20:47:44,667 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-08-16 20:47:44,670 INFO [export.py:93] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||
2023-08-16 20:47:44,677 INFO [export.py:100] Using torch.jit.script
|
||||
2023-08-16 20:47:44,843 INFO [export.py:104] Saved to tdnn/exp/cpu_jit.pt
|
||||
|
||||
From the output logs we can see that the generated file is saved to ``tdnn/exp/cpu_jit.pt``.
|
||||
|
||||
Don't be confused by the name ``cpu_jit.pt``. The ``cpu`` part means the model is moved to
|
||||
CPU before exporting. That means, when you load it with:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
torch.jit.load()
|
||||
|
||||
you don't need to specify the argument `map_location <https://pytorch.org/docs/stable/generated/torch.jit.load.html#torch.jit.load>`_
|
||||
and it resides on CPU by default.
|
||||
|
||||
To use ``tdnn/exp/cpu_jit.pt`` with `icefall`_ to decode files, we can use:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# ./tdnn/jit_pretrained.py requires kaldifeat
|
||||
#
|
||||
# Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
|
||||
# for how to install kaldifeat
|
||||
|
||||
pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
|
||||
|
||||
|
||||
./tdnn/jit_pretrained.py \
|
||||
--nn-model ./tdnn/exp/cpu_jit.pt \
|
||||
--HLG ./data/lang_phone/HLG.pt \
|
||||
--words-file ./data/lang_phone/words.txt \
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2023-08-16 20:56:00,603 INFO [jit_pretrained.py:121] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'nn_model': './tdnn/exp/cpu_jit.pt', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
|
||||
2023-08-16 20:56:00,603 INFO [jit_pretrained.py:127] device: cpu
|
||||
2023-08-16 20:56:00,603 INFO [jit_pretrained.py:129] Loading torchscript model
|
||||
2023-08-16 20:56:00,640 INFO [jit_pretrained.py:134] Loading HLG from ./data/lang_phone/HLG.pt
|
||||
2023-08-16 20:56:00,641 INFO [jit_pretrained.py:138] Constructing Fbank computer
|
||||
2023-08-16 20:56:00,641 INFO [jit_pretrained.py:148] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
|
||||
2023-08-16 20:56:00,642 INFO [jit_pretrained.py:154] Decoding started
|
||||
2023-08-16 20:56:00,727 INFO [jit_pretrained.py:190]
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav:
|
||||
NO NO NO YES NO NO NO YES
|
||||
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav:
|
||||
NO NO YES NO NO NO YES NO
|
||||
|
||||
|
||||
2023-08-16 20:56:00,727 INFO [jit_pretrained.py:192] Decoding Done
|
||||
|
||||
.. hint::
|
||||
|
||||
We provide only code for ``torch.jit.script()``. You can try ``torch.jit.trace()``
|
||||
if you want.
|
||||
|
||||
Export via torch.onnx.export()
|
||||
------------------------------
|
||||
|
||||
The command for this kind of export is
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd /tmp/icefall
|
||||
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
cd egs/yesno/ASR
|
||||
|
||||
# tdnn/export_onnx.py requires onnx and onnxruntime
|
||||
pip install onnx onnxruntime
|
||||
|
||||
# assume that "--epoch 14 --avg 2" produces the lowest WER.
|
||||
|
||||
./tdnn/export_onnx.py \
|
||||
--epoch 14 \
|
||||
--avg 2
|
||||
|
||||
The output logs are given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2023-08-16 20:59:20,888 INFO [export_onnx.py:83] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2}
|
||||
2023-08-16 20:59:20,888 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-08-16 20:59:20,892 INFO [export_onnx.py:100] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||
================ Diagnostic Run torch.onnx.export version 2.0.0 ================
|
||||
verbose: False, log level: Level.ERROR
|
||||
======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================
|
||||
|
||||
2023-08-16 20:59:21,047 INFO [export_onnx.py:127] Saved to tdnn/exp/model-epoch-14-avg-2.onnx
|
||||
2023-08-16 20:59:21,047 INFO [export_onnx.py:136] meta_data: {'model_type': 'tdnn', 'version': '1', 'model_author': 'k2-fsa', 'comment': 'non-streaming tdnn for the yesno recipe', 'vocab_size': 4}
|
||||
2023-08-16 20:59:21,049 INFO [export_onnx.py:140] Generate int8 quantization models
|
||||
2023-08-16 20:59:21,075 INFO [onnx_quantizer.py:538] Quantization parameters for tensor:"/Transpose_1_output_0" not specified
|
||||
2023-08-16 20:59:21,081 INFO [export_onnx.py:151] Saved to tdnn/exp/model-epoch-14-avg-2.int8.onnx
|
||||
|
||||
We can see from the logs that it generates two files:
|
||||
|
||||
- ``tdnn/exp/model-epoch-14-avg-2.onnx`` (ONNX model with ``float32`` weights)
|
||||
- ``tdnn/exp/model-epoch-14-avg-2.int8.onnx`` (ONNX model with ``int8`` weights)
|
||||
|
||||
To use the generated ONNX model files for decoding with `onnxruntime`_, we can use
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# ./tdnn/onnx_pretrained.py requires kaldifeat
|
||||
#
|
||||
# Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
|
||||
# for how to install kaldifeat
|
||||
|
||||
pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
|
||||
|
||||
./tdnn/onnx_pretrained.py \
|
||||
--nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
|
||||
--HLG ./data/lang_phone/HLG.pt \
|
||||
--words-file ./data/lang_phone/words.txt \
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||
|
||||
The output is given below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:166] {'feature_dim': 23, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'nn_model': './tdnn/exp/model-epoch-14-avg-2.onnx', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
|
||||
2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:171] device: cpu
|
||||
2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:173] Loading onnx model ./tdnn/exp/model-epoch-14-avg-2.onnx
|
||||
2023-08-16 21:03:24,267 INFO [onnx_pretrained.py:176] Loading HLG from ./data/lang_phone/HLG.pt
|
||||
2023-08-16 21:03:24,270 INFO [onnx_pretrained.py:180] Constructing Fbank computer
|
||||
2023-08-16 21:03:24,273 INFO [onnx_pretrained.py:190] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
|
||||
2023-08-16 21:03:24,279 INFO [onnx_pretrained.py:196] Decoding started
|
||||
2023-08-16 21:03:24,318 INFO [onnx_pretrained.py:232]
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav:
|
||||
NO NO NO YES NO NO NO YES
|
||||
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav:
|
||||
NO NO YES NO NO NO YES NO
|
||||
|
||||
|
||||
2023-08-16 21:03:24,318 INFO [onnx_pretrained.py:234] Decoding Done
|
||||
|
||||
.. note::
|
||||
|
||||
To use the ``int8`` ONNX model for decoding, please use:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tdnn/onnx_pretrained.py \
|
||||
--nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
|
||||
--HLG ./data/lang_phone/HLG.pt \
|
||||
--words-file ./data/lang_phone/words.txt \
|
||||
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||
|
||||
For the more curious
|
||||
--------------------
|
||||
|
||||
If you are wondering how to deploy the model without ``torch``, please
|
||||
continue reading. We will show how to use `sherpa-onnx`_ to run the
|
||||
exported ONNX models, which depends only on `onnxruntime`_ and does not
|
||||
depend on ``torch``.
|
||||
|
||||
In this tutorial, we will only demonstrate the usage of `sherpa-onnx`_ with the
|
||||
pre-trained model of the `yesno`_ recipe. There are also other two frameworks
|
||||
available:
|
||||
|
||||
- `sherpa`_. It works with torchscript models.
|
||||
- `sherpa-ncnn`_. It works with models exported using :ref:`icefall_export_to_ncnn` with `ncnn`_
|
||||
|
||||
Please see `<https://k2-fsa.github.io/sherpa/>`_ for further details.
|
39
docs/source/for-dummies/training.rst
Normal file
39
docs/source/for-dummies/training.rst
Normal file
@ -0,0 +1,39 @@
|
||||
.. _dummies_tutorial_training:
|
||||
|
||||
Training
|
||||
========
|
||||
|
||||
After :ref:`dummies_tutorial_data_preparation`, we can start training.
|
||||
|
||||
The command to start the training is quite simple:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd /tmp/icefall
|
||||
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
cd egs/yesno/ASR
|
||||
|
||||
# We use CPU for training by setting the following environment variable
|
||||
export CUDA_VISIBLE_DEVICES=""
|
||||
|
||||
./tdnn/train.py
|
||||
|
||||
That's it!
|
||||
|
||||
You can find the training logs below:
|
||||
|
||||
.. literalinclude:: ./code/train-yesno.txt
|
||||
|
||||
For the more curious
|
||||
--------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tdnn/train.py --help
|
||||
|
||||
will print the usage information about ``./tdnn/train.py``. For instance, you
|
||||
can specify the number of epochs to train and the location to save the training
|
||||
results.
|
||||
|
||||
The training text logs are saved in ``tdnn/exp/log`` while the tensorboard
|
||||
logs are in ``tdnn/exp/tensorboard``.
|
@ -20,10 +20,13 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
for-dummies/index.rst
|
||||
installation/index
|
||||
docker/index
|
||||
faqs
|
||||
model-export/index
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
|
||||
|
@ -3,40 +3,28 @@
|
||||
Installation
|
||||
============
|
||||
|
||||
.. hint::
|
||||
|
||||
We also provide :ref:`icefall_docker` support, which has already setup
|
||||
the environment for you.
|
||||
|
||||
``icefall`` depends on `k2 <https://github.com/k2-fsa/k2>`_ and
|
||||
`lhotse <https://github.com/lhotse-speech/lhotse>`_.
|
||||
.. hint::
|
||||
|
||||
We have a colab notebook guiding you step by step to setup the environment.
|
||||
|
||||
|yesno colab notebook|
|
||||
|
||||
.. |yesno colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
|
||||
:target: https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing
|
||||
|
||||
`icefall`_ depends on `k2`_ and `lhotse`_.
|
||||
|
||||
We recommend that you use the following steps to install the dependencies.
|
||||
|
||||
- (0) Install CUDA toolkit and cuDNN
|
||||
- (1) Install PyTorch and torchaudio
|
||||
- (2) Install k2
|
||||
- (3) Install lhotse
|
||||
|
||||
.. caution::
|
||||
|
||||
99% users who have issues about the installation are using conda.
|
||||
|
||||
.. caution::
|
||||
|
||||
99% users who have issues about the installation are using conda.
|
||||
|
||||
.. caution::
|
||||
|
||||
99% users who have issues about the installation are using conda.
|
||||
|
||||
.. hint::
|
||||
|
||||
We suggest that you use ``pip install`` to install PyTorch.
|
||||
|
||||
You can use the following command to create a virutal environment in Python:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 -m venv ./my_env
|
||||
source ./my_env/bin/activate
|
||||
- (1) Install `torch`_ and `torchaudio`_
|
||||
- (2) Install `k2`_
|
||||
- (3) Install `lhotse`_
|
||||
|
||||
.. caution::
|
||||
|
||||
@ -50,27 +38,20 @@ Please refer to
|
||||
to install CUDA and cuDNN.
|
||||
|
||||
|
||||
(1) Install PyTorch and torchaudio
|
||||
----------------------------------
|
||||
(1) Install torch and torchaudio
|
||||
--------------------------------
|
||||
|
||||
Please refer `<https://pytorch.org/>`_ to install PyTorch
|
||||
and torchaudio.
|
||||
|
||||
.. hint::
|
||||
|
||||
You can also go to `<https://download.pytorch.org/whl/torch_stable.html>`_
|
||||
to download pre-compiled wheels and install them.
|
||||
Please refer `<https://pytorch.org/>`_ to install `torch`_ and `torchaudio`_.
|
||||
|
||||
.. caution::
|
||||
|
||||
Please install torch and torchaudio at the same time.
|
||||
|
||||
|
||||
(2) Install k2
|
||||
--------------
|
||||
|
||||
Please refer to `<https://k2-fsa.github.io/k2/installation/index.html>`_
|
||||
to install ``k2``.
|
||||
to install `k2`_.
|
||||
|
||||
.. caution::
|
||||
|
||||
@ -78,21 +59,18 @@ to install ``k2``.
|
||||
|
||||
.. note::
|
||||
|
||||
We suggest that you install k2 from source by following
|
||||
`<https://k2-fsa.github.io/k2/installation/from_source.html>`_
|
||||
or
|
||||
`<https://k2-fsa.github.io/k2/installation/for_developers.html>`_.
|
||||
We suggest that you install k2 from pre-compiled wheels by following
|
||||
`<https://k2-fsa.github.io/k2/installation/from_wheels.html>`_
|
||||
|
||||
.. hint::
|
||||
|
||||
Please always install the latest version of k2.
|
||||
Please always install the latest version of `k2`_.
|
||||
|
||||
(3) Install lhotse
|
||||
------------------
|
||||
|
||||
Please refer to `<https://lhotse.readthedocs.io/en/latest/getting-started.html#installation>`_
|
||||
to install ``lhotse``.
|
||||
|
||||
to install `lhotse`_.
|
||||
|
||||
.. hint::
|
||||
|
||||
@ -100,17 +78,16 @@ to install ``lhotse``.
|
||||
|
||||
pip install git+https://github.com/lhotse-speech/lhotse
|
||||
|
||||
to install the latest version of lhotse.
|
||||
to install the latest version of `lhotse`_.
|
||||
|
||||
(4) Download icefall
|
||||
--------------------
|
||||
|
||||
``icefall`` is a collection of Python scripts; what you need is to download it
|
||||
`icefall`_ is a collection of Python scripts; what you need is to download it
|
||||
and set the environment variable ``PYTHONPATH`` to point to it.
|
||||
|
||||
Assume you want to place ``icefall`` in the folder ``/tmp``. The
|
||||
following commands show you how to setup ``icefall``:
|
||||
|
||||
Assume you want to place `icefall`_ in the folder ``/tmp``. The
|
||||
following commands show you how to setup `icefall`_:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@ -122,285 +99,334 @@ following commands show you how to setup ``icefall``:
|
||||
|
||||
.. HINT::
|
||||
|
||||
You can put several versions of ``icefall`` in the same virtual environment.
|
||||
To switch among different versions of ``icefall``, just set ``PYTHONPATH``
|
||||
You can put several versions of `icefall`_ in the same virtual environment.
|
||||
To switch among different versions of `icefall`_, just set ``PYTHONPATH``
|
||||
to point to the version you want.
|
||||
|
||||
|
||||
Installation example
|
||||
--------------------
|
||||
|
||||
The following shows an example about setting up the environment.
|
||||
|
||||
|
||||
(1) Create a virtual environment
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ virtualenv -p python3.8 test-icefall
|
||||
kuangfangjun:~$ virtualenv -p python3.8 test-icefall
|
||||
created virtual environment CPython3.8.0.final.0-64 in 9422ms
|
||||
creator CPython3Posix(dest=/star-fj/fangjun/test-icefall, clear=False, no_vcs_ignore=False, global=False)
|
||||
seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/star-fj/fangjun/.local/share/virtualenv)
|
||||
added seed packages: pip==22.3.1, setuptools==65.6.3, wheel==0.38.4
|
||||
activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
|
||||
|
||||
created virtual environment CPython3.8.6.final.0-64 in 1540ms
|
||||
creator CPython3Posix(dest=/ceph-fj/fangjun/test-icefall, clear=False, no_vcs_ignore=False, global=False)
|
||||
seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/fangjun/.local/share/v
|
||||
irtualenv)
|
||||
added seed packages: pip==21.1.3, setuptools==57.4.0, wheel==0.36.2
|
||||
activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator
|
||||
kuangfangjun:~$ source test-icefall/bin/activate
|
||||
|
||||
(test-icefall) kuangfangjun:~$
|
||||
|
||||
(2) Activate your virtual environment
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
(2) Install CUDA toolkit and cuDNN
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You need to determine the version of CUDA toolkit to install.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ source test-icefall/bin/activate
|
||||
(test-icefall) kuangfangjun:~$ nvidia-smi | head -n 4
|
||||
|
||||
(3) Install k2
|
||||
Wed Jul 26 21:57:49 2023
|
||||
+-----------------------------------------------------------------------------+
|
||||
| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 |
|
||||
|-------------------------------+----------------------+----------------------+
|
||||
|
||||
You can choose any CUDA version that is ``not`` greater than the version printed by ``nvidia-smi``.
|
||||
In our case, we can choose any version ``<= 11.6``.
|
||||
|
||||
We will use ``CUDA 11.6`` in this example. Please follow
|
||||
`<https://k2-fsa.github.io/k2/installation/cuda-cudnn.html#cuda-11-6>`_
|
||||
to install CUDA toolkit and cuDNN if you have not done that before.
|
||||
|
||||
After installing CUDA toolkit, you can use the following command to verify it:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
(test-icefall) kuangfangjun:~$ nvcc --version
|
||||
|
||||
nvcc: NVIDIA (R) Cuda compiler driver
|
||||
Copyright (c) 2005-2019 NVIDIA Corporation
|
||||
Built on Wed_Oct_23_19:24:38_PDT_2019
|
||||
Cuda compilation tools, release 10.2, V10.2.89
|
||||
|
||||
(3) Install torch and torchaudio
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Since we have selected CUDA toolkit ``11.6``, we have to install a version of `torch`_
|
||||
that is compiled against CUDA ``11.6``. We select ``torch 1.13.0+cu116`` in this
|
||||
example.
|
||||
|
||||
After selecting the version of `torch`_ to install, we need to also install
|
||||
a compatible version of `torchaudio`_, which is ``0.13.0+cu116`` in our case.
|
||||
|
||||
Please refer to `<https://pytorch.org/audio/stable/installation.html#compatibility-matrix>`_
|
||||
to select an appropriate version of `torchaudio`_ to install if you use a different
|
||||
version of `torch`_.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
(test-icefall) kuangfangjun:~$ pip install torch==1.13.0+cu116 torchaudio==0.13.0+cu116 -f https://download.pytorch.org/whl/torch_stable.html
|
||||
|
||||
Looking in links: https://download.pytorch.org/whl/torch_stable.html
|
||||
Collecting torch==1.13.0+cu116
|
||||
Downloading https://download.pytorch.org/whl/cu116/torch-1.13.0%2Bcu116-cp38-cp38-linux_x86_64.whl (1983.0 MB)
|
||||
________________________________________ 2.0/2.0 GB 764.4 kB/s eta 0:00:00
|
||||
Collecting torchaudio==0.13.0+cu116
|
||||
Downloading https://download.pytorch.org/whl/cu116/torchaudio-0.13.0%2Bcu116-cp38-cp38-linux_x86_64.whl (4.2 MB)
|
||||
________________________________________ 4.2/4.2 MB 1.3 MB/s eta 0:00:00
|
||||
Requirement already satisfied: typing-extensions in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from torch==1.13.0+cu116) (4.7.1)
|
||||
Installing collected packages: torch, torchaudio
|
||||
Successfully installed torch-1.13.0+cu116 torchaudio-0.13.0+cu116
|
||||
|
||||
Verify that `torch`_ and `torchaudio`_ are successfully installed:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
(test-icefall) kuangfangjun:~$ python3 -c "import torch; print(torch.__version__)"
|
||||
|
||||
1.13.0+cu116
|
||||
|
||||
(test-icefall) kuangfangjun:~$ python3 -c "import torchaudio; print(torchaudio.__version__)"
|
||||
|
||||
0.13.0+cu116
|
||||
|
||||
(4) Install k2
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
We will install `k2`_ from pre-compiled wheels by following
|
||||
`<https://k2-fsa.github.io/k2/installation/from_wheels.html>`_
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install k2==1.4.dev20210822+cpu.torch1.9.0 -f https://k2-fsa.org/nightly/index.html
|
||||
(test-icefall) kuangfangjun:~$ pip install k2==1.24.3.dev20230725+cuda11.6.torch1.13.0 -f https://k2-fsa.github.io/k2/cuda.html
|
||||
|
||||
Looking in links: https://k2-fsa.org/nightly/index.html
|
||||
Collecting k2==1.4.dev20210822+cpu.torch1.9.0
|
||||
Downloading https://k2-fsa.org/nightly/whl/k2-1.4.dev20210822%2Bcpu.torch1.9.0-cp38-cp38-linux_x86_64.whl (1.6 MB)
|
||||
|________________________________| 1.6 MB 185 kB/s
|
||||
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
Looking in links: https://k2-fsa.github.io/k2/cuda.html
|
||||
Collecting k2==1.24.3.dev20230725+cuda11.6.torch1.13.0
|
||||
Downloading https://huggingface.co/csukuangfj/k2/resolve/main/ubuntu-cuda/k2-1.24.3.dev20230725%2Bcuda11.6.torch1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (104.3 MB)
|
||||
________________________________________ 104.3/104.3 MB 5.1 MB/s eta 0:00:00
|
||||
Requirement already satisfied: torch==1.13.0 in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from k2==1.24.3.dev20230725+cuda11.6.torch1.13.0) (1.13.0+cu116)
|
||||
Collecting graphviz
|
||||
Downloading graphviz-0.17-py3-none-any.whl (18 kB)
|
||||
Collecting torch==1.9.0
|
||||
Using cached torch-1.9.0-cp38-cp38-manylinux1_x86_64.whl (831.4 MB)
|
||||
Collecting typing-extensions
|
||||
Using cached typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
|
||||
Installing collected packages: typing-extensions, torch, graphviz, k2
|
||||
Successfully installed graphviz-0.17 k2-1.4.dev20210822+cpu.torch1.9.0 torch-1.9.0 typing-extensions-3.10.0.0
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/de/5e/fcbb22c68208d39edff467809d06c9d81d7d27426460ebc598e55130c1aa/graphviz-0.20.1-py3-none-any.whl (47 kB)
|
||||
Requirement already satisfied: typing-extensions in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from torch==1.13.0->k2==1.24.3.dev20230725+cuda11.6.torch1.13.0) (4.7.1)
|
||||
Installing collected packages: graphviz, k2
|
||||
Successfully installed graphviz-0.20.1 k2-1.24.3.dev20230725+cuda11.6.torch1.13.0
|
||||
|
||||
.. WARNING::
|
||||
.. hint::
|
||||
|
||||
We choose to install a CPU version of k2 for testing. You would probably want to install
|
||||
a CUDA version of k2.
|
||||
Please refer to `<https://k2-fsa.github.io/k2/cuda.html>`_ for the available
|
||||
pre-compiled wheels about `k2`_.
|
||||
|
||||
Verify that `k2`_ has been installed successfully:
|
||||
|
||||
(4) Install lhotse
|
||||
.. code-block:: bash
|
||||
|
||||
(test-icefall) kuangfangjun:~$ python3 -m k2.version
|
||||
|
||||
Collecting environment information...
|
||||
|
||||
k2 version: 1.24.3
|
||||
Build type: Release
|
||||
Git SHA1: 4c05309499a08454997adf500b56dcc629e35ae5
|
||||
Git date: Tue Jul 25 16:23:36 2023
|
||||
Cuda used to build k2: 11.6
|
||||
cuDNN used to build k2: 8.3.2
|
||||
Python version used to build k2: 3.8
|
||||
OS used to build k2: CentOS Linux release 7.9.2009 (Core)
|
||||
CMake version: 3.27.0
|
||||
GCC version: 9.3.1
|
||||
CMAKE_CUDA_FLAGS: -Wno-deprecated-gpu-targets -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_35,code=sm_35 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_50,code=sm_50 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_60,code=sm_60 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_61,code=sm_61 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_70,code=sm_70 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_75,code=sm_75 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_80,code=sm_80 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_86,code=sm_86 -DONNX_NAMESPACE=onnx_c2 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=integer_sign_change,--diag_suppress=useless_using_declaration,--diag_suppress=set_but_not_used,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=implicit_return_from_non_void_function,--diag_suppress=unsigned_compare_with_zero,--diag_suppress=declared_but_not_referenced,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -D_GLIBCXX_USE_CXX11_ABI=0 --compiler-options -Wall --compiler-options -Wno-strict-overflow --compiler-options -Wno-unknown-pragmas
|
||||
CMAKE_CXX_FLAGS: -D_GLIBCXX_USE_CXX11_ABI=0 -Wno-unused-variable -Wno-strict-overflow
|
||||
PyTorch version used to build k2: 1.13.0+cu116
|
||||
PyTorch is using Cuda: 11.6
|
||||
NVTX enabled: True
|
||||
With CUDA: True
|
||||
Disable debug: True
|
||||
Sync kernels : False
|
||||
Disable checks: False
|
||||
Max cpu memory allocate: 214748364800 bytes (or 200.0 GB)
|
||||
k2 abort: False
|
||||
__file__: /star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/version/version.py
|
||||
_k2.__file__: /star-fj/fangjun/test-icefall/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so
|
||||
|
||||
(5) Install lhotse
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install git+https://github.com/lhotse-speech/lhotse
|
||||
(test-icefall) kuangfangjun:~$ pip install git+https://github.com/lhotse-speech/lhotse
|
||||
|
||||
Collecting git+https://github.com/lhotse-speech/lhotse
|
||||
Cloning https://github.com/lhotse-speech/lhotse to /tmp/pip-req-build-7b1b76ge
|
||||
Running command git clone -q https://github.com/lhotse-speech/lhotse /tmp/pip-req-build-7b1b76ge
|
||||
Collecting audioread>=2.1.9
|
||||
Using cached audioread-2.1.9-py3-none-any.whl
|
||||
Collecting SoundFile>=0.10
|
||||
Using cached SoundFile-0.10.3.post1-py2.py3-none-any.whl (21 kB)
|
||||
Collecting click>=7.1.1
|
||||
Using cached click-8.0.1-py3-none-any.whl (97 kB)
|
||||
Cloning https://github.com/lhotse-speech/lhotse to /tmp/pip-req-build-vq12fd5i
|
||||
Running command git clone --filter=blob:none --quiet https://github.com/lhotse-speech/lhotse /tmp/pip-req-build-vq12fd5i
|
||||
Resolved https://github.com/lhotse-speech/lhotse to commit 7640d663469b22cd0b36f3246ee9b849cd25e3b7
|
||||
Installing build dependencies ... done
|
||||
Getting requirements to build wheel ... done
|
||||
Preparing metadata (pyproject.toml) ... done
|
||||
Collecting cytoolz>=0.10.1
|
||||
Using cached cytoolz-0.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.9 MB)
|
||||
Collecting dataclasses
|
||||
Using cached dataclasses-0.6-py3-none-any.whl (14 kB)
|
||||
Collecting h5py>=2.10.0
|
||||
Downloading h5py-3.4.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.5 MB)
|
||||
|________________________________| 4.5 MB 684 kB/s
|
||||
Collecting intervaltree>=3.1.0
|
||||
Using cached intervaltree-3.1.0-py2.py3-none-any.whl
|
||||
Collecting lilcom>=1.1.0
|
||||
Using cached lilcom-1.1.1-cp38-cp38-linux_x86_64.whl
|
||||
Collecting numpy>=1.18.1
|
||||
Using cached numpy-1.21.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.8 MB)
|
||||
Collecting packaging
|
||||
Using cached packaging-21.0-py3-none-any.whl (40 kB)
|
||||
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1e/3b/a7828d575aa17fb7acaf1ced49a3655aa36dad7e16eb7e6a2e4df0dda76f/cytoolz-0.12.2-cp38-cp38-
|
||||
manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
|
||||
________________________________________ 2.0/2.0 MB 33.2 MB/s eta 0:00:00
|
||||
Collecting pyyaml>=5.3.1
|
||||
Using cached PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
|
||||
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c8/6b/6600ac24725c7388255b2f5add93f91e58a5d7efaf4af244fdbcc11a541b/PyYAML-6.0.1-cp38-cp38-ma
|
||||
nylinux_2_17_x86_64.manylinux2014_x86_64.whl (736 kB)
|
||||
________________________________________ 736.6/736.6 kB 38.6 MB/s eta 0:00:00
|
||||
Collecting dataclasses
|
||||
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/26/2f/1095cdc2868052dd1e64520f7c0d5c8c550ad297e944e641dbf1ffbb9a5d/dataclasses-0.6-py3-none-
|
||||
any.whl (14 kB)
|
||||
Requirement already satisfied: torchaudio in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (0.13.0+cu116)
|
||||
Collecting lilcom>=1.1.0
|
||||
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a8/65/df0a69c52bd085ca1ad4e5c4c1a5c680e25f9477d8e49316c4ff1e5084a4/lilcom-1.7-cp38-cp38-many
|
||||
linux_2_17_x86_64.manylinux2014_x86_64.whl (87 kB)
|
||||
________________________________________ 87.1/87.1 kB 8.7 MB/s eta 0:00:00
|
||||
Collecting tqdm
|
||||
Downloading tqdm-4.62.1-py2.py3-none-any.whl (76 kB)
|
||||
|________________________________| 76 kB 2.7 MB/s
|
||||
Collecting torchaudio==0.9.0
|
||||
Downloading torchaudio-0.9.0-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
|
||||
|________________________________| 1.9 MB 73.1 MB/s
|
||||
Requirement already satisfied: torch==1.9.0 in ./test-icefall/lib/python3.8/site-packages (from torchaudio==0.9.0->lhotse===0.8.0.dev
|
||||
-2a1410b-clean) (1.9.0)
|
||||
Requirement already satisfied: typing-extensions in ./test-icefall/lib/python3.8/site-packages (from torch==1.9.0->torchaudio==0.9.0-
|
||||
>lhotse===0.8.0.dev-2a1410b-clean) (3.10.0.0)
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/e6/02/a2cff6306177ae6bc73bc0665065de51dfb3b9db7373e122e2735faf0d97/tqdm-4.65.0-py3-none-any
|
||||
.whl (77 kB)
|
||||
Requirement already satisfied: numpy>=1.18.1 in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (1.24.4)
|
||||
Collecting audioread>=2.1.9
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/5d/cb/82a002441902dccbe427406785db07af10182245ee639ea9f4d92907c923/audioread-3.0.0.tar.gz (
|
||||
377 kB)
|
||||
Preparing metadata (setup.py) ... done
|
||||
Collecting tabulate>=0.8.1
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-
|
||||
any.whl (35 kB)
|
||||
Collecting click>=7.1.1
|
||||
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1a/70/e63223f8116931d365993d4a6b7ef653a4d920b41d03de7c59499962821f/click-8.1.6-py3-none-any.
|
||||
whl (97 kB)
|
||||
________________________________________ 97.9/97.9 kB 8.4 MB/s eta 0:00:00
|
||||
Collecting packaging
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/ab/c3/57f0601a2d4fe15de7a553c00adbc901425661bf048f2a22dfc500caf121/packaging-23.1-py3-none-
|
||||
any.whl (48 kB)
|
||||
Collecting intervaltree>=3.1.0
|
||||
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/50/fb/396d568039d21344639db96d940d40eb62befe704ef849b27949ded5c3bb/intervaltree-3.1.0.tar.gz
|
||||
(32 kB)
|
||||
Preparing metadata (setup.py) ... done
|
||||
Requirement already satisfied: torch in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (1.13.0+cu116)
|
||||
Collecting SoundFile>=0.10
|
||||
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ad/bd/0602167a213d9184fc688b1086dc6d374b7ae8c33eccf169f9b50ce6568c/soundfile-0.12.1-py2.py3-
|
||||
none-manylinux_2_17_x86_64.whl (1.3 MB)
|
||||
________________________________________ 1.3/1.3 MB 46.5 MB/s eta 0:00:00
|
||||
Collecting toolz>=0.8.0
|
||||
Using cached toolz-0.11.1-py3-none-any.whl (55 kB)
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/7f/5c/922a3508f5bda2892be3df86c74f9cf1e01217c2b1f8a0ac4841d903e3e9/toolz-0.12.0-py3-none-any.whl (55 kB)
|
||||
Collecting sortedcontainers<3.0,>=2.0
|
||||
Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
|
||||
Collecting cffi>=1.0
|
||||
Using cached cffi-1.14.6-cp38-cp38-manylinux1_x86_64.whl (411 kB)
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/b7/8b/06f30caa03b5b3ac006de4f93478dbd0239e2a16566d81a106c322dc4f79/cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (442 kB)
|
||||
Requirement already satisfied: typing-extensions in ./test-icefall/lib/python3.8/site-packages (from torch->lhotse==1.16.0.dev0+git.7640d66.clean) (4.7.1)
|
||||
Collecting pycparser
|
||||
Using cached pycparser-2.20-py2.py3-none-any.whl (112 kB)
|
||||
Collecting pyparsing>=2.0.2
|
||||
Using cached pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
|
||||
Building wheels for collected packages: lhotse
|
||||
Building wheel for lhotse (setup.py) ... done
|
||||
Created wheel for lhotse: filename=lhotse-0.8.0.dev_2a1410b_clean-py3-none-any.whl size=342242 sha256=f683444afa4dc0881133206b4646a
|
||||
9d0f774224cc84000f55d0a67f6e4a37997
|
||||
Stored in directory: /tmp/pip-ephem-wheel-cache-ftu0qysz/wheels/7f/7a/8e/a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f
|
||||
WARNING: Built wheel for lhotse is invalid: Metadata 1.2 mandates PEP 440 version, but '0.8.0.dev-2a1410b-clean' is not
|
||||
Failed to build lhotse
|
||||
Installing collected packages: pycparser, toolz, sortedcontainers, pyparsing, numpy, cffi, tqdm, torchaudio, SoundFile, pyyaml, packa
|
||||
ging, lilcom, intervaltree, h5py, dataclasses, cytoolz, click, audioread, lhotse
|
||||
Running setup.py install for lhotse ... done
|
||||
DEPRECATION: lhotse was installed using the legacy 'setup.py install' method, because a wheel could not be built for it. A possible
|
||||
replacement is to fix the wheel build issue reported above. You can find discussion regarding this at https://github.com/pypa/pip/is
|
||||
sues/8368.
|
||||
Successfully installed SoundFile-0.10.3.post1 audioread-2.1.9 cffi-1.14.6 click-8.0.1 cytoolz-0.11.0 dataclasses-0.6 h5py-3.4.0 inter
|
||||
valtree-3.1.0 lhotse-0.8.0.dev-2a1410b-clean lilcom-1.1.1 numpy-1.21.2 packaging-21.0 pycparser-2.20 pyparsing-2.4.7 pyyaml-5.4.1 sor
|
||||
tedcontainers-2.4.0 toolz-0.11.1 torchaudio-0.9.0 tqdm-4.62.1
|
||||
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/62/d5/5f610ebe421e85889f2e55e33b7f9a6795bd982198517d912eb1c76e1a53/pycparser-2.21-py2.py3-none-any.whl (118 kB)
|
||||
Building wheels for collected packages: lhotse, audioread, intervaltree
|
||||
Building wheel for lhotse (pyproject.toml) ... done
|
||||
Created wheel for lhotse: filename=lhotse-1.16.0.dev0+git.7640d66.clean-py3-none-any.whl size=687627 sha256=cbf0a4d2d0b639b33b91637a4175bc251d6a021a069644ecb1a9f2b3a83d072a
|
||||
Stored in directory: /tmp/pip-ephem-wheel-cache-wwtk90_m/wheels/7f/7a/8e/a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f
|
||||
Building wheel for audioread (setup.py) ... done
|
||||
Created wheel for audioread: filename=audioread-3.0.0-py3-none-any.whl size=23704 sha256=5e2d3537c96ce9cf0f645a654c671163707bf8cb8d9e358d0e2b0939a85ff4c2
|
||||
Stored in directory: /star-fj/fangjun/.cache/pip/wheels/e2/c3/9c/f19ae5a03f8862d9f0776b0c0570f1fdd60a119d90954e3f39
|
||||
Building wheel for intervaltree (setup.py) ... done
|
||||
Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26098 sha256=2604170976cfffe0d2f678cb1a6e5b525f561cd50babe53d631a186734fec9f9
|
||||
Stored in directory: /star-fj/fangjun/.cache/pip/wheels/f3/ed/2b/c179ebfad4e15452d6baef59737f27beb9bfb442e0620f7271
|
||||
Successfully built lhotse audioread intervaltree
|
||||
Installing collected packages: sortedcontainers, dataclasses, tqdm, toolz, tabulate, pyyaml, pycparser, packaging, lilcom, intervaltree, click, audioread, cytoolz, cffi, SoundFile, lhotse
|
||||
Successfully installed SoundFile-0.12.1 audioread-3.0.0 cffi-1.15.1 click-8.1.6 cytoolz-0.12.2 dataclasses-0.6 intervaltree-3.1.0 lhotse-1.16.0.dev0+git.7640d66.clean lilcom-1.7 packaging-23.1 pycparser-2.21 pyyaml-6.0.1 sortedcontainers-2.4.0 tabulate-0.9.0 toolz-0.12.0 tqdm-4.65.0
|
||||
|
||||
(5) Download icefall
|
||||
|
||||
Verify that `lhotse`_ has been installed successfully:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
(test-icefall) kuangfangjun:~$ python3 -c "import lhotse; print(lhotse.__version__)"
|
||||
|
||||
1.16.0.dev+git.7640d66.clean
|
||||
|
||||
(6) Download icefall
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: bash
|
||||
|
||||
$ cd /tmp
|
||||
$ git clone https://github.com/k2-fsa/icefall
|
||||
(test-icefall) kuangfangjun:~$ cd /tmp/
|
||||
|
||||
(test-icefall) kuangfangjun:tmp$ git clone https://github.com/k2-fsa/icefall
|
||||
|
||||
Cloning into 'icefall'...
|
||||
remote: Enumerating objects: 500, done.
|
||||
remote: Counting objects: 100% (500/500), done.
|
||||
remote: Compressing objects: 100% (308/308), done.
|
||||
remote: Total 500 (delta 263), reused 307 (delta 102), pack-reused 0
|
||||
Receiving objects: 100% (500/500), 172.49 KiB | 385.00 KiB/s, done.
|
||||
Resolving deltas: 100% (263/263), done.
|
||||
remote: Enumerating objects: 12942, done.
|
||||
remote: Counting objects: 100% (67/67), done.
|
||||
remote: Compressing objects: 100% (56/56), done.
|
||||
remote: Total 12942 (delta 17), reused 35 (delta 6), pack-reused 12875
|
||||
Receiving objects: 100% (12942/12942), 14.77 MiB | 9.29 MiB/s, done.
|
||||
Resolving deltas: 100% (8835/8835), done.
|
||||
|
||||
$ cd icefall
|
||||
$ pip install -r requirements.txt
|
||||
|
||||
Collecting kaldilm
|
||||
Downloading kaldilm-1.8.tar.gz (48 kB)
|
||||
|________________________________| 48 kB 574 kB/s
|
||||
Collecting kaldialign
|
||||
Using cached kaldialign-0.2-cp38-cp38-linux_x86_64.whl
|
||||
Collecting sentencepiece>=0.1.96
|
||||
Using cached sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
|
||||
Collecting tensorboard
|
||||
Using cached tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
|
||||
Requirement already satisfied: setuptools>=41.0.0 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r
|
||||
requirements.txt (line 4)) (57.4.0)
|
||||
Collecting absl-py>=0.4
|
||||
Using cached absl_py-0.13.0-py3-none-any.whl (132 kB)
|
||||
Collecting google-auth-oauthlib<0.5,>=0.4.1
|
||||
Using cached google_auth_oauthlib-0.4.5-py2.py3-none-any.whl (18 kB)
|
||||
Collecting grpcio>=1.24.3
|
||||
Using cached grpcio-1.39.0-cp38-cp38-manylinux2014_x86_64.whl (4.3 MB)
|
||||
Requirement already satisfied: wheel>=0.26 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r require
|
||||
ments.txt (line 4)) (0.36.2)
|
||||
Requirement already satisfied: numpy>=1.12.0 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r requi
|
||||
rements.txt (line 4)) (1.21.2)
|
||||
Collecting protobuf>=3.6.0
|
||||
Using cached protobuf-3.17.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
|
||||
Collecting werkzeug>=0.11.15
|
||||
Using cached Werkzeug-2.0.1-py3-none-any.whl (288 kB)
|
||||
Collecting tensorboard-data-server<0.7.0,>=0.6.0
|
||||
Using cached tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)
|
||||
Collecting google-auth<2,>=1.6.3
|
||||
Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
|
||||
|________________________________| 152 kB 1.4 MB/s
|
||||
Collecting requests<3,>=2.21.0
|
||||
Using cached requests-2.26.0-py2.py3-none-any.whl (62 kB)
|
||||
Collecting tensorboard-plugin-wit>=1.6.0
|
||||
Using cached tensorboard_plugin_wit-1.8.0-py3-none-any.whl (781 kB)
|
||||
Collecting markdown>=2.6.8
|
||||
Using cached Markdown-3.3.4-py3-none-any.whl (97 kB)
|
||||
Collecting six
|
||||
Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
|
||||
Collecting cachetools<5.0,>=2.0.0
|
||||
Using cached cachetools-4.2.2-py3-none-any.whl (11 kB)
|
||||
Collecting rsa<5,>=3.1.4
|
||||
Using cached rsa-4.7.2-py3-none-any.whl (34 kB)
|
||||
Collecting pyasn1-modules>=0.2.1
|
||||
Using cached pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
|
||||
Collecting requests-oauthlib>=0.7.0
|
||||
Using cached requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
|
||||
Collecting pyasn1<0.5.0,>=0.4.6
|
||||
Using cached pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)
|
||||
Collecting urllib3<1.27,>=1.21.1
|
||||
Using cached urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
|
||||
Collecting certifi>=2017.4.17
|
||||
Using cached certifi-2021.5.30-py2.py3-none-any.whl (145 kB)
|
||||
Collecting charset-normalizer~=2.0.0
|
||||
Using cached charset_normalizer-2.0.4-py3-none-any.whl (36 kB)
|
||||
Collecting idna<4,>=2.5
|
||||
Using cached idna-3.2-py3-none-any.whl (59 kB)
|
||||
Collecting oauthlib>=3.0.0
|
||||
Using cached oauthlib-3.1.1-py2.py3-none-any.whl (146 kB)
|
||||
Building wheels for collected packages: kaldilm
|
||||
Building wheel for kaldilm (setup.py) ... done
|
||||
Created wheel for kaldilm: filename=kaldilm-1.8-cp38-cp38-linux_x86_64.whl size=897233 sha256=eccb906cafcd45bf9a7e1a1718e4534254bfb
|
||||
f4c0d0cbc66eee6c88d68a63862
|
||||
Stored in directory: /root/fangjun/.cache/pip/wheels/85/7d/63/f2dd586369b8797cb36d213bf3a84a789eeb92db93d2e723c9
|
||||
Successfully built kaldilm
|
||||
Installing collected packages: urllib3, pyasn1, idna, charset-normalizer, certifi, six, rsa, requests, pyasn1-modules, oauthlib, cach
|
||||
etools, requests-oauthlib, google-auth, werkzeug, tensorboard-plugin-wit, tensorboard-data-server, protobuf, markdown, grpcio, google
|
||||
-auth-oauthlib, absl-py, tensorboard, sentencepiece, kaldilm, kaldialign
|
||||
Successfully installed absl-py-0.13.0 cachetools-4.2.2 certifi-2021.5.30 charset-normalizer-2.0.4 google-auth-1.35.0 google-auth-oaut
|
||||
hlib-0.4.5 grpcio-1.39.0 idna-3.2 kaldialign-0.2 kaldilm-1.8 markdown-3.3.4 oauthlib-3.1.1 protobuf-3.17.3 pyasn1-0.4.8 pyasn1-module
|
||||
s-0.2.8 requests-2.26.0 requests-oauthlib-1.3.0 rsa-4.7.2 sentencepiece-0.1.96 six-1.16.0 tensorboard-2.6.0 tensorboard-data-server-0
|
||||
.6.1 tensorboard-plugin-wit-1.8.0 urllib3-1.26.6 werkzeug-2.0.1
|
||||
(test-icefall) kuangfangjun:tmp$ cd icefall/
|
||||
|
||||
(test-icefall) kuangfangjun:icefall$ pip install -r ./requirements.txt
|
||||
|
||||
Test Your Installation
|
||||
----------------------
|
||||
|
||||
To test that your installation is successful, let us run
|
||||
the `yesno recipe <https://github.com/k2-fsa/icefall/tree/master/egs/yesno/ASR>`_
|
||||
on CPU.
|
||||
on ``CPU``.
|
||||
|
||||
Data preparation
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
$ cd /tmp/icefall
|
||||
$ cd egs/yesno/ASR
|
||||
$ ./prepare.sh
|
||||
(test-icefall) kuangfangjun:icefall$ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||
|
||||
(test-icefall) kuangfangjun:icefall$ cd /tmp/icefall
|
||||
|
||||
(test-icefall) kuangfangjun:icefall$ cd egs/yesno/ASR
|
||||
|
||||
(test-icefall) kuangfangjun:ASR$ ./prepare.sh
|
||||
|
||||
|
||||
The log of running ``./prepare.sh`` is:
|
||||
|
||||
.. code-block::
|
||||
|
||||
2023-05-12 17:55:21 (prepare.sh:27:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
|
||||
2023-05-12 17:55:21 (prepare.sh:30:main) Stage 0: Download data
|
||||
/tmp/icefall/egs/yesno/ASR/download/waves_yesno.tar.gz: 100%|_______________________________________________________________| 4.70M/4.70M [06:54<00:00, 11.4kB/s]
|
||||
2023-05-12 18:02:19 (prepare.sh:39:main) Stage 1: Prepare yesno manifest
|
||||
2023-05-12 18:02:21 (prepare.sh:45:main) Stage 2: Compute fbank for yesno
|
||||
2023-05-12 18:02:23,199 INFO [compute_fbank_yesno.py:65] Processing train
|
||||
Extracting and storing features: 100%|_______________________________________________________________| 90/90 [00:00<00:00, 212.60it/s]
|
||||
2023-05-12 18:02:23,640 INFO [compute_fbank_yesno.py:65] Processing test
|
||||
Extracting and storing features: 100%|_______________________________________________________________| 30/30 [00:00<00:00, 304.53it/s]
|
||||
2023-05-12 18:02:24 (prepare.sh:51:main) Stage 3: Prepare lang
|
||||
2023-05-12 18:02:26 (prepare.sh:66:main) Stage 4: Prepare G
|
||||
2023-07-27 12:41:39 (prepare.sh:27:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
|
||||
2023-07-27 12:41:39 (prepare.sh:30:main) Stage 0: Download data
|
||||
/tmp/icefall/egs/yesno/ASR/download/waves_yesno.tar.gz: 100%|___________________________________________________| 4.70M/4.70M [00:00<00:00, 11.1MB/s]
|
||||
2023-07-27 12:41:46 (prepare.sh:39:main) Stage 1: Prepare yesno manifest
|
||||
2023-07-27 12:41:50 (prepare.sh:45:main) Stage 2: Compute fbank for yesno
|
||||
2023-07-27 12:41:55,718 INFO [compute_fbank_yesno.py:65] Processing train
|
||||
Extracting and storing features: 100%|_______________________________________________________________________________| 90/90 [00:01<00:00, 87.82it/s]
|
||||
2023-07-27 12:41:56,778 INFO [compute_fbank_yesno.py:65] Processing test
|
||||
Extracting and storing features: 100%|______________________________________________________________________________| 30/30 [00:00<00:00, 256.92it/s]
|
||||
2023-07-27 12:41:57 (prepare.sh:51:main) Stage 3: Prepare lang
|
||||
2023-07-27 12:42:02 (prepare.sh:66:main) Stage 4: Prepare G
|
||||
/project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):79
|
||||
[I] Reading \data\ section.
|
||||
/project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):140
|
||||
[I] Reading \1-grams: section.
|
||||
2023-05-12 18:02:26 (prepare.sh:92:main) Stage 5: Compile HLG
|
||||
2023-05-12 18:02:28,581 INFO [compile_hlg.py:124] Processing data/lang_phone
|
||||
2023-05-12 18:02:28,582 INFO [lexicon.py:171] Converting L.pt to Linv.pt
|
||||
2023-05-12 18:02:28,609 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
|
||||
2023-05-12 18:02:28,610 INFO [compile_hlg.py:52] Loading G.fst.txt
|
||||
2023-05-12 18:02:28,611 INFO [compile_hlg.py:62] Intersecting L and G
|
||||
2023-05-12 18:02:28,613 INFO [compile_hlg.py:64] LG shape: (4, None)
|
||||
2023-05-12 18:02:28,613 INFO [compile_hlg.py:66] Connecting LG
|
||||
2023-05-12 18:02:28,614 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
|
||||
2023-05-12 18:02:28,614 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
|
||||
2023-05-12 18:02:28,614 INFO [compile_hlg.py:71] Determinizing LG
|
||||
2023-05-12 18:02:28,615 INFO [compile_hlg.py:74] <class '_k2.ragged.RaggedTensor'>
|
||||
2023-05-12 18:02:28,615 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
|
||||
2023-05-12 18:02:28,615 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
|
||||
2023-05-12 18:02:28,616 INFO [compile_hlg.py:91] LG shape after k2.remove_epsilon: (6, None)
|
||||
2023-05-12 18:02:28,617 INFO [compile_hlg.py:96] Arc sorting LG
|
||||
2023-05-12 18:02:28,617 INFO [compile_hlg.py:99] Composing H and LG
|
||||
2023-05-12 18:02:28,619 INFO [compile_hlg.py:106] Connecting LG
|
||||
2023-05-12 18:02:28,619 INFO [compile_hlg.py:109] Arc sorting LG
|
||||
2023-05-12 18:02:28,619 INFO [compile_hlg.py:111] HLG.shape: (8, None)
|
||||
2023-05-12 18:02:28,619 INFO [compile_hlg.py:127] Saving HLG.pt to data/lang_phone
|
||||
|
||||
2023-07-27 12:42:02 (prepare.sh:92:main) Stage 5: Compile HLG
|
||||
2023-07-27 12:42:07,275 INFO [compile_hlg.py:124] Processing data/lang_phone
|
||||
2023-07-27 12:42:07,276 INFO [lexicon.py:171] Converting L.pt to Linv.pt
|
||||
2023-07-27 12:42:07,309 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
|
||||
2023-07-27 12:42:07,310 INFO [compile_hlg.py:52] Loading G.fst.txt
|
||||
2023-07-27 12:42:07,314 INFO [compile_hlg.py:62] Intersecting L and G
|
||||
2023-07-27 12:42:07,323 INFO [compile_hlg.py:64] LG shape: (4, None)
|
||||
2023-07-27 12:42:07,323 INFO [compile_hlg.py:66] Connecting LG
|
||||
2023-07-27 12:42:07,323 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
|
||||
2023-07-27 12:42:07,323 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
|
||||
2023-07-27 12:42:07,323 INFO [compile_hlg.py:71] Determinizing LG
|
||||
2023-07-27 12:42:07,341 INFO [compile_hlg.py:74] <class '_k2.ragged.RaggedTensor'>
|
||||
2023-07-27 12:42:07,341 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
|
||||
2023-07-27 12:42:07,341 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
|
||||
2023-07-27 12:42:07,354 INFO [compile_hlg.py:91] LG shape after k2.remove_epsilon: (6, None)
|
||||
2023-07-27 12:42:07,445 INFO [compile_hlg.py:96] Arc sorting LG
|
||||
2023-07-27 12:42:07,445 INFO [compile_hlg.py:99] Composing H and LG
|
||||
2023-07-27 12:42:07,446 INFO [compile_hlg.py:106] Connecting LG
|
||||
2023-07-27 12:42:07,446 INFO [compile_hlg.py:109] Arc sorting LG
|
||||
2023-07-27 12:42:07,447 INFO [compile_hlg.py:111] HLG.shape: (8, None)
|
||||
2023-07-27 12:42:07,447 INFO [compile_hlg.py:127] Saving HLG.pt to data/lang_phone
|
||||
|
||||
Training
|
||||
~~~~~~~~
|
||||
@ -409,12 +435,13 @@ Now let us run the training part:
|
||||
|
||||
.. code-block::
|
||||
|
||||
$ export CUDA_VISIBLE_DEVICES=""
|
||||
$ ./tdnn/train.py
|
||||
(test-icefall) kuangfangjun:ASR$ export CUDA_VISIBLE_DEVICES=""
|
||||
|
||||
(test-icefall) kuangfangjun:ASR$ ./tdnn/train.py
|
||||
|
||||
.. CAUTION::
|
||||
|
||||
We use ``export CUDA_VISIBLE_DEVICES=""`` so that ``icefall`` uses CPU
|
||||
We use ``export CUDA_VISIBLE_DEVICES=""`` so that `icefall`_ uses CPU
|
||||
even if there are GPUs available.
|
||||
|
||||
.. hint::
|
||||
@ -432,53 +459,52 @@ The training log is given below:
|
||||
|
||||
.. code-block::
|
||||
|
||||
2023-05-12 18:04:59,759 INFO [train.py:481] Training started
|
||||
2023-05-12 18:04:59,759 INFO [train.py:482] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0,
|
||||
'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10,
|
||||
'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0,
|
||||
'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2,
|
||||
'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '3b7f09fa35e72589914f67089c0da9f196a92ca4', 'k2-git-date': 'Mon May 8 22:58:45 2023',
|
||||
'lhotse-version': '1.15.0.dev+git.6fcfced.clean', 'torch-version': '2.0.0+cu118', 'torch-cuda-available': False, 'torch-cuda-version': '11.8', 'python-version': '3.1', 'icefall-git-branch': 'master',
|
||||
'icefall-git-sha1': '30bde4b-clean', 'icefall-git-date': 'Thu May 11 17:37:47 2023', 'icefall-path': '/tmp/icefall',
|
||||
'k2-path': 'tmp/lib/python3.10/site-packages/k2-1.24.3.dev20230512+cuda11.8.torch2.0.0-py3.10-linux-x86_64.egg/k2/__init__.py',
|
||||
'lhotse-path': 'tmp/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'host', 'IP address': '0.0.0.0'}}
|
||||
2023-05-12 18:04:59,761 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-05-12 18:04:59,764 INFO [train.py:495] device: cpu
|
||||
2023-05-12 18:04:59,791 INFO [asr_datamodule.py:146] About to get train cuts
|
||||
2023-05-12 18:04:59,791 INFO [asr_datamodule.py:244] About to get train cuts
|
||||
2023-05-12 18:04:59,852 INFO [asr_datamodule.py:149] About to create train dataset
|
||||
2023-05-12 18:04:59,852 INFO [asr_datamodule.py:199] Using SingleCutSampler.
|
||||
2023-05-12 18:04:59,852 INFO [asr_datamodule.py:205] About to create train dataloader
|
||||
2023-05-12 18:04:59,853 INFO [asr_datamodule.py:218] About to get test cuts
|
||||
2023-05-12 18:04:59,853 INFO [asr_datamodule.py:252] About to get test cuts
|
||||
2023-05-12 18:04:59,986 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.065, over 2436.00 frames. ], tot_loss[loss=1.065, over 2436.00 frames. ], batch size: 4
|
||||
2023-05-12 18:05:00,352 INFO [train.py:422] Epoch 0, batch 10, loss[loss=0.4561, over 2828.00 frames. ], tot_loss[loss=0.7076, over 22192.90 frames. ], batch size: 4
|
||||
2023-05-12 18:05:00,691 INFO [train.py:444] Epoch 0, validation loss=0.9002, over 18067.00 frames.
|
||||
2023-05-12 18:05:00,996 INFO [train.py:422] Epoch 0, batch 20, loss[loss=0.2555, over 2695.00 frames. ], tot_loss[loss=0.484, over 34971.47 frames. ], batch size: 5
|
||||
2023-05-12 18:05:01,217 INFO [train.py:444] Epoch 0, validation loss=0.4688, over 18067.00 frames.
|
||||
2023-05-12 18:05:01,251 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-0.pt
|
||||
2023-05-12 18:05:01,389 INFO [train.py:422] Epoch 1, batch 0, loss[loss=0.2532, over 2436.00 frames. ], tot_loss[loss=0.2532, over 2436.00 frames. ], batch size: 4
|
||||
2023-05-12 18:05:01,637 INFO [train.py:422] Epoch 1, batch 10, loss[loss=0.1139, over 2828.00 frames. ], tot_loss[loss=0.1592, over 22192.90 frames. ], batch size: 4
|
||||
2023-05-12 18:05:01,859 INFO [train.py:444] Epoch 1, validation loss=0.1629, over 18067.00 frames.
|
||||
2023-05-12 18:05:02,094 INFO [train.py:422] Epoch 1, batch 20, loss[loss=0.0767, over 2695.00 frames. ], tot_loss[loss=0.118, over 34971.47 frames. ], batch size: 5
|
||||
2023-05-12 18:05:02,350 INFO [train.py:444] Epoch 1, validation loss=0.06778, over 18067.00 frames.
|
||||
2023-05-12 18:05:02,395 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-1.pt
|
||||
2023-07-27 12:50:51,936 INFO [train.py:481] Training started
|
||||
2023-07-27 12:50:51,936 INFO [train.py:482] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d66.clean', 'torch-version': '1.13.0+cu116', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '3fb0a43-clean', 'icefall-git-date': 'Thu Jul 27 12:36:05 2023', 'icefall-path': '/tmp/icefall', 'k2-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1220091118-57c4d55446-sph26', 'IP address': '10.177.77.20'}}
|
||||
2023-07-27 12:50:51,941 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-07-27 12:50:51,949 INFO [train.py:495] device: cpu
|
||||
2023-07-27 12:50:51,965 INFO [asr_datamodule.py:146] About to get train cuts
|
||||
2023-07-27 12:50:51,965 INFO [asr_datamodule.py:244] About to get train cuts
|
||||
2023-07-27 12:50:51,967 INFO [asr_datamodule.py:149] About to create train dataset
|
||||
2023-07-27 12:50:51,967 INFO [asr_datamodule.py:199] Using SingleCutSampler.
|
||||
2023-07-27 12:50:51,967 INFO [asr_datamodule.py:205] About to create train dataloader
|
||||
2023-07-27 12:50:51,968 INFO [asr_datamodule.py:218] About to get test cuts
|
||||
2023-07-27 12:50:51,968 INFO [asr_datamodule.py:252] About to get test cuts
|
||||
2023-07-27 12:50:52,565 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.065, over 2436.00 frames. ], tot_loss[loss=1.065, over 2436.00 frames. ], batch size: 4
|
||||
2023-07-27 12:50:53,681 INFO [train.py:422] Epoch 0, batch 10, loss[loss=0.4561, over 2828.00 frames. ], tot_loss[loss=0.7076, over 22192.90 frames.], batch size: 4
|
||||
2023-07-27 12:50:54,167 INFO [train.py:444] Epoch 0, validation loss=0.9002, over 18067.00 frames.
|
||||
2023-07-27 12:50:55,011 INFO [train.py:422] Epoch 0, batch 20, loss[loss=0.2555, over 2695.00 frames. ], tot_loss[loss=0.484, over 34971.47 frames. ], batch size: 5
|
||||
2023-07-27 12:50:55,331 INFO [train.py:444] Epoch 0, validation loss=0.4688, over 18067.00 frames.
|
||||
2023-07-27 12:50:55,368 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-0.pt
|
||||
2023-07-27 12:50:55,633 INFO [train.py:422] Epoch 1, batch 0, loss[loss=0.2532, over 2436.00 frames. ], tot_loss[loss=0.2532, over 2436.00 frames. ],
|
||||
batch size: 4
|
||||
2023-07-27 12:50:56,242 INFO [train.py:422] Epoch 1, batch 10, loss[loss=0.1139, over 2828.00 frames. ], tot_loss[loss=0.1592, over 22192.90 frames.], batch size: 4
|
||||
2023-07-27 12:50:56,522 INFO [train.py:444] Epoch 1, validation loss=0.1627, over 18067.00 frames.
|
||||
2023-07-27 12:50:57,209 INFO [train.py:422] Epoch 1, batch 20, loss[loss=0.07055, over 2695.00 frames. ], tot_loss[loss=0.1175, over 34971.47 frames.], batch size: 5
|
||||
2023-07-27 12:50:57,600 INFO [train.py:444] Epoch 1, validation loss=0.07091, over 18067.00 frames.
|
||||
2023-07-27 12:50:57,640 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-1.pt
|
||||
2023-07-27 12:50:57,847 INFO [train.py:422] Epoch 2, batch 0, loss[loss=0.07731, over 2436.00 frames. ], tot_loss[loss=0.07731, over 2436.00 frames.], batch size: 4
|
||||
2023-07-27 12:50:58,427 INFO [train.py:422] Epoch 2, batch 10, loss[loss=0.04391, over 2828.00 frames. ], tot_loss[loss=0.05341, over 22192.90 frames. ], batch size: 4
|
||||
2023-07-27 12:50:58,884 INFO [train.py:444] Epoch 2, validation loss=0.04384, over 18067.00 frames.
|
||||
2023-07-27 12:50:59,387 INFO [train.py:422] Epoch 2, batch 20, loss[loss=0.03458, over 2695.00 frames. ], tot_loss[loss=0.04616, over 34971.47 frames. ], batch size: 5
|
||||
2023-07-27 12:50:59,707 INFO [train.py:444] Epoch 2, validation loss=0.03379, over 18067.00 frames.
|
||||
2023-07-27 12:50:59,758 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-2.pt
|
||||
|
||||
... ...
|
||||
|
||||
2023-05-12 18:05:14,789 INFO [train.py:422] Epoch 13, batch 0, loss[loss=0.01056, over 2436.00 frames. ], tot_loss[loss=0.01056, over 2436.00 frames. ], batch size: 4
|
||||
2023-05-12 18:05:15,016 INFO [train.py:422] Epoch 13, batch 10, loss[loss=0.009022, over 2828.00 frames. ], tot_loss[loss=0.009985, over 22192.90 frames. ], batch size: 4
|
||||
2023-05-12 18:05:15,271 INFO [train.py:444] Epoch 13, validation loss=0.01088, over 18067.00 frames.
|
||||
2023-05-12 18:05:15,497 INFO [train.py:422] Epoch 13, batch 20, loss[loss=0.01174, over 2695.00 frames. ], tot_loss[loss=0.01077, over 34971.47 frames. ], batch size: 5
|
||||
2023-05-12 18:05:15,747 INFO [train.py:444] Epoch 13, validation loss=0.01087, over 18067.00 frames.
|
||||
2023-05-12 18:05:15,783 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-13.pt
|
||||
2023-05-12 18:05:15,921 INFO [train.py:422] Epoch 14, batch 0, loss[loss=0.01045, over 2436.00 frames. ], tot_loss[loss=0.01045, over 2436.00 frames. ], batch size: 4
|
||||
2023-05-12 18:05:16,146 INFO [train.py:422] Epoch 14, batch 10, loss[loss=0.008957, over 2828.00 frames. ], tot_loss[loss=0.009903, over 22192.90 frames. ], batch size: 4
|
||||
2023-05-12 18:05:16,374 INFO [train.py:444] Epoch 14, validation loss=0.01092, over 18067.00 frames.
|
||||
2023-05-12 18:05:16,598 INFO [train.py:422] Epoch 14, batch 20, loss[loss=0.01169, over 2695.00 frames. ], tot_loss[loss=0.01065, over 34971.47 frames. ], batch size: 5
|
||||
2023-05-12 18:05:16,824 INFO [train.py:444] Epoch 14, validation loss=0.01077, over 18067.00 frames.
|
||||
2023-05-12 18:05:16,862 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-14.pt
|
||||
2023-05-12 18:05:16,865 INFO [train.py:555] Done!
|
||||
2023-07-27 12:51:23,433 INFO [train.py:422] Epoch 13, batch 0, loss[loss=0.01054, over 2436.00 frames. ], tot_loss[loss=0.01054, over 2436.00 frames. ], batch size: 4
|
||||
2023-07-27 12:51:23,980 INFO [train.py:422] Epoch 13, batch 10, loss[loss=0.009014, over 2828.00 frames. ], tot_loss[loss=0.009974, over 22192.90 frames. ], batch size: 4
|
||||
2023-07-27 12:51:24,489 INFO [train.py:444] Epoch 13, validation loss=0.01085, over 18067.00 frames.
|
||||
2023-07-27 12:51:25,258 INFO [train.py:422] Epoch 13, batch 20, loss[loss=0.01172, over 2695.00 frames. ], tot_loss[loss=0.01055, over 34971.47 frames. ], batch size: 5
|
||||
2023-07-27 12:51:25,621 INFO [train.py:444] Epoch 13, validation loss=0.01074, over 18067.00 frames.
|
||||
2023-07-27 12:51:25,699 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-13.pt
|
||||
2023-07-27 12:51:25,866 INFO [train.py:422] Epoch 14, batch 0, loss[loss=0.01044, over 2436.00 frames. ], tot_loss[loss=0.01044, over 2436.00 frames. ], batch size: 4
|
||||
2023-07-27 12:51:26,844 INFO [train.py:422] Epoch 14, batch 10, loss[loss=0.008942, over 2828.00 frames. ], tot_loss[loss=0.01, over 22192.90 frames. ], batch size: 4
|
||||
2023-07-27 12:51:27,221 INFO [train.py:444] Epoch 14, validation loss=0.01082, over 18067.00 frames.
|
||||
2023-07-27 12:51:27,970 INFO [train.py:422] Epoch 14, batch 20, loss[loss=0.01169, over 2695.00 frames. ], tot_loss[loss=0.01054, over 34971.47 frames. ], batch size: 5
|
||||
2023-07-27 12:51:28,247 INFO [train.py:444] Epoch 14, validation loss=0.01073, over 18067.00 frames.
|
||||
2023-07-27 12:51:28,323 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-14.pt
|
||||
2023-07-27 12:51:28,326 INFO [train.py:555] Done!
|
||||
|
||||
Decoding
|
||||
~~~~~~~~
|
||||
@ -487,42 +513,32 @@ Let us use the trained model to decode the test set:
|
||||
|
||||
.. code-block::
|
||||
|
||||
$ ./tdnn/decode.py
|
||||
(test-icefall) kuangfangjun:ASR$ ./tdnn/decode.py
|
||||
|
||||
The decoding log is:
|
||||
2023-07-27 12:55:12,840 INFO [decode.py:263] Decoding started
|
||||
2023-07-27 12:55:12,840 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d66.clean', 'torch-version': '1.13.0+cu116', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '3fb0a43-clean', 'icefall-git-date': 'Thu Jul 27 12:36:05 2023', 'icefall-path': '/tmp/icefall', 'k2-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1220091118-57c4d55446-sph26', 'IP address': '10.177.77.20'}}
|
||||
2023-07-27 12:55:12,841 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-07-27 12:55:12,855 INFO [decode.py:273] device: cpu
|
||||
2023-07-27 12:55:12,868 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||
2023-07-27 12:55:12,882 INFO [asr_datamodule.py:218] About to get test cuts
|
||||
2023-07-27 12:55:12,883 INFO [asr_datamodule.py:252] About to get test cuts
|
||||
2023-07-27 12:55:13,157 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
|
||||
2023-07-27 12:55:13,701 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
|
||||
2023-07-27 12:55:13,702 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
|
||||
2023-07-27 12:55:13,704 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
|
||||
2023-07-27 12:55:13,704 INFO [decode.py:316] Done!
|
||||
|
||||
.. code-block::
|
||||
|
||||
2023-05-12 18:08:30,482 INFO [decode.py:263] Decoding started
|
||||
2023-05-12 18:08:30,483 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23,
|
||||
'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'),
|
||||
'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True,
|
||||
'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '3b7f09fa35e72589914f67089c0da9f196a92ca4', 'k2-git-date': 'Mon May 8 22:58:45 2023',
|
||||
'lhotse-version': '1.15.0.dev+git.6fcfced.clean', 'torch-version': '2.0.0+cu118', 'torch-cuda-available': False, 'torch-cuda-version': '11.8', 'python-version': '3.1', 'icefall-git-branch': 'master',
|
||||
'icefall-git-sha1': '30bde4b-clean', 'icefall-git-date': 'Thu May 11 17:37:47 2023', 'icefall-path': '/tmp/icefall',
|
||||
'k2-path': '/tmp/lib/python3.10/site-packages/k2-1.24.3.dev20230512+cuda11.8.torch2.0.0-py3.10-linux-x86_64.egg/k2/__init__.py',
|
||||
'lhotse-path': '/tmp/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'host', 'IP address': '0.0.0.0'}}
|
||||
2023-05-12 18:08:30,483 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||
2023-05-12 18:08:30,487 INFO [decode.py:273] device: cpu
|
||||
2023-05-12 18:08:30,513 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||
2023-05-12 18:08:30,521 INFO [asr_datamodule.py:218] About to get test cuts
|
||||
2023-05-12 18:08:30,521 INFO [asr_datamodule.py:252] About to get test cuts
|
||||
2023-05-12 18:08:30,675 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
|
||||
2023-05-12 18:08:30,923 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
|
||||
2023-05-12 18:08:30,924 INFO [utils.py:558] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
|
||||
2023-05-12 18:08:30,925 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
|
||||
2023-05-12 18:08:30,925 INFO [decode.py:316] Done!
|
||||
|
||||
**Congratulations!** You have successfully setup the environment and have run the first recipe in ``icefall``.
|
||||
**Congratulations!** You have successfully setup the environment and have run the first recipe in `icefall`_.
|
||||
|
||||
Have fun with ``icefall``!
|
||||
|
||||
YouTube Video
|
||||
-------------
|
||||
|
||||
We provide the following YouTube video showing how to install ``icefall``.
|
||||
We provide the following YouTube video showing how to install `icefall`_.
|
||||
It also shows how to debug various problems that you may encounter while
|
||||
using ``icefall``.
|
||||
using `icefall`_.
|
||||
|
||||
.. note::
|
||||
|
||||
|
@ -41,7 +41,7 @@ as an example.
|
||||
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||
--bpe-model data/lang_bpe_500/bpe.model \
|
||||
--tokens data/lang_bpe_500/tokens.txt \
|
||||
--epoch 20 \
|
||||
--avg 10
|
||||
|
||||
@ -78,7 +78,7 @@ In each recipe, there is also a file ``pretrained.py``, which can use
|
||||
|
||||
./pruned_transducer_stateless3/pretrained.py \
|
||||
--checkpoint ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt \
|
||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model \
|
||||
--tokens ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \
|
||||
--method greedy_search \
|
||||
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \
|
||||
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \
|
||||
|
@ -153,11 +153,10 @@ Next, we use the following code to export our model:
|
||||
|
||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $dir/exp \
|
||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
||||
--tokens $dir/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 30 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
\
|
||||
--num-encoder-layers 12 \
|
||||
--chunk-length 32 \
|
||||
--cnn-module-kernel 31 \
|
||||
|
@ -73,7 +73,7 @@ Next, we use the following code to export our model:
|
||||
|
||||
./lstm_transducer_stateless2/export-for-ncnn.py \
|
||||
--exp-dir $dir/exp \
|
||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
||||
--tokens $dir/data/lang_bpe_500/tokens.txt \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--use-averaged-model 0 \
|
||||
|
@ -72,12 +72,11 @@ Next, we use the following code to export our model:
|
||||
dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
||||
--tokens $dir/data/lang_bpe_500/tokens.txt \
|
||||
--exp-dir $dir/exp \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
\
|
||||
--decode-chunk-len 32 \
|
||||
--num-left-chunks 4 \
|
||||
--num-encoder-layers "2,4,3,2,4" \
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _icefall_export_to_ncnn:
|
||||
|
||||
Export to ncnn
|
||||
==============
|
||||
|
||||
|
@ -71,7 +71,7 @@ Export the model to ONNX
|
||||
.. code-block:: bash
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
||||
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
|
@ -32,7 +32,7 @@ as an example in the following.
|
||||
|
||||
./pruned_transducer_stateless3/export.py \
|
||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||
--bpe-model data/lang_bpe_500/bpe.model \
|
||||
--tokens data/lang_bpe_500/tokens.txt \
|
||||
--epoch $epoch \
|
||||
--avg $avg \
|
||||
--jit 1
|
||||
|
@ -33,7 +33,7 @@ as an example in the following.
|
||||
|
||||
./lstm_transducer_stateless2/export.py \
|
||||
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||
--bpe-model data/lang_bpe_500/bpe.model \
|
||||
--tokens data/lang_bpe_500/tokens.txt \
|
||||
--iter $iter \
|
||||
--avg $avg \
|
||||
--jit-trace 1
|
||||
|
@ -47,7 +47,7 @@ The data preparation contains several stages, you can use the following two
|
||||
options:
|
||||
|
||||
- ``--stage``
|
||||
- ``--stop-stage``
|
||||
- ``--stop_stage``
|
||||
|
||||
to control which stage(s) should be run. By default, all stages are executed.
|
||||
|
||||
@ -56,8 +56,8 @@ For example,
|
||||
.. code-block:: bash
|
||||
|
||||
$ cd egs/librispeech/ASR
|
||||
$ ./prepare.sh --stage 0 --stop-stage 0 # run only stage 0
|
||||
$ ./prepare.sh --stage 2 --stop-stage 5 # run from stage 2 to stage 5
|
||||
$ ./prepare.sh --stage 0 --stop_stage 0 # run only stage 0
|
||||
$ ./prepare.sh --stage 2 --stop_stage 5 # run from stage 2 to stage 5
|
||||
|
||||
.. HINT::
|
||||
|
||||
@ -108,15 +108,15 @@ As usual, you can control the stages you want to run by specifying the following
|
||||
two options:
|
||||
|
||||
- ``--stage``
|
||||
- ``--stop-stage``
|
||||
- ``--stop_stage``
|
||||
|
||||
For example,
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ cd egs/librispeech/ASR
|
||||
$ ./distillation_with_hubert.sh --stage 0 --stop-stage 0 # run only stage 0
|
||||
$ ./distillation_with_hubert.sh --stage 2 --stop-stage 4 # run from stage 2 to stage 5
|
||||
$ ./distillation_with_hubert.sh --stage 0 --stop_stage 0 # run only stage 0
|
||||
$ ./distillation_with_hubert.sh --stage 2 --stop_stage 4 # run from stage 2 to stage 5
|
||||
|
||||
Here are a few options in `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_
|
||||
you need to know before you proceed.
|
||||
@ -134,7 +134,7 @@ and prepares MVQ-augmented training manifests.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ./distillation_with_hubert.sh --stage 2 --stop-stage 2 # run only stage 2
|
||||
$ ./distillation_with_hubert.sh --stage 2 --stop_stage 2 # run only stage 2
|
||||
|
||||
Please see the
|
||||
following screenshot for the output of an example execution.
|
||||
@ -172,7 +172,7 @@ To perform training, please run stage 3 by executing the following command.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ./prepare.sh --stage 3 --stop-stage 3 # run MVQ training
|
||||
$ ./prepare.sh --stage 3 --stop_stage 3 # run MVQ training
|
||||
|
||||
Here is the code snippet for training:
|
||||
|
||||
|
7
docs/source/recipes/RNN-LM/index.rst
Normal file
7
docs/source/recipes/RNN-LM/index.rst
Normal file
@ -0,0 +1,7 @@
|
||||
RNN-LM
|
||||
======
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
librispeech/lm-training
|
104
docs/source/recipes/RNN-LM/librispeech/lm-training.rst
Normal file
104
docs/source/recipes/RNN-LM/librispeech/lm-training.rst
Normal file
@ -0,0 +1,104 @@
|
||||
.. _train_nnlm:
|
||||
|
||||
Train an RNN langugage model
|
||||
======================================
|
||||
|
||||
If you have enough text data, you can train a neural network language model (NNLM) to improve
|
||||
the WER of your E2E ASR system. This tutorial shows you how to train an RNNLM from
|
||||
scratch.
|
||||
|
||||
.. HINT::
|
||||
|
||||
For how to use an NNLM during decoding, please refer to the following tutorials:
|
||||
:ref:`shallow_fusion`, :ref:`LODR`, :ref:`rescoring`
|
||||
|
||||
.. note::
|
||||
|
||||
This tutorial is based on the LibriSpeech recipe. Please check it out for the necessary
|
||||
python scripts for this tutorial. We use the LibriSpeech LM-corpus as the LM training set
|
||||
for illustration purpose. You can also collect your own data. The data format is quite simple:
|
||||
each line should contain a complete sentence, and words should be separated by space.
|
||||
|
||||
First, let's download the training data for the RNNLM. This can be done via the
|
||||
following command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ wget https://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
|
||||
$ gzip -d librispeech-lm-norm.txt.gz
|
||||
|
||||
As we are training a BPE-level RNNLM, we need to tokenize the training text, which requires a
|
||||
BPE tokenizer. This can be achieved by executing the following command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ # if you don't have the BPE
|
||||
$ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
|
||||
$ cd icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500
|
||||
$ git lfs pull --include bpe.model
|
||||
$ cd ../../..
|
||||
|
||||
$ ./local/prepare_lm_training_data.py \
|
||||
--bpe-model icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500/bpe.model \
|
||||
--lm-data librispeech-lm-norm.txt \
|
||||
--lm-archive data/lang_bpe_500/lm_data.pt
|
||||
|
||||
Now, you should have a file name ``lm_data.pt`` file store under the directory ``data/lang_bpe_500``.
|
||||
This is the packed training data for the RNNLM. We then sort the training data according to its
|
||||
sentence length.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ # This could take a while (~ 20 minutes), feel free to grab a cup of coffee :)
|
||||
$ ./local/sort_lm_training_data.py \
|
||||
--in-lm-data data/lang_bpe_500/lm_data.pt \
|
||||
--out-lm-data data/lang_bpe_500/sorted_lm_data.pt \
|
||||
--out-statistics data/lang_bpe_500/lm_data_stats.txt
|
||||
|
||||
|
||||
The aforementioned steps can be repeated to create a a validation set for you RNNLM. Let's say
|
||||
you have a validation set in ``valid.txt``, you can just set ``--lm-data valid.txt``
|
||||
and ``--lm-archive data/lang_bpe_500/lm-data-valid.pt`` when calling ``./local/prepare_lm_training_data.py``.
|
||||
|
||||
After completing the previous steps, the training and testing sets for training RNNLM are ready.
|
||||
The next step is to train the RNNLM model. The training command is as follows:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ # assume you are in the icefall root directory
|
||||
$ cd rnn_lm
|
||||
$ ln -s ../../egs/librispeech/ASR/data .
|
||||
$ cd ..
|
||||
$ ./rnn_lm/train.py \
|
||||
--world-size 4 \
|
||||
--exp-dir ./rnn_lm/exp \
|
||||
--start-epoch 0 \
|
||||
--num-epochs 10 \
|
||||
--use-fp16 0 \
|
||||
--tie-weights 1 \
|
||||
--embedding-dim 2048 \
|
||||
--hidden_dim 2048 \
|
||||
--num-layers 3 \
|
||||
--batch-size 300 \
|
||||
--lm-data rnn_lm/data/lang_bpe_500/sorted_lm_data.pt \
|
||||
--lm-data-valid rnn_lm/data/lang_bpe_500/sorted_lm_data.pt
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
You can adjust the RNNLM hyper parameters to control the size of the RNNLM,
|
||||
such as embedding dimension and hidden state dimension. For more details, please
|
||||
run ``./rnn_lm/train.py --help``.
|
||||
|
||||
.. note::
|
||||
|
||||
The training of RNNLM can take a long time (usually a couple of days).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -15,3 +15,4 @@ We may add recipes for other tasks as well in the future.
|
||||
|
||||
Non-streaming-ASR/index
|
||||
Streaming-ASR/index
|
||||
RNN-LM/index
|
||||
|
@ -32,7 +32,7 @@ import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
from icefall.utils import get_executor, str2bool
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
@ -42,7 +42,7 @@ torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80, perturb_speed: bool = False):
|
||||
src_dir = Path("data/manifests/aidatatang_200zh")
|
||||
output_dir = Path("data/fbank")
|
||||
num_jobs = min(15, os.cpu_count())
|
||||
@ -85,7 +85,8 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
)
|
||||
if "train" in partition:
|
||||
if "train" in partition and perturb_speed:
|
||||
logging.info(f"Doing speed perturb")
|
||||
cut_set = (
|
||||
cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
|
||||
)
|
||||
@ -109,7 +110,12 @@ def get_args():
|
||||
default=80,
|
||||
help="""The number of mel bins for Fbank""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--perturb-speed",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -119,4 +125,6 @@ if __name__ == "__main__":
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
args = get_args()
|
||||
compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
|
||||
compute_fbank_aidatatang_200zh(
|
||||
num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
|
||||
)
|
||||
|
@ -77,7 +77,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
log "Stage 4: Compute fbank for aidatatang_200zh"
|
||||
if [ ! -f data/fbank/.aidatatang_200zh.done ]; then
|
||||
mkdir -p data/fbank
|
||||
./local/compute_fbank_aidatatang_200zh.py
|
||||
./local/compute_fbank_aidatatang_200zh.py --perturb-speed True
|
||||
touch data/fbank/.aidatatang_200zh.done
|
||||
fi
|
||||
fi
|
||||
|
@ -37,7 +37,7 @@ from lhotse.dataset import (
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
PrecomputedFeatures,
|
||||
SingleCutSampler,
|
||||
SimpleCutSampler,
|
||||
SpecAugment,
|
||||
)
|
||||
from lhotse.dataset.input_strategies import OnTheFlyFeatures
|
||||
@ -211,7 +211,7 @@ class Aidatatang_200zhAsrDataModule:
|
||||
if self.args.enable_musan:
|
||||
logging.info("Enable MUSAN")
|
||||
transforms.append(
|
||||
CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
|
||||
CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
|
||||
)
|
||||
else:
|
||||
logging.info("Disable MUSAN")
|
||||
@ -291,8 +291,8 @@ class Aidatatang_200zhAsrDataModule:
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
logging.info("Using SingleCutSampler.")
|
||||
train_sampler = SingleCutSampler(
|
||||
logging.info("Using SimpleCutSampler.")
|
||||
train_sampler = SimpleCutSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
|
@ -635,7 +635,6 @@ def train_one_epoch(
|
||||
tot_loss = MetricsTracker()
|
||||
|
||||
for batch_idx, batch in enumerate(train_dl):
|
||||
|
||||
params.batch_idx_train += 1
|
||||
batch_size = len(batch["supervisions"]["text"])
|
||||
|
||||
@ -800,7 +799,7 @@ def run(rank, world_size, args):
|
||||
|
||||
if params.print_diagnostics:
|
||||
opts = diagnostics.TensorDiagnosticOptions(
|
||||
2**22
|
||||
512
|
||||
) # allow 4 megabytes per sub-module
|
||||
diagnostic = diagnostics.attach_diagnostics(model, opts)
|
||||
|
||||
|
@ -2,6 +2,56 @@
|
||||
|
||||
### Aishell training result(Stateless Transducer)
|
||||
|
||||
#### Pruned transducer stateless 7 streaming
|
||||
[./pruned_transducer_stateless7_streaming](./pruned_transducer_stateless7_streaming)
|
||||
|
||||
It's Streaming version of Zipformer1 with Pruned RNNT loss.
|
||||
|
||||
| | test | dev | comment |
|
||||
|------------------------|------|------|---------------------------------------|
|
||||
| greedy search | 6.95 | 6.29 | --epoch 44 --avg 15 --max-duration 600 |
|
||||
| modified beam search | 6.51 | 5.90 | --epoch 44 --avg 15 --max-duration 600 |
|
||||
| fast beam search | 6.73 | 6.09 | --epoch 44 --avg 15 --max-duration 600 |
|
||||
|
||||
Training command is:
|
||||
|
||||
```bash
|
||||
./prepare.sh
|
||||
|
||||
export CUDA_VISIBLE_DEVICES="0,1"
|
||||
|
||||
./pruned_transducer_stateless7_streaming/train.py \
|
||||
--world-size 2 \
|
||||
--num-epochs 50 \
|
||||
--use-fp16 1 \
|
||||
--context-size 1 \
|
||||
--max-duration 800 \
|
||||
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
|
||||
--enable-musan 0 \
|
||||
--spec-aug-time-warp-factor 20
|
||||
```
|
||||
|
||||
**Caution**: It uses `--context-size=1`.
|
||||
|
||||
The decoding command is:
|
||||
```bash
|
||||
for m in greedy_search modified_beam_search fast_beam_search ; do
|
||||
./pruned_transducer_stateless7_streaming/decode.py \
|
||||
--epoch 44 \
|
||||
--avg 15 \
|
||||
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
|
||||
--lang-dir data/lang_char \
|
||||
--context-size 1 \
|
||||
--decoding-method $m
|
||||
done
|
||||
```
|
||||
|
||||
Pretrained models, training logs, decoding logs, tensorboard and decoding results
|
||||
are available at
|
||||
<https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-pruned-transducer-stateless7-streaming-2023-10-16/>
|
||||
|
||||
|
||||
|
||||
#### Pruned transducer stateless 7
|
||||
|
||||
[./pruned_transducer_stateless7](./pruned_transducer_stateless7)
|
||||
|
21
egs/aishell/ASR/conformer_ctc/export.py
Normal file → Executable file
21
egs/aishell/ASR/conformer_ctc/export.py
Normal file → Executable file
@ -23,12 +23,12 @@ import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import k2
|
||||
import torch
|
||||
from conformer import Conformer
|
||||
|
||||
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
||||
from icefall.lexicon import Lexicon
|
||||
from icefall.utils import AttributeDict, str2bool
|
||||
from icefall.utils import AttributeDict, num_tokens, str2bool
|
||||
|
||||
|
||||
def get_parser():
|
||||
@ -63,11 +63,10 @@ def get_parser():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--lang-dir",
|
||||
"--tokens",
|
||||
type=str,
|
||||
default="data/lang_char",
|
||||
help="""It contains language related input files such as "lexicon.txt"
|
||||
""",
|
||||
required=True,
|
||||
help="Path to the tokens.txt.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -98,16 +97,16 @@ def get_params() -> AttributeDict:
|
||||
def main():
|
||||
args = get_parser().parse_args()
|
||||
args.exp_dir = Path(args.exp_dir)
|
||||
args.lang_dir = Path(args.lang_dir)
|
||||
|
||||
params = get_params()
|
||||
params.update(vars(args))
|
||||
|
||||
logging.info(params)
|
||||
# Load tokens.txt here
|
||||
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||
|
||||
lexicon = Lexicon(params.lang_dir)
|
||||
max_token_id = max(lexicon.tokens)
|
||||
num_classes = max_token_id + 1 # +1 for the blank
|
||||
num_classes = num_tokens(token_table) + 1 # +1 for the blank
|
||||
|
||||
logging.info(params)
|
||||
|
||||
device = torch.device("cpu")
|
||||
if torch.cuda.is_available():
|
||||
|
1
egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_H.py
Symbolic link
1
egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_H.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_H.py
|
1
egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py
Symbolic link
1
egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HL.py
|
1
egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py
Symbolic link
1
egs/aishell/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/conformer_ctc/jit_pretrained_decode_with_HLG.py
|
0
egs/aishell/ASR/conformer_ctc/test_transformer.py
Normal file → Executable file
0
egs/aishell/ASR/conformer_ctc/test_transformer.py
Normal file → Executable file
@ -32,7 +32,7 @@ import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
from icefall.utils import get_executor, str2bool
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
@ -42,7 +42,7 @@ torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80, perturb_speed: bool = False):
|
||||
src_dir = Path("data/manifests")
|
||||
output_dir = Path("data/fbank")
|
||||
num_jobs = min(15, os.cpu_count())
|
||||
@ -85,7 +85,8 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
)
|
||||
if "train" in partition:
|
||||
if "train" in partition and perturb_speed:
|
||||
logging.info(f"Doing speed perturb")
|
||||
cut_set = (
|
||||
cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
|
||||
)
|
||||
@ -109,7 +110,12 @@ def get_args():
|
||||
default=80,
|
||||
help="""The number of mel bins for Fbank""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--perturb-speed",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -119,4 +125,6 @@ if __name__ == "__main__":
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
args = get_args()
|
||||
compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
|
||||
compute_fbank_aidatatang_200zh(
|
||||
num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
|
||||
)
|
||||
|
@ -32,7 +32,7 @@ import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
from icefall.utils import get_executor, str2bool
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
@ -42,7 +42,7 @@ torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_aishell(num_mel_bins: int = 80):
|
||||
def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
|
||||
src_dir = Path("data/manifests")
|
||||
output_dir = Path("data/fbank")
|
||||
num_jobs = min(15, os.cpu_count())
|
||||
@ -81,7 +81,8 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
)
|
||||
if "train" in partition:
|
||||
if "train" in partition and perturb_speed:
|
||||
logging.info(f"Doing speed perturb")
|
||||
cut_set = (
|
||||
cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
|
||||
)
|
||||
@ -104,7 +105,12 @@ def get_args():
|
||||
default=80,
|
||||
help="""The number of mel bins for Fbank""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--perturb-speed",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -114,4 +120,6 @@ if __name__ == "__main__":
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
args = get_args()
|
||||
compute_fbank_aishell(num_mel_bins=args.num_mel_bins)
|
||||
compute_fbank_aishell(
|
||||
num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
|
||||
)
|
||||
|
1
egs/aishell/ASR/local/prepare_lang_fst.py
Symbolic link
1
egs/aishell/ASR/local/prepare_lang_fst.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/prepare_lang_fst.py
|
@ -15,7 +15,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# You can install sentencepiece via:
|
||||
#
|
||||
# pip install sentencepiece
|
||||
@ -26,12 +25,12 @@
|
||||
# Please install a version >=0.1.96
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
from icefall import byte_encode, tokenize_by_CJK_char
|
||||
|
||||
|
||||
@ -74,6 +73,11 @@ def main():
|
||||
model_type = "unigram"
|
||||
|
||||
model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
|
||||
model_file = Path(model_prefix + ".model")
|
||||
if model_file.is_file():
|
||||
print(f"{model_file} exists - skipping")
|
||||
return
|
||||
|
||||
character_coverage = 1.0
|
||||
input_sentence_size = 100000000
|
||||
|
||||
@ -88,8 +92,6 @@ def main():
|
||||
|
||||
_convert_to_bchar(args.transcript, train_text)
|
||||
|
||||
model_file = Path(model_prefix + ".model")
|
||||
if not model_file.is_file():
|
||||
spm.SentencePieceTrainer.train(
|
||||
input=train_text,
|
||||
vocab_size=vocab_size,
|
||||
@ -102,9 +104,6 @@ def main():
|
||||
bos_id=-1,
|
||||
eos_id=-1,
|
||||
)
|
||||
else:
|
||||
print(f"{model_file} exists - skipping")
|
||||
return
|
||||
|
||||
shutil.copyfile(model_file, f"{lang_dir}/bbpe.model")
|
||||
|
||||
|
@ -114,7 +114,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Compute fbank for aishell"
|
||||
if [ ! -f data/fbank/.aishell.done ]; then
|
||||
mkdir -p data/fbank
|
||||
./local/compute_fbank_aishell.py
|
||||
./local/compute_fbank_aishell.py --perturb-speed True
|
||||
touch data/fbank/.aishell.done
|
||||
fi
|
||||
fi
|
||||
@ -143,6 +143,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
./local/prepare_lang.py --lang-dir $lang_phone_dir
|
||||
fi
|
||||
|
||||
|
||||
# Train a bigram P for MMI training
|
||||
if [ ! -f $lang_phone_dir/transcript_words.txt ]; then
|
||||
log "Generate data to train phone based bigram P"
|
||||
@ -257,6 +258,13 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
|
||||
--max-order=3 \
|
||||
data/lm/3-gram.unpruned.arpa > data/lm/G_3_gram_char.fst.txt
|
||||
fi
|
||||
|
||||
if [ ! -f $lang_char_dir/HLG.fst ]; then
|
||||
lang_phone_dir=data/lang_phone
|
||||
./local/prepare_lang_fst.py \
|
||||
--lang-dir $lang_phone_dir \
|
||||
--ngram-G ./data/lm/G_3_gram.fst.txt
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user