mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Merge branch 'k2-fsa:master' into dev_swbd
This commit is contained in:
commit
ba480b7b5f
@ -53,7 +53,7 @@ log "Export to torchscript model"
|
|||||||
|
|
||||||
./conformer_ctc3/export.py \
|
./conformer_ctc3/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--lang-dir $repo/data/lang_bpe_500 \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--jit-trace 1 \
|
--jit-trace 1 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -82,7 +82,7 @@ for m in ctc-decoding 1best; do
|
|||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--words-file $repo/data/lang_bpe_500/words.txt \
|
--words-file $repo/data/lang_bpe_500/words.txt \
|
||||||
--HLG $repo/data/lang_bpe_500/HLG.pt \
|
--HLG $repo/data/lang_bpe_500/HLG.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--G $repo/data/lm/G_4_gram.pt \
|
--G $repo/data/lm/G_4_gram.pt \
|
||||||
--method $m \
|
--method $m \
|
||||||
--sample-rate 16000 \
|
--sample-rate 16000 \
|
||||||
|
|||||||
@ -31,7 +31,7 @@ log "Test exporting with torch.jit.trace()"
|
|||||||
|
|
||||||
./lstm_transducer_stateless2/export.py \
|
./lstm_transducer_stateless2/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
@ -55,7 +55,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -68,7 +68,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -36,7 +36,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -49,7 +49,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -35,7 +35,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -48,7 +48,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -30,14 +30,14 @@ popd
|
|||||||
log "Export to torchscript model"
|
log "Export to torchscript model"
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--jit 1
|
--jit 1
|
||||||
|
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--jit-trace 1
|
--jit-trace 1
|
||||||
@ -74,7 +74,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -87,7 +87,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -32,7 +32,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--num-encoder-layers 18 \
|
--num-encoder-layers 18 \
|
||||||
--dim-feedforward 2048 \
|
--dim-feedforward 2048 \
|
||||||
--nhead 8 \
|
--nhead 8 \
|
||||||
@ -51,7 +51,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav \
|
$repo/test_wavs/1221-135766-0002.wav \
|
||||||
|
|||||||
@ -33,7 +33,7 @@ log "Export to torchscript model"
|
|||||||
./pruned_transducer_stateless7/export.py \
|
./pruned_transducer_stateless7/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model false \
|
--use-averaged-model false \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -56,7 +56,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -69,7 +69,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -37,7 +37,7 @@ log "Export to torchscript model"
|
|||||||
./pruned_transducer_stateless7_ctc/export.py \
|
./pruned_transducer_stateless7_ctc/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model false \
|
--use-averaged-model false \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -74,7 +74,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -87,7 +87,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -36,7 +36,7 @@ log "Export to torchscript model"
|
|||||||
./pruned_transducer_stateless7_ctc_bs/export.py \
|
./pruned_transducer_stateless7_ctc_bs/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model false \
|
--use-averaged-model false \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -72,7 +72,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -85,7 +85,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -37,7 +37,7 @@ log "Export to torchscript model"
|
|||||||
./pruned_transducer_stateless7_streaming/export.py \
|
./pruned_transducer_stateless7_streaming/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model false \
|
--use-averaged-model false \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--decode-chunk-len 32 \
|
--decode-chunk-len 32 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -81,7 +81,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--decode-chunk-len 32 \
|
--decode-chunk-len 32 \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
@ -95,7 +95,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--decode-chunk-len 32 \
|
--decode-chunk-len 32 \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
|
|||||||
@ -41,7 +41,7 @@ log "Decode with models exported by torch.jit.script()"
|
|||||||
log "Export to torchscript model"
|
log "Export to torchscript model"
|
||||||
./pruned_transducer_stateless8/export.py \
|
./pruned_transducer_stateless8/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model false \
|
--use-averaged-model false \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -65,7 +65,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -78,7 +78,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -32,7 +32,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--simulate-streaming 1 \
|
--simulate-streaming 1 \
|
||||||
--causal-convolution 1 \
|
--causal-convolution 1 \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
@ -47,7 +47,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--simulate-streaming 1 \
|
--simulate-streaming 1 \
|
||||||
--causal-convolution 1 \
|
--causal-convolution 1 \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
|
|||||||
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -37,7 +37,7 @@ log "Export to torchscript model"
|
|||||||
./zipformer_mmi/export.py \
|
./zipformer_mmi/export.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model false \
|
--use-averaged-model false \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -61,7 +61,7 @@ for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescor
|
|||||||
--method $method \
|
--method $method \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--lang-dir $repo/data/lang_bpe_500 \
|
--lang-dir $repo/data/lang_bpe_500 \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -27,7 +27,7 @@ log "CTC decoding"
|
|||||||
--method ctc-decoding \
|
--method ctc-decoding \
|
||||||
--num-classes 500 \
|
--num-classes 500 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.flac \
|
$repo/test_wavs/1089-134686-0001.flac \
|
||||||
$repo/test_wavs/1221-135766-0001.flac \
|
$repo/test_wavs/1221-135766-0001.flac \
|
||||||
$repo/test_wavs/1221-135766-0002.flac
|
$repo/test_wavs/1221-135766-0002.flac
|
||||||
@ -38,7 +38,7 @@ log "HLG decoding"
|
|||||||
--method 1best \
|
--method 1best \
|
||||||
--num-classes 500 \
|
--num-classes 500 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--words-file $repo/data/lang_bpe_500/words.txt \
|
--words-file $repo/data/lang_bpe_500/words.txt \
|
||||||
--HLG $repo/data/lang_bpe_500/HLG.pt \
|
--HLG $repo/data/lang_bpe_500/HLG.pt \
|
||||||
$repo/test_wavs/1089-134686-0001.flac \
|
$repo/test_wavs/1089-134686-0001.flac \
|
||||||
|
|||||||
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -41,7 +41,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -41,7 +41,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -28,7 +28,7 @@ for sym in 1 2 3; do
|
|||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
--max-sym-per-frame $sym \
|
--max-sym-per-frame $sym \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
|
|||||||
--method $method \
|
--method $method \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -27,7 +27,7 @@ log "Beam search decoding"
|
|||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
--checkpoint $repo/exp/pretrained.pt \
|
--checkpoint $repo/exp/pretrained.pt \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
$repo/test_wavs/1089-134686-0001.wav \
|
$repo/test_wavs/1089-134686-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0001.wav \
|
$repo/test_wavs/1221-135766-0001.wav \
|
||||||
$repo/test_wavs/1221-135766-0002.wav
|
$repo/test_wavs/1221-135766-0002.wav
|
||||||
|
|||||||
@ -17,7 +17,6 @@ git lfs install
|
|||||||
git clone $repo_url
|
git clone $repo_url
|
||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
|
|
||||||
log "Display test files"
|
log "Display test files"
|
||||||
tree $repo/
|
tree $repo/
|
||||||
ls -lh $repo/test_wavs/*.wav
|
ls -lh $repo/test_wavs/*.wav
|
||||||
@ -29,12 +28,11 @@ popd
|
|||||||
|
|
||||||
log "Test exporting to ONNX format"
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
./pruned_transducer_stateless2/export.py \
|
./pruned_transducer_stateless2/export-onnx.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--lang-dir $repo/data/lang_char \
|
--lang-dir $repo/data/lang_char \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1
|
||||||
--onnx 1
|
|
||||||
|
|
||||||
log "Export to torchscript model"
|
log "Export to torchscript model"
|
||||||
|
|
||||||
@ -59,19 +57,17 @@ log "Decode with ONNX models"
|
|||||||
|
|
||||||
./pruned_transducer_stateless2/onnx_check.py \
|
./pruned_transducer_stateless2/onnx_check.py \
|
||||||
--jit-filename $repo/exp/cpu_jit.pt \
|
--jit-filename $repo/exp/cpu_jit.pt \
|
||||||
--onnx-encoder-filename $repo/exp/encoder.onnx \
|
--onnx-encoder-filename $repo/exp/encoder-epoch-10-avg-2.onnx \
|
||||||
--onnx-decoder-filename $repo/exp/decoder.onnx \
|
--onnx-decoder-filename $repo/exp/decoder-epoch-10-avg-2.onnx \
|
||||||
--onnx-joiner-filename $repo/exp/joiner.onnx \
|
--onnx-joiner-filename $repo/exp/joiner-epoch-10-avg-2.onnx \
|
||||||
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
|
--onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj-epoch-10-avg-2.onnx \
|
||||||
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
|
--onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj-epoch-10-avg-2.onnx
|
||||||
|
|
||||||
./pruned_transducer_stateless2/onnx_pretrained.py \
|
./pruned_transducer_stateless2/onnx_pretrained.py \
|
||||||
--tokens $repo/data/lang_char/tokens.txt \
|
--tokens $repo/data/lang_char/tokens.txt \
|
||||||
--encoder-model-filename $repo/exp/encoder.onnx \
|
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||||
--decoder-model-filename $repo/exp/decoder.onnx \
|
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||||
--joiner-model-filename $repo/exp/joiner.onnx \
|
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||||
--joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
|
|
||||||
--joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
|
|
||||||
$repo/test_wavs/DEV_T0000000000.wav \
|
$repo/test_wavs/DEV_T0000000000.wav \
|
||||||
$repo/test_wavs/DEV_T0000000001.wav \
|
$repo/test_wavs/DEV_T0000000001.wav \
|
||||||
$repo/test_wavs/DEV_T0000000002.wav
|
$repo/test_wavs/DEV_T0000000002.wav
|
||||||
|
|||||||
12
.github/scripts/test-ncnn-export.sh
vendored
12
.github/scripts/test-ncnn-export.sh
vendored
@ -45,7 +45,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
|||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
pushd $repo
|
pushd $repo
|
||||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
|
||||||
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
||||||
|
|
||||||
cd exp
|
cd exp
|
||||||
@ -56,11 +55,10 @@ log "Export via torch.jit.trace()"
|
|||||||
|
|
||||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
\
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--num-encoder-layers 12 \
|
--num-encoder-layers 12 \
|
||||||
--chunk-length 32 \
|
--chunk-length 32 \
|
||||||
--cnn-module-kernel 31 \
|
--cnn-module-kernel 31 \
|
||||||
@ -91,7 +89,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
|||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
pushd $repo
|
pushd $repo
|
||||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
|
||||||
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
|
git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
|
||||||
|
|
||||||
cd exp
|
cd exp
|
||||||
@ -102,7 +99,7 @@ log "Export via torch.jit.trace()"
|
|||||||
|
|
||||||
./lstm_transducer_stateless2/export-for-ncnn.py \
|
./lstm_transducer_stateless2/export-for-ncnn.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0
|
--use-averaged-model 0
|
||||||
@ -140,7 +137,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
|||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
pushd $repo
|
pushd $repo
|
||||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
|
||||||
git lfs pull --include "exp/pretrained.pt"
|
git lfs pull --include "exp/pretrained.pt"
|
||||||
|
|
||||||
cd exp
|
cd exp
|
||||||
@ -148,7 +144,7 @@ ln -s pretrained.pt epoch-99.pt
|
|||||||
popd
|
popd
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
@ -199,7 +195,7 @@ ln -s pretrained.pt epoch-9999.pt
|
|||||||
popd
|
popd
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-for-ncnn-zh.py \
|
./pruned_transducer_stateless7_streaming/export-for-ncnn-zh.py \
|
||||||
--lang-dir $repo/data/lang_char_bpe \
|
--tokens $repo/data/lang_char_bpe/tokens.txt \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 9999 \
|
--epoch 9999 \
|
||||||
|
|||||||
138
.github/scripts/test-onnx-export.sh
vendored
138
.github/scripts/test-onnx-export.sh
vendored
@ -10,7 +10,123 @@ log() {
|
|||||||
|
|
||||||
cd egs/librispeech/ASR
|
cd egs/librispeech/ASR
|
||||||
|
|
||||||
|
log "=========================================================================="
|
||||||
|
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
|
||||||
|
log "Downloading pre-trained model from $repo_url"
|
||||||
|
git lfs install
|
||||||
|
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||||
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
|
pushd $repo
|
||||||
|
git lfs pull --include "exp/pretrained.pt"
|
||||||
|
cd exp
|
||||||
|
ln -s pretrained.pt epoch-99.pt
|
||||||
|
popd
|
||||||
|
|
||||||
|
log "Export via torch.jit.script()"
|
||||||
|
./zipformer/export.py \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--jit 1
|
||||||
|
|
||||||
|
log "Test export to ONNX format"
|
||||||
|
./zipformer/export-onnx.py \
|
||||||
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
|
--use-averaged-model 0 \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--num-encoder-layers "2,2,3,4,3,2" \
|
||||||
|
--downsampling-factor "1,2,4,8,4,2" \
|
||||||
|
--feedforward-dim "512,768,1024,1536,1024,768" \
|
||||||
|
--num-heads "4,4,4,8,4,4" \
|
||||||
|
--encoder-dim "192,256,384,512,384,256" \
|
||||||
|
--query-head-dim 32 \
|
||||||
|
--value-head-dim 12 \
|
||||||
|
--pos-head-dim 4 \
|
||||||
|
--pos-dim 48 \
|
||||||
|
--encoder-unmasked-dim "192,192,256,256,256,192" \
|
||||||
|
--cnn-module-kernel "31,31,15,15,15,31" \
|
||||||
|
--decoder-dim 512 \
|
||||||
|
--joiner-dim 512 \
|
||||||
|
--causal False \
|
||||||
|
--chunk-size "16,32,64,-1" \
|
||||||
|
--left-context-frames "64,128,256,-1"
|
||||||
|
|
||||||
|
ls -lh $repo/exp
|
||||||
|
|
||||||
|
log "Run onnx_check.py"
|
||||||
|
|
||||||
|
./zipformer/onnx_check.py \
|
||||||
|
--jit-filename $repo/exp/jit_script.pt \
|
||||||
|
--onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||||
|
--onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||||
|
--onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
|
||||||
|
|
||||||
|
log "Run onnx_pretrained.py"
|
||||||
|
|
||||||
|
./zipformer/onnx_pretrained.py \
|
||||||
|
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
|
||||||
|
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
|
||||||
|
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
|
||||||
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav
|
||||||
|
|
||||||
|
rm -rf $repo
|
||||||
|
|
||||||
|
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
|
||||||
|
log "Downloading pre-trained model from $repo_url"
|
||||||
|
git lfs install
|
||||||
|
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||||
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
|
pushd $repo
|
||||||
|
git lfs pull --include "exp/pretrained.pt"
|
||||||
|
|
||||||
|
cd exp
|
||||||
|
ln -s pretrained.pt epoch-99.pt
|
||||||
|
popd
|
||||||
|
|
||||||
|
log "Test export streaming model to ONNX format"
|
||||||
|
./zipformer/export-onnx-streaming.py \
|
||||||
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
|
--use-averaged-model 0 \
|
||||||
|
--epoch 99 \
|
||||||
|
--avg 1 \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--num-encoder-layers "2,2,3,4,3,2" \
|
||||||
|
--downsampling-factor "1,2,4,8,4,2" \
|
||||||
|
--feedforward-dim "512,768,1024,1536,1024,768" \
|
||||||
|
--num-heads "4,4,4,8,4,4" \
|
||||||
|
--encoder-dim "192,256,384,512,384,256" \
|
||||||
|
--query-head-dim 32 \
|
||||||
|
--value-head-dim 12 \
|
||||||
|
--pos-head-dim 4 \
|
||||||
|
--pos-dim 48 \
|
||||||
|
--encoder-unmasked-dim "192,192,256,256,256,192" \
|
||||||
|
--cnn-module-kernel "31,31,15,15,15,31" \
|
||||||
|
--decoder-dim 512 \
|
||||||
|
--joiner-dim 512 \
|
||||||
|
--causal True \
|
||||||
|
--chunk-size 16 \
|
||||||
|
--left-context-frames 64
|
||||||
|
|
||||||
|
ls -lh $repo/exp
|
||||||
|
|
||||||
|
log "Run onnx_pretrained-streaming.py"
|
||||||
|
|
||||||
|
./zipformer/onnx_pretrained-streaming.py \
|
||||||
|
--encoder-model-filename $repo/exp/encoder-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||||
|
--decoder-model-filename $repo/exp/decoder-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||||
|
--joiner-model-filename $repo/exp/joiner-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||||
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
|
$repo/test_wavs/1089-134686-0001.wav
|
||||||
|
|
||||||
|
rm -rf $repo
|
||||||
|
|
||||||
|
log "--------------------------------------------------------------------------"
|
||||||
|
|
||||||
log "=========================================================================="
|
log "=========================================================================="
|
||||||
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||||
@ -39,7 +155,7 @@ log "Export via torch.jit.trace()"
|
|||||||
log "Test exporting to ONNX format"
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -88,7 +204,7 @@ popd
|
|||||||
log "Export via torch.jit.script()"
|
log "Export via torch.jit.script()"
|
||||||
|
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 9999 \
|
--epoch 9999 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--exp-dir $repo/exp/ \
|
--exp-dir $repo/exp/ \
|
||||||
@ -97,7 +213,7 @@ log "Export via torch.jit.script()"
|
|||||||
log "Test exporting to ONNX format"
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
./pruned_transducer_stateless3/export-onnx.py \
|
./pruned_transducer_stateless3/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 9999 \
|
--epoch 9999 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--exp-dir $repo/exp/
|
--exp-dir $repo/exp/
|
||||||
@ -126,7 +242,6 @@ log "Run onnx_pretrained.py"
|
|||||||
rm -rf $repo
|
rm -rf $repo
|
||||||
log "--------------------------------------------------------------------------"
|
log "--------------------------------------------------------------------------"
|
||||||
|
|
||||||
|
|
||||||
log "=========================================================================="
|
log "=========================================================================="
|
||||||
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
|
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
|
||||||
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
||||||
@ -143,7 +258,7 @@ popd
|
|||||||
log "Export via torch.jit.script()"
|
log "Export via torch.jit.script()"
|
||||||
|
|
||||||
./pruned_transducer_stateless5/export.py \
|
./pruned_transducer_stateless5/export.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
@ -159,7 +274,7 @@ log "Export via torch.jit.script()"
|
|||||||
log "Test exporting to ONNX format"
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
./pruned_transducer_stateless5/export-onnx.py \
|
./pruned_transducer_stateless5/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
@ -205,7 +320,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
|||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
pushd $repo
|
pushd $repo
|
||||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
|
||||||
git lfs pull --include "exp/pretrained.pt"
|
git lfs pull --include "exp/pretrained.pt"
|
||||||
|
|
||||||
cd exp
|
cd exp
|
||||||
@ -215,7 +329,7 @@ popd
|
|||||||
log "Export via torch.jit.script()"
|
log "Export via torch.jit.script()"
|
||||||
|
|
||||||
./pruned_transducer_stateless7/export.py \
|
./pruned_transducer_stateless7/export.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -226,7 +340,7 @@ log "Export via torch.jit.script()"
|
|||||||
log "Test exporting to ONNX format"
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
./pruned_transducer_stateless7/export-onnx.py \
|
./pruned_transducer_stateless7/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -270,7 +384,7 @@ popd
|
|||||||
log "Test exporting to ONNX format"
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
./conv_emformer_transducer_stateless2/export-onnx.py \
|
./conv_emformer_transducer_stateless2/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -310,7 +424,7 @@ popd
|
|||||||
log "Export via torch.jit.trace()"
|
log "Export via torch.jit.trace()"
|
||||||
|
|
||||||
./lstm_transducer_stateless2/export.py \
|
./lstm_transducer_stateless2/export.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -320,7 +434,7 @@ log "Export via torch.jit.trace()"
|
|||||||
log "Test exporting to ONNX format"
|
log "Test exporting to ONNX format"
|
||||||
|
|
||||||
./lstm_transducer_stateless2/export-onnx.py \
|
./lstm_transducer_stateless2/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
|
|||||||
76
.github/workflows/run-yesno-recipe.yml
vendored
76
.github/workflows/run-yesno-recipe.yml
vendored
@ -44,11 +44,6 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Install graphviz
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
sudo apt-get -qq install graphviz
|
|
||||||
|
|
||||||
- name: Setup Python ${{ matrix.python-version }}
|
- name: Setup Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
@ -70,6 +65,7 @@ jobs:
|
|||||||
pip install --no-binary protobuf protobuf==3.20.*
|
pip install --no-binary protobuf protobuf==3.20.*
|
||||||
|
|
||||||
pip install --no-deps --force-reinstall https://huggingface.co/csukuangfj/k2/resolve/main/cpu/k2-1.24.3.dev20230508+cpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
|
pip install --no-deps --force-reinstall https://huggingface.co/csukuangfj/k2/resolve/main/cpu/k2-1.24.3.dev20230508+cpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
|
||||||
|
pip install kaldifeat==1.25.0.dev20230726+cpu.torch1.13.1 -f https://csukuangfj.github.io/kaldifeat/cpu.html
|
||||||
|
|
||||||
- name: Run yesno recipe
|
- name: Run yesno recipe
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -78,9 +74,75 @@ jobs:
|
|||||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
echo $PYTHONPATH
|
echo $PYTHONPATH
|
||||||
|
|
||||||
|
|
||||||
cd egs/yesno/ASR
|
cd egs/yesno/ASR
|
||||||
./prepare.sh
|
./prepare.sh
|
||||||
python3 ./tdnn/train.py
|
python3 ./tdnn/train.py
|
||||||
python3 ./tdnn/decode.py
|
python3 ./tdnn/decode.py
|
||||||
# TODO: Check that the WER is less than some value
|
|
||||||
|
- name: Test exporting to pretrained.pt
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{github.workspace}}
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
echo $PYTHONPATH
|
||||||
|
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
python3 ./tdnn/export.py --epoch 14 --avg 2
|
||||||
|
|
||||||
|
python3 ./tdnn/pretrained.py \
|
||||||
|
--checkpoint ./tdnn/exp/pretrained.pt \
|
||||||
|
--HLG ./data/lang_phone/HLG.pt \
|
||||||
|
--words-file ./data/lang_phone/words.txt \
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||||
|
|
||||||
|
- name: Test exporting to torchscript
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{github.workspace}}
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
echo $PYTHONPATH
|
||||||
|
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
|
||||||
|
|
||||||
|
python3 ./tdnn/jit_pretrained.py \
|
||||||
|
--nn-model ./tdnn/exp/cpu_jit.pt \
|
||||||
|
--HLG ./data/lang_phone/HLG.pt \
|
||||||
|
--words-file ./data/lang_phone/words.txt \
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||||
|
|
||||||
|
- name: Test exporting to onnx
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{github.workspace}}
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
echo $PYTHONPATH
|
||||||
|
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
python3 ./tdnn/export_onnx.py --epoch 14 --avg 2
|
||||||
|
|
||||||
|
echo "Test float32 model"
|
||||||
|
python3 ./tdnn/onnx_pretrained.py \
|
||||||
|
--nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
|
||||||
|
--HLG ./data/lang_phone/HLG.pt \
|
||||||
|
--words-file ./data/lang_phone/words.txt \
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||||
|
|
||||||
|
|
||||||
|
echo "Test int8 model"
|
||||||
|
python3 ./tdnn/onnx_pretrained.py \
|
||||||
|
--nn-model ./tdnn/exp/model-epoch-14-avg-2.int8.onnx \
|
||||||
|
--HLG ./data/lang_phone/HLG.pt \
|
||||||
|
--words-file ./data/lang_phone/words.txt \
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||||
|
|
||||||
|
- name: Show generated files
|
||||||
|
shell: bash
|
||||||
|
working-directory: ${{github.workspace}}
|
||||||
|
run: |
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
ls -lh tdnn/exp
|
||||||
|
|||||||
@ -95,4 +95,7 @@ rst_epilog = """
|
|||||||
.. _k2: https://github.com/k2-fsa/k2
|
.. _k2: https://github.com/k2-fsa/k2
|
||||||
.. _lhotse: https://github.com/lhotse-speech/lhotse
|
.. _lhotse: https://github.com/lhotse-speech/lhotse
|
||||||
.. _yesno: https://www.openslr.org/1/
|
.. _yesno: https://www.openslr.org/1/
|
||||||
|
.. _Next-gen Kaldi: https://github.com/k2-fsa
|
||||||
|
.. _Kaldi: https://github.com/kaldi-asr/kaldi
|
||||||
|
.. _lilcom: https://github.com/danpovey/lilcom
|
||||||
"""
|
"""
|
||||||
|
|||||||
180
docs/source/for-dummies/data-preparation.rst
Normal file
180
docs/source/for-dummies/data-preparation.rst
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
.. _dummies_tutorial_data_preparation:
|
||||||
|
|
||||||
|
Data Preparation
|
||||||
|
================
|
||||||
|
|
||||||
|
After :ref:`dummies_tutorial_environment_setup`, we can start preparing the
|
||||||
|
data for training and decoding.
|
||||||
|
|
||||||
|
The first step is to prepare the data for training. We have already provided
|
||||||
|
`prepare.sh <https://github.com/k2-fsa/icefall/blob/master/egs/yesno/ASR/prepare.sh>`_
|
||||||
|
that would prepare everything required for training.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
cd /tmp/icefall
|
||||||
|
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
|
||||||
|
./prepare.sh
|
||||||
|
|
||||||
|
Note that in each recipe from `icefall`_, there exists a file ``prepare.sh``,
|
||||||
|
which you should run before you run anything else.
|
||||||
|
|
||||||
|
That is all you need for data preparation.
|
||||||
|
|
||||||
|
For the more curious
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
If you are wondering how to prepare your own dataset, please refer to the following
|
||||||
|
URLs for more details:
|
||||||
|
|
||||||
|
- `<https://github.com/lhotse-speech/lhotse/tree/master/lhotse/recipes>`_
|
||||||
|
|
||||||
|
It contains recipes for a variety of dataset. If you want to add your own
|
||||||
|
dataset, please read recipes in this folder first.
|
||||||
|
|
||||||
|
- `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/yesno.py>`_
|
||||||
|
|
||||||
|
The `yesno`_ recipe in `lhotse`_.
|
||||||
|
|
||||||
|
If you already have a `Kaldi`_ dataset directory, which contains files like
|
||||||
|
``wav.scp``, ``feats.scp``, then you can refer to `<https://lhotse.readthedocs.io/en/latest/kaldi.html#example>`_.
|
||||||
|
|
||||||
|
A quick look to the generated files
|
||||||
|
-----------------------------------
|
||||||
|
|
||||||
|
``./prepare.sh`` puts generated files into two directories:
|
||||||
|
|
||||||
|
- ``download``
|
||||||
|
- ``data``
|
||||||
|
|
||||||
|
download
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
The ``download`` directory contains downloaded dataset files:
|
||||||
|
|
||||||
|
.. code-block:: bas
|
||||||
|
|
||||||
|
tree -L 1 ./download/
|
||||||
|
|
||||||
|
./download/
|
||||||
|
|-- waves_yesno
|
||||||
|
`-- waves_yesno.tar.gz
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/yesno.py#L41>`_
|
||||||
|
for how the data is downloaded and extracted.
|
||||||
|
|
||||||
|
data
|
||||||
|
^^^^
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
tree ./data/
|
||||||
|
|
||||||
|
./data/
|
||||||
|
|-- fbank
|
||||||
|
| |-- yesno_cuts_test.jsonl.gz
|
||||||
|
| |-- yesno_cuts_train.jsonl.gz
|
||||||
|
| |-- yesno_feats_test.lca
|
||||||
|
| `-- yesno_feats_train.lca
|
||||||
|
|-- lang_phone
|
||||||
|
| |-- HLG.pt
|
||||||
|
| |-- L.pt
|
||||||
|
| |-- L_disambig.pt
|
||||||
|
| |-- Linv.pt
|
||||||
|
| |-- lexicon.txt
|
||||||
|
| |-- lexicon_disambig.txt
|
||||||
|
| |-- tokens.txt
|
||||||
|
| `-- words.txt
|
||||||
|
|-- lm
|
||||||
|
| |-- G.arpa
|
||||||
|
| `-- G.fst.txt
|
||||||
|
`-- manifests
|
||||||
|
|-- yesno_recordings_test.jsonl.gz
|
||||||
|
|-- yesno_recordings_train.jsonl.gz
|
||||||
|
|-- yesno_supervisions_test.jsonl.gz
|
||||||
|
`-- yesno_supervisions_train.jsonl.gz
|
||||||
|
|
||||||
|
4 directories, 18 files
|
||||||
|
|
||||||
|
**data/manifests**:
|
||||||
|
|
||||||
|
This directory contains manifests. They are used to generate files in
|
||||||
|
``data/fbank``.
|
||||||
|
|
||||||
|
To give you an idea of what it contains, we examine the first few lines of
|
||||||
|
the manifests related to the ``train`` dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd data/manifests
|
||||||
|
gunzip -c yesno_recordings_train.jsonl.gz | head -n 3
|
||||||
|
|
||||||
|
The output is given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
{"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}
|
||||||
|
{"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}
|
||||||
|
{"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}
|
||||||
|
|
||||||
|
Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/audio.py#L300>`_
|
||||||
|
for the meaning of each field per line.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
gunzip -c yesno_supervisions_train.jsonl.gz | head -n 3
|
||||||
|
|
||||||
|
The output is given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
{"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}
|
||||||
|
{"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}
|
||||||
|
{"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}
|
||||||
|
|
||||||
|
Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/supervision.py#L510>`_
|
||||||
|
for the meaning of each field per line.
|
||||||
|
|
||||||
|
**data/fbank**:
|
||||||
|
|
||||||
|
This directory contains everything from ``data/manifests``. Furthermore, it also contains features
|
||||||
|
for training.
|
||||||
|
|
||||||
|
``data/fbank/yesno_feats_train.lca`` contains the features for the train dataset.
|
||||||
|
Features are compressed using `lilcom`_.
|
||||||
|
|
||||||
|
``data/fbank/yesno_cuts_train.jsonl.gz`` stores the `CutSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/cut/set.py#L72>`_,
|
||||||
|
which stores `RecordingSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/audio.py#L928>`_,
|
||||||
|
`SupervisionSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/supervision.py#L510>`_,
|
||||||
|
and `FeatureSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/features/base.py#L593>`_.
|
||||||
|
|
||||||
|
To give you an idea about what it looks like, we can run the following command:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd data/fbank
|
||||||
|
|
||||||
|
gunzip -c yesno_cuts_train.jsonl.gz | head -n 3
|
||||||
|
|
||||||
|
The output is given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
{"id": "0_0_0_0_1_1_1_1-0", "start": 0, "duration": 6.35, "channel": 0, "supervisions": [{"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 635, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.35, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "0,13000,3570", "channels": 0}, "recording": {"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}, "type": "MonoCut"}
|
||||||
|
{"id": "0_0_0_1_0_1_1_0-1", "start": 0, "duration": 6.11, "channel": 0, "supervisions": [{"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 611, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.11, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "16570,12964,2929", "channels": 0}, "recording": {"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}, "type": "MonoCut"}
|
||||||
|
{"id": "0_0_1_0_0_1_1_0-2", "start": 0, "duration": 6.02, "channel": 0, "supervisions": [{"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 602, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.02, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "32463,12936,2696", "channels": 0}, "recording": {"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}, "type": "MonoCut"}
|
||||||
|
|
||||||
|
Note that ``yesno_cuts_train.jsonl.gz`` only stores the information about how to read the features.
|
||||||
|
The actual features are stored separately in ``data/fbank/yesno_feats_train.lca``.
|
||||||
|
|
||||||
|
**data/lang**:
|
||||||
|
|
||||||
|
This directory contains the lexicon.
|
||||||
|
|
||||||
|
**data/lm**:
|
||||||
|
|
||||||
|
This directory contains language models.
|
||||||
39
docs/source/for-dummies/decoding.rst
Normal file
39
docs/source/for-dummies/decoding.rst
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
.. _dummies_tutorial_decoding:
|
||||||
|
|
||||||
|
Decoding
|
||||||
|
========
|
||||||
|
|
||||||
|
After :ref:`dummies_tutorial_training`, we can start decoding.
|
||||||
|
|
||||||
|
The command to start the decoding is quite simple:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd /tmp/icefall
|
||||||
|
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
|
||||||
|
# We use CPU for decoding by setting the following environment variable
|
||||||
|
export CUDA_VISIBLE_DEVICES=""
|
||||||
|
|
||||||
|
./tdnn/decode.py
|
||||||
|
|
||||||
|
The output logs are given below:
|
||||||
|
|
||||||
|
.. literalinclude:: ./code/decoding-yesno.txt
|
||||||
|
|
||||||
|
For the more curious
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./tdnn/decode.py --help
|
||||||
|
|
||||||
|
will print the usage information about ``./tdnn/decode.py``. For instance, you
|
||||||
|
can specify:
|
||||||
|
|
||||||
|
- ``--epoch`` to use which checkpoint for decoding
|
||||||
|
- ``--avg`` to select how many checkpoints to use for model averaging
|
||||||
|
|
||||||
|
You usually try different combinations of ``--epoch`` and ``--avg`` and select
|
||||||
|
one that leads to the lowest WER (`Word Error Rate <https://en.wikipedia.org/wiki/Word_error_rate>`_).
|
||||||
121
docs/source/for-dummies/environment-setup.rst
Normal file
121
docs/source/for-dummies/environment-setup.rst
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
.. _dummies_tutorial_environment_setup:
|
||||||
|
|
||||||
|
Environment setup
|
||||||
|
=================
|
||||||
|
|
||||||
|
We will create an environment for `Next-gen Kaldi`_ that runs on ``CPU``
|
||||||
|
in this tutorial.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Since the `yesno`_ dataset used in this tutorial is very tiny, training on
|
||||||
|
``CPU`` works very well for it.
|
||||||
|
|
||||||
|
If your dataset is very large, e.g., hundreds or thousands of hours of
|
||||||
|
training data, please follow :ref:`install icefall` to install `icefall`_
|
||||||
|
that works with ``GPU``.
|
||||||
|
|
||||||
|
|
||||||
|
Create a virtual environment
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
virtualenv -p python3 /tmp/icefall_env
|
||||||
|
|
||||||
|
The above command creates a virtual environment in the directory ``/tmp/icefall_env``.
|
||||||
|
You can select any directory you want.
|
||||||
|
|
||||||
|
The output of the above command is given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
Already using interpreter /usr/bin/python3
|
||||||
|
Using base prefix '/usr'
|
||||||
|
New python executable in /tmp/icefall_env/bin/python3
|
||||||
|
Also creating executable in /tmp/icefall_env/bin/python
|
||||||
|
Installing setuptools, pkg_resources, pip, wheel...done.
|
||||||
|
|
||||||
|
Now we can activate the environment using:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
source /tmp/icefall_env/bin/activate
|
||||||
|
|
||||||
|
Install dependencies
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
|
||||||
|
Remeber to activate your virtual environment before you continue!
|
||||||
|
|
||||||
|
After activating the virtual environment, we can use the following command
|
||||||
|
to install dependencies of `icefall`_:
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Remeber that we will run this tutorial on ``CPU``, so we install
|
||||||
|
dependencies required only by running on ``CPU``.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# Caution: Installation order matters!
|
||||||
|
|
||||||
|
# We use torch 2.0.0 and torchaduio 2.0.0 in this tutorial.
|
||||||
|
# Other versions should also work.
|
||||||
|
|
||||||
|
pip install torch==2.0.0+cpu torchaudio==2.0.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
|
|
||||||
|
# If you are using macOS or Windows, please use the following command to install torch and torchaudio
|
||||||
|
# pip install torch==2.0.0 torchaudio==2.0.0 -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
|
|
||||||
|
# Now install k2
|
||||||
|
# Please refer to https://k2-fsa.github.io/k2/installation/from_wheels.html#linux-cpu-example
|
||||||
|
|
||||||
|
pip install k2==1.24.3.dev20230726+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu.html
|
||||||
|
|
||||||
|
# Install the latest version of lhotse
|
||||||
|
|
||||||
|
pip install git+https://github.com/lhotse-speech/lhotse
|
||||||
|
|
||||||
|
|
||||||
|
Install icefall
|
||||||
|
---------------
|
||||||
|
|
||||||
|
We will put the source code of `icefall`_ into the directory ``/tmp``
|
||||||
|
You can select any directory you want.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd /tmp
|
||||||
|
git clone https://github.com/k2-fsa/icefall
|
||||||
|
cd icefall
|
||||||
|
pip install -r ./requirements.txt
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# Anytime we want to use icefall, we have to set the following
|
||||||
|
# environment variable
|
||||||
|
|
||||||
|
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
If you get the following error during this tutorial:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
ModuleNotFoundError: No module named 'icefall'
|
||||||
|
|
||||||
|
please set the above environment variable to fix it.
|
||||||
|
|
||||||
|
|
||||||
|
Congratulations! You have installed `icefall`_ successfully.
|
||||||
|
|
||||||
|
For the more curious
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
`icefall`_ contains a collection of Python scripts and you don't need to
|
||||||
|
use ``python3 setup.py install`` or ``pip install icefall`` to install it.
|
||||||
|
All you need to do is to download the code and set the environment variable
|
||||||
|
``PYTHONPATH``.
|
||||||
34
docs/source/for-dummies/index.rst
Normal file
34
docs/source/for-dummies/index.rst
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
Icefall for dummies tutorial
|
||||||
|
============================
|
||||||
|
|
||||||
|
This tutorial walks you step by step about how to create a simple
|
||||||
|
ASR (`Automatic Speech Recognition <https://en.wikipedia.org/wiki/Speech_recognition>`_)
|
||||||
|
system with `Next-gen Kaldi`_.
|
||||||
|
|
||||||
|
We use the `yesno`_ dataset for demonstration. We select it out of two reasons:
|
||||||
|
|
||||||
|
- It is quite tiny, containing only about 12 minutes of data
|
||||||
|
- The training can be finished within 20 seconds on ``CPU``.
|
||||||
|
|
||||||
|
That also means you don't need a ``GPU`` to run this tutorial.
|
||||||
|
|
||||||
|
Let's get started!
|
||||||
|
|
||||||
|
Please follow items below **sequentially**.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The :ref:`dummies_tutorial_data_preparation` runs only on Linux and on macOS.
|
||||||
|
All other parts run on Linux, macOS, and Windows.
|
||||||
|
|
||||||
|
Help from the community is appreciated to port the :ref:`dummies_tutorial_data_preparation`
|
||||||
|
to Windows.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
./environment-setup.rst
|
||||||
|
./data-preparation.rst
|
||||||
|
./training.rst
|
||||||
|
./decoding.rst
|
||||||
|
./model-export.rst
|
||||||
310
docs/source/for-dummies/model-export.rst
Normal file
310
docs/source/for-dummies/model-export.rst
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
Model Export
|
||||||
|
============
|
||||||
|
|
||||||
|
There are three ways to export a pre-trained model.
|
||||||
|
|
||||||
|
- Export the model parameters via `model.state_dict() <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.state_dict>`_
|
||||||
|
- Export via `torchscript <https://pytorch.org/docs/stable/jit.html>`_: either `torch.jit.script() <https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script>`_ or `torch.jit.trace() <https://pytorch.org/docs/stable/generated/torch.jit.trace.html>`_
|
||||||
|
- Export to `ONNX`_ via `torch.onnx.export() <https://pytorch.org/docs/stable/onnx.html>`_
|
||||||
|
|
||||||
|
Each method is explained below in detail.
|
||||||
|
|
||||||
|
Export the model parameters via model.state_dict()
|
||||||
|
---------------------------------------------------
|
||||||
|
|
||||||
|
The command for this kind of export is
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd /tmp/icefall
|
||||||
|
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
|
||||||
|
# assume that "--epoch 14 --avg 2" produces the lowest WER.
|
||||||
|
|
||||||
|
./tdnn/export.py --epoch 14 --avg 2
|
||||||
|
|
||||||
|
The output logs are given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
2023-08-16 20:42:03,912 INFO [export.py:76] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'jit': False}
|
||||||
|
2023-08-16 20:42:03,913 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||||
|
2023-08-16 20:42:03,950 INFO [export.py:93] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||||
|
2023-08-16 20:42:03,971 INFO [export.py:106] Not using torch.jit.script
|
||||||
|
2023-08-16 20:42:03,974 INFO [export.py:111] Saved to tdnn/exp/pretrained.pt
|
||||||
|
|
||||||
|
We can see from the logs that the exported model is saved to the file ``tdnn/exp/pretrained.pt``.
|
||||||
|
|
||||||
|
To give you an idea of what ``tdnn/exp/pretrained.pt`` contains, we can use the following command:
|
||||||
|
|
||||||
|
.. code-block:: python3
|
||||||
|
|
||||||
|
>>> import torch
|
||||||
|
>>> m = torch.load("tdnn/exp/pretrained.pt")
|
||||||
|
>>> list(m.keys())
|
||||||
|
['model']
|
||||||
|
>>> list(m["model"].keys())
|
||||||
|
['tdnn.0.weight', 'tdnn.0.bias', 'tdnn.2.running_mean', 'tdnn.2.running_var', 'tdnn.2.num_batches_tracked', 'tdnn.3.weight', 'tdnn.3.bias', 'tdnn.5.running_mean', 'tdnn.5.running_var', 'tdnn.5.num_batches_tracked', 'tdnn.6.weight', 'tdnn.6.bias', 'tdnn.8.running_mean', 'tdnn.8.running_var', 'tdnn.8.num_batches_tracked', 'output_linear.weight', 'output_linear.bias']
|
||||||
|
|
||||||
|
We can use ``tdnn/exp/pretrained.pt`` in the following way with ``./tdnn/decode.py``:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd tdnn/exp
|
||||||
|
ln -s pretrained.pt epoch-99.pt
|
||||||
|
cd ../..
|
||||||
|
|
||||||
|
./tdnn/decode.py --epoch 99 --avg 1
|
||||||
|
|
||||||
|
The output logs of the above command are given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
2023-08-16 20:45:48,089 INFO [decode.py:262] Decoding started
|
||||||
|
2023-08-16 20:45:48,090 INFO [decode.py:263] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 99, 'avg': 1, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': False, 'k2-git-sha1': 'ad79f1c699c684de9785ed6ca5edb805a41f78c3', 'k2-git-date': 'Wed Jul 26 09:30:42 2023', 'lhotse-version': '1.16.0.dev+git.aa073f6.clean', 'torch-version': '2.0.0', 'torch-cuda-available': False, 'torch-cuda-version': None, 'python-version': '3.1', 'icefall-git-branch': 'master', 'icefall-git-sha1': '9a47c08-clean', 'icefall-git-date': 'Mon Aug 14 22:10:50 2023', 'icefall-path': '/private/tmp/icefall', 'k2-path': '/private/tmp/icefall_env/lib/python3.11/site-packages/k2/__init__.py', 'lhotse-path': '/private/tmp/icefall_env/lib/python3.11/site-packages/lhotse/__init__.py', 'hostname': 'fangjuns-MacBook-Pro.local', 'IP address': '127.0.0.1'}}
|
||||||
|
2023-08-16 20:45:48,092 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||||
|
2023-08-16 20:45:48,103 INFO [decode.py:272] device: cpu
|
||||||
|
2023-08-16 20:45:48,109 INFO [checkpoint.py:112] Loading checkpoint from tdnn/exp/epoch-99.pt
|
||||||
|
2023-08-16 20:45:48,115 INFO [asr_datamodule.py:218] About to get test cuts
|
||||||
|
2023-08-16 20:45:48,115 INFO [asr_datamodule.py:253] About to get test cuts
|
||||||
|
2023-08-16 20:45:50,386 INFO [decode.py:203] batch 0/?, cuts processed until now is 4
|
||||||
|
2023-08-16 20:45:50,556 INFO [decode.py:240] The transcripts are stored in tdnn/exp/recogs-test_set.txt
|
||||||
|
2023-08-16 20:45:50,557 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
|
||||||
|
2023-08-16 20:45:50,558 INFO [decode.py:248] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
|
||||||
|
2023-08-16 20:45:50,559 INFO [decode.py:315] Done!
|
||||||
|
|
||||||
|
We can see that it produces an identical WER as before.
|
||||||
|
|
||||||
|
We can also use it to decode files with the following command:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# ./tdnn/pretrained.py requires kaldifeat
|
||||||
|
#
|
||||||
|
# Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
|
||||||
|
# for how to install kaldifeat
|
||||||
|
|
||||||
|
pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
|
||||||
|
|
||||||
|
./tdnn/pretrained.py \
|
||||||
|
--checkpoint ./tdnn/exp/pretrained.pt \
|
||||||
|
--HLG ./data/lang_phone/HLG.pt \
|
||||||
|
--words-file ./data/lang_phone/words.txt \
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||||
|
|
||||||
|
The output is given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
2023-08-16 20:53:19,208 INFO [pretrained.py:136] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './tdnn/exp/pretrained.pt', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
|
||||||
|
2023-08-16 20:53:19,208 INFO [pretrained.py:142] device: cpu
|
||||||
|
2023-08-16 20:53:19,208 INFO [pretrained.py:144] Creating model
|
||||||
|
2023-08-16 20:53:19,212 INFO [pretrained.py:156] Loading HLG from ./data/lang_phone/HLG.pt
|
||||||
|
2023-08-16 20:53:19,213 INFO [pretrained.py:160] Constructing Fbank computer
|
||||||
|
2023-08-16 20:53:19,213 INFO [pretrained.py:170] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
|
||||||
|
2023-08-16 20:53:19,224 INFO [pretrained.py:176] Decoding started
|
||||||
|
2023-08-16 20:53:19,304 INFO [pretrained.py:212]
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav:
|
||||||
|
NO NO NO YES NO NO NO YES
|
||||||
|
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav:
|
||||||
|
NO NO YES NO NO NO YES NO
|
||||||
|
|
||||||
|
|
||||||
|
2023-08-16 20:53:19,304 INFO [pretrained.py:214] Decoding Done
|
||||||
|
|
||||||
|
|
||||||
|
Export via torch.jit.script()
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
The command for this kind of export is
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd /tmp/icefall
|
||||||
|
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
|
||||||
|
# assume that "--epoch 14 --avg 2" produces the lowest WER.
|
||||||
|
|
||||||
|
./tdnn/export.py --epoch 14 --avg 2 --jit true
|
||||||
|
|
||||||
|
The output logs are given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
2023-08-16 20:47:44,666 INFO [export.py:76] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'jit': True}
|
||||||
|
2023-08-16 20:47:44,667 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||||
|
2023-08-16 20:47:44,670 INFO [export.py:93] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||||
|
2023-08-16 20:47:44,677 INFO [export.py:100] Using torch.jit.script
|
||||||
|
2023-08-16 20:47:44,843 INFO [export.py:104] Saved to tdnn/exp/cpu_jit.pt
|
||||||
|
|
||||||
|
From the output logs we can see that the generated file is saved to ``tdnn/exp/cpu_jit.pt``.
|
||||||
|
|
||||||
|
Don't be confused by the name ``cpu_jit.pt``. The ``cpu`` part means the model is moved to
|
||||||
|
CPU before exporting. That means, when you load it with:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
torch.jit.load()
|
||||||
|
|
||||||
|
you don't need to specify the argument `map_location <https://pytorch.org/docs/stable/generated/torch.jit.load.html#torch.jit.load>`_
|
||||||
|
and it resides on CPU by default.
|
||||||
|
|
||||||
|
To use ``tdnn/exp/cpu_jit.pt`` with `icefall`_ to decode files, we can use:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# ./tdnn/jit_pretrained.py requires kaldifeat
|
||||||
|
#
|
||||||
|
# Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
|
||||||
|
# for how to install kaldifeat
|
||||||
|
|
||||||
|
pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
|
||||||
|
|
||||||
|
|
||||||
|
./tdnn/jit_pretrained.py \
|
||||||
|
--nn-model ./tdnn/exp/cpu_jit.pt \
|
||||||
|
--HLG ./data/lang_phone/HLG.pt \
|
||||||
|
--words-file ./data/lang_phone/words.txt \
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||||
|
|
||||||
|
The output is given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
2023-08-16 20:56:00,603 INFO [jit_pretrained.py:121] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'nn_model': './tdnn/exp/cpu_jit.pt', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
|
||||||
|
2023-08-16 20:56:00,603 INFO [jit_pretrained.py:127] device: cpu
|
||||||
|
2023-08-16 20:56:00,603 INFO [jit_pretrained.py:129] Loading torchscript model
|
||||||
|
2023-08-16 20:56:00,640 INFO [jit_pretrained.py:134] Loading HLG from ./data/lang_phone/HLG.pt
|
||||||
|
2023-08-16 20:56:00,641 INFO [jit_pretrained.py:138] Constructing Fbank computer
|
||||||
|
2023-08-16 20:56:00,641 INFO [jit_pretrained.py:148] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
|
||||||
|
2023-08-16 20:56:00,642 INFO [jit_pretrained.py:154] Decoding started
|
||||||
|
2023-08-16 20:56:00,727 INFO [jit_pretrained.py:190]
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav:
|
||||||
|
NO NO NO YES NO NO NO YES
|
||||||
|
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav:
|
||||||
|
NO NO YES NO NO NO YES NO
|
||||||
|
|
||||||
|
|
||||||
|
2023-08-16 20:56:00,727 INFO [jit_pretrained.py:192] Decoding Done
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
We provide only code for ``torch.jit.script()``. You can try ``torch.jit.trace()``
|
||||||
|
if you want.
|
||||||
|
|
||||||
|
Export via torch.onnx.export()
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
The command for this kind of export is
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd /tmp/icefall
|
||||||
|
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
|
||||||
|
# tdnn/export_onnx.py requires onnx and onnxruntime
|
||||||
|
pip install onnx onnxruntime
|
||||||
|
|
||||||
|
# assume that "--epoch 14 --avg 2" produces the lowest WER.
|
||||||
|
|
||||||
|
./tdnn/export_onnx.py \
|
||||||
|
--epoch 14 \
|
||||||
|
--avg 2
|
||||||
|
|
||||||
|
The output logs are given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
2023-08-16 20:59:20,888 INFO [export_onnx.py:83] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2}
|
||||||
|
2023-08-16 20:59:20,888 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
|
||||||
|
2023-08-16 20:59:20,892 INFO [export_onnx.py:100] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
|
||||||
|
================ Diagnostic Run torch.onnx.export version 2.0.0 ================
|
||||||
|
verbose: False, log level: Level.ERROR
|
||||||
|
======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================
|
||||||
|
|
||||||
|
2023-08-16 20:59:21,047 INFO [export_onnx.py:127] Saved to tdnn/exp/model-epoch-14-avg-2.onnx
|
||||||
|
2023-08-16 20:59:21,047 INFO [export_onnx.py:136] meta_data: {'model_type': 'tdnn', 'version': '1', 'model_author': 'k2-fsa', 'comment': 'non-streaming tdnn for the yesno recipe', 'vocab_size': 4}
|
||||||
|
2023-08-16 20:59:21,049 INFO [export_onnx.py:140] Generate int8 quantization models
|
||||||
|
2023-08-16 20:59:21,075 INFO [onnx_quantizer.py:538] Quantization parameters for tensor:"/Transpose_1_output_0" not specified
|
||||||
|
2023-08-16 20:59:21,081 INFO [export_onnx.py:151] Saved to tdnn/exp/model-epoch-14-avg-2.int8.onnx
|
||||||
|
|
||||||
|
We can see from the logs that it generates two files:
|
||||||
|
|
||||||
|
- ``tdnn/exp/model-epoch-14-avg-2.onnx`` (ONNX model with ``float32`` weights)
|
||||||
|
- ``tdnn/exp/model-epoch-14-avg-2.int8.onnx`` (ONNX model with ``int8`` weights)
|
||||||
|
|
||||||
|
To use the generated ONNX model files for decoding with `onnxruntime`_, we can use
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# ./tdnn/onnx_pretrained.py requires kaldifeat
|
||||||
|
#
|
||||||
|
# Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
|
||||||
|
# for how to install kaldifeat
|
||||||
|
|
||||||
|
pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
|
||||||
|
|
||||||
|
./tdnn/onnx_pretrained.py \
|
||||||
|
--nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
|
||||||
|
--HLG ./data/lang_phone/HLG.pt \
|
||||||
|
--words-file ./data/lang_phone/words.txt \
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||||
|
|
||||||
|
The output is given below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:166] {'feature_dim': 23, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'nn_model': './tdnn/exp/model-epoch-14-avg-2.onnx', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
|
||||||
|
2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:171] device: cpu
|
||||||
|
2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:173] Loading onnx model ./tdnn/exp/model-epoch-14-avg-2.onnx
|
||||||
|
2023-08-16 21:03:24,267 INFO [onnx_pretrained.py:176] Loading HLG from ./data/lang_phone/HLG.pt
|
||||||
|
2023-08-16 21:03:24,270 INFO [onnx_pretrained.py:180] Constructing Fbank computer
|
||||||
|
2023-08-16 21:03:24,273 INFO [onnx_pretrained.py:190] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
|
||||||
|
2023-08-16 21:03:24,279 INFO [onnx_pretrained.py:196] Decoding started
|
||||||
|
2023-08-16 21:03:24,318 INFO [onnx_pretrained.py:232]
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav:
|
||||||
|
NO NO NO YES NO NO NO YES
|
||||||
|
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav:
|
||||||
|
NO NO YES NO NO NO YES NO
|
||||||
|
|
||||||
|
|
||||||
|
2023-08-16 21:03:24,318 INFO [onnx_pretrained.py:234] Decoding Done
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
To use the ``int8`` ONNX model for decoding, please use:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./tdnn/onnx_pretrained.py \
|
||||||
|
--nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
|
||||||
|
--HLG ./data/lang_phone/HLG.pt \
|
||||||
|
--words-file ./data/lang_phone/words.txt \
|
||||||
|
download/waves_yesno/0_0_0_1_0_0_0_1.wav \
|
||||||
|
download/waves_yesno/0_0_1_0_0_0_1_0.wav
|
||||||
|
|
||||||
|
For the more curious
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
If you are wondering how to deploy the model without ``torch``, please
|
||||||
|
continue reading. We will show how to use `sherpa-onnx`_ to run the
|
||||||
|
exported ONNX models, which depends only on `onnxruntime`_ and does not
|
||||||
|
depend on ``torch``.
|
||||||
|
|
||||||
|
In this tutorial, we will only demonstrate the usage of `sherpa-onnx`_ with the
|
||||||
|
pre-trained model of the `yesno`_ recipe. There are also other two frameworks
|
||||||
|
available:
|
||||||
|
|
||||||
|
- `sherpa`_. It works with torchscript models.
|
||||||
|
- `sherpa-ncnn`_. It works with models exported using :ref:`icefall_export_to_ncnn` with `ncnn`_
|
||||||
|
|
||||||
|
Please see `<https://k2-fsa.github.io/sherpa/>`_ for further details.
|
||||||
39
docs/source/for-dummies/training.rst
Normal file
39
docs/source/for-dummies/training.rst
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
.. _dummies_tutorial_training:
|
||||||
|
|
||||||
|
Training
|
||||||
|
========
|
||||||
|
|
||||||
|
After :ref:`dummies_tutorial_data_preparation`, we can start training.
|
||||||
|
|
||||||
|
The command to start the training is quite simple:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
cd /tmp/icefall
|
||||||
|
export PYTHONPATH=/tmp/icefall:$PYTHONPATH
|
||||||
|
cd egs/yesno/ASR
|
||||||
|
|
||||||
|
# We use CPU for training by setting the following environment variable
|
||||||
|
export CUDA_VISIBLE_DEVICES=""
|
||||||
|
|
||||||
|
./tdnn/train.py
|
||||||
|
|
||||||
|
That's it!
|
||||||
|
|
||||||
|
You can find the training logs below:
|
||||||
|
|
||||||
|
.. literalinclude:: ./code/train-yesno.txt
|
||||||
|
|
||||||
|
For the more curious
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./tdnn/train.py --help
|
||||||
|
|
||||||
|
will print the usage information about ``./tdnn/train.py``. For instance, you
|
||||||
|
can specify the number of epochs to train and the location to save the training
|
||||||
|
results.
|
||||||
|
|
||||||
|
The training text logs are saved in ``tdnn/exp/log`` while the tensorboard
|
||||||
|
logs are in ``tdnn/exp/tensorboard``.
|
||||||
@ -20,6 +20,7 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
|
|||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
:caption: Contents:
|
:caption: Contents:
|
||||||
|
|
||||||
|
for-dummies/index.rst
|
||||||
installation/index
|
installation/index
|
||||||
docker/index
|
docker/index
|
||||||
faqs
|
faqs
|
||||||
|
|||||||
@ -41,7 +41,7 @@ as an example.
|
|||||||
|
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -78,7 +78,7 @@ In each recipe, there is also a file ``pretrained.py``, which can use
|
|||||||
|
|
||||||
./pruned_transducer_stateless3/pretrained.py \
|
./pruned_transducer_stateless3/pretrained.py \
|
||||||
--checkpoint ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt \
|
--checkpoint ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt \
|
||||||
--bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model \
|
--tokens ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \
|
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \
|
||||||
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \
|
./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \
|
||||||
|
|||||||
@ -153,11 +153,10 @@ Next, we use the following code to export our model:
|
|||||||
|
|
||||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||||
--exp-dir $dir/exp \
|
--exp-dir $dir/exp \
|
||||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
--tokens $dir/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
\
|
|
||||||
--num-encoder-layers 12 \
|
--num-encoder-layers 12 \
|
||||||
--chunk-length 32 \
|
--chunk-length 32 \
|
||||||
--cnn-module-kernel 31 \
|
--cnn-module-kernel 31 \
|
||||||
|
|||||||
@ -73,7 +73,7 @@ Next, we use the following code to export our model:
|
|||||||
|
|
||||||
./lstm_transducer_stateless2/export-for-ncnn.py \
|
./lstm_transducer_stateless2/export-for-ncnn.py \
|
||||||
--exp-dir $dir/exp \
|
--exp-dir $dir/exp \
|
||||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
--tokens $dir/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
|
|||||||
@ -72,12 +72,11 @@ Next, we use the following code to export our model:
|
|||||||
dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
||||||
--bpe-model $dir/data/lang_bpe_500/bpe.model \
|
--tokens $dir/data/lang_bpe_500/tokens.txt \
|
||||||
--exp-dir $dir/exp \
|
--exp-dir $dir/exp \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
\
|
|
||||||
--decode-chunk-len 32 \
|
--decode-chunk-len 32 \
|
||||||
--num-left-chunks 4 \
|
--num-left-chunks 4 \
|
||||||
--num-encoder-layers "2,4,3,2,4" \
|
--num-encoder-layers "2,4,3,2,4" \
|
||||||
|
|||||||
@ -71,7 +71,7 @@ Export the model to ONNX
|
|||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
|
|||||||
@ -32,7 +32,7 @@ as an example in the following.
|
|||||||
|
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch $epoch \
|
--epoch $epoch \
|
||||||
--avg $avg \
|
--avg $avg \
|
||||||
--jit 1
|
--jit 1
|
||||||
|
|||||||
@ -33,7 +33,7 @@ as an example in the following.
|
|||||||
|
|
||||||
./lstm_transducer_stateless2/export.py \
|
./lstm_transducer_stateless2/export.py \
|
||||||
--exp-dir ./lstm_transducer_stateless2/exp \
|
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--iter $iter \
|
--iter $iter \
|
||||||
--avg $avg \
|
--avg $avg \
|
||||||
--jit-trace 1
|
--jit-trace 1
|
||||||
|
|||||||
@ -1,321 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
#
|
|
||||||
# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
# This script converts several saved checkpoints
|
|
||||||
# to a single one using model averaging.
|
|
||||||
"""
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
|
|
||||||
(1) Export to torchscript model using torch.jit.script()
|
|
||||||
|
|
||||||
./pruned_transducer_stateless7/export.py \
|
|
||||||
--exp-dir ./pruned_transducer_stateless7/exp \
|
|
||||||
--lang-dir data/lang_char \
|
|
||||||
--epoch 30 \
|
|
||||||
--avg 9 \
|
|
||||||
--jit 1
|
|
||||||
|
|
||||||
It will generate a file `cpu_jit.pt` in the given `exp_dir`. You can later
|
|
||||||
load it by `torch.jit.load("cpu_jit.pt")`.
|
|
||||||
|
|
||||||
Note `cpu` in the name `cpu_jit.pt` means the parameters when loaded into Python
|
|
||||||
are on CPU. You can use `to("cuda")` to move them to a CUDA device.
|
|
||||||
|
|
||||||
Check
|
|
||||||
https://github.com/k2-fsa/sherpa
|
|
||||||
for how to use the exported models outside of icefall.
|
|
||||||
|
|
||||||
(2) Export `model.state_dict()`
|
|
||||||
|
|
||||||
./pruned_transducer_stateless7/export.py \
|
|
||||||
--exp-dir ./pruned_transducer_stateless7/exp \
|
|
||||||
--lang-dir data/lang_char \
|
|
||||||
--epoch 20 \
|
|
||||||
--avg 10
|
|
||||||
|
|
||||||
It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
|
|
||||||
load it by `icefall.checkpoint.load_checkpoint()`.
|
|
||||||
|
|
||||||
To use the generated file with `pruned_transducer_stateless7/decode.py`,
|
|
||||||
you can do:
|
|
||||||
|
|
||||||
cd /path/to/exp_dir
|
|
||||||
ln -s pretrained.pt epoch-9999.pt
|
|
||||||
|
|
||||||
cd /path/to/egs/librispeech/ASR
|
|
||||||
./pruned_transducer_stateless7/decode.py \
|
|
||||||
--exp-dir ./pruned_transducer_stateless7/exp \
|
|
||||||
--epoch 9999 \
|
|
||||||
--avg 1 \
|
|
||||||
--max-duration 600 \
|
|
||||||
--decoding-method greedy_search \
|
|
||||||
--lang-dir data/lang_char
|
|
||||||
|
|
||||||
Check ./pretrained.py for its usage.
|
|
||||||
|
|
||||||
Note: If you don't want to train a model from scratch, we have
|
|
||||||
provided one for you. You can get it at
|
|
||||||
|
|
||||||
https://huggingface.co/marcoyang/icefall-asr-aishell-zipformer-pruned-transducer-stateless7-2023-03-21
|
|
||||||
|
|
||||||
with the following commands:
|
|
||||||
|
|
||||||
sudo apt-get install git-lfs
|
|
||||||
git lfs install
|
|
||||||
git clone https://huggingface.co/marcoyang/icefall-asr-aishell-zipformer-pruned-transducer-stateless7-2023-03-21
|
|
||||||
# You will find the pre-trained model in icefall-asr-aishell-zipformer-pruned-transducer-stateless7-2023-03-21exp
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
|
||||||
from train2 import add_model_arguments, get_params, get_transducer_model
|
|
||||||
|
|
||||||
from icefall.checkpoint import (
|
|
||||||
average_checkpoints,
|
|
||||||
average_checkpoints_with_averaged_model,
|
|
||||||
find_checkpoints,
|
|
||||||
load_checkpoint,
|
|
||||||
)
|
|
||||||
from icefall.lexicon import Lexicon
|
|
||||||
from icefall.utils import str2bool
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--epoch",
|
|
||||||
type=int,
|
|
||||||
default=30,
|
|
||||||
help="""It specifies the checkpoint to use for decoding.
|
|
||||||
Note: Epoch counts from 1.
|
|
||||||
You can specify --avg to use more checkpoints for model averaging.""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--iter",
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help="""If positive, --epoch is ignored and it
|
|
||||||
will use the checkpoint exp_dir/checkpoint-iter.pt.
|
|
||||||
You can specify --avg to use more checkpoints for model averaging.
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--avg",
|
|
||||||
type=int,
|
|
||||||
default=9,
|
|
||||||
help="Number of checkpoints to average. Automatically select "
|
|
||||||
"consecutive checkpoints before the checkpoint specified by "
|
|
||||||
"'--epoch' and '--iter'",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--use-averaged-model",
|
|
||||||
type=str2bool,
|
|
||||||
default=True,
|
|
||||||
help="Whether to load averaged model. Currently it only supports "
|
|
||||||
"using --epoch. If True, it would decode with the averaged model "
|
|
||||||
"over the epoch range from `epoch-avg` (excluded) to `epoch`."
|
|
||||||
"Actually only the models with epoch number of `epoch-avg` and "
|
|
||||||
"`epoch` are loaded for averaging. ",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--exp-dir",
|
|
||||||
type=str,
|
|
||||||
default="pruned_transducer_stateless7/exp",
|
|
||||||
help="""It specifies the directory where all training related
|
|
||||||
files, e.g., checkpoints, log, etc, are saved
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--lang-dir",
|
|
||||||
type=str,
|
|
||||||
default="data/lang_char",
|
|
||||||
help="""The lang dir
|
|
||||||
It contains language related input files such as
|
|
||||||
"lexicon.txt"
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--jit",
|
|
||||||
type=str2bool,
|
|
||||||
default=False,
|
|
||||||
help="""True to save a model after applying torch.jit.script.
|
|
||||||
It will generate a file named cpu_jit.pt
|
|
||||||
|
|
||||||
Check ./jit_pretrained.py for how to use it.
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--context-size",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
|
|
||||||
)
|
|
||||||
|
|
||||||
add_model_arguments(parser)
|
|
||||||
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def main():
|
|
||||||
args = get_parser().parse_args()
|
|
||||||
args.exp_dir = Path(args.exp_dir)
|
|
||||||
|
|
||||||
params = get_params()
|
|
||||||
params.update(vars(args))
|
|
||||||
|
|
||||||
device = torch.device("cpu")
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
device = torch.device("cuda", 0)
|
|
||||||
|
|
||||||
logging.info(f"device: {device}")
|
|
||||||
|
|
||||||
lexicon = Lexicon(params.lang_dir)
|
|
||||||
params.blank_id = 0
|
|
||||||
params.vocab_size = max(lexicon.tokens) + 1
|
|
||||||
|
|
||||||
logging.info(params)
|
|
||||||
|
|
||||||
logging.info("About to create model")
|
|
||||||
model = get_transducer_model(params)
|
|
||||||
|
|
||||||
model.to(device)
|
|
||||||
|
|
||||||
if not params.use_averaged_model:
|
|
||||||
if params.iter > 0:
|
|
||||||
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
|
|
||||||
: params.avg
|
|
||||||
]
|
|
||||||
if len(filenames) == 0:
|
|
||||||
raise ValueError(
|
|
||||||
f"No checkpoints found for"
|
|
||||||
f" --iter {params.iter}, --avg {params.avg}"
|
|
||||||
)
|
|
||||||
elif len(filenames) < params.avg:
|
|
||||||
raise ValueError(
|
|
||||||
f"Not enough checkpoints ({len(filenames)}) found for"
|
|
||||||
f" --iter {params.iter}, --avg {params.avg}"
|
|
||||||
)
|
|
||||||
logging.info(f"averaging {filenames}")
|
|
||||||
model.to(device)
|
|
||||||
model.load_state_dict(average_checkpoints(filenames, device=device))
|
|
||||||
elif params.avg == 1:
|
|
||||||
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
|
|
||||||
else:
|
|
||||||
start = params.epoch - params.avg + 1
|
|
||||||
filenames = []
|
|
||||||
for i in range(start, params.epoch + 1):
|
|
||||||
if i >= 1:
|
|
||||||
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
|
|
||||||
logging.info(f"averaging {filenames}")
|
|
||||||
model.to(device)
|
|
||||||
model.load_state_dict(average_checkpoints(filenames, device=device))
|
|
||||||
else:
|
|
||||||
if params.iter > 0:
|
|
||||||
filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
|
|
||||||
: params.avg + 1
|
|
||||||
]
|
|
||||||
if len(filenames) == 0:
|
|
||||||
raise ValueError(
|
|
||||||
f"No checkpoints found for"
|
|
||||||
f" --iter {params.iter}, --avg {params.avg}"
|
|
||||||
)
|
|
||||||
elif len(filenames) < params.avg + 1:
|
|
||||||
raise ValueError(
|
|
||||||
f"Not enough checkpoints ({len(filenames)}) found for"
|
|
||||||
f" --iter {params.iter}, --avg {params.avg}"
|
|
||||||
)
|
|
||||||
filename_start = filenames[-1]
|
|
||||||
filename_end = filenames[0]
|
|
||||||
logging.info(
|
|
||||||
"Calculating the averaged model over iteration checkpoints"
|
|
||||||
f" from {filename_start} (excluded) to {filename_end}"
|
|
||||||
)
|
|
||||||
model.to(device)
|
|
||||||
model.load_state_dict(
|
|
||||||
average_checkpoints_with_averaged_model(
|
|
||||||
filename_start=filename_start,
|
|
||||||
filename_end=filename_end,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert params.avg > 0, params.avg
|
|
||||||
start = params.epoch - params.avg
|
|
||||||
assert start >= 1, start
|
|
||||||
filename_start = f"{params.exp_dir}/epoch-{start}.pt"
|
|
||||||
filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
|
|
||||||
logging.info(
|
|
||||||
f"Calculating the averaged model over epoch range from "
|
|
||||||
f"{start} (excluded) to {params.epoch}"
|
|
||||||
)
|
|
||||||
model.to(device)
|
|
||||||
model.load_state_dict(
|
|
||||||
average_checkpoints_with_averaged_model(
|
|
||||||
filename_start=filename_start,
|
|
||||||
filename_end=filename_end,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
model.to("cpu")
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
if params.jit is True:
|
|
||||||
convert_scaled_to_non_scaled(model, inplace=True)
|
|
||||||
# We won't use the forward() method of the model in C++, so just ignore
|
|
||||||
# it here.
|
|
||||||
# Otherwise, one of its arguments is a ragged tensor and is not
|
|
||||||
# torch scriptabe.
|
|
||||||
model.__class__.forward = torch.jit.ignore(model.__class__.forward)
|
|
||||||
logging.info("Using torch.jit.script")
|
|
||||||
model = torch.jit.script(model)
|
|
||||||
filename = params.exp_dir / "cpu_jit.pt"
|
|
||||||
model.save(str(filename))
|
|
||||||
logging.info(f"Saved to {filename}")
|
|
||||||
else:
|
|
||||||
logging.info("Not using torchscript. Export model.state_dict()")
|
|
||||||
# Save it using a format so that it can be loaded
|
|
||||||
# by :func:`load_checkpoint`
|
|
||||||
filename = params.exp_dir / "pretrained.pt"
|
|
||||||
torch.save({"model": model.state_dict()}, str(filename))
|
|
||||||
logging.info(f"Saved to {filename}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
||||||
|
|
||||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
||||||
main()
|
|
||||||
1
egs/aishell/ASR/pruned_transducer_stateless7/export.py
Symbolic link
1
egs/aishell/ASR/pruned_transducer_stateless7/export.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/pruned_transducer_stateless7/export.py
|
||||||
@ -1,348 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
"""
|
|
||||||
This script loads a checkpoint and uses it to decode waves.
|
|
||||||
You can generate the checkpoint with the following command:
|
|
||||||
|
|
||||||
./pruned_transducer_stateless7/export.py \
|
|
||||||
--exp-dir ./pruned_transducer_stateless7/exp \
|
|
||||||
--lang-dir data/lang_char \
|
|
||||||
--epoch 20 \
|
|
||||||
--avg 10
|
|
||||||
|
|
||||||
Usage of this script:
|
|
||||||
|
|
||||||
(1) greedy search
|
|
||||||
./pruned_transducer_stateless7/pretrained.py \
|
|
||||||
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
|
||||||
--lang-dir ./data/lang_char \
|
|
||||||
--method greedy_search \
|
|
||||||
/path/to/foo.wav \
|
|
||||||
/path/to/bar.wav
|
|
||||||
|
|
||||||
(2) beam search
|
|
||||||
./pruned_transducer_stateless7/pretrained.py \
|
|
||||||
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
|
||||||
--lang-dir ./data/lang_char \
|
|
||||||
--method beam_search \
|
|
||||||
--beam-size 4 \
|
|
||||||
/path/to/foo.wav \
|
|
||||||
/path/to/bar.wav
|
|
||||||
|
|
||||||
(3) modified beam search
|
|
||||||
./pruned_transducer_stateless7/pretrained.py \
|
|
||||||
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
|
||||||
--lang-dir ./data/lang_char \
|
|
||||||
--method modified_beam_search \
|
|
||||||
--beam-size 4 \
|
|
||||||
/path/to/foo.wav \
|
|
||||||
/path/to/bar.wav
|
|
||||||
|
|
||||||
(4) fast beam search
|
|
||||||
./pruned_transducer_stateless7/pretrained.py \
|
|
||||||
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
|
||||||
--lang-dir ./data/lang_char \
|
|
||||||
--method fast_beam_search \
|
|
||||||
--beam-size 4 \
|
|
||||||
/path/to/foo.wav \
|
|
||||||
/path/to/bar.wav
|
|
||||||
|
|
||||||
You can also use `./pruned_transducer_stateless7/exp/epoch-xx.pt`.
|
|
||||||
|
|
||||||
Note: ./pruned_transducer_stateless7/exp/pretrained.pt is generated by
|
|
||||||
./pruned_transducer_stateless7/export.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import math
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import k2
|
|
||||||
import kaldifeat
|
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
|
||||||
import torchaudio
|
|
||||||
from beam_search import (
|
|
||||||
beam_search,
|
|
||||||
fast_beam_search_one_best,
|
|
||||||
greedy_search,
|
|
||||||
greedy_search_batch,
|
|
||||||
modified_beam_search,
|
|
||||||
)
|
|
||||||
from torch.nn.utils.rnn import pad_sequence
|
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
|
||||||
|
|
||||||
from icefall.lexicon import Lexicon
|
|
||||||
from icefall.utils import str2bool
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--checkpoint",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Path to the checkpoint. "
|
|
||||||
"The checkpoint is assumed to be saved by "
|
|
||||||
"icefall.checkpoint.save_checkpoint().",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--lang-dir",
|
|
||||||
type=str,
|
|
||||||
help="""The lang dir
|
|
||||||
It contains language related input files such as
|
|
||||||
"lexicon.txt"
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--method",
|
|
||||||
type=str,
|
|
||||||
default="greedy_search",
|
|
||||||
help="""Possible values are:
|
|
||||||
- greedy_search
|
|
||||||
- beam_search
|
|
||||||
- modified_beam_search
|
|
||||||
- fast_beam_search
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"sound_files",
|
|
||||||
type=str,
|
|
||||||
nargs="+",
|
|
||||||
help="The input sound file(s) to transcribe. "
|
|
||||||
"Supported formats are those supported by torchaudio.load(). "
|
|
||||||
"For example, wav and flac are supported. "
|
|
||||||
"The sample rate has to be 16kHz.",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--sample-rate",
|
|
||||||
type=int,
|
|
||||||
default=16000,
|
|
||||||
help="The sample rate of the input sound file",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--beam-size",
|
|
||||||
type=int,
|
|
||||||
default=4,
|
|
||||||
help="""An integer indicating how many candidates we will keep for each
|
|
||||||
frame. Used only when --method is beam_search or
|
|
||||||
modified_beam_search.""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--beam",
|
|
||||||
type=float,
|
|
||||||
default=4,
|
|
||||||
help="""A floating point value to calculate the cutoff score during beam
|
|
||||||
search (i.e., `cutoff = max-score - beam`), which is the same as the
|
|
||||||
`beam` in Kaldi.
|
|
||||||
Used only when --method is fast_beam_search""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-contexts",
|
|
||||||
type=int,
|
|
||||||
default=4,
|
|
||||||
help="""Used only when --method is fast_beam_search""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-states",
|
|
||||||
type=int,
|
|
||||||
default=8,
|
|
||||||
help="""Used only when --method is fast_beam_search""",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--context-size",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-sym-per-frame",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="""Maximum number of symbols per frame. Used only when
|
|
||||||
--method is greedy_search.
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
|
|
||||||
add_model_arguments(parser)
|
|
||||||
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def read_sound_files(
|
|
||||||
filenames: List[str], expected_sample_rate: float
|
|
||||||
) -> List[torch.Tensor]:
|
|
||||||
"""Read a list of sound files into a list 1-D float32 torch tensors.
|
|
||||||
Args:
|
|
||||||
filenames:
|
|
||||||
A list of sound filenames.
|
|
||||||
expected_sample_rate:
|
|
||||||
The expected sample rate of the sound files.
|
|
||||||
Returns:
|
|
||||||
Return a list of 1-D float32 torch tensors.
|
|
||||||
"""
|
|
||||||
ans = []
|
|
||||||
for f in filenames:
|
|
||||||
wave, sample_rate = torchaudio.load(f)
|
|
||||||
assert (
|
|
||||||
sample_rate == expected_sample_rate
|
|
||||||
), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
|
|
||||||
# We use only the first channel
|
|
||||||
ans.append(wave[0])
|
|
||||||
return ans
|
|
||||||
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def main():
|
|
||||||
parser = get_parser()
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
params = get_params()
|
|
||||||
|
|
||||||
params.update(vars(args))
|
|
||||||
|
|
||||||
lexicon = Lexicon(params.lang_dir)
|
|
||||||
params.blank_id = 0
|
|
||||||
params.vocab_size = max(lexicon.tokens) + 1
|
|
||||||
token_table = lexicon.token_table
|
|
||||||
|
|
||||||
logging.info(f"{params}")
|
|
||||||
|
|
||||||
device = torch.device("cpu")
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
device = torch.device("cuda", 0)
|
|
||||||
|
|
||||||
logging.info(f"device: {device}")
|
|
||||||
|
|
||||||
logging.info("Creating model")
|
|
||||||
model = get_transducer_model(params)
|
|
||||||
|
|
||||||
num_param = sum([p.numel() for p in model.parameters()])
|
|
||||||
logging.info(f"Number of model parameters: {num_param}")
|
|
||||||
|
|
||||||
checkpoint = torch.load(args.checkpoint, map_location="cpu")
|
|
||||||
model.load_state_dict(checkpoint["model"], strict=False)
|
|
||||||
model.to(device)
|
|
||||||
model.eval()
|
|
||||||
model.device = device
|
|
||||||
|
|
||||||
logging.info("Constructing Fbank computer")
|
|
||||||
opts = kaldifeat.FbankOptions()
|
|
||||||
opts.device = device
|
|
||||||
opts.frame_opts.dither = 0
|
|
||||||
opts.frame_opts.snip_edges = False
|
|
||||||
opts.frame_opts.samp_freq = params.sample_rate
|
|
||||||
opts.mel_opts.num_bins = params.feature_dim
|
|
||||||
|
|
||||||
fbank = kaldifeat.Fbank(opts)
|
|
||||||
|
|
||||||
logging.info(f"Reading sound files: {params.sound_files}")
|
|
||||||
waves = read_sound_files(
|
|
||||||
filenames=params.sound_files, expected_sample_rate=params.sample_rate
|
|
||||||
)
|
|
||||||
waves = [w.to(device) for w in waves]
|
|
||||||
|
|
||||||
logging.info("Decoding started")
|
|
||||||
features = fbank(waves)
|
|
||||||
feature_lengths = [f.size(0) for f in features]
|
|
||||||
|
|
||||||
features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
|
|
||||||
|
|
||||||
feature_lengths = torch.tensor(feature_lengths, device=device)
|
|
||||||
|
|
||||||
encoder_out, encoder_out_lens = model.encoder(x=features, x_lens=feature_lengths)
|
|
||||||
|
|
||||||
num_waves = encoder_out.size(0)
|
|
||||||
hyps = []
|
|
||||||
msg = f"Using {params.method}"
|
|
||||||
if params.method == "beam_search":
|
|
||||||
msg += f" with beam size {params.beam_size}"
|
|
||||||
logging.info(msg)
|
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
|
||||||
model=model,
|
|
||||||
decoding_graph=decoding_graph,
|
|
||||||
encoder_out=encoder_out,
|
|
||||||
encoder_out_lens=encoder_out_lens,
|
|
||||||
beam=params.beam,
|
|
||||||
max_contexts=params.max_contexts,
|
|
||||||
max_states=params.max_states,
|
|
||||||
)
|
|
||||||
elif params.method == "modified_beam_search":
|
|
||||||
hyp_tokens = modified_beam_search(
|
|
||||||
model=model,
|
|
||||||
encoder_out=encoder_out,
|
|
||||||
encoder_out_lens=encoder_out_lens,
|
|
||||||
beam=params.beam_size,
|
|
||||||
)
|
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
|
||||||
hyp_tokens = greedy_search_batch(
|
|
||||||
model=model,
|
|
||||||
encoder_out=encoder_out,
|
|
||||||
encoder_out_lens=encoder_out_lens,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
for i in range(num_waves):
|
|
||||||
# fmt: off
|
|
||||||
encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
|
|
||||||
# fmt: on
|
|
||||||
if params.method == "greedy_search":
|
|
||||||
hyp_tokens = greedy_search(
|
|
||||||
model=model,
|
|
||||||
encoder_out=encoder_out_i,
|
|
||||||
max_sym_per_frame=params.max_sym_per_frame,
|
|
||||||
)
|
|
||||||
elif params.method == "beam_search":
|
|
||||||
hyp_tokens = beam_search(
|
|
||||||
model=model,
|
|
||||||
encoder_out=encoder_out_i,
|
|
||||||
beam=params.beam_size,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
|
||||||
|
|
||||||
hyps = [[token_table[t] for t in tokens] for tokens in hyp_tokens]
|
|
||||||
s = "\n"
|
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
|
||||||
words = " ".join(hyp)
|
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
||||||
|
|
||||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
||||||
main()
|
|
||||||
1
egs/aishell/ASR/pruned_transducer_stateless7/pretrained.py
Symbolic link
1
egs/aishell/ASR/pruned_transducer_stateless7/pretrained.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../librispeech/ASR/pruned_transducer_stateless7/pretrained.py
|
||||||
@ -23,12 +23,13 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from conformer import Conformer
|
from conformer import Conformer
|
||||||
|
|
||||||
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
||||||
from icefall.lexicon import Lexicon
|
from icefall.lexicon import Lexicon
|
||||||
from icefall.utils import AttributeDict, str2bool
|
from icefall.utils import AttributeDict, num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -63,11 +64,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lang-dir",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500",
|
required=True,
|
||||||
help="""It contains language related input files such as "lexicon.txt"
|
help="Path to the tokens.txt.",
|
||||||
""",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -98,16 +98,16 @@ def get_params() -> AttributeDict:
|
|||||||
def main():
|
def main():
|
||||||
args = get_parser().parse_args()
|
args = get_parser().parse_args()
|
||||||
args.exp_dir = Path(args.exp_dir)
|
args.exp_dir = Path(args.exp_dir)
|
||||||
args.lang_dir = Path(args.lang_dir)
|
|
||||||
|
|
||||||
params = get_params()
|
params = get_params()
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
lexicon = Lexicon(params.lang_dir)
|
# Load tokens.txt here
|
||||||
max_token_id = max(lexicon.tokens)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
num_classes = max_token_id + 1 # +1 for the blank
|
|
||||||
|
num_classes = num_tokens(token_table) + 1 # +1 for the blank
|
||||||
|
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
|
|||||||
@ -24,7 +24,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from conformer import Conformer
|
from conformer import Conformer
|
||||||
@ -70,11 +69,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.
|
help="Path to the tokens.txt.",
|
||||||
Used only when method is ctc-decoding.
|
|
||||||
""",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -83,10 +80,9 @@ def get_parser():
|
|||||||
default="1best",
|
default="1best",
|
||||||
help="""Decoding method.
|
help="""Decoding method.
|
||||||
Possible values are:
|
Possible values are:
|
||||||
(0) ctc-decoding - Use CTC decoding. It uses a sentence
|
(0) ctc-decoding - Use CTC decoding. It uses a tokens.txt file
|
||||||
piece model, i.e., lang_dir/bpe.model, to convert
|
to convert tokens to actual words or characters. It needs
|
||||||
word pieces to words. It needs neither a lexicon
|
neither a lexicon nor an n-gram LM.
|
||||||
nor an n-gram LM.
|
|
||||||
(1) 1best - Use the best path as decoding output. Only
|
(1) 1best - Use the best path as decoding output. Only
|
||||||
the transformer encoder output is used for decoding.
|
the transformer encoder output is used for decoding.
|
||||||
We call it HLG decoding.
|
We call it HLG decoding.
|
||||||
@ -297,6 +293,7 @@ def main():
|
|||||||
waves = [w.to(device) for w in waves]
|
waves = [w.to(device) for w in waves]
|
||||||
|
|
||||||
logging.info("Decoding started")
|
logging.info("Decoding started")
|
||||||
|
hyps = []
|
||||||
features = fbank(waves)
|
features = fbank(waves)
|
||||||
|
|
||||||
features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
|
features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
|
||||||
@ -313,10 +310,17 @@ def main():
|
|||||||
|
|
||||||
if params.method == "ctc-decoding":
|
if params.method == "ctc-decoding":
|
||||||
logging.info("Use CTC decoding")
|
logging.info("Use CTC decoding")
|
||||||
bpe_model = spm.SentencePieceProcessor()
|
|
||||||
bpe_model.load(params.bpe_model)
|
|
||||||
max_token_id = params.num_classes - 1
|
max_token_id = params.num_classes - 1
|
||||||
|
|
||||||
|
# Load tokens.txt here
|
||||||
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
H = k2.ctc_topo(
|
H = k2.ctc_topo(
|
||||||
max_token=max_token_id,
|
max_token=max_token_id,
|
||||||
modified=params.num_classes > 500,
|
modified=params.num_classes > 500,
|
||||||
@ -337,9 +341,9 @@ def main():
|
|||||||
best_path = one_best_decoding(
|
best_path = one_best_decoding(
|
||||||
lattice=lattice, use_double_scores=params.use_double_scores
|
lattice=lattice, use_double_scores=params.use_double_scores
|
||||||
)
|
)
|
||||||
token_ids = get_texts(best_path)
|
hyp_tokens = get_texts(best_path)
|
||||||
hyps = bpe_model.decode(token_ids)
|
for hyp in hyp_tokens:
|
||||||
hyps = [s.split() for s in hyps]
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method in [
|
elif params.method in [
|
||||||
"1best",
|
"1best",
|
||||||
"whole-lattice-rescoring",
|
"whole-lattice-rescoring",
|
||||||
@ -408,16 +412,16 @@ def main():
|
|||||||
)
|
)
|
||||||
best_path = next(iter(best_path_dict.values()))
|
best_path = next(iter(best_path_dict.values()))
|
||||||
|
|
||||||
hyps = get_texts(best_path)
|
|
||||||
word_sym_table = k2.SymbolTable.from_file(params.words_file)
|
word_sym_table = k2.SymbolTable.from_file(params.words_file)
|
||||||
hyps = [[word_sym_table[i] for i in ids] for ids in hyps]
|
hyp_tokens = get_texts(best_path)
|
||||||
|
for hyp in hyp_tokens:
|
||||||
|
hyps.append(" ".join([word_sym_table[i] for i in hyp]))
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported decoding method: {params.method}")
|
raise ValueError(f"Unsupported decoding method: {params.method}")
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -23,6 +23,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./conformer_ctc2/export.py \
|
./conformer_ctc2/export.py \
|
||||||
--exp-dir ./conformer_ctc2/exp \
|
--exp-dir ./conformer_ctc2/exp \
|
||||||
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -46,6 +47,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from conformer import Conformer
|
from conformer import Conformer
|
||||||
from decode import get_params
|
from decode import get_params
|
||||||
@ -56,8 +58,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.lexicon import Lexicon
|
from icefall.utils import num_tokens, str2bool
|
||||||
from icefall.utils import str2bool
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -123,10 +124,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lang-dir",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500",
|
required=True,
|
||||||
help="The lang dir",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -143,14 +144,14 @@ def get_parser():
|
|||||||
def main():
|
def main():
|
||||||
args = get_parser().parse_args()
|
args = get_parser().parse_args()
|
||||||
args.exp_dir = Path(args.exp_dir)
|
args.exp_dir = Path(args.exp_dir)
|
||||||
args.lang_dir = Path(args.lang_dir)
|
|
||||||
|
|
||||||
params = get_params()
|
params = get_params()
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
lexicon = Lexicon(params.lang_dir)
|
# Load tokens.txt here
|
||||||
max_token_id = max(lexicon.tokens)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
num_classes = max_token_id + 1 # +1 for the blank
|
|
||||||
|
num_classes = num_tokens(token_table) + 1 # +1 for the blank
|
||||||
|
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
|
|||||||
@ -25,7 +25,7 @@ Usage:
|
|||||||
|
|
||||||
./conformer_ctc3/export.py \
|
./conformer_ctc3/export.py \
|
||||||
--exp-dir ./conformer_ctc3/exp \
|
--exp-dir ./conformer_ctc3/exp \
|
||||||
--lang-dir data/lang_bpe_500 \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10 \
|
--avg 10 \
|
||||||
--jit-trace 1
|
--jit-trace 1
|
||||||
@ -36,7 +36,7 @@ It will generates the file: `jit_trace.pt`.
|
|||||||
|
|
||||||
./conformer_ctc3/export.py \
|
./conformer_ctc3/export.py \
|
||||||
--exp-dir ./conformer_ctc3/exp \
|
--exp-dir ./conformer_ctc3/exp \
|
||||||
--lang-dir data/lang_bpe_500 \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -62,6 +62,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_ctc_model, get_params
|
from train import add_model_arguments, get_ctc_model, get_params
|
||||||
@ -72,8 +73,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.lexicon import Lexicon
|
from icefall.utils import num_tokens, str2bool
|
||||||
from icefall.utils import str2bool
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -130,10 +130,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lang-dir",
|
"--tokens",
|
||||||
type=Path,
|
type=str,
|
||||||
default="data/lang_bpe_500",
|
required=True,
|
||||||
help="The lang dir containing word table and LG graph",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -171,9 +171,10 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
lexicon = Lexicon(params.lang_dir)
|
# Load tokens.txt here
|
||||||
max_token_id = max(lexicon.tokens)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
num_classes = max_token_id + 1 # +1 for the blank
|
|
||||||
|
num_classes = num_tokens(token_table) + 1 # +1 for the blank
|
||||||
params.vocab_size = num_classes
|
params.vocab_size = num_classes
|
||||||
|
|
||||||
if params.streaming_model:
|
if params.streaming_model:
|
||||||
|
|||||||
@ -24,7 +24,7 @@ Usage (for non-streaming mode):
|
|||||||
(1) ctc-decoding
|
(1) ctc-decoding
|
||||||
./conformer_ctc3/pretrained.py \
|
./conformer_ctc3/pretrained.py \
|
||||||
--checkpoint conformer_ctc3/exp/pretrained.pt \
|
--checkpoint conformer_ctc3/exp/pretrained.pt \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method ctc-decoding \
|
--method ctc-decoding \
|
||||||
--sample-rate 16000 \
|
--sample-rate 16000 \
|
||||||
test_wavs/1089-134686-0001.wav
|
test_wavs/1089-134686-0001.wav
|
||||||
@ -67,7 +67,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from decode import get_decoding_params
|
from decode import get_decoding_params
|
||||||
@ -114,11 +113,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.
|
help="Path to the tokens.txt.",
|
||||||
Used only when method is ctc-decoding.
|
|
||||||
""",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -127,10 +124,9 @@ def get_parser():
|
|||||||
default="1best",
|
default="1best",
|
||||||
help="""Decoding method.
|
help="""Decoding method.
|
||||||
Possible values are:
|
Possible values are:
|
||||||
(0) ctc-decoding - Use CTC decoding. It uses a sentence
|
(0) ctc-decoding - Use CTC decoding. It uses a tokens.txt file
|
||||||
piece model, i.e., lang_dir/bpe.model, to convert
|
to convert tokens to actual words or characters. It needs
|
||||||
word pieces to words. It needs neither a lexicon
|
neither a lexicon nor an n-gram LM.
|
||||||
nor an n-gram LM.
|
|
||||||
(1) 1best - Use the best path as decoding output. Only
|
(1) 1best - Use the best path as decoding output. Only
|
||||||
the transformer encoder output is used for decoding.
|
the transformer encoder output is used for decoding.
|
||||||
We call it HLG decoding.
|
We call it HLG decoding.
|
||||||
@ -316,6 +312,7 @@ def main():
|
|||||||
waves = [w.to(device) for w in waves]
|
waves = [w.to(device) for w in waves]
|
||||||
|
|
||||||
logging.info("Decoding started")
|
logging.info("Decoding started")
|
||||||
|
hyps = []
|
||||||
features = fbank(waves)
|
features = fbank(waves)
|
||||||
feature_lengths = [f.size(0) for f in features]
|
feature_lengths = [f.size(0) for f in features]
|
||||||
|
|
||||||
@ -348,10 +345,17 @@ def main():
|
|||||||
|
|
||||||
if params.method == "ctc-decoding":
|
if params.method == "ctc-decoding":
|
||||||
logging.info("Use CTC decoding")
|
logging.info("Use CTC decoding")
|
||||||
bpe_model = spm.SentencePieceProcessor()
|
|
||||||
bpe_model.load(params.bpe_model)
|
|
||||||
max_token_id = params.num_classes - 1
|
max_token_id = params.num_classes - 1
|
||||||
|
|
||||||
|
# Load tokens.txt here
|
||||||
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
H = k2.ctc_topo(
|
H = k2.ctc_topo(
|
||||||
max_token=max_token_id,
|
max_token=max_token_id,
|
||||||
modified=False,
|
modified=False,
|
||||||
@ -372,9 +376,9 @@ def main():
|
|||||||
best_path = one_best_decoding(
|
best_path = one_best_decoding(
|
||||||
lattice=lattice, use_double_scores=params.use_double_scores
|
lattice=lattice, use_double_scores=params.use_double_scores
|
||||||
)
|
)
|
||||||
token_ids = get_texts(best_path)
|
hyp_tokens = get_texts(best_path)
|
||||||
hyps = bpe_model.decode(token_ids)
|
for hyp in hyp_tokens:
|
||||||
hyps = [s.split() for s in hyps]
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method in [
|
elif params.method in [
|
||||||
"1best",
|
"1best",
|
||||||
"nbest-rescoring",
|
"nbest-rescoring",
|
||||||
@ -439,16 +443,16 @@ def main():
|
|||||||
)
|
)
|
||||||
best_path = next(iter(best_path_dict.values()))
|
best_path = next(iter(best_path_dict.values()))
|
||||||
|
|
||||||
hyps = get_texts(best_path)
|
|
||||||
word_sym_table = k2.SymbolTable.from_file(params.words_file)
|
word_sym_table = k2.SymbolTable.from_file(params.words_file)
|
||||||
hyps = [[word_sym_table[i] for i in ids] for ids in hyps]
|
hyp_tokens = get_texts(best_path)
|
||||||
|
for hyp in hyp_tokens:
|
||||||
|
hyps.append(" ".join([word_sym_table[i] for i in hyp]))
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported decoding method: {params.method}")
|
raise ValueError(f"Unsupported decoding method: {params.method}")
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./conv_emformer_transducer_stateless/export.py \
|
./conv_emformer_transducer_stateless/export.py \
|
||||||
--exp-dir ./conv_emformer_transducer_stateless/exp \
|
--exp-dir ./conv_emformer_transducer_stateless/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 10 \
|
--avg 10 \
|
||||||
--use-averaged-model=True \
|
--use-averaged-model=True \
|
||||||
@ -62,7 +62,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
@ -72,7 +72,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -118,10 +118,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
required=True,
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -166,12 +166,12 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@ for more details about how to use this file.
|
|||||||
Usage:
|
Usage:
|
||||||
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
./conv_emformer_transducer_stateless2/export-for-ncnn.py \
|
||||||
--exp-dir ./conv_emformer_transducer_stateless2/exp \
|
--exp-dir ./conv_emformer_transducer_stateless2/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 10 \
|
--avg 10 \
|
||||||
--use-averaged-model=True \
|
--use-averaged-model=True \
|
||||||
@ -37,7 +37,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train2 import add_model_arguments, get_params, get_transducer_model
|
from train2 import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -48,7 +48,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -94,10 +94,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
required=True,
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -217,12 +217,12 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -18,7 +18,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
|||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
pushd $repo
|
pushd $repo
|
||||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
|
||||||
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
|
||||||
|
|
||||||
cd exp
|
cd exp
|
||||||
@ -28,7 +27,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./conv_emformer_transducer_stateless2/export-onnx.py \
|
./conv_emformer_transducer_stateless2/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -55,14 +54,14 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Tuple
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnx
|
import onnx
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from decoder import Decoder
|
from decoder import Decoder
|
||||||
|
from emformer import Emformer
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train2 import add_model_arguments, get_params, get_transducer_model
|
from train2 import add_model_arguments, get_params, get_transducer_model
|
||||||
from emformer import Emformer
|
|
||||||
|
|
||||||
from icefall.checkpoint import (
|
from icefall.checkpoint import (
|
||||||
average_checkpoints,
|
average_checkpoints,
|
||||||
@ -70,7 +69,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -127,10 +126,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
required=True,
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -484,12 +483,12 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./conv_emformer_transducer_stateless2/export.py \
|
./conv_emformer_transducer_stateless2/export.py \
|
||||||
--exp-dir ./conv_emformer_transducer_stateless2/exp \
|
--exp-dir ./conv_emformer_transducer_stateless2/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 10 \
|
--avg 10 \
|
||||||
--use-averaged-model=True \
|
--use-averaged-model=True \
|
||||||
@ -62,7 +62,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -73,7 +73,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -119,10 +119,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
required=True,
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -167,12 +167,12 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./conv_emformer_transducer_stateless2/export-onnx.py \
|
./conv_emformer_transducer_stateless2/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
|
|||||||
@ -26,7 +26,7 @@ Usage:
|
|||||||
|
|
||||||
./lstm_transducer_stateless/export.py \
|
./lstm_transducer_stateless/export.py \
|
||||||
--exp-dir ./lstm_transducer_stateless/exp \
|
--exp-dir ./lstm_transducer_stateless/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 35 \
|
--epoch 35 \
|
||||||
--avg 10 \
|
--avg 10 \
|
||||||
--jit-trace 1
|
--jit-trace 1
|
||||||
@ -38,7 +38,7 @@ It will generate 3 files: `encoder_jit_trace.pt`,
|
|||||||
|
|
||||||
./lstm_transducer_stateless/export.py \
|
./lstm_transducer_stateless/export.py \
|
||||||
--exp-dir ./lstm_transducer_stateless/exp \
|
--exp-dir ./lstm_transducer_stateless/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 35 \
|
--epoch 35 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -79,7 +79,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
@ -91,7 +91,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -148,10 +148,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -266,12 +266,13 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size, <blk> is
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
# defined in local/train_bpe_model.py
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.blank_id = token_table["<blk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ Usage:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./lstm_transducer_stateless/pretrained.py \
|
./lstm_transducer_stateless/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -28,7 +28,7 @@ Usage:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./lstm_transducer_stateless/pretrained.py \
|
./lstm_transducer_stateless/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -37,7 +37,7 @@ Usage:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./lstm_transducer_stateless/pretrained.py \
|
./lstm_transducer_stateless/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./lstm_transducer_stateless/pretrained.py \
|
./lstm_transducer_stateless/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -66,7 +66,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -79,6 +78,8 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
|
from icefall.utils import num_tokens
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -95,9 +96,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -214,13 +215,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -275,6 +277,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -286,8 +294,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -296,16 +304,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -326,12 +334,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -29,7 +29,7 @@ popd
|
|||||||
|
|
||||||
./lstm_transducer_stateless2/export-for-ncnn.py \
|
./lstm_transducer_stateless2/export-for-ncnn.py \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
@ -49,7 +49,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -60,7 +60,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -106,10 +106,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -221,12 +221,13 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size, <blk> is
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
# defined in local/train_bpe_model.py
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.blank_id = token_table["<blk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -613,7 +613,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./lstm_transducer_stateless2/export-onnx.py \
|
./lstm_transducer_stateless2/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -52,8 +52,8 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Optional, Tuple
|
from typing import Dict, Optional, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnx
|
import onnx
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from decoder import Decoder
|
from decoder import Decoder
|
||||||
@ -68,7 +68,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -125,10 +125,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -437,12 +437,13 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size, <blk> is
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
# defined in local/train_bpe_model.py
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.blank_id = token_table["<blk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
@ -607,7 +608,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -27,7 +27,7 @@ Usage:
|
|||||||
|
|
||||||
./lstm_transducer_stateless2/export.py \
|
./lstm_transducer_stateless2/export.py \
|
||||||
--exp-dir ./lstm_transducer_stateless2/exp \
|
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 35 \
|
--epoch 35 \
|
||||||
--avg 10 \
|
--avg 10 \
|
||||||
--jit-trace 1
|
--jit-trace 1
|
||||||
@ -39,7 +39,7 @@ It will generate 3 files: `encoder_jit_trace.pt`,
|
|||||||
|
|
||||||
./lstm_transducer_stateless2/export.py \
|
./lstm_transducer_stateless2/export.py \
|
||||||
--exp-dir ./lstm_transducer_stateless2/exp \
|
--exp-dir ./lstm_transducer_stateless2/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 35 \
|
--epoch 35 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -80,7 +80,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
@ -92,7 +92,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -149,10 +149,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -267,12 +267,13 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size, <blk> is
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
# defined in local/train_bpe_model.py
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.blank_id = token_table["<blk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ Usage:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./lstm_transducer_stateless2/pretrained.py \
|
./lstm_transducer_stateless2/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -28,7 +28,7 @@ Usage:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./lstm_transducer_stateless2/pretrained.py \
|
./lstm_transducer_stateless2/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -37,7 +37,7 @@ Usage:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./lstm_transducer_stateless2/pretrained.py \
|
./lstm_transducer_stateless2/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./lstm_transducer_stateless2/pretrained.py \
|
./lstm_transducer_stateless2/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -69,7 +69,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -82,6 +81,8 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
|
from icefall.utils import num_tokens
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -98,9 +99,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -217,13 +218,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -278,6 +280,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -289,8 +297,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -299,16 +307,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -329,12 +337,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -26,7 +26,7 @@ Usage:
|
|||||||
|
|
||||||
./lstm_transducer_stateless3/export.py \
|
./lstm_transducer_stateless3/export.py \
|
||||||
--exp-dir ./lstm_transducer_stateless3/exp \
|
--exp-dir ./lstm_transducer_stateless3/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 40 \
|
--epoch 40 \
|
||||||
--avg 20 \
|
--avg 20 \
|
||||||
--jit-trace 1
|
--jit-trace 1
|
||||||
@ -38,7 +38,7 @@ It will generate 3 files: `encoder_jit_trace.pt`,
|
|||||||
|
|
||||||
./lstm_transducer_stateless3/export.py \
|
./lstm_transducer_stateless3/export.py \
|
||||||
--exp-dir ./lstm_transducer_stateless3/exp \
|
--exp-dir ./lstm_transducer_stateless3/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 40 \
|
--epoch 40 \
|
||||||
--avg 20
|
--avg 20
|
||||||
|
|
||||||
@ -79,7 +79,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
@ -91,7 +91,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -148,10 +148,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -266,12 +266,13 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size, <blk> is
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
# defined in local/train_bpe_model.py
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.blank_id = token_table["<blk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ Usage:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./lstm_transducer_stateless3/pretrained.py \
|
./lstm_transducer_stateless3/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless3/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless3/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -28,7 +28,7 @@ Usage:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./lstm_transducer_stateless3/pretrained.py \
|
./lstm_transducer_stateless3/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless3/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless3/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -37,7 +37,7 @@ Usage:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./lstm_transducer_stateless3/pretrained.py \
|
./lstm_transducer_stateless3/pretrained.py \
|
||||||
--checkpoint ./lstm_transducer_stateless3/exp/pretrained.pt \
|
--checkpoint ./lstm_transducer_stateless3/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -79,6 +79,8 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
|
from icefall.utils import num_tokens
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -95,9 +97,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -214,13 +216,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -275,6 +278,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -286,8 +295,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -296,16 +305,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -326,12 +335,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./prunted_stateless_emformer_rnnt/export.py \
|
./prunted_stateless_emformer_rnnt/export.py \
|
||||||
--exp-dir ./prunted_stateless_emformer_rnnt/exp \
|
--exp-dir ./prunted_stateless_emformer_rnnt/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -115,10 +115,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -154,13 +154,12 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> and <unk> are defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
params.vocab_size = sp.get_piece_size()
|
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -508,7 +508,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./pruned_transducer_stateless/export.py \
|
./pruned_transducer_stateless/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless/exp \
|
--exp-dir ./pruned_transducer_stateless/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -47,12 +47,12 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -87,10 +87,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -135,13 +135,13 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size, <blk> is
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
# defined in local/train_bpe_model.py
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
if params.streaming_model:
|
if params.streaming_model:
|
||||||
assert params.causal_convolution
|
assert params.causal_convolution
|
||||||
|
|||||||
@ -20,7 +20,7 @@ Usage:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless/pretrained.py \
|
./pruned_transducer_stateless/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -28,7 +28,7 @@ Usage:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless/pretrained.py \
|
./pruned_transducer_stateless/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -37,7 +37,7 @@ Usage:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless/pretrained.py \
|
./pruned_transducer_stateless/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless/pretrained.py \
|
./pruned_transducer_stateless/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -66,7 +66,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -79,7 +78,7 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -97,9 +96,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -237,13 +236,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
if params.simulate_streaming:
|
if params.simulate_streaming:
|
||||||
assert (
|
assert (
|
||||||
@ -314,6 +314,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -325,8 +331,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -335,16 +341,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -365,12 +371,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -1008,7 +1008,7 @@ def modified_beam_search(
|
|||||||
for i in range(N):
|
for i in range(N):
|
||||||
B[i].add(
|
B[i].add(
|
||||||
Hypothesis(
|
Hypothesis(
|
||||||
ys=[blank_id] * context_size,
|
ys=[-1] * (context_size - 1) + [blank_id],
|
||||||
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
||||||
context_state=None if context_graph is None else context_graph.root,
|
context_state=None if context_graph is None else context_graph.root,
|
||||||
timestamp=[],
|
timestamp=[],
|
||||||
@ -1217,7 +1217,7 @@ def modified_beam_search_lm_rescore(
|
|||||||
for i in range(N):
|
for i in range(N):
|
||||||
B[i].add(
|
B[i].add(
|
||||||
Hypothesis(
|
Hypothesis(
|
||||||
ys=[blank_id] * context_size,
|
ys=[-1] * (context_size - 1) + [blank_id],
|
||||||
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
||||||
timestamp=[],
|
timestamp=[],
|
||||||
)
|
)
|
||||||
@ -1417,7 +1417,7 @@ def modified_beam_search_lm_rescore_LODR(
|
|||||||
for i in range(N):
|
for i in range(N):
|
||||||
B[i].add(
|
B[i].add(
|
||||||
Hypothesis(
|
Hypothesis(
|
||||||
ys=[blank_id] * context_size,
|
ys=[-1] * (context_size - 1) + [blank_id],
|
||||||
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
||||||
timestamp=[],
|
timestamp=[],
|
||||||
)
|
)
|
||||||
@ -1617,7 +1617,7 @@ def _deprecated_modified_beam_search(
|
|||||||
B = HypothesisList()
|
B = HypothesisList()
|
||||||
B.add(
|
B.add(
|
||||||
Hypothesis(
|
Hypothesis(
|
||||||
ys=[blank_id] * context_size,
|
ys=[-1] * (context_size - 1) + [blank_id],
|
||||||
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
||||||
timestamp=[],
|
timestamp=[],
|
||||||
)
|
)
|
||||||
@ -1753,7 +1753,11 @@ def beam_search(
|
|||||||
t = 0
|
t = 0
|
||||||
|
|
||||||
B = HypothesisList()
|
B = HypothesisList()
|
||||||
B.add(Hypothesis(ys=[blank_id] * context_size, log_prob=0.0, timestamp=[]))
|
B.add(
|
||||||
|
Hypothesis(
|
||||||
|
ys=[-1] * (context_size - 1) + [blank_id], log_prob=0.0, timestamp=[]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
max_sym_per_utt = 20000
|
max_sym_per_utt = 20000
|
||||||
|
|
||||||
@ -2265,7 +2269,7 @@ def modified_beam_search_ngram_rescoring(
|
|||||||
for i in range(N):
|
for i in range(N):
|
||||||
B[i].add(
|
B[i].add(
|
||||||
Hypothesis(
|
Hypothesis(
|
||||||
ys=[blank_id] * context_size,
|
ys=[-1] * (context_size - 1) + [blank_id],
|
||||||
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
||||||
state_cost=NgramLmStateCost(ngram_lm),
|
state_cost=NgramLmStateCost(ngram_lm),
|
||||||
)
|
)
|
||||||
@ -2446,7 +2450,7 @@ def modified_beam_search_LODR(
|
|||||||
for i in range(N):
|
for i in range(N):
|
||||||
B[i].add(
|
B[i].add(
|
||||||
Hypothesis(
|
Hypothesis(
|
||||||
ys=[blank_id] * context_size,
|
ys=[-1] * (context_size - 1) + [blank_id],
|
||||||
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
||||||
state=init_states, # state of the NN LM
|
state=init_states, # state of the NN LM
|
||||||
lm_score=init_score.reshape(-1),
|
lm_score=init_score.reshape(-1),
|
||||||
@ -2709,7 +2713,7 @@ def modified_beam_search_lm_shallow_fusion(
|
|||||||
for i in range(N):
|
for i in range(N):
|
||||||
B[i].add(
|
B[i].add(
|
||||||
Hypothesis(
|
Hypothesis(
|
||||||
ys=[blank_id] * context_size,
|
ys=[-1] * (context_size - 1) + [blank_id],
|
||||||
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
log_prob=torch.zeros(1, dtype=torch.float32, device=device),
|
||||||
state=init_states,
|
state=init_states,
|
||||||
lm_score=init_score.reshape(-1),
|
lm_score=init_score.reshape(-1),
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./pruned_transducer_stateless2/export.py \
|
./pruned_transducer_stateless2/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless2/exp \
|
--exp-dir ./pruned_transducer_stateless2/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -47,12 +47,12 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
|
from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -98,10 +98,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -145,12 +145,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
if params.streaming_model:
|
if params.streaming_model:
|
||||||
assert params.causal_convolution
|
assert params.causal_convolution
|
||||||
|
|||||||
@ -20,7 +20,7 @@ Usage:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless2/pretrained.py \
|
./pruned_transducer_stateless2/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -28,7 +28,7 @@ Usage:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless2/pretrained.py \
|
./pruned_transducer_stateless2/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -37,7 +37,7 @@ Usage:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless2/pretrained.py \
|
./pruned_transducer_stateless2/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless2/pretrained.py \
|
./pruned_transducer_stateless2/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless2/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -66,7 +66,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -79,7 +78,7 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -97,9 +96,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -238,13 +237,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
if params.simulate_streaming:
|
if params.simulate_streaming:
|
||||||
assert (
|
assert (
|
||||||
@ -315,6 +315,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -326,8 +332,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -336,16 +342,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -366,12 +372,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -28,7 +28,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./pruned_transducer_stateless3/export-onnx.py \
|
./pruned_transducer_stateless3/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 9999 \
|
--epoch 9999 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--exp-dir $repo/exp/
|
--exp-dir $repo/exp/
|
||||||
@ -48,8 +48,8 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Tuple
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnx
|
import onnx
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from conformer import Conformer
|
from conformer import Conformer
|
||||||
@ -59,7 +59,7 @@ from scaling_converter import convert_scaled_to_non_scaled
|
|||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
|
from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
|
||||||
from icefall.utils import setup_logger
|
from icefall.utils import num_tokens, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -105,10 +105,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -393,12 +393,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
@ -518,7 +520,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -26,7 +26,7 @@ Usage:
|
|||||||
|
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10 \
|
--avg 10 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -44,7 +44,7 @@ It will also generate 3 other files: `encoder_jit_script.pt`,
|
|||||||
|
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10 \
|
--avg 10 \
|
||||||
--jit-trace 1
|
--jit-trace 1
|
||||||
@ -56,7 +56,7 @@ It will generates 3 files: `encoder_jit_trace.pt`,
|
|||||||
|
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -97,14 +97,14 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
|
from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -150,10 +150,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -342,12 +342,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
if params.streaming_model:
|
if params.streaming_model:
|
||||||
assert params.causal_convolution
|
assert params.causal_convolution
|
||||||
|
|||||||
@ -20,7 +20,7 @@ You can generate the checkpoint with the following command:
|
|||||||
|
|
||||||
./pruned_transducer_stateless3/export.py \
|
./pruned_transducer_stateless3/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless3/exp \
|
--exp-dir ./pruned_transducer_stateless3/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ Usage of this script:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless3/pretrained.py \
|
./pruned_transducer_stateless3/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless3/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless3/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -37,7 +37,7 @@ Usage of this script:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless3/pretrained.py \
|
./pruned_transducer_stateless3/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless3/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless3/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage of this script:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless3/pretrained.py \
|
./pruned_transducer_stateless3/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless3/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless3/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -55,7 +55,7 @@ Usage of this script:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless3/pretrained.py \
|
./pruned_transducer_stateless3/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless3/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless3/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -75,7 +75,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -88,7 +87,7 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -106,9 +105,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -247,13 +246,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
if params.simulate_streaming:
|
if params.simulate_streaming:
|
||||||
assert (
|
assert (
|
||||||
@ -324,6 +324,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -335,8 +341,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -345,16 +351,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -375,12 +381,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./pruned_transducer_stateless4/export.py \
|
./pruned_transducer_stateless4/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless4/exp \
|
--exp-dir ./pruned_transducer_stateless4/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -59,7 +59,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -116,10 +116,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -164,12 +164,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
if params.streaming_model:
|
if params.streaming_model:
|
||||||
assert params.causal_convolution
|
assert params.causal_convolution
|
||||||
|
|||||||
@ -28,7 +28,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./pruned_transducer_stateless5/export-onnx-streaming.py \
|
./pruned_transducer_stateless5/export-onnx-streaming.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
@ -58,13 +58,13 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Tuple
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnx
|
import onnx
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from conformer import Conformer
|
from conformer import Conformer
|
||||||
from onnxruntime.quantization import QuantType, quantize_dynamic
|
|
||||||
from decoder import Decoder
|
from decoder import Decoder
|
||||||
|
from onnxruntime.quantization import QuantType, quantize_dynamic
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
@ -74,7 +74,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -131,10 +131,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -489,12 +489,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
@ -662,7 +664,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./pruned_transducer_stateless5/export-onnx.py \
|
./pruned_transducer_stateless5/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
@ -55,13 +55,13 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Tuple
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnx
|
import onnx
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from conformer import Conformer
|
from conformer import Conformer
|
||||||
from onnxruntime.quantization import QuantType, quantize_dynamic
|
|
||||||
from decoder import Decoder
|
from decoder import Decoder
|
||||||
|
from onnxruntime.quantization import QuantType, quantize_dynamic
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
@ -71,7 +71,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -128,10 +128,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -416,12 +416,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
@ -586,7 +588,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./pruned_transducer_stateless5/export.py \
|
./pruned_transducer_stateless5/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless5/exp \
|
--exp-dir ./pruned_transducer_stateless5/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -59,7 +59,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -116,10 +116,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -164,12 +164,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
if params.streaming_model:
|
if params.streaming_model:
|
||||||
assert params.causal_convolution
|
assert params.causal_convolution
|
||||||
|
|||||||
@ -20,7 +20,7 @@ Usage:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless5/pretrained.py \
|
./pruned_transducer_stateless5/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless5/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless5/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -28,7 +28,7 @@ Usage:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless5/pretrained.py \
|
./pruned_transducer_stateless5/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless5/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless5/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -37,7 +37,7 @@ Usage:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless5/pretrained.py \
|
./pruned_transducer_stateless5/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless5/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless5/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless5/pretrained.py \
|
./pruned_transducer_stateless5/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless5/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless5/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -66,7 +66,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -79,6 +78,8 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
|
from icefall.utils import num_tokens
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -95,9 +96,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -214,13 +215,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -275,6 +277,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -286,8 +294,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -296,16 +304,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -326,12 +334,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./pruned_transducer_stateless6/export.py \
|
./pruned_transducer_stateless6/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless6/exp \
|
--exp-dir ./pruned_transducer_stateless6/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -47,12 +47,12 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from train import get_params, get_transducer_model
|
from train import get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
|
from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -98,10 +98,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -135,12 +135,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
#
|
#
|
||||||
# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang)
|
# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang
|
||||||
|
# Zengrui Jin)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
This script exports a transducer model from PyTorch to ONNX.
|
This script exports a transducer model from PyTorch to ONNX.
|
||||||
@ -18,7 +19,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
|
|||||||
repo=$(basename $repo_url)
|
repo=$(basename $repo_url)
|
||||||
|
|
||||||
pushd $repo
|
pushd $repo
|
||||||
git lfs pull --include "data/lang_bpe_500/bpe.model"
|
|
||||||
git lfs pull --include "exp/pretrained-epoch-30-avg-9.pt"
|
git lfs pull --include "exp/pretrained-epoch-30-avg-9.pt"
|
||||||
|
|
||||||
cd exp
|
cd exp
|
||||||
@ -28,7 +28,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./pruned_transducer_stateless7/export-onnx.py \
|
./pruned_transducer_stateless7/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -50,8 +50,8 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Tuple
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnx
|
import onnx
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from decoder import Decoder
|
from decoder import Decoder
|
||||||
@ -66,7 +66,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -123,10 +123,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
help="Path to the tokens.txt.",
|
||||||
help="Path to the BPE model",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -411,12 +410,12 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
@ -581,7 +580,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
#
|
#
|
||||||
# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
|
# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang
|
||||||
|
# Zengrui Jin)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -26,7 +27,7 @@ Usage:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7/export.py \
|
./pruned_transducer_stateless7/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7/exp \
|
--exp-dir ./pruned_transducer_stateless7/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 9 \
|
--avg 9 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -45,7 +46,7 @@ for how to use the exported models outside of icefall.
|
|||||||
|
|
||||||
./pruned_transducer_stateless7/export.py \
|
./pruned_transducer_stateless7/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7/exp \
|
--exp-dir ./pruned_transducer_stateless7/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -65,7 +66,7 @@ you can do:
|
|||||||
--avg 1 \
|
--avg 1 \
|
||||||
--max-duration 600 \
|
--max-duration 600 \
|
||||||
--decoding-method greedy_search \
|
--decoding-method greedy_search \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
|
|
||||||
Check ./pretrained.py for its usage.
|
Check ./pretrained.py for its usage.
|
||||||
|
|
||||||
@ -86,7 +87,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
@ -98,7 +99,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -155,10 +156,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
help="Path to the tokens.txt.",
|
||||||
help="Path to the BPE model",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -198,12 +198,12 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
@ -292,7 +292,7 @@ def main():
|
|||||||
model.to("cpu")
|
model.to("cpu")
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
if params.jit is True:
|
if params.jit:
|
||||||
convert_scaled_to_non_scaled(model, inplace=True)
|
convert_scaled_to_non_scaled(model, inplace=True)
|
||||||
# We won't use the forward() method of the model in C++, so just ignore
|
# We won't use the forward() method of the model in C++, so just ignore
|
||||||
# it here.
|
# it here.
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang
|
||||||
|
# Zengrui Jin)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -20,7 +21,7 @@ You can generate the checkpoint with the following command:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7/export.py \
|
./pruned_transducer_stateless7/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7/exp \
|
--exp-dir ./pruned_transducer_stateless7/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -29,7 +30,7 @@ Usage of this script:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless7/pretrained.py \
|
./pruned_transducer_stateless7/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -37,7 +38,7 @@ Usage of this script:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless7/pretrained.py \
|
./pruned_transducer_stateless7/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +47,7 @@ Usage of this script:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless7/pretrained.py \
|
./pruned_transducer_stateless7/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -55,7 +56,7 @@ Usage of this script:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless7/pretrained.py \
|
./pruned_transducer_stateless7/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens ./data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -75,7 +76,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -88,7 +88,7 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -106,9 +106,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -225,13 +225,13 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# Load id of the <blk> token and the vocab size
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -286,6 +286,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -297,8 +303,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -307,16 +313,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -337,12 +343,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -26,7 +26,7 @@ Usage:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc/export.py \
|
./pruned_transducer_stateless7_ctc/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 9 \
|
--avg 9 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -45,7 +45,7 @@ for how to use the exported models outside of icefall.
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc/export.py \
|
./pruned_transducer_stateless7_ctc/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -86,7 +86,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -97,7 +97,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -154,10 +154,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -197,12 +197,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ You can generate the checkpoint with the following command:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc/export.py \
|
./pruned_transducer_stateless7_ctc/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ Usage of this script:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless7_ctc/pretrained.py \
|
./pruned_transducer_stateless7_ctc/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -37,7 +37,7 @@ Usage of this script:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless7_ctc/pretrained.py \
|
./pruned_transducer_stateless7_ctc/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage of this script:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless7_ctc/pretrained.py \
|
./pruned_transducer_stateless7_ctc/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -55,7 +55,7 @@ Usage of this script:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless7_ctc/pretrained.py \
|
./pruned_transducer_stateless7_ctc/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -75,7 +75,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -88,6 +87,8 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
|
from icefall.utils import num_tokens
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -104,9 +105,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -223,13 +224,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -284,6 +286,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -295,8 +303,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -305,16 +313,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -335,12 +343,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -22,14 +22,14 @@ You can use the following command to get the exported models:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc/export.py \
|
./pruned_transducer_stateless7_ctc/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
Usage of this script:
|
Usage of this script:
|
||||||
|
|
||||||
(1) ctc-decoding
|
(1) ctc-decoding
|
||||||
./pruned_transducer_stateless7_ctc/jit_pretrained_ctc.py \
|
./pruned_transducer_stateless7_ctc/pretrained_ctc.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
--method ctc-decoding \
|
--method ctc-decoding \
|
||||||
@ -38,7 +38,7 @@ Usage of this script:
|
|||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
|
|
||||||
(2) 1best
|
(2) 1best
|
||||||
./pruned_transducer_stateless7_ctc/jit_pretrained_ctc.py \
|
./pruned_transducer_stateless7_ctc/pretrained_ctc.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
||||||
--HLG data/lang_bpe_500/HLG.pt \
|
--HLG data/lang_bpe_500/HLG.pt \
|
||||||
--words-file data/lang_bpe_500/words.txt \
|
--words-file data/lang_bpe_500/words.txt \
|
||||||
@ -48,7 +48,7 @@ Usage of this script:
|
|||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
|
|
||||||
(3) nbest-rescoring
|
(3) nbest-rescoring
|
||||||
./bruned_transducer_stateless7_ctc/jit_pretrained_ctc.py \
|
./bruned_transducer_stateless7_ctc/pretrained_ctc.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
||||||
--HLG data/lang_bpe_500/HLG.pt \
|
--HLG data/lang_bpe_500/HLG.pt \
|
||||||
--words-file data/lang_bpe_500/words.txt \
|
--words-file data/lang_bpe_500/words.txt \
|
||||||
@ -60,7 +60,7 @@ Usage of this script:
|
|||||||
|
|
||||||
|
|
||||||
(4) whole-lattice-rescoring
|
(4) whole-lattice-rescoring
|
||||||
./pruned_transducer_stateless7_ctc/jit_pretrained_ctc.py \
|
./pruned_transducer_stateless7_ctc/pretrained_ctc.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc/exp/pretrained.pt \
|
||||||
--HLG data/lang_bpe_500/HLG.pt \
|
--HLG data/lang_bpe_500/HLG.pt \
|
||||||
--words-file data/lang_bpe_500/words.txt \
|
--words-file data/lang_bpe_500/words.txt \
|
||||||
|
|||||||
@ -26,7 +26,7 @@ Usage:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc_bs/export.py \
|
./pruned_transducer_stateless7_ctc_bs/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 13 \
|
--avg 13 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -45,7 +45,7 @@ for how to use the exported models outside of icefall.
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc_bs/export.py \
|
./pruned_transducer_stateless7_ctc_bs/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 13
|
--avg 13
|
||||||
|
|
||||||
@ -86,7 +86,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -97,7 +97,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -154,10 +154,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -197,12 +197,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ Usage:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc_bs/export_onnx.py \
|
./pruned_transducer_stateless7_ctc_bs/export_onnx.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 13 \
|
--avg 13 \
|
||||||
--onnx 1
|
--onnx 1
|
||||||
@ -48,7 +48,7 @@ Check `onnx_check.py` for how to use them.
|
|||||||
(2) Export to ONNX format which can be used in Triton Server
|
(2) Export to ONNX format which can be used in Triton Server
|
||||||
./pruned_transducer_stateless7_ctc_bs/export_onnx.py \
|
./pruned_transducer_stateless7_ctc_bs/export_onnx.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 13 \
|
--avg 13 \
|
||||||
--onnx-triton 1
|
--onnx-triton 1
|
||||||
@ -86,9 +86,10 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
from onnx_wrapper import TritonOnnxDecoder, TritonOnnxJoiner, TritonOnnxLconv
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
@ -98,8 +99,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
from onnx_wrapper import TritonOnnxDecoder, TritonOnnxJoiner, TritonOnnxLconv
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -156,10 +156,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -728,12 +728,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ You can generate the checkpoint with the following command:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc_bs/export.py \
|
./pruned_transducer_stateless7_ctc_bs/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 13
|
--avg 13
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ Usage of this script:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
|
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -37,7 +37,7 @@ Usage of this script:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
|
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage of this script:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
|
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -55,7 +55,7 @@ Usage of this script:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
|
./pruned_transducer_stateless7_ctc_bs/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -75,7 +75,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -88,6 +87,8 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
|
from icefall.utils import num_tokens
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -104,9 +105,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -223,13 +224,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -284,6 +286,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -295,8 +303,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -305,16 +313,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -335,12 +343,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -22,14 +22,14 @@ You can use the following command to get the exported models:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_ctc_bs/export.py \
|
./pruned_transducer_stateless7_ctc_bs/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
--exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
Usage of this script:
|
Usage of this script:
|
||||||
|
|
||||||
(1) ctc-decoding
|
(1) ctc-decoding
|
||||||
./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
|
./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--bpe-model data/lang_bpe_500/bpe.model \
|
||||||
--method ctc-decoding \
|
--method ctc-decoding \
|
||||||
@ -38,7 +38,7 @@ Usage of this script:
|
|||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
|
|
||||||
(2) 1best
|
(2) 1best
|
||||||
./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
|
./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
||||||
--HLG data/lang_bpe_500/HLG.pt \
|
--HLG data/lang_bpe_500/HLG.pt \
|
||||||
--words-file data/lang_bpe_500/words.txt \
|
--words-file data/lang_bpe_500/words.txt \
|
||||||
@ -48,7 +48,7 @@ Usage of this script:
|
|||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
|
|
||||||
(3) nbest-rescoring
|
(3) nbest-rescoring
|
||||||
./bruned_transducer_stateless7_ctc/jit_pretrained_ctc.py \
|
./bruned_transducer_stateless7_ctc/pretrained_ctc.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
||||||
--HLG data/lang_bpe_500/HLG.pt \
|
--HLG data/lang_bpe_500/HLG.pt \
|
||||||
--words-file data/lang_bpe_500/words.txt \
|
--words-file data/lang_bpe_500/words.txt \
|
||||||
@ -60,7 +60,7 @@ Usage of this script:
|
|||||||
|
|
||||||
|
|
||||||
(4) whole-lattice-rescoring
|
(4) whole-lattice-rescoring
|
||||||
./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
|
./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
|
||||||
--HLG data/lang_bpe_500/HLG.pt \
|
--HLG data/lang_bpe_500/HLG.pt \
|
||||||
--words-file data/lang_bpe_500/words.txt \
|
--words-file data/lang_bpe_500/words.txt \
|
||||||
|
|||||||
@ -66,6 +66,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train2 import add_model_arguments, get_params, get_transducer_model
|
from train2 import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -76,8 +77,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.lexicon import Lexicon
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
from icefall.utils import setup_logger, str2bool
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -123,10 +123,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lang-dir",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_char",
|
default="data/lang_char/tokens.txt",
|
||||||
help="The lang dir",
|
help="The tokens.txt file",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -246,9 +246,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
lexicon = Lexicon(params.lang_dir)
|
# Load tokens.txt here
|
||||||
params.blank_id = 0
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
params.vocab_size = max(lexicon.tokens) + 1
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
|
params.blank_id = token_table["<blk>"]
|
||||||
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ popd
|
|||||||
2. Export to ncnn
|
2. Export to ncnn
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
@ -64,7 +64,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train2 import add_model_arguments, get_params, get_transducer_model
|
from train2 import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -75,7 +75,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -121,10 +121,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -244,12 +244,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -29,7 +29,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-onnx-zh.py \
|
./pruned_transducer_stateless7_streaming/export-onnx-zh.py \
|
||||||
--lang-dir $repo/data/lang_char_bpe \
|
--tokens $repo/data/lang_char_bpe/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -60,6 +60,7 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnx
|
import onnx
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@ -76,8 +77,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.lexicon import Lexicon
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
from icefall.utils import setup_logger, str2bool
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -134,10 +134,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lang-dir",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_char",
|
default="data/lang_char/tokens.txt",
|
||||||
help="The lang dir",
|
help="The tokens.txt file",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -493,9 +493,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
lexicon = Lexicon(params.lang_dir)
|
# Load tokens.txt here
|
||||||
params.blank_id = 0
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
params.vocab_size = max(lexicon.tokens) + 1
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
|
params.blank_id = token_table["<blk>"]
|
||||||
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
@ -661,7 +666,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -27,7 +27,7 @@ popd
|
|||||||
2. Export the model to ONNX
|
2. Export the model to ONNX
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
./pruned_transducer_stateless7_streaming/export-onnx.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
--avg 1 \
|
--avg 1 \
|
||||||
@ -48,8 +48,8 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnx
|
import onnx
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from decoder import Decoder
|
from decoder import Decoder
|
||||||
@ -65,7 +65,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -122,10 +122,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -481,12 +481,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
@ -652,7 +654,7 @@ def main():
|
|||||||
quantize_dynamic(
|
quantize_dynamic(
|
||||||
model_input=decoder_filename,
|
model_input=decoder_filename,
|
||||||
model_output=decoder_filename_int8,
|
model_output=decoder_filename_int8,
|
||||||
op_types_to_quantize=["MatMul"],
|
op_types_to_quantize=["MatMul", "Gather"],
|
||||||
weight_type=QuantType.QInt8,
|
weight_type=QuantType.QInt8,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -139,8 +139,8 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import k2
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from onnx_model_wrapper import OnnxStreamingEncoder, TritonOnnxDecoder, TritonOnnxJoiner
|
from onnx_model_wrapper import OnnxStreamingEncoder, TritonOnnxDecoder, TritonOnnxJoiner
|
||||||
@ -154,7 +154,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -211,10 +211,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -675,12 +675,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ You can generate the checkpoint with the following command:
|
|||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export.py \
|
./pruned_transducer_stateless7_streaming/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
|
--exp-dir ./pruned_transducer_stateless7_streaming/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ Usage of this script:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless7_streaming/pretrained.py \
|
./pruned_transducer_stateless7_streaming/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -37,7 +37,7 @@ Usage of this script:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless7_streaming/pretrained.py \
|
./pruned_transducer_stateless7_streaming/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage of this script:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless7_streaming/pretrained.py \
|
./pruned_transducer_stateless7_streaming/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -55,7 +55,7 @@ Usage of this script:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless7_streaming/pretrained.py \
|
./pruned_transducer_stateless7_streaming/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -75,7 +75,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -88,7 +87,7 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -106,9 +105,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -225,13 +224,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -286,6 +286,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -297,8 +303,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -307,16 +313,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -337,12 +343,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -28,7 +28,7 @@ popd
|
|||||||
2. Export to ncnn
|
2. Export to ncnn
|
||||||
|
|
||||||
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
|
||||||
--bpe-model $repo/data/lang_bpe_500/bpe.model \
|
--tokens $repo/data/lang_bpe_500/tokens.txt \
|
||||||
--exp-dir $repo/exp \
|
--exp-dir $repo/exp \
|
||||||
--use-averaged-model 0 \
|
--use-averaged-model 0 \
|
||||||
--epoch 99 \
|
--epoch 99 \
|
||||||
@ -64,7 +64,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
from train2 import add_model_arguments, get_params, get_transducer_model
|
from train2 import add_model_arguments, get_params, get_transducer_model
|
||||||
@ -75,7 +75,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import setup_logger, str2bool
|
from icefall.utils import num_tokens, setup_logger, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -121,10 +121,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -244,12 +244,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -26,7 +26,7 @@ Usage:
|
|||||||
|
|
||||||
./pruned_transducer_stateless8/export.py \
|
./pruned_transducer_stateless8/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless8/exp \
|
--exp-dir ./pruned_transducer_stateless8/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 30 \
|
--epoch 30 \
|
||||||
--avg 9 \
|
--avg 9 \
|
||||||
--jit 1
|
--jit 1
|
||||||
@ -45,7 +45,7 @@ for how to use the exported models outside of icefall.
|
|||||||
|
|
||||||
./pruned_transducer_stateless8/export.py \
|
./pruned_transducer_stateless8/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless8/exp \
|
--exp-dir ./pruned_transducer_stateless8/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -86,7 +86,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from scaling_converter import convert_scaled_to_non_scaled
|
from scaling_converter import convert_scaled_to_non_scaled
|
||||||
@ -98,7 +98,7 @@ from icefall.checkpoint import (
|
|||||||
find_checkpoints,
|
find_checkpoints,
|
||||||
load_checkpoint,
|
load_checkpoint,
|
||||||
)
|
)
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -155,10 +155,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -198,12 +198,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ You can generate the checkpoint with the following command:
|
|||||||
|
|
||||||
./pruned_transducer_stateless8/export.py \
|
./pruned_transducer_stateless8/export.py \
|
||||||
--exp-dir ./pruned_transducer_stateless8/exp \
|
--exp-dir ./pruned_transducer_stateless8/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ Usage of this script:
|
|||||||
(1) greedy search
|
(1) greedy search
|
||||||
./pruned_transducer_stateless8/pretrained.py \
|
./pruned_transducer_stateless8/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless8/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless8/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav
|
/path/to/bar.wav
|
||||||
@ -37,7 +37,7 @@ Usage of this script:
|
|||||||
(2) beam search
|
(2) beam search
|
||||||
./pruned_transducer_stateless8/pretrained.py \
|
./pruned_transducer_stateless8/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless8/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless8/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method beam_search \
|
--method beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -46,7 +46,7 @@ Usage of this script:
|
|||||||
(3) modified beam search
|
(3) modified beam search
|
||||||
./pruned_transducer_stateless8/pretrained.py \
|
./pruned_transducer_stateless8/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless8/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless8/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method modified_beam_search \
|
--method modified_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -55,7 +55,7 @@ Usage of this script:
|
|||||||
(4) fast beam search
|
(4) fast beam search
|
||||||
./pruned_transducer_stateless8/pretrained.py \
|
./pruned_transducer_stateless8/pretrained.py \
|
||||||
--checkpoint ./pruned_transducer_stateless8/exp/pretrained.pt \
|
--checkpoint ./pruned_transducer_stateless8/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method fast_beam_search \
|
--method fast_beam_search \
|
||||||
--beam-size 4 \
|
--beam-size 4 \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
@ -75,7 +75,6 @@ from typing import List
|
|||||||
|
|
||||||
import k2
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import (
|
from beam_search import (
|
||||||
@ -88,7 +87,7 @@ from beam_search import (
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from train import add_model_arguments, get_params, get_transducer_model
|
from train import add_model_arguments, get_params, get_transducer_model
|
||||||
|
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -106,9 +105,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.""",
|
help="""Path to tokens.txt.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -225,13 +224,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.unk_id = sp.piece_to_id("<unk>")
|
params.unk_id = token_table["<unk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -286,6 +286,12 @@ def main():
|
|||||||
msg += f" with beam size {params.beam_size}"
|
msg += f" with beam size {params.beam_size}"
|
||||||
logging.info(msg)
|
logging.info(msg)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
if params.method == "fast_beam_search":
|
if params.method == "fast_beam_search":
|
||||||
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
|
||||||
hyp_tokens = fast_beam_search_one_best(
|
hyp_tokens = fast_beam_search_one_best(
|
||||||
@ -297,8 +303,8 @@ def main():
|
|||||||
max_contexts=params.max_contexts,
|
max_contexts=params.max_contexts,
|
||||||
max_states=params.max_states,
|
max_states=params.max_states,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "modified_beam_search":
|
elif params.method == "modified_beam_search":
|
||||||
hyp_tokens = modified_beam_search(
|
hyp_tokens = modified_beam_search(
|
||||||
model=model,
|
model=model,
|
||||||
@ -307,16 +313,16 @@ def main():
|
|||||||
beam=params.beam_size,
|
beam=params.beam_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
|
||||||
hyp_tokens = greedy_search_batch(
|
hyp_tokens = greedy_search_batch(
|
||||||
model=model,
|
model=model,
|
||||||
encoder_out=encoder_out,
|
encoder_out=encoder_out,
|
||||||
encoder_out_lens=encoder_out_lens,
|
encoder_out_lens=encoder_out_lens,
|
||||||
)
|
)
|
||||||
for hyp in sp.decode(hyp_tokens):
|
for hyp in hyp_tokens:
|
||||||
hyps.append(hyp.split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
else:
|
else:
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -337,12 +343,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./transducer/export.py \
|
./transducer/export.py \
|
||||||
--exp-dir ./transducer/exp \
|
--exp-dir ./transducer/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 34 \
|
--epoch 34 \
|
||||||
--avg 11
|
--avg 11
|
||||||
|
|
||||||
@ -46,7 +46,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
from conformer import Conformer
|
from conformer import Conformer
|
||||||
from decoder import Decoder
|
from decoder import Decoder
|
||||||
@ -55,7 +55,7 @@ from model import Transducer
|
|||||||
|
|
||||||
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
||||||
from icefall.env import get_env_info
|
from icefall.env import get_env_info
|
||||||
from icefall.utils import AttributeDict, str2bool
|
from icefall.utils import AttributeDict, num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -90,10 +90,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -191,12 +191,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
@ -19,7 +19,7 @@ Usage:
|
|||||||
|
|
||||||
./transducer/pretrained.py \
|
./transducer/pretrained.py \
|
||||||
--checkpoint ./transducer/exp/pretrained.pt \
|
--checkpoint ./transducer/exp/pretrained.pt \
|
||||||
--bpe-model ./data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--method greedy_search \
|
--method greedy_search \
|
||||||
/path/to/foo.wav \
|
/path/to/foo.wav \
|
||||||
/path/to/bar.wav \
|
/path/to/bar.wav \
|
||||||
@ -36,8 +36,8 @@ import logging
|
|||||||
import math
|
import math
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
import k2
|
||||||
import kaldifeat
|
import kaldifeat
|
||||||
import sentencepiece as spm
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from beam_search import beam_search, greedy_search
|
from beam_search import beam_search, greedy_search
|
||||||
@ -48,7 +48,7 @@ from model import Transducer
|
|||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
|
|
||||||
from icefall.env import get_env_info
|
from icefall.env import get_env_info
|
||||||
from icefall.utils import AttributeDict
|
from icefall.utils import AttributeDict, num_tokens
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -66,11 +66,9 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
help="""Path to bpe.model.
|
help="Path to tokens.txt.",
|
||||||
Used only when method is ctc-decoding.
|
|
||||||
""",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -204,12 +202,14 @@ def main():
|
|||||||
|
|
||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(f"{params}")
|
logging.info(f"{params}")
|
||||||
|
|
||||||
@ -257,6 +257,12 @@ def main():
|
|||||||
x=features, x_lens=feature_lengths
|
x=features, x_lens=feature_lengths
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def token_ids_to_words(token_ids: List[int]) -> str:
|
||||||
|
text = ""
|
||||||
|
for i in token_ids:
|
||||||
|
text += token_table[i]
|
||||||
|
return text.replace("▁", " ").strip()
|
||||||
|
|
||||||
num_waves = encoder_out.size(0)
|
num_waves = encoder_out.size(0)
|
||||||
hyps = []
|
hyps = []
|
||||||
for i in range(num_waves):
|
for i in range(num_waves):
|
||||||
@ -272,12 +278,11 @@ def main():
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported method: {params.method}")
|
raise ValueError(f"Unsupported method: {params.method}")
|
||||||
|
|
||||||
hyps.append(sp.decode(hyp).split())
|
hyps.append(token_ids_to_words(hyp))
|
||||||
|
|
||||||
s = "\n"
|
s = "\n"
|
||||||
for filename, hyp in zip(params.sound_files, hyps):
|
for filename, hyp in zip(params.sound_files, hyps):
|
||||||
words = " ".join(hyp)
|
s += f"{filename}:\n{hyp}\n\n"
|
||||||
s += f"{filename}:\n{words}\n\n"
|
|
||||||
logging.info(s)
|
logging.info(s)
|
||||||
|
|
||||||
logging.info("Decoding Done")
|
logging.info("Decoding Done")
|
||||||
|
|||||||
@ -22,7 +22,7 @@
|
|||||||
Usage:
|
Usage:
|
||||||
./transducer_stateless/export.py \
|
./transducer_stateless/export.py \
|
||||||
--exp-dir ./transducer_stateless/exp \
|
--exp-dir ./transducer_stateless/exp \
|
||||||
--bpe-model data/lang_bpe_500/bpe.model \
|
--tokens data/lang_bpe_500/tokens.txt \
|
||||||
--epoch 20 \
|
--epoch 20 \
|
||||||
--avg 10
|
--avg 10
|
||||||
|
|
||||||
@ -46,7 +46,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import k2
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from conformer import Conformer
|
from conformer import Conformer
|
||||||
@ -56,7 +56,7 @@ from model import Transducer
|
|||||||
|
|
||||||
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
from icefall.checkpoint import average_checkpoints, load_checkpoint
|
||||||
from icefall.env import get_env_info
|
from icefall.env import get_env_info
|
||||||
from icefall.utils import AttributeDict, str2bool
|
from icefall.utils import AttributeDict, num_tokens, str2bool
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -91,10 +91,10 @@ def get_parser():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bpe-model",
|
"--tokens",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/lang_bpe_500/bpe.model",
|
default="data/lang_bpe_500/tokens.txt",
|
||||||
help="Path to the BPE model",
|
help="Path to the tokens.txt.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -191,12 +191,14 @@ def main():
|
|||||||
|
|
||||||
logging.info(f"device: {device}")
|
logging.info(f"device: {device}")
|
||||||
|
|
||||||
sp = spm.SentencePieceProcessor()
|
# Load tokens.txt here
|
||||||
sp.load(params.bpe_model)
|
token_table = k2.SymbolTable.from_file(params.tokens)
|
||||||
|
|
||||||
|
# Load id of the <blk> token and the vocab size
|
||||||
# <blk> is defined in local/train_bpe_model.py
|
# <blk> is defined in local/train_bpe_model.py
|
||||||
params.blank_id = sp.piece_to_id("<blk>")
|
params.blank_id = token_table["<blk>"]
|
||||||
params.vocab_size = sp.get_piece_size()
|
params.unk_id = token_table["<unk>"]
|
||||||
|
params.vocab_size = num_tokens(token_table) + 1 # +1 for <blk>
|
||||||
|
|
||||||
logging.info(params)
|
logging.info(params)
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user