diff --git a/egs/gigaspeech/KWS/RESULTS.md b/egs/gigaspeech/KWS/RESULTS.md new file mode 100644 index 000000000..992240e14 --- /dev/null +++ b/egs/gigaspeech/KWS/RESULTS.md @@ -0,0 +1,49 @@ +# Results + +## zipformer transducer model + +This is a tiny general ASR model, which has around 3.3M parameters, see this PR https://github.com/k2-fsa/icefall/pull/1428 for how to train it and other details. + +The modeling units are 500 BPEs trained on gigaspeech transcripts. + +The positive test sets are from https://github.com/pkufool/open-commands and the negative test set is test set of gigaspeech (has 40 hours audios). + +We put the whole pipeline in `run.sh` containing training, decoding and finetuning commands. + +The models have been upload to [github](https://github.com/pkufool/keyword-spotting-models/releases/download/v0.11/icefall-kws-zipformer-gigaspeech-20240219.tar.gz). + +Here is the results of a small test set which has 20 commands, we list the results of every commands, for +each metric there are two columns, one for the original model trained on gigaspeech XL subset, the other +for the finetune model finetuned on commands dataset. + +Commands | FN in positive set |FN in positive set | Recall | Recall | FP in negative set | FP in negative set| False alarm (time / hour) 40 hours | False alarm (time / hour) 40 hours | +-- | -- | -- | -- | --| -- | -- | -- | -- +  | original | finetune | original | finetune | original | finetune | original | finetune +All | 43/307 | 4/307 | 86% | 98.7% | 1 | 24 | 0.025 | 0.6 +Lights on | 6/17 | 0/17 | 64.7% | 100% | 1 | 9 | 0.025 | 0.225 +Heat up | 5/14 | 1/14 | 64.3% | 92.9% | 0 | 1 | 0 | 0.025 +Volume down | 4/18 | 0/18 | 77.8% | 100% | 0 | 2 | 0 | 0.05 +Volume max | 4/17 | 0/17 | 76.5% | 100% | 0 | 0 | 0 | 0 +Volume mute | 4/16 | 0/16 | 75.0% | 100% | 0 | 0 | 0 | 0 +Too quiet | 3/17 | 0/17 | 82.4% | 100% | 0 | 4 | 0 | 0.1 +Lights off | 3/17 | 0/17 | 82.4% | 100% | 0 | 2 | 0 | 0.05 +Play music | 2/14 | 0/14 | 85.7% | 100% | 0 | 0 | 0 | 0 +Bring newspaper | 2/13 | 1/13 | 84.6% | 92.3% | 0 | 0 | 0 | 0 +Heat down | 2/16 | 2/16 | 87.5% | 87.5% | 0 | 1 | 0 | 0.025 +Volume up | 2/18 | 0/18 | 88.9% | 100% | 0 | 1 | 0 | 0.025 +Too loud | 1/13 | 0/13 | 92.3% | 100% | 0 | 0 | 0 | 0 +Resume music | 1/14 | 0/14 | 92.9% | 100% | 0 | 0 | 0 | 0 +Bring shoes | 1/15 | 0/15 | 93.3% | 100% | 0 | 0 | 0 | 0 +Switch language | 1/15 | 0/15 | 93.3% | 100% | 0 | 0 | 0 | 0 +Pause music | 1/15 | 0/15 | 93.3% | 100% | 0 | 0 | 0 | 0 +Bring socks | 1/12 | 0/12 | 91.7% | 100% | 0 | 0 | 0 | 0 +Stop music | 0/15 | 0/15 | 100% | 100% | 0 | 0 | 0 | 0 +Turn it up | 0/15 | 0/15 | 100% | 100% | 0 | 3 | 0 | 0.075 +Turn it down | 0/16 | 0/16 | 100% | 100% | 0 | 1 | 0 | 0.025 + +This is the result of large test set, it has more than 200 commands, too many to list the details of each commands, so only an overall result here. + +Commands | FN in positive set | FN in positive set | Recall | Recall | FP in negative set | FP in negative set | False alarm (time / hour)23 hours | False alarm (time / hour)23 hours +-- | -- | -- | -- | -- | -- | -- | -- | -- +  | original | finetune | original | finetune | original | finetune | original | finetune +All | 622/3994 | 79/ 3994 | 83.6% | 97.9% | 18/19930 | 52/19930 | 0.45 | 1.3 diff --git a/egs/gigaspeech/KWS/run.sh b/egs/gigaspeech/KWS/run.sh old mode 100644 new mode 100755 index e13a78964..ea04c7c9b --- a/egs/gigaspeech/KWS/run.sh +++ b/egs/gigaspeech/KWS/run.sh @@ -11,8 +11,6 @@ export PYTHONPATH=../../../:$PYTHONPATH stage=0 stop_stage=100 -pre_trained_model_host=github - . shared/parse_options.sh || exit 1 log() { @@ -21,20 +19,6 @@ log() { echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } - -if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then - log "Stage -1: Download a pre-trained model." - if [ $pre_trained_model_host -eq "github" ]; then - - elif [$pre_trained_model_host -eq "modelscope" ]; then - - else - log "Pretrained model host : $pre_trained_model_host not support." - exit -1; - fi -fi - - if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Train a model." if [ ! -e data/fbank/.gigaspeech.done ]; then @@ -51,14 +35,13 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then --feedforward-dim 192,192,192,192,192,192 \ --encoder-dim 128,128,128,128,128,128 \ --encoder-unmasked-dim 128,128,128,128,128,128 \ - --num-epochs 15 \ + --num-epochs 12 \ --lr-epochs 1.5 \ --use-fp16 1 \ --start-epoch 1 \ - --training-subset L \ - --pinyin-type partial_with_tone \ + --subset XL \ + --bpe-model data/lang_bpe_500/bpe.model \ --causal 1 \ - --lang-dir data/lang_partial_tone \ --max-duration 1000 fi @@ -66,11 +49,10 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Decode the model." for t in small, large; do python ./zipformer/decode.py \ - --epoch 15 \ + --epoch 12 \ --avg 2 \ --exp-dir ./zipformer/exp \ - --lang-dir ./data/lang_partial_tone \ - --pinyin-type partial_with_tone \ + --bpe-model data/lang_bpe_500/bpe.model \ --causal 1 \ --chunk-size 16 \ --left-context-frames 64 \ @@ -92,10 +74,10 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Export the model." python ./zipformer/export.py \ - --epoch 15 \ + --epoch 12 \ --avg 2 \ --exp-dir ./zipformer/exp \ - --tokens data/lang_partial_tone/tokens.txt \ + --tokens data/lang_bpe_500/tokens.txt \ --causal 1 \ --chunk-size 16 \ --left-context-frames 64 \ @@ -108,8 +90,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then python ./zipformer/export_onnx_streaming.py \ --exp-dir zipformer/exp \ - --tokens data/lang_partial_tone/tokens.txt \ - --epoch 15 \ + --tokens data/lang_bpe_500/tokens.txt \ + --epoch 12 \ --avg 2 \ --chunk-size 16 \ --left-context-frames 128 \ @@ -138,9 +120,8 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then --world-size 4 \ --num-epochs 10 \ --start-epoch 1 \ - --exp-dir zipformer/exp_finetune - --lang-dir ./data/lang_partial_tone \ - --pinyin-type partial_with_tone \ + --exp-dir zipformer/exp_finetune \ + --bpe-model data/lang_bpe_500/bpe.model \ --use-fp16 1 \ --decoder-dim 320 \ --joiner-dim 320 \ @@ -160,11 +141,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 1: Decode the finetuned model." for t in small, large; do python ./zipformer/decode.py \ - --epoch 15 \ + --epoch 10 \ --avg 2 \ --exp-dir ./zipformer/exp_finetune \ - --lang-dir ./data/lang_partial_tone \ - --pinyin-type partial_with_tone \ + --bpe-model data/lang_bpe_500/bpe.model \ --causal 1 \ --chunk-size 16 \ --left-context-frames 64 \ @@ -185,10 +165,25 @@ fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 2: Export the finetuned model." + python ./zipformer/export.py \ + --epoch 10 \ + --avg 2 \ + --exp-dir ./zipformer/exp_finetune \ + --tokens data/lang_bpe_500/tokens.txt \ + --causal 1 \ + --chunk-size 16 \ + --left-context-frames 64 \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 + python ./zipformer/export_onnx_streaming.py \ --exp-dir zipformer/exp_finetune \ - --tokens data/lang_partial_tone/tokens.txt \ - --epoch 15 \ + --tokens data/lang_bpe_500/tokens.txt \ + --epoch 10 \ --avg 2 \ --chunk-size 16 \ --left-context-frames 128 \ diff --git a/egs/wenetspeech/KWS/RESULTS.md b/egs/wenetspeech/KWS/RESULTS.md new file mode 100644 index 000000000..5ff2f4131 --- /dev/null +++ b/egs/wenetspeech/KWS/RESULTS.md @@ -0,0 +1,58 @@ +# Results + +## zipformer transducer model + +This is a tiny general ASR model, which has around 3.3M parameters, see this PR https://github.com/k2-fsa/icefall/pull/1428 for how to train it and other details. + +The modeling units are partial pinyin (i.e initials and finals) with tone. + +The positive test sets are from https://github.com/pkufool/open-commands and the negative test set is test net of wenetspeech (has 23 hours audios). + +We put the whole pipeline in `run.sh` containing training, decoding and finetuning commands. + +The models have been upload to [github](https://github.com/pkufool/keyword-spotting-models/releases/download/v0.11/icefall-kws-zipformer-wenetspeech-20240219.tar.gz). + +Here is the results of a small test set which has 20 commands, we list the results of every commands, for +each metric there are two columns, one for the original model trained on wenetspeech L subset, the other +for the finetune model finetuned on in house commands dataset (has 90 hours audio). + +> You can see that the performance of the original model is very poor, I think the reason is the test commands are all collected from real product scenarios which are very different from the scenarios wenetspeech dataset was collected. After finetuning, the performance improves a lot. + +Commands | FN in positive set | FN in positive set | Recall | Recall | FP in negative set | FP in negative set | False alarm (time / hour)23 hours | False alarm (time / hour)23 hours +-- | -- | -- | -- | -- | -- | -- | -- | -- +  | original | finetune | original | finetune | original | finetune | original | finetune +All | 426 / 985 | 40/985 | 56.8% | 95.9% | 7 | 1 | 0.3 | 0.04 +下一个 | 5/50 | 0/50 | 90% | 100% | 3 | 0 | 0.13 | 0 +开灯 | 19/49 | 2/49 | 61.2% | 95.9% | 0 | 0 | 0 | 0 +第一个 | 11/50 | 3/50 | 78% | 94% | 3 | 0 | 0.13 | 0 +声音调到最大 | 39/50 | 7/50 | 22% | 86% | 0 | 0 | 0 | 0 +暂停音乐 | 36/49 | 1/49 | 26.5% | 98% | 0 | 0 | 0 | 0 +暂停播放 | 33/49 | 2/49 | 32.7% | 95.9% | 0 | 0 | 0 | 0 +打开卧室灯 | 33/49 | 1/49 | 32.7% | 98% | 0 | 0 | 0 | 0 +关闭所有灯 | 27/50 | 0/50 | 46% | 100% | 0 | 0 | 0 | 0 +关灯 | 25/48 | 2/48 | 47.9% | 95.8% | 1 | 1 | 0.04 | 0.04 +关闭导航 | 25/48 | 1/48 | 47.9% | 97.9% | 0 | 0 | 0 | 0 +打开蓝牙 | 24/47 | 0/47 | 48.9% | 100% | 0 | 0 | 0 | 0 +下一首歌 | 21/50 | 1/50 | 58% | 98% | 0 | 0 | 0 | 0 +换一首歌 | 19/50 | 5/50 | 62% | 90% | 0 | 0 | 0 | 0 +继续播放 | 19/50 | 2/50 | 62% | 96% | 0 | 0 | 0 | 0 +打开闹钟 | 18/49 | 2/49 | 63.3% | 95.9% | 0 | 0 | 0 | 0 +打开音乐 | 17/49 | 0/49 | 65.3% | 100% | 0 | 0 | 0 | 0 +打开导航 | 17/48 | 0/49 | 64.6% | 100% | 0 | 0 | 0 | 0 +打开电视 | 15/50 | 0/49 | 70% | 100% | 0 | 0 | 0 | 0 +大点声 | 12/50 | 5/50 | 76% | 90% | 0 | 0 | 0 | 0 +小点声 | 11/50 | 6/50 | 78% | 88% | 0 | 0 | 0 | 0 + + +This is the result of large test set, it has more than 100 commands, too many to list the details of each commands, so only an overall result here. We also list the results of two weak up words 小云小云 (only test set)and 你好问问 (both training and test sets). For 你好问问, we have to finetune models, one is finetuned on 你好问问 and our in house commands data, the other finetuned on only 你好问问. Both models perform much better than original model, the one finetuned on only 你好问问 behaves slightly better than the other. + +> 小云小云 test set and 你好问问 training, dev and test sets are available at https://github.com/pkufool/open-commands + +Commands | FN in positive set | FN in positive set | Recall | Recall | FP in negative set | FP in negative set | False alarm (time / hour)23 hours | False alarm (time / hour)23 hours +-- | -- | -- | -- | -- | -- | -- | -- | -- +  | baseline | finetune | baseline | finetune | baseline | finetune | baseline | finetune +large | 2429/4505 | 477 / 4505 | 46.1% | 89.4% | 50 | 41 | 2.17 | 1.78 +小云小云(clean) | 30/100 | 40/100 | 70% | 60% | 0 | 0 | 0 | 0 +小云小云(noisy) | 118/350 | 154/350 | 66.3% | 56% | 0 | 0 | 0 | 0 +你好问问(finetune with all keywords data) | 2236/10641 | 678/10641 | 79% | 93.6% | 0 | 0 | 0 | 0 +你好问问(finetune with only 你好问问) | 2236/10641 | 249/10641 | 79% | 97.7% | 0 | 0 | 0 | 0 diff --git a/egs/wenetspeech/KWS/run.sh b/egs/wenetspeech/KWS/run.sh old mode 100644 new mode 100755 index 971f54e29..2bdd6a5f3 --- a/egs/wenetspeech/KWS/run.sh +++ b/egs/wenetspeech/KWS/run.sh @@ -11,8 +11,6 @@ export PYTHONPATH=../../../:$PYTHONPATH stage=0 stop_stage=100 -pre_trained_model_host=github - . shared/parse_options.sh || exit 1 log() { @@ -21,20 +19,6 @@ log() { echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } - -if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then - log "Stage -1: Download a pre-trained model." - if [ $pre_trained_model_host -eq "github" ]; then - - elif [$pre_trained_model_host -eq "modelscope" ]; then - - else - log "Pretrained model host : $pre_trained_model_host not support." - exit -1; - fi -fi - - if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Train a model." if [ ! -e data/fbank/.wenetspeech.done ]; then @@ -51,7 +35,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then --feedforward-dim 192,192,192,192,192,192 \ --encoder-dim 128,128,128,128,128,128 \ --encoder-unmasked-dim 128,128,128,128,128,128 \ - --num-epochs 15 \ + --num-epochs 18 \ --lr-epochs 1.5 \ --use-fp16 1 \ --start-epoch 1 \ @@ -66,10 +50,10 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Decode the model." for t in small, large; do python ./zipformer/decode.py \ - --epoch 15 \ + --epoch 18 \ --avg 2 \ --exp-dir ./zipformer/exp \ - --lang-dir ./data/lang_partial_tone \ + --tokens ./data/lang_partial_tone/tokens.txt \ --pinyin-type partial_with_tone \ --causal 1 \ --chunk-size 16 \ @@ -81,8 +65,8 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then --encoder-dim 128,128,128,128,128,128 \ --encoder-unmasked-dim 128,128,128,128,128,128 \ --test-set $t \ - --keywords-score 1.0 \ - --keywords-threshold 0.35 \ + --keywords-score 1.5 \ + --keywords-threshold 0.1 \ --keywords-file ./data/commands_${t}.txt \ --max-duration 3000 done @@ -92,7 +76,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Export the model." python ./zipformer/export.py \ - --epoch 15 \ + --epoch 18 \ --avg 2 \ --exp-dir ./zipformer/exp \ --tokens data/lang_partial_tone/tokens.txt \ @@ -109,7 +93,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then python ./zipformer/export_onnx_streaming.py \ --exp-dir zipformer/exp \ --tokens data/lang_partial_tone/tokens.txt \ - --epoch 15 \ + --epoch 18 \ --avg 2 \ --chunk-size 16 \ --left-context-frames 128 \ @@ -160,10 +144,10 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 1: Decode the finetuned model." for t in small, large; do python ./zipformer/decode.py \ - --epoch 15 \ + --epoch 10 \ --avg 2 \ --exp-dir ./zipformer/exp_finetune \ - --lang-dir ./data/lang_partial_tone \ + --tokens ./data/lang_partial_tone/tokens.txt \ --pinyin-type partial_with_tone \ --causal 1 \ --chunk-size 16 \ @@ -175,7 +159,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then --encoder-dim 128,128,128,128,128,128 \ --encoder-unmasked-dim 128,128,128,128,128,128 \ --test-set $t \ - --keywords-score 1.0 \ + --keywords-score 0.000001 \ --keywords-threshold 0.35 \ --keywords-file ./data/commands_${t}.txt \ --max-duration 3000 @@ -185,10 +169,25 @@ fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 2: Export the finetuned model." + python ./zipformer/export.py \ + --epoch 10 \ + --avg 2 \ + --exp-dir ./zipformer/exp_finetune \ + --tokens data/lang_partial_tone/tokens.txt \ + --causal 1 \ + --chunk-size 16 \ + --left-context-frames 64 \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 + python ./zipformer/export_onnx_streaming.py \ --exp-dir zipformer/exp_finetune \ --tokens data/lang_partial_tone/tokens.txt \ - --epoch 15 \ + --epoch 10 \ --avg 2 \ --chunk-size 16 \ --left-context-frames 128 \ diff --git a/egs/wenetspeech/KWS/zipformer/decode.py b/egs/wenetspeech/KWS/zipformer/decode.py index 50316b402..5ed3c6c2c 100755 --- a/egs/wenetspeech/KWS/zipformer/decode.py +++ b/egs/wenetspeech/KWS/zipformer/decode.py @@ -44,10 +44,10 @@ from icefall.checkpoint import ( find_checkpoints, load_checkpoint, ) -from icefall.lexicon import Lexicon from icefall.utils import ( AttributeDict, make_pad_mask, + num_tokens, setup_logger, store_transcripts, str2bool, @@ -124,10 +124,10 @@ def get_parser(): ) parser.add_argument( - "--lang-dir", + "--tokens", type=Path, - default="data/lang_char", - help="The lang dir containing word table and LG graph", + default="data/lang_partial_tone/tokens.txt", + help="The path to the token.txt", ) parser.add_argument( @@ -209,7 +209,6 @@ def get_parser(): def decode_one_batch( params: AttributeDict, model: nn.Module, - lexicon: Lexicon, batch: dict, keywords_graph: ContextGraph, ) -> Dict[str, List[List[str]]]: @@ -296,7 +295,6 @@ def decode_dataset( dl: torch.utils.data.DataLoader, params: AttributeDict, model: nn.Module, - lexicon: Lexicon, keywords_graph: ContextGraph, keywords: Set[str], test_only_keywords: bool, @@ -342,7 +340,6 @@ def decode_dataset( hyps = decode_one_batch( params=params, model=model, - lexicon=lexicon, keywords_graph=keywords_graph, batch=batch, ) @@ -516,9 +513,9 @@ def main(): logging.info(f"Device: {device}") - lexicon = Lexicon(params.lang_dir) - params.blank_id = lexicon.token_table[""] - params.vocab_size = max(lexicon.tokens) + 1 + token_table = k2.SymbolTable.from_file(params.tokens) + params.blank_id = token_table[""] + params.vocab_size = num_tokens(token_table) + 1 logging.info(params) @@ -547,8 +544,8 @@ def main(): tmp_ids = [] kws_py = text_to_pinyin(keyword, mode=params.pinyin_type) for k in kws_py: - if k in lexicon.token_table: - tmp_ids.append(lexicon.token_table[k]) + if k in token_table: + tmp_ids.append(token_table[k]) else: logging.warning(f"Containing OOV tokens, skipping line : {line}") tmp_ids = [] @@ -721,7 +718,6 @@ def main(): dl=test_dl, params=params, model=model, - lexicon=lexicon, keywords_graph=keywords_graph, keywords=keywords, test_only_keywords="test_net" not in test_set,