From 5df6040df00f60ff1b26a8087984dcde3339703b Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 7 Mar 2022 16:03:39 +0800 Subject: [PATCH] Update README to include force alignment information. --- .../ASR/transducer_stateless/README.md | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/egs/librispeech/ASR/transducer_stateless/README.md b/egs/librispeech/ASR/transducer_stateless/README.md index 964bddfab..978fa2ada 100644 --- a/egs/librispeech/ASR/transducer_stateless/README.md +++ b/egs/librispeech/ASR/transducer_stateless/README.md @@ -20,3 +20,120 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" --max-duration 250 \ --lr-factor 2.5 ``` + +## How to get framewise token alignment + +Assume that you already have a trained model. If not, you can either +train one by yourself or download a pre-trained model from hugging face: + + +**Caution**: If you are going to use your own trained model, remember +to set `--modified-transducer-prob` to a nonzero value since the +force alignment code assumes that `--max-sym-per-frame` is 1. + + +The following shows how to get framewise token alignment using the above +pre-trained model. + +```bash +git clone https://github.com/k2-fsa/icefall +cd icefall/egs/librispeech/ASR +mkdir tmp +sudo apt-get install git-lfs +git lfs install +git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01 ./tmp/ + +ln -s $PWD/tmp/exp/pretrained.pt $PWD/tmp/epoch-999.pt + +./transducer_stateless/compute_ali.py \ + --exp-dir ./tmp/exp \ + --bpe-model ./tmp/data/lang_bpe_500/bpe.model \ + --epoch 999 \ + --avg 1 \ + --max-duration 100 \ + --dataset dev-clean \ + --out-dir data/ali +``` + +After running the above commands, you will find the following two files +in the folder `./data/ali`: + +``` +-rw-r--r-- 1 xxx xxx 412K Mar 7 15:45 cuts_dev-clean.json.gz +-rw-r--r-- 1 xxx xxx 2.9M Mar 7 15:45 token_ali_dev-clean.h5 +``` + +You can find usage examples in `./test_compute_ali.py` about +extracting framewise token alignment information from the above +two files. + +## How to get word starting time from framewise token alignment + +Assume you have run the above commands to get framewise token alignment +using a pre-trained model from `tmp/exp/epoch-999.pt`. You can use the following +commands to obtain word starting time. + +```bash +./transducer_stateless/test_compute_ali.py \ + --bpe-model ./tmp/data/lang_bpe_500/bpe.model \ + --ali-dir data/ali \ + --dataset dev-clean +``` + +**Caution**: Since the frame shift is 10ms and the subsampling factor +of the model is 4, the time resolution is 0.04 second. + +**Note**: The script `test_compute_ali.py` is for illustration only +and it processes only one batch and then exits. + +You will get the following output: + +``` +5694-64029-0022-1998-0 +[('THE', '0.20'), ('LEADEN', '0.36'), ('HAIL', '0.72'), ('STORM', '1.00'), ('SWEPT', '1.48'), ('THEM', '1.88'), ('OFF', '2.00'), ('THE', '2.24'), ('FIELD', '2.36'), ('THEY', '3.20'), ('FELL', '3.36'), ('BACK', '3.64'), ('AND', '3.92'), ('RE', '4.04'), ('FORMED', '4.20')] + +3081-166546-0040-308-0 +[('IN', '0.32'), ('OLDEN', '0.60'), ('DAYS', '1.00'), ('THEY', '1.40'), ('WOULD', '1.56'), ('HAVE', '1.76'), ('SAID', '1.92'), ('STRUCK', '2.60'), ('BY', '3.16'), ('A', '3.36'), ('BOLT', '3.44'), ('FROM', '3.84'), ('HEAVEN', '4.04')] + +2035-147960-0016-1283-0 +[('A', '0.44'), ('SNAKE', '0.52'), ('OF', '0.84'), ('HIS', '0.96'), ('SIZE', '1.12'), ('IN', '1.60'), ('FIGHTING', '1.72'), ('TRIM', '2.12'), ('WOULD', '2.56'), ('BE', '2.76'), ('MORE', '2.88'), ('THAN', '3.08'), ('ANY', '3.28'), ('BOY', '3.56'), ('COULD', '3.88'), ('HANDLE', '4.04')] + +2428-83699-0020-1734-0 +[('WHEN', '0.28'), ('THE', '0.48'), ('TRAP', '0.60'), ('DID', '0.88'), ('APPEAR', '1.08'), ('IT', '1.80'), ('LOOKED', '1.96'), ('TO', +'2.24'), ('ME', '2.36'), ('UNCOMMONLY', '2.52'), ('LIKE', '3.16'), ('AN', '3.40'), ('OPEN', '3.56'), ('SPRING', '3.92'), ('CART', '4.28')] + +8297-275154-0026-2108-0 +[('LET', '0.44'), ('ME', '0.72'), ('REST', '0.92'), ('A', '1.32'), ('LITTLE', '1.40'), ('HE', '1.80'), ('PLEADED', '2.00'), ('IF', '3.04'), ("I'M", '3.28'), ('NOT', '3.52'), ('IN', '3.76'), ('THE', '3.88'), ('WAY', '4.00')] + +652-129742-0007-1002-0 +[('SURROUND', '0.28'), ('WITH', '0.80'), ('A', '0.92'), ('GARNISH', '1.00'), ('OF', '1.44'), ('COOKED', '1.56'), ('AND', '1.88'), ('DICED', '4.16'), ('CARROTS', '4.28'), ('TURNIPS', '4.44'), ('GREEN', '4.60'), ('PEAS', '4.72')] +``` + + +For the row: +``` +5694-64029-0022-1998-0 +[('THE', '0.20'), ('LEADEN', '0.36'), ('HAIL', '0.72'), ('STORM', '1.00'), ('SWEPT', '1.48'), +('THEM', '1.88'), ('OFF', '2.00'), ('THE', '2.24'), ('FIELD', '2.36'), ('THEY', '3.20'), ('FELL', '3.36'), +('BACK', '3.64'), ('AND', '3.92'), ('RE', '4.04'), ('FORMED', '4.20')] +``` + +- `5694-64029-0022-1998-0` is the cut ID. +- `('THE', '0.20')` means the word `THE` starts at 0.20 second. +- `('LEADEN', '0.36')` means the word `LEADEN` starts at 0.36 second. + + +You can compare the above word starting time with the one +from + +``` +5694-64029-0022 ",THE,LEADEN,HAIL,STORM,SWEPT,THEM,OFF,THE,FIELD,,THEY,FELL,BACK,AND,RE,FORMED," "0.230,0.360,0.670,1.010,1.440,1.860,1.990,2.230,2.350,2.870,3.230,3.390,3.660,3.960,4.060,4.160,4.850,4.9" +``` + +We reformat it below for readability: + +``` +5694-64029-0022 ",THE,LEADEN,HAIL,STORM,SWEPT,THEM,OFF,THE,FIELD,,THEY,FELL,BACK,AND,RE,FORMED," +"0.230,0.360,0.670,1.010,1.440,1.860,1.990,2.230,2.350,2.870,3.230,3.390,3.660,3.960,4.060,4.160,4.850,4.9" + the leaden hail storm swept them off the field sil they fell back and re formed sil +```