mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
* support long file transcription * rename recipe as long_file_recog * add docs * support multi-gpu decoding * style fix
95 lines
3.4 KiB
Bash
Executable File
95 lines
3.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
|
|
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
|
|
|
set -eou pipefail
|
|
|
|
# This script is used to recogize long audios. The process is as follows:
|
|
# 1) Split long audios into chunks with overlaps.
|
|
# 2) Perform speech recognition on chunks, getting tokens and timestamps.
|
|
# 3) Merge the overlapped chunks into utterances acording to the timestamps.
|
|
|
|
# Each chunk (except the first and the last) is padded with extra left side and right side.
|
|
# The chunk length is: left_side + chunk_size + right_side.
|
|
chunk=30.0
|
|
extra=2.0
|
|
|
|
stage=1
|
|
stop_stage=4
|
|
|
|
# We assume that you have downloaded the LibriLight dataset
|
|
# with audio files in $corpus_dir and texts in $text_dir
|
|
corpus_dir=$PWD/download/libri-light
|
|
text_dir=$PWD/download/librilight_text
|
|
# Path to save the manifests
|
|
output_dir=$PWD/data/librilight
|
|
|
|
world_size=4
|
|
|
|
|
|
log() {
|
|
# This function is from espnet
|
|
local fname=${BASH_SOURCE[1]##*/}
|
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
}
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
# We will get librilight_recodings_{subset}.jsonl.gz and librilight_supervisions_{subset}.jsonl.gz
|
|
# saved in $output_dir/manifests
|
|
log "Stage 1: Prepare LibriLight manifest"
|
|
lhotse prepare librilight $corpus_dir $text_dir $output_dir/manifests -j 10
|
|
fi
|
|
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
# Chunk manifests are saved to $output_dir/manifests_chunk/librilight_cuts_{subset}.jsonl.gz
|
|
log "Stage 2: Split long audio into chunks"
|
|
./long_file_recog/split_into_chunks.py \
|
|
--manifest-in-dir $output_dir/manifests \
|
|
--manifest-out-dir $output_dir/manifests_chunk \
|
|
--chunk $chunk \
|
|
--extra $extra # Extra duration (in seconds) at both sides
|
|
fi
|
|
|
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
# Recognized tokens and timestamps are saved to $output_dir/manifests_chunk_recog/librilight_cuts_{subset}.jsonl.gz
|
|
|
|
# This script loads torchscript models, exported by `torch.jit.script()`,
|
|
# and uses it to decode waves.
|
|
# You can download the jit model from https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
|
|
|
|
log "Stage 3: Perform speech recognition on splitted chunks"
|
|
for subset in small median large; do
|
|
./long_file_recog/recognize.py \
|
|
--world-size $world_size \
|
|
--num-workers 8 \
|
|
--subset $subset \
|
|
--manifest-in-dir $output_dir/manifests_chunk \
|
|
--manifest-out-dir $output_dir/manifests_chunk_recog \
|
|
--nn-model-filename long_file_recog/exp/jit_model.pt \
|
|
--bpe-model data/lang_bpe_500/bpe.model \
|
|
--max-duration 2400 \
|
|
--decoding-method greedy_search
|
|
--master 12345
|
|
|
|
if [ $world_size -gt 1 ]; then
|
|
# Combine manifests from different jobs
|
|
lhotse combine $(find $output_dir/manifests_chunk_recog -name librilight_cuts_${subset}_job_*.jsonl.gz | tr "\n" " ") $output_dir/manifests_chunk_recog/librilight_cuts_${subset}.jsonl.gz
|
|
fi
|
|
done
|
|
fi
|
|
|
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
# Final results are saved in $output_dir/manifests/librilight_cuts_{subset}.jsonl.gz
|
|
log "Stage 4: Merge splitted chunks into utterances."
|
|
./long_file_recog/merge_chunks.py \
|
|
--manifest-in-dir $output_dir/manifests_chunk_recog \
|
|
--manifest-out-dir $output_dir/manifests \
|
|
--bpe-model data/lang_bpe_500/bpe.model \
|
|
--extra $extra
|
|
fi
|
|
|
|
|