From 2cbba6901eb19e181a6ece1fbcdae0be0ee3fdf8 Mon Sep 17 00:00:00 2001 From: luomingshuang <739314837@qq.com> Date: Mon, 7 Mar 2022 10:13:03 +0800 Subject: [PATCH] fix comments --- .../ASR/local/compute_fbank_tedlium.py | 4 +- .../convert_transcript_words_to_bpe_ids.py | 4 +- egs/tedlium3/ASR/local/prepare_transcripts.py | 95 +++++++++++++++++++ egs/tedlium3/ASR/prepare.sh | 85 ++--------------- .../ASR/transducer_stateless/README.md | 2 +- .../ASR/transducer_stateless/decode.py | 9 ++ .../ASR/transducer_stateless/export.py | 2 +- .../ASR/transducer_stateless/pretrained.py | 6 +- .../ASR/transducer_stateless/train.py | 1 - 9 files changed, 124 insertions(+), 84 deletions(-) create mode 100755 egs/tedlium3/ASR/local/prepare_transcripts.py diff --git a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py index c1d68a07b..915197594 100644 --- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py +++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py @@ -70,7 +70,7 @@ def compute_fbank_tedlium(): cut_set = CutSet.from_manifests( recordings=m["recordings"], supervisions=m["supervisions"], - ).trim_to_supervisions(keep_overlapping=False) + ) if "train" in partition: cut_set = ( cut_set @@ -85,6 +85,8 @@ def compute_fbank_tedlium(): executor=ex, storage_type=ChunkedLilcomHdf5Writer, ) + # Split long cuts into many short and un-overlapping cuts + cut_set = cut_set.trim_to_supervisions(keep_overlapping=False) cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") diff --git a/egs/tedlium3/ASR/local/convert_transcript_words_to_bpe_ids.py b/egs/tedlium3/ASR/local/convert_transcript_words_to_bpe_ids.py index bbd1b838c..49544ccb3 100644 --- a/egs/tedlium3/ASR/local/convert_transcript_words_to_bpe_ids.py +++ b/egs/tedlium3/ASR/local/convert_transcript_words_to_bpe_ids.py @@ -42,7 +42,7 @@ def convert_texts_into_ids( texts: List[str], unk_id: int, sp: spm.SentencePieceProcessor, -) -> List[int]: +) -> List[List[int]]: """ Args: texts: @@ -50,7 +50,7 @@ def convert_texts_into_ids( unk_id: A number id for the token ''. Returns: - Return a integer list of bpe ids. + Return an integer list of bpe ids. """ y = [] for text in texts: diff --git a/egs/tedlium3/ASR/local/prepare_transcripts.py b/egs/tedlium3/ASR/local/prepare_transcripts.py new file mode 100755 index 000000000..44dc3891f --- /dev/null +++ b/egs/tedlium3/ASR/local/prepare_transcripts.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Mingshuang Luo) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script takes as input supervisions json dir "data/manifests" +consisting of supervisions_TRAIN.json and does the following: + +1. Generate train.text. + +""" +import argparse +import json +import logging +from pathlib import Path + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--manifests-dir", + type=str, + help="""Input directory. + """, + ) + parser.add_argument( + "--lang-dir", + type=str, + help="""Output directory. + """, + ) + + return parser.parse_args() + + +def prepare_transcripts(manifests_dir: str, lang_dir: str): + """ + Args: + manifests_dir: + The manifests directory, e.g., data/manifests. + lang_dir: + The language directory, e.g., data/lang_phone. + + Return: + The train.text in lang_dir. + """ + texts = [] + + supervisions_train = Path(manifests_dir) / "supervisions_train.json" + train_text = Path(lang_dir) / "train.text" + + logging.info(f"Loading {supervisions_train}!") + with open(supervisions_train, "r") as load_f: + load_dicts = json.load(load_f) + for load_dict in load_dicts: + text = load_dict["text"] + texts.append(text) + + with open(train_text, "w") as f: + for text in texts: + f.write(text) + f.write("\n") + + +def main(): + args = get_args() + manifests_dir = Path(args.manifests_dir) + lang_dir = Path(args.lang_dir) + + logging.info("Generating train.text") + prepare_transcripts(manifests_dir, lang_dir) + + +if __name__ == "__main__": + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + + main() diff --git a/egs/tedlium3/ASR/prepare.sh b/egs/tedlium3/ASR/prepare.sh index fff1a0967..9ac73b20d 100644 --- a/egs/tedlium3/ASR/prepare.sh +++ b/egs/tedlium3/ASR/prepare.sh @@ -71,13 +71,14 @@ fi if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" - # If you have pre-downloaded it to /path/to/LibriSpeech, + # If you have pre-downloaded it to /path/to/tedlium3, # you can create a symlink # # ln -sfv /path/to/tedlium3 $dl_dir/tedlium3 # - if [ ! -d $dl_dir/tedlium ]; then + if [ ! -d $dl_dir/tedlium3 ]; then lhotse download tedlium $dl_dir + mv $dl_dir/TEDLIUM_release-3 $dl_dir/tedlium3 fi # If you have pre-downloaded it to /path/to/musan, @@ -127,13 +128,13 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then ./local/prepare_transcripts.py \ --lang-dir $lang_dir \ --manifests-dir data/manifests - - cat download/tedlium3/TEDLIUM.152k.dic | - grep -v -w "" | - grep -v -w "" | - grep -v -w "" | - LANG= LC_ALL= sort | - sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt + fi + cat download/tedlium3/TEDLIUM.152k.dic | \ + grep -v -w "" | \ + grep -v -w "" | \ + grep -v -w "" | \ + LANG= LC_ALL= sort | \ + sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt (echo ' '; ) | cat - $lang_dir/lexicon_words.txt | @@ -174,69 +175,3 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then fi done fi - -if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then - log "Stage 7: Prepare bigram P" - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - - if [ ! -f $lang_dir/transcript_tokens.txt ]; then - ./local/convert_transcript_words_to_tokens.py \ - --lexicon $lang_dir/lexicon.txt \ - --transcript $lang_dir/transcript_words.txt \ - --oov "" \ - > $lang_dir/transcript_tokens.txt - fi - - if [ ! -f $lang_dir/P.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 2 \ - -text $lang_dir/transcript_tokens.txt \ - -lm $lang_dir/P.arpa - fi - - if [ ! -f $lang_dir/P.fst.txt ]; then - python3 -m kaldilm \ - --read-symbol-table="$lang_dir/tokens.txt" \ - --disambig-symbol='#0' \ - --max-order=2 \ - $lang_dir/P.arpa > $lang_dir/P.fst.txt - fi - done -fi - -if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 8: Prepare G" - # We assume you have install kaldilm, if not, please install - # it using: pip install kaldilm - - mkdir -p data/lm - if [ ! -f data/lm/G_3_gram.fst.txt ]; then - # It is used in building HLG - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - data/lm/lm_3_gram.arpa > data/lm/G_3_gram.fst.txt - fi - - if [ ! -f data/lm/G_4_gram.fst.txt ]; then - # It is used for LM rescoring - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=4 \ - data/lm/lm_4_gram.arpa > data/lm/G_4_gram.fst.txt - fi -fi -echo 'completing the G building....' -if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Compile HLG" - ./local/compile_hlg.py --lang-dir data/lang_phone - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - ./local/compile_hlg.py --lang-dir $lang_dir - done -fi diff --git a/egs/tedlium3/ASR/transducer_stateless/README.md b/egs/tedlium3/ASR/transducer_stateless/README.md index 90c08c848..93af553ec 100644 --- a/egs/tedlium3/ASR/transducer_stateless/README.md +++ b/egs/tedlium3/ASR/transducer_stateless/README.md @@ -16,5 +16,5 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" --num-epochs 30 \ --start-epoch 0 \ --exp-dir transducer_stateless/exp \ - --max-duration 200 \ + --max-duration 200 ``` diff --git a/egs/tedlium3/ASR/transducer_stateless/decode.py b/egs/tedlium3/ASR/transducer_stateless/decode.py index 78d1d52ae..c566132b0 100755 --- a/egs/tedlium3/ASR/transducer_stateless/decode.py +++ b/egs/tedlium3/ASR/transducer_stateless/decode.py @@ -34,6 +34,15 @@ Usage: --max-duration 100 \ --decoding-method beam_search \ --beam-size 4 + +(3) modified beam search +./transducer_stateless/decode.py \ + --epoch 29 \ + --avg 16 \ + --exp-dir ./transducer_stateless/exp \ + --max-duration 100 \ + --decoding-method modified_beam_search \ + --beam-size 4 """ diff --git a/egs/tedlium3/ASR/transducer_stateless/export.py b/egs/tedlium3/ASR/transducer_stateless/export.py index 44392c870..6a40a1b4f 100644 --- a/egs/tedlium3/ASR/transducer_stateless/export.py +++ b/egs/tedlium3/ASR/transducer_stateless/export.py @@ -39,7 +39,7 @@ To use the generated file with `transducer_stateless/decode.py`, you can do: --exp-dir ./transducer_stateless/exp \ --epoch 9999 \ --avg 1 \ - --max-duration 1 \ + --max-duration 100 \ --bpe-model data/lang_bpe_500/bpe.model """ diff --git a/egs/tedlium3/ASR/transducer_stateless/pretrained.py b/egs/tedlium3/ASR/transducer_stateless/pretrained.py index 77bdd0ca6..c0e3bb844 100644 --- a/egs/tedlium3/ASR/transducer_stateless/pretrained.py +++ b/egs/tedlium3/ASR/transducer_stateless/pretrained.py @@ -25,7 +25,7 @@ Usage: --method greedy_search \ --max-sym-per-frame 1 \ /path/to/foo.wav \ - /path/to/bar.wav \ + /path/to/bar.wav (2) beam search ./transducer_stateless/pretrained.py \ @@ -34,7 +34,7 @@ Usage: --method beam_search \ --beam-size 4 \ /path/to/foo.wav \ - /path/to/bar.wav \ + /path/to/bar.wav (3) modified beam search ./transducer_stateless/pretrained.py \ @@ -43,7 +43,7 @@ Usage: --method modified_beam_search \ --beam-size 4 \ /path/to/foo.wav \ - /path/to/bar.wav \ + /path/to/bar.wav You can also use `./transducer_stateless/exp/epoch-xx.pt`. diff --git a/egs/tedlium3/ASR/transducer_stateless/train.py b/egs/tedlium3/ASR/transducer_stateless/train.py index b3dba4409..52e175273 100755 --- a/egs/tedlium3/ASR/transducer_stateless/train.py +++ b/egs/tedlium3/ASR/transducer_stateless/train.py @@ -397,7 +397,6 @@ def compute_loss( feature_lens = supervisions["num_frames"].to(device) texts = batch["supervisions"]["text"] - unk_id = params.unk_id y = convert_texts_into_ids(texts, unk_id, sp=sp) y = k2.RaggedTensor(y).to(device)