fix comments

This commit is contained in:
luomingshuang 2022-03-07 10:13:03 +08:00
parent 973149d3bc
commit 2cbba6901e
9 changed files with 124 additions and 84 deletions

View File

@ -70,7 +70,7 @@ def compute_fbank_tedlium():
cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
).trim_to_supervisions(keep_overlapping=False)
)
if "train" in partition:
cut_set = (
cut_set
@ -85,6 +85,8 @@ def compute_fbank_tedlium():
executor=ex,
storage_type=ChunkedLilcomHdf5Writer,
)
# Split long cuts into many short and un-overlapping cuts
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")

View File

@ -42,7 +42,7 @@ def convert_texts_into_ids(
texts: List[str],
unk_id: int,
sp: spm.SentencePieceProcessor,
) -> List[int]:
) -> List[List[int]]:
"""
Args:
texts:
@ -50,7 +50,7 @@ def convert_texts_into_ids(
unk_id:
A number id for the token '<unk>'.
Returns:
Return a integer list of bpe ids.
Return an integer list of bpe ids.
"""
y = []
for text in texts:

View File

@ -0,0 +1,95 @@
#!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Mingshuang Luo)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script takes as input supervisions json dir "data/manifests"
consisting of supervisions_TRAIN.json and does the following:
1. Generate train.text.
"""
import argparse
import json
import logging
from pathlib import Path
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--manifests-dir",
type=str,
help="""Input directory.
""",
)
parser.add_argument(
"--lang-dir",
type=str,
help="""Output directory.
""",
)
return parser.parse_args()
def prepare_transcripts(manifests_dir: str, lang_dir: str):
"""
Args:
manifests_dir:
The manifests directory, e.g., data/manifests.
lang_dir:
The language directory, e.g., data/lang_phone.
Return:
The train.text in lang_dir.
"""
texts = []
supervisions_train = Path(manifests_dir) / "supervisions_train.json"
train_text = Path(lang_dir) / "train.text"
logging.info(f"Loading {supervisions_train}!")
with open(supervisions_train, "r") as load_f:
load_dicts = json.load(load_f)
for load_dict in load_dicts:
text = load_dict["text"]
texts.append(text)
with open(train_text, "w") as f:
for text in texts:
f.write(text)
f.write("\n")
def main():
args = get_args()
manifests_dir = Path(args.manifests_dir)
lang_dir = Path(args.lang_dir)
logging.info("Generating train.text")
prepare_transcripts(manifests_dir, lang_dir)
if __name__ == "__main__":
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
main()

View File

@ -71,13 +71,14 @@ fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# If you have pre-downloaded it to /path/to/LibriSpeech,
# If you have pre-downloaded it to /path/to/tedlium3,
# you can create a symlink
#
# ln -sfv /path/to/tedlium3 $dl_dir/tedlium3
#
if [ ! -d $dl_dir/tedlium ]; then
if [ ! -d $dl_dir/tedlium3 ]; then
lhotse download tedlium $dl_dir
mv $dl_dir/TEDLIUM_release-3 $dl_dir/tedlium3
fi
# If you have pre-downloaded it to /path/to/musan,
@ -127,13 +128,13 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
./local/prepare_transcripts.py \
--lang-dir $lang_dir \
--manifests-dir data/manifests
cat download/tedlium3/TEDLIUM.152k.dic |
grep -v -w "<s>" |
grep -v -w "</s>" |
grep -v -w "<unk>" |
LANG= LC_ALL= sort |
sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt
fi
cat download/tedlium3/TEDLIUM.152k.dic | \
grep -v -w "<s>" | \
grep -v -w "</s>" | \
grep -v -w "<unk>" | \
LANG= LC_ALL= sort | \
sed 's:([0-9])::g' > $lang_dir/lexicon_words.txt
(echo '<UNK> <UNK>'; ) |
cat - $lang_dir/lexicon_words.txt |
@ -174,69 +175,3 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
fi
done
fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Prepare bigram P"
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
if [ ! -f $lang_dir/transcript_tokens.txt ]; then
./local/convert_transcript_words_to_tokens.py \
--lexicon $lang_dir/lexicon.txt \
--transcript $lang_dir/transcript_words.txt \
--oov "<UNK>" \
> $lang_dir/transcript_tokens.txt
fi
if [ ! -f $lang_dir/P.arpa ]; then
./shared/make_kn_lm.py \
-ngram-order 2 \
-text $lang_dir/transcript_tokens.txt \
-lm $lang_dir/P.arpa
fi
if [ ! -f $lang_dir/P.fst.txt ]; then
python3 -m kaldilm \
--read-symbol-table="$lang_dir/tokens.txt" \
--disambig-symbol='#0' \
--max-order=2 \
$lang_dir/P.arpa > $lang_dir/P.fst.txt
fi
done
fi
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
log "Stage 8: Prepare G"
# We assume you have install kaldilm, if not, please install
# it using: pip install kaldilm
mkdir -p data/lm
if [ ! -f data/lm/G_3_gram.fst.txt ]; then
# It is used in building HLG
python3 -m kaldilm \
--read-symbol-table="data/lang_phone/words.txt" \
--disambig-symbol='#0' \
--max-order=3 \
data/lm/lm_3_gram.arpa > data/lm/G_3_gram.fst.txt
fi
if [ ! -f data/lm/G_4_gram.fst.txt ]; then
# It is used for LM rescoring
python3 -m kaldilm \
--read-symbol-table="data/lang_phone/words.txt" \
--disambig-symbol='#0' \
--max-order=4 \
data/lm/lm_4_gram.arpa > data/lm/G_4_gram.fst.txt
fi
fi
echo 'completing the G building....'
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
log "Stage 9: Compile HLG"
./local/compile_hlg.py --lang-dir data/lang_phone
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
./local/compile_hlg.py --lang-dir $lang_dir
done
fi

View File

@ -16,5 +16,5 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
--num-epochs 30 \
--start-epoch 0 \
--exp-dir transducer_stateless/exp \
--max-duration 200 \
--max-duration 200
```

View File

@ -34,6 +34,15 @@ Usage:
--max-duration 100 \
--decoding-method beam_search \
--beam-size 4
(3) modified beam search
./transducer_stateless/decode.py \
--epoch 29 \
--avg 16 \
--exp-dir ./transducer_stateless/exp \
--max-duration 100 \
--decoding-method modified_beam_search \
--beam-size 4
"""

View File

@ -39,7 +39,7 @@ To use the generated file with `transducer_stateless/decode.py`, you can do:
--exp-dir ./transducer_stateless/exp \
--epoch 9999 \
--avg 1 \
--max-duration 1 \
--max-duration 100 \
--bpe-model data/lang_bpe_500/bpe.model
"""

View File

@ -25,7 +25,7 @@ Usage:
--method greedy_search \
--max-sym-per-frame 1 \
/path/to/foo.wav \
/path/to/bar.wav \
/path/to/bar.wav
(2) beam search
./transducer_stateless/pretrained.py \
@ -34,7 +34,7 @@ Usage:
--method beam_search \
--beam-size 4 \
/path/to/foo.wav \
/path/to/bar.wav \
/path/to/bar.wav
(3) modified beam search
./transducer_stateless/pretrained.py \
@ -43,7 +43,7 @@ Usage:
--method modified_beam_search \
--beam-size 4 \
/path/to/foo.wav \
/path/to/bar.wav \
/path/to/bar.wav
You can also use `./transducer_stateless/exp/epoch-xx.pt`.

View File

@ -397,7 +397,6 @@ def compute_loss(
feature_lens = supervisions["num_frames"].to(device)
texts = batch["supervisions"]["text"]
unk_id = params.unk_id
y = convert_texts_into_ids(texts, unk_id, sp=sp)
y = k2.RaggedTensor(y).to(device)