diff --git a/egs/icmcasr/ASR/local/compile_lg.py b/egs/icmcasr/ASR/local/compile_lg.py new file mode 100644 index 000000000..709b14070 --- /dev/null +++ b/egs/icmcasr/ASR/local/compile_lg.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, Wei Kang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script takes as input lang_dir and generates LG from + + - L, the lexicon, built from lang_dir/L_disambig.pt + + Caution: We use a lexicon that contains disambiguation symbols + + - G, the LM, built from data/lm/G_3_gram.fst.txt + +The generated LG is saved in $lang_dir/LG.pt +""" +import argparse +import logging +from pathlib import Path + +import k2 +import torch + +from icefall.lexicon import Lexicon + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + """, + ) + parser.add_argument( + "--lm", + type=str, + default="G_3_gram", + help="""Stem name for LM used in HLG compiling. + """, + ) + + return parser.parse_args() + + +def compile_LG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa: + """ + Args: + lang_dir: + The language directory, e.g., data/lang_phone or data/lang_bpe_5000. + + Return: + An FSA representing LG. + """ + lexicon = Lexicon(lang_dir) + L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt")) + + if Path(f"data/lm/{lm}.pt").is_file(): + logging.info(f"Loading pre-compiled {lm}") + d = torch.load(f"data/lm/{lm}.pt") + G = k2.Fsa.from_dict(d) + else: + logging.info(f"Loading {lm}.fst.txt") + with open(f"data/lm/{lm}.fst.txt") as f: + G = k2.Fsa.from_openfst(f.read(), acceptor=False) + torch.save(G.as_dict(), f"data/lm/{lm}.pt") + + first_token_disambig_id = lexicon.token_table["#0"] + first_word_disambig_id = lexicon.word_table["#0"] + + L = k2.arc_sort(L) + G = k2.arc_sort(G) + + logging.info("Intersecting L and G") + LG = k2.compose(L, G) + logging.info(f"LG shape: {LG.shape}") + + logging.info("Connecting LG") + LG = k2.connect(LG) + logging.info(f"LG shape after k2.connect: {LG.shape}") + + logging.info(type(LG.aux_labels)) + logging.info("Determinizing LG") + + LG = k2.determinize(LG, k2.DeterminizeWeightPushingType.kLogWeightPushing) + logging.info(type(LG.aux_labels)) + + logging.info("Connecting LG after k2.determinize") + LG = k2.connect(LG) + + logging.info("Removing disambiguation symbols on LG") + + # LG.labels[LG.labels >= first_token_disambig_id] = 0 + # see https://github.com/k2-fsa/k2/pull/1140 + labels = LG.labels + labels[labels >= first_token_disambig_id] = 0 + LG.labels = labels + + assert isinstance(LG.aux_labels, k2.RaggedTensor) + LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0 + + LG = k2.remove_epsilon(LG) + logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}") + + LG = k2.connect(LG) + LG.aux_labels = LG.aux_labels.remove_values_eq(0) + + logging.info("Arc sorting LG") + LG = k2.arc_sort(LG) + + return LG + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + + if (lang_dir / "LG.pt").is_file(): + logging.info(f"{lang_dir}/LG.pt already exists - skipping") + return + + logging.info(f"Processing {lang_dir}") + + LG = compile_LG(lang_dir, args.lm) + logging.info(f"Saving LG.pt to {lang_dir}") + torch.save(LG.as_dict(), f"{lang_dir}/LG.pt") + + +if __name__ == "__main__": + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + + logging.basicConfig(format=formatter, level=logging.INFO) + + main() diff --git a/egs/icmcasr/ASR/local/text2segments.py b/egs/icmcasr/ASR/local/text2segments.py new file mode 120000 index 000000000..7d68a39c3 --- /dev/null +++ b/egs/icmcasr/ASR/local/text2segments.py @@ -0,0 +1 @@ +../../../wenetspeech/ASR/local/text2segments.py \ No newline at end of file diff --git a/egs/icmcasr/ASR/local/text2token.py b/egs/icmcasr/ASR/local/text2token.py new file mode 120000 index 000000000..ce5cfd537 --- /dev/null +++ b/egs/icmcasr/ASR/local/text2token.py @@ -0,0 +1 @@ +../../../wenetspeech/ASR/local/text2token.py \ No newline at end of file diff --git a/egs/icmcasr/ASR/prepare.sh b/egs/icmcasr/ASR/prepare.sh index 1de9562a9..77bad4adb 100755 --- a/egs/icmcasr/ASR/prepare.sh +++ b/egs/icmcasr/ASR/prepare.sh @@ -6,8 +6,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail nj=15 -stage=4 -stop_stage=4 +stage=8 +stop_stage=8 # We assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded @@ -34,9 +34,9 @@ dl_dir=$PWD/download # It will generate data/lang_bbpe_xxx, # data/lang_bbpe_yyy if the array contains xxx, yyy vocab_sizes=( - # 2000 + 2000 # 1000 - 500 + # 500 ) # All files generated by this script are saved in "data". @@ -103,19 +103,91 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then fi fi -lang_phone_dir=data/lang_phone +lang_char_dir=data/lang_char if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare G.fst" - mkdir -p $lang_phone_dir + log "Stage 6: Prepare char based lang" + mkdir -p $lang_char_dir - (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | - cat - $dl_dir/icmcasr/resource_icmcasr/lexicon.txt | - sort | uniq > $lang_phone_dir/lexicon.txt + if ! which jq; then + echo "This script is intended to be used with jq but you have not installed jq + Note: in Linux, you can install jq with the following command: + 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 + 2. chmod +x ./jq + 3. cp jq /usr/bin" && exit 1 + fi + if [ ! -f $lang_char_dir/text ] || [ ! -s $lang_char_dir/text ]; then + log "Prepare text." + gunzip -c data/manifests/icmcasr-ihm_supervisions_train.jsonl.gz \ + | jq '.text' | sed 's/"//g' \ + | ./local/text2token.py -t "char" > $lang_char_dir/text + fi - ./local/generate_unique_lexicon.py --lang-dir $lang_phone_dir - - if [ ! -f $lang_phone_dir/L_disambig.pt ]; then - ./local/prepare_lang.py --lang-dir $lang_phone_dir + # The implementation of chinese word segmentation for text, + # and it will take about 15 minutes. + if [ ! -f $lang_char_dir/text_words_segmentation ]; then + python3 ./local/text2segments.py \ + --num-process $nj \ + --input-file $lang_char_dir/text \ + --output-file $lang_char_dir/text_words_segmentation + fi + if [ -f $lang_char_dir/words.txt ]; then + cd $lang_char_dir + ln -s ../../../../wenetspeech/ASR/data/lang_char/words.txt . + cd .. + else + log "Abort! Please run ../../wenetspeech/ASR/prepare.sh" + exit 1 fi fi +if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then + log "Stage 7: Prepare G" + if [ ! -f $lang_char_dir/3-gram.unpruned.arpa ]; then + python3 ./shared/make_kn_lm.py \ + -ngram-order 3 \ + -text $lang_char_dir/text_words_segmentation \ + -lm $lang_char_dir/3-gram.unpruned.arpa + fi + + mkdir -p data/lm + if [ ! -f data/lm/G_3_gram.fst.txt ]; then + # It is used in building LG + python3 -m kaldilm \ + --read-symbol-table="$lang_char_dir/words.txt" \ + --disambig-symbol='#0' \ + --max-order=3 \ + $lang_char_dir/3-gram.unpruned.arpa > data/lm/G_3_gram.fst.txt + fi + + if [ ! -f $lang_char_dir/5-gram.unpruned.arpa ]; then + python3 ./shared/make_kn_lm.py \ + -ngram-order 5 \ + -text $lang_char_dir/text_words_segmentation \ + -lm $lang_char_dir/5-gram.unpruned.arpa + fi + + if [ ! -f data/lm/G_5_gram.fst.txt ]; then + # It is used in building LG + python3 -m kaldilm \ + --read-symbol-table="$lang_char_dir/words.txt" \ + --disambig-symbol='#0' \ + --max-order=5 \ + $lang_char_dir/5-gram.unpruned.arpa > data/lm/G_5_gram.fst.txt + fi +fi + +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 15: Compile LG" + if [ ! -d data/lang_bpe_2000/ ]; then + log "Abort! Please run ../../multi_zh-hans/ASR/prepare.sh" + exit 1 + cd data + ln -s ../../../../multi_zh-hans/ASR/data/lang_bpe_2000 . + cd .. + else + log "data/lang_bpe_2000/ exists" + fi + lang_dir=data/lang_bpe_2000 + python3 ./local/compile_lg.py --lang-dir $lang_dir + #python3 ./local/compile_lg.py --lang-dir $lang_dir --lm G_5_gram +fi \ No newline at end of file