add lm preparation

2025-08-12 03:22:19 +00:00 · 2023-12-19 19:05:48 +08:00 · 2023-12-19 19:05:48 +08:00 · 77d8a15288
commit 77d8a15288
parent eb79f1eceb
4 changed files with 235 additions and 14 deletions
--- a/egs/icmcasr/ASR/local/compile_lg.py
+++ b/egs/icmcasr/ASR/local/compile_lg.py
@ -0,0 +1,147 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input lang_dir and generates LG from
    - L, the lexicon, built from lang_dir/L_disambig.pt
        Caution: We use a lexicon that contains disambiguation symbols
    - G, the LM, built from data/lm/G_3_gram.fst.txt
 The generated LG is saved in $lang_dir/LG.pt
 """
 import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from icefall.lexicon import Lexicon
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir",
        type=str,
        help="""Input and output directory.
        """,
    )
    parser.add_argument(
        "--lm",
        type=str,
        default="G_3_gram",
        help="""Stem name for LM used in HLG compiling.
        """,
    )
    return parser.parse_args()
 def compile_LG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
    """
    Args:
      lang_dir:
        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
    Return:
      An FSA representing LG.
    """
    lexicon = Lexicon(lang_dir)
    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
    if Path(f"data/lm/{lm}.pt").is_file():
        logging.info(f"Loading pre-compiled {lm}")
        d = torch.load(f"data/lm/{lm}.pt")
        G = k2.Fsa.from_dict(d)
    else:
        logging.info(f"Loading {lm}.fst.txt")
        with open(f"data/lm/{lm}.fst.txt") as f:
            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
            torch.save(G.as_dict(), f"data/lm/{lm}.pt")
    first_token_disambig_id = lexicon.token_table["#0"]
    first_word_disambig_id = lexicon.word_table["#0"]
    L = k2.arc_sort(L)
    G = k2.arc_sort(G)
    logging.info("Intersecting L and G")
    LG = k2.compose(L, G)
    logging.info(f"LG shape: {LG.shape}")
    logging.info("Connecting LG")
    LG = k2.connect(LG)
    logging.info(f"LG shape after k2.connect: {LG.shape}")
    logging.info(type(LG.aux_labels))
    logging.info("Determinizing LG")
    LG = k2.determinize(LG, k2.DeterminizeWeightPushingType.kLogWeightPushing)
    logging.info(type(LG.aux_labels))
    logging.info("Connecting LG after k2.determinize")
    LG = k2.connect(LG)
    logging.info("Removing disambiguation symbols on LG")
    # LG.labels[LG.labels >= first_token_disambig_id] = 0
    # see https://github.com/k2-fsa/k2/pull/1140
    labels = LG.labels
    labels[labels >= first_token_disambig_id] = 0
    LG.labels = labels
    assert isinstance(LG.aux_labels, k2.RaggedTensor)
    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
    LG = k2.remove_epsilon(LG)
    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
    LG = k2.connect(LG)
    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
    logging.info("Arc sorting LG")
    LG = k2.arc_sort(LG)
    return LG
 def main():
    args = get_args()
    lang_dir = Path(args.lang_dir)
    if (lang_dir / "LG.pt").is_file():
        logging.info(f"{lang_dir}/LG.pt already exists - skipping")
        return
    logging.info(f"Processing {lang_dir}")
    LG = compile_LG(lang_dir, args.lm)
    logging.info(f"Saving LG.pt to {lang_dir}")
    torch.save(LG.as_dict(), f"{lang_dir}/LG.pt")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/icmcasr/ASR/local/text2segments.py
+++ b/egs/icmcasr/ASR/local/text2segments.py
@ -0,0 +1 @@
 ../../../wenetspeech/ASR/local/text2segments.py
--- a/egs/icmcasr/ASR/local/text2token.py
+++ b/egs/icmcasr/ASR/local/text2token.py
@ -0,0 +1 @@
 ../../../wenetspeech/ASR/local/text2token.py
--- a/egs/icmcasr/ASR/prepare.sh
+++ b/egs/icmcasr/ASR/prepare.sh
@ -6,8 +6,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 nj=15
-stage=4
+stage=8
-stop_stage=4
+stop_stage=8
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
@ -34,9 +34,9 @@ dl_dir=$PWD/download
 # It will generate data/lang_bbpe_xxx,
 # data/lang_bbpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
-  # 2000
+  2000
  # 1000
-  500
+  # 500
 )
 # All files generated by this script are saved in "data".
@ -103,19 +103,91 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  fi
 fi
-lang_phone_dir=data/lang_phone
+lang_char_dir=data/lang_char
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Prepare G.fst"
+  log "Stage 6: Prepare char based lang"
-  mkdir -p $lang_phone_dir
+  mkdir -p $lang_char_dir
-  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
+  if ! which jq; then
-    cat - $dl_dir/icmcasr/resource_icmcasr/lexicon.txt |
+      echo "This script is intended to be used with jq but you have not installed jq
-    sort | uniq > $lang_phone_dir/lexicon.txt
+      Note: in Linux, you can install jq with the following command:
      1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
      2. chmod +x ./jq
      3. cp jq /usr/bin" && exit 1
  fi
  if [ ! -f $lang_char_dir/text ] || [ ! -s $lang_char_dir/text ]; then
    log "Prepare text."
    gunzip -c data/manifests/icmcasr-ihm_supervisions_train.jsonl.gz \
      | jq '.text' | sed 's/"//g' \
      | ./local/text2token.py -t "char" > $lang_char_dir/text
  fi
-  ./local/generate_unique_lexicon.py --lang-dir $lang_phone_dir
+  # The implementation of chinese word segmentation for text,
-
+  # and it will take about 15 minutes.
-  if [ ! -f $lang_phone_dir/L_disambig.pt ]; then
+  if [ ! -f $lang_char_dir/text_words_segmentation ]; then
-    ./local/prepare_lang.py --lang-dir $lang_phone_dir
+    python3 ./local/text2segments.py \
      --num-process $nj \
      --input-file $lang_char_dir/text \
      --output-file $lang_char_dir/text_words_segmentation
  fi
  if [ -f $lang_char_dir/words.txt ]; then
    cd $lang_char_dir
    ln -s ../../../../wenetspeech/ASR/data/lang_char/words.txt .
    cd ..
  else
    log "Abort! Please run ../../wenetspeech/ASR/prepare.sh"
    exit 1
  fi
 fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "Stage 7: Prepare G"
  if [ ! -f $lang_char_dir/3-gram.unpruned.arpa ]; then
    python3 ./shared/make_kn_lm.py \
      -ngram-order 3 \
      -text $lang_char_dir/text_words_segmentation \
      -lm $lang_char_dir/3-gram.unpruned.arpa
  fi
  mkdir -p data/lm
  if [ ! -f data/lm/G_3_gram.fst.txt ]; then
    # It is used in building LG
    python3 -m kaldilm \
      --read-symbol-table="$lang_char_dir/words.txt" \
      --disambig-symbol='#0' \
      --max-order=3 \
      $lang_char_dir/3-gram.unpruned.arpa > data/lm/G_3_gram.fst.txt
  fi
  if [ ! -f $lang_char_dir/5-gram.unpruned.arpa ]; then
    python3 ./shared/make_kn_lm.py \
      -ngram-order 5 \
      -text $lang_char_dir/text_words_segmentation \
      -lm $lang_char_dir/5-gram.unpruned.arpa
  fi
  if [ ! -f data/lm/G_5_gram.fst.txt ]; then
    # It is used in building LG
    python3 -m kaldilm \
      --read-symbol-table="$lang_char_dir/words.txt" \
      --disambig-symbol='#0' \
      --max-order=5 \
      $lang_char_dir/5-gram.unpruned.arpa > data/lm/G_5_gram.fst.txt
  fi
 fi
 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
  log "Stage 15: Compile LG"
  if [ ! -d data/lang_bpe_2000/ ]; then
    log "Abort! Please run ../../multi_zh-hans/ASR/prepare.sh"
    exit 1
    cd data
    ln -s ../../../../multi_zh-hans/ASR/data/lang_bpe_2000 .
    cd ..
  else
    log "data/lang_bpe_2000/ exists"
  fi
  lang_dir=data/lang_bpe_2000
  python3 ./local/compile_lg.py --lang-dir $lang_dir
  #python3 ./local/compile_lg.py --lang-dir $lang_dir --lm G_5_gram
 fi
		`@ -0,0 +1 @@`
							`../../../wenetspeech/ASR/local/text2segments.py`
		`@ -0,0 +1 @@`
							`../../../wenetspeech/ASR/local/text2token.py`