Add char-based language model training process for aishell. (#945)

* Add char-based language model training process for aishell. Add soft link from librispeech/ASR/local/sort_lm_training_data.py to aishell/ASR/local/ --------- Co-authored-by: lichao <www.563042811@qq.com>
2023-03-16 09:52:11 +08:00 · 2023-03-16 09:52:11 +08:00 · 6196b4a407
commit 6196b4a407
parent a48812ddb3
2 changed files with 255 additions and 1 deletions
--- a/egs/aishell/ASR/local/prepare_char_lm_training_data.py
+++ b/egs/aishell/ASR/local/prepare_char_lm_training_data.py
@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2021  Xiaomi Corporation (authors: Daniel Povey
+#                                                   Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script takes a `tokens.txt` and a text file such as
+./download/lm/aishell-transcript.txt
+and outputs the LM training data to a supplied directory such
+as data/lm_training_char.  The format is as follows:
+It creates a PyTorch archive (.pt file), say data/lm_training.pt, which is a
+representation of a dict with the same format with librispeech receipe
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-char",
+        type=str,
+        help="""Lang dir of asr model, e.g. data/lang_char""",
+    )
+    parser.add_argument(
+        "--lm-data",
+        type=str,
+        help="""Input LM training data as text, e.g.
+        download/lm/aishell-train-word.txt""",
+    )
+    parser.add_argument(
+        "--lm-archive",
+        type=str,
+        help="""Path to output archive, e.g. data/lm_training_char/lm_data.pt;
+        look at the source of this script to see the format.""",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    if Path(args.lm_archive).exists():
+        logging.warning(f"{args.lm_archive} exists - skipping")
+        return
+
+    # make token_dict from tokens.txt in order to map characters to tokens.
+    token_dict = {}
+    token_file = args.lang_char + "/tokens.txt"
+
+    with open(token_file, "r") as f:
+        for line in f.readlines():
+            line_list = line.split()
+            token_dict[line_list[0]] = int(line_list[1])
+
+    # word2index is a dictionary from words to integer ids.  No need to reserve
+    # space for epsilon, etc.; the words are just used as a convenient way to
+    # compress the sequences of tokens.
+    word2index = dict()
+
+    word2token = []  # Will be a list-of-list-of-int, representing tokens.
+    sentences = []  # Will be a list-of-list-of-int, representing word-ids.
+
+    if "aishell-lm" in args.lm_data:
+        num_lines_in_total = 120098.0
+        step = 50000
+    elif "valid" in args.lm_data:
+        num_lines_in_total = 14326.0
+        step = 3000
+    elif "test" in args.lm_data:
+        num_lines_in_total = 7176.0
+        step = 3000
+    else:
+        num_lines_in_total = None
+        step = None
+
+    processed = 0
+
+    with open(args.lm_data) as f:
+        while True:
+            line = f.readline()
+            if line == "":
+                break
+
+            if step and processed % step == 0:
+                logging.info(
+                    f"Processed number of lines: {processed} "
+                    f"({processed / num_lines_in_total * 100: .3f}%)"
+                )
+            processed += 1
+
+            line_words = line.split()
+            for w in line_words:
+                if w not in word2index:
+                    w_token = []
+                    for t in w:
+                        if t in token_dict:
+                            w_token.append(token_dict[t])
+                        else:
+                            w_token.append(token_dict["<unk>"])
+                    word2index[w] = len(word2token)
+                    word2token.append(w_token)
+            sentences.append([word2index[w] for w in line_words])
+
+    logging.info("Constructing ragged tensors")
+    words = k2.ragged.RaggedTensor(word2token)
+    sentences = k2.ragged.RaggedTensor(sentences)
+
+    output = dict(words=words, sentences=sentences)
+
+    num_sentences = sentences.dim0
+    logging.info(f"Computing sentence lengths, num_sentences: {num_sentences}")
+    sentence_lengths = [0] * num_sentences
+    for i in range(num_sentences):
+        if step and i % step == 0:
+            logging.info(
+                f"Processed number of lines: {i} ({i / num_sentences * 100: .3f}%)"
+            )
+
+        word_ids = sentences[i]
+
+        # NOTE: If word_ids is a tensor with only 1 entry,
+        # token_ids is a torch.Tensor
+        token_ids = words[word_ids]
+        if isinstance(token_ids, k2.RaggedTensor):
+            token_ids = token_ids.values
+
+        # token_ids is a 1-D tensor containing the BPE tokens
+        # of the current sentence
+
+        sentence_lengths[i] = token_ids.numel()
+
+    output["sentence_lengths"] = torch.tensor(sentence_lengths, dtype=torch.int32)
+
+    torch.save(output, args.lm_archive)
+    logging.info(f"Saved to {args.lm_archive}")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/aishell/ASR/prepare.sh
+++ b/egs/aishell/ASR/prepare.sh
@ -7,7 +7,7 @@ set -eou pipefail

 nj=15
 stage=-1
-stop_stage=10
+stop_stage=11

 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
@ -219,3 +219,93 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
  ./local/compile_hlg.py --lang-dir $lang_phone_dir
  ./local/compile_hlg.py --lang-dir $lang_char_dir
 fi
+
+if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
+  log "Stage 9: Generate LM training data"
+
+  log "Processing char based data"
+  out_dir=data/lm_training_char
+  mkdir -p $out_dir $dl_dir/lm
+
+  if [ ! -f $dl_dir/lm/aishell-train-word.txt ]; then
+    cp $lang_phone_dir/transcript_words.txt $dl_dir/lm/aishell-train-word.txt
+  fi
+
+  ./local/prepare_char_lm_training_data.py \
+    --lang-char data/lang_char \
+    --lm-data $dl_dir/lm/aishell-train-word.txt \
+    --lm-archive $out_dir/lm_data.pt
+
+  if [ ! -f $dl_dir/lm/aishell-valid-word.txt ]; then
+    aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+    aishell_valid_uid=$dl_dir/aishell/data_aishell/transcript/aishell_valid_uid
+    find $dl_dir/aishell/data_aishell/wav/dev -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_valid_uid
+    awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_valid_uid $aishell_text |
+	    cut -d " " -f 2- > $dl_dir/lm/aishell-valid-word.txt
+  fi
+
+  ./local/prepare_char_lm_training_data.py \
+    --lang-char data/lang_char \
+    --lm-data $dl_dir/lm/aishell-valid-word.txt \
+    --lm-archive $out_dir/lm_data_valid.pt
+
+  if [ ! -f $dl_dir/lm/aishell-test-word.txt ]; then
+    aishell_text=$dl_dir/aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+    aishell_test_uid=$dl_dir/aishell/data_aishell/transcript/aishell_test_uid
+    find $dl_dir/aishell/data_aishell/wav/test -name "*.wav" | sed 's/\.wav//g' | awk -F '/' '{print $NF}' > $aishell_test_uid
+    awk 'NR==FNR{uid[$1]=$1} NR!=FNR{if($1 in uid) print $0}' $aishell_test_uid $aishell_text |
+	    cut -d " " -f 2- > $dl_dir/lm/aishell-test-word.txt
+  fi
+
+  ./local/prepare_char_lm_training_data.py \
+    --lang-char data/lang_char \
+    --lm-data $dl_dir/lm/aishell-test-word.txt \
+    --lm-archive $out_dir/lm_data_test.pt
+fi
+
+
+if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
+  log "Stage 10: Sort LM training data"
+  # Sort LM training data by sentence length in descending order
+  # for ease of training.
+  #
+  # Sentence length equals to the number of tokens
+  # in a sentence.
+
+  out_dir=data/lm_training_char
+  mkdir -p $out_dir
+  ln -snf ../../../librispeech/ASR/local/sort_lm_training_data.py local/
+  
+  ./local/sort_lm_training_data.py \
+    --in-lm-data $out_dir/lm_data.pt \
+    --out-lm-data $out_dir/sorted_lm_data.pt \
+    --out-statistics $out_dir/statistics.txt
+
+  ./local/sort_lm_training_data.py \
+    --in-lm-data $out_dir/lm_data_valid.pt \
+    --out-lm-data $out_dir/sorted_lm_data-valid.pt \
+    --out-statistics $out_dir/statistics-valid.txt
+
+  ./local/sort_lm_training_data.py \
+    --in-lm-data $out_dir/lm_data_test.pt \
+    --out-lm-data $out_dir/sorted_lm_data-test.pt \
+    --out-statistics $out_dir/statistics-test.txt
+fi
+
+if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
+  log "Stage 11: Train RNN LM model"
+  python ../../../icefall/rnn_lm/train.py \
+    --start-epoch 0 \
+    --world-size 1 \
+    --num-epochs 20 \
+    --use-fp16 0 \
+    --embedding-dim 512 \
+    --hidden-dim 512 \
+    --num-layers 2 \
+    --batch-size 400 \
+    --exp-dir rnnlm_char/exp \
+    --lm-data data/lm_training_char/sorted_lm_data.pt \
+    --lm-data-valid data/lm_training_char/sorted_lm_data-valid.pt \
+    --vocab-size 4336 \
+    --master-port 12345
+fi