init commit

2024-03-08 16:49:43 +08:00 · 2024-03-08 16:49:43 +08:00 · 821ec9db13
commit 821ec9db13
parent ae61bd4090
14 changed files with 684 additions and 2 deletions
--- a/egs/aishell/ASR/prepare.sh
+++ b/egs/aishell/ASR/prepare.sh
@ -360,7 +360,7 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
 fi

 if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
-  log "Stage 11: Train RNN LM model"
+  log "Stage 12: Train RNN LM model"
  python ../../../icefall/rnn_lm/train.py \
    --start-epoch 0 \
    --world-size 1 \
--- a/egs/mdcc/ASR/local/compile_hlg.py
+++ b/egs/mdcc/ASR/local/compile_hlg.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compile_hlg.py
--- a/egs/mdcc/ASR/local/compile_hlg_using_openfst.py
+++ b/egs/mdcc/ASR/local/compile_hlg_using_openfst.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compile_hlg_using_openfst.py
--- a/egs/mdcc/ASR/local/compile_lg.py
+++ b/egs/mdcc/ASR/local/compile_lg.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compile_lg.py
--- a/egs/mdcc/ASR/local/compute_fbank_mdcc.py
+++ b/egs/mdcc/ASR/local/compute_fbank_mdcc.py
@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+# Copyright    2021-2024  Xiaomi Corp.   (authors: Fangjun Kuang,
+#                                                  Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the aishell dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import torch
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+from icefall.utils import get_executor, str2bool
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def compute_fbank_mdcc(
+    num_mel_bins: int = 80,
+    perturb_speed: bool = False,
+    whisper_fbank: bool = False,
+    output_dir: str = "data/fbank",
+):
+    src_dir = Path("data/manifests")
+    output_dir = Path(output_dir)
+    num_jobs = min(15, os.cpu_count())
+
+    dataset_parts = (
+        "train",
+        "valid",
+        "test",
+    )
+    prefix = "mdcc"
+    suffix = "jsonl.gz"
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
+    )
+    assert manifests is not None
+
+    assert len(manifests) == len(dataset_parts), (
+        len(manifests),
+        len(dataset_parts),
+        list(manifests.keys()),
+        dataset_parts,
+    )
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    with get_executor() as ex:  # Initialize the executor only once.
+        for partition, m in manifests.items():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
+                logging.info(f"{partition} already exists - skipping.")
+                continue
+            logging.info(f"Processing {partition}")
+            cut_set = CutSet.from_manifests(
+                recordings=m["recordings"],
+                supervisions=m["supervisions"],
+            )
+            if "train" in partition and perturb_speed:
+                logging.info("Doing speed perturb")
+                cut_set = (
+                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
+                )
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomChunkyWriter,
+            )
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num-mel-bins",
+        type=int,
+        default=80,
+        help="""The number of mel bins for Fbank""",
+    )
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use WhisperFbank instead of Fbank. Default: False.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/fbank",
+        help="Output directory. Default: data/fbank.",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    args = get_args()
+    compute_fbank_mdcc(
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
+        output_dir=args.output_dir,
+    )
--- a/egs/mdcc/ASR/local/prepare_char.py
+++ b/egs/mdcc/ASR/local/prepare_char.py
@ -0,0 +1 @@
+../../../aishell/ASR/local/prepare_char.py
--- a/egs/mdcc/ASR/local/prepare_char_lm_training_data.py
+++ b/egs/mdcc/ASR/local/prepare_char_lm_training_data.py
@ -0,0 +1 @@
+../../../aishell/ASR/local/prepare_char_lm_training_data.py
--- a/egs/mdcc/ASR/local/prepare_lang.py
+++ b/egs/mdcc/ASR/local/prepare_lang.py
@ -0,0 +1 @@
+../../../aishell/ASR/local/prepare_lang.py
--- a/egs/mdcc/ASR/local/preprocess_mdcc.py
+++ b/egs/mdcc/ASR/local/preprocess_mdcc.py
@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# Copyright    2024  Xiaomi Corp.        (authors: Zengrui Jin)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script takes a text file "data/lang_char/text" as input, the file consist of
+lines each containing a transcript, applies text norm and generates the following
+files in the directory "data/lang_char":
+    - text_norm
+    - words.txt
+    - words_no_ids.txt
+    - text_words_segmentation
+"""
+
+import argparse
+from pathlib import Path
+from typing import List
+
+import pycantonese
+from tqdm.auto import tqdm
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Prepare char lexicon",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--input-file",
+        "-i",
+        default="data/lang_char/text",
+        type=str,
+        help="The input text file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        "-o",
+        default="data/lang_char",
+        type=str,
+        help="The output directory",
+    )
+    return parser
+
+
+def get_norm_lines(lines: List[str]) -> List[str]:
+    def _text_norm(text: str) -> str:
+        # to cope with the protocol for transcription:
+        # When taking notes, the annotators adhere to the following guidelines:
+        # 1) If the audio contains pure music, the annotators mark the label
+        # "(music)" in the file name of its transcript. 2) If the utterance
+        # contains one or several sentences with background music or noise, the
+        # annotators mark the label "(music)" before each sentence in the transcript.
+        # 3) The annotators use {} symbols to enclose words they are uncertain
+        # about, for example, {梁佳佳}，我是{}人.
+        return (
+            text.strip()
+            .replace("(music)", "")
+            .replace("(music", "")
+            .replace("{", "")
+            .replace("}", "")
+        )
+
+    return [_text_norm(line) for line in lines]
+
+
+def get_word_segments(lines: List[str]) -> List[str]:
+    return [
+        " ".join(pycantonese.segment(line)) + "\n"
+        for line in tqdm(lines, desc="Segmenting lines")
+    ]
+
+
+def get_words(lines: List[str]) -> List[str]:
+    words = set()
+    for line in tqdm(lines, desc="Getting words"):
+        words.update(pycantonese.segment(line))
+    return list(words)
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    input_file = Path(args.input_file)
+    output_dir = Path(args.output_dir)
+
+    assert output_dir.is_dir(), f"{output_dir} does not exist"
+    assert input_file.is_file(), f"{input_file} does not exist"
+
+    lines = input_file.read_text(encoding="utf-8").strip().split("\n")
+
+    norm_lines = get_norm_lines(lines)
+    with open(output_dir / "text_norm", "w+", encoding="utf-8") as f:
+        f.writelines([line + "\n" for line in norm_lines])
+
+    words = get_words(norm_lines)
+    with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
+        f.writelines([word + "\n" for word in sorted(words)])
+
+    words = (
+        ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>"]
+        + sorted(words)
+        + ["#0", "<s>", "<\s>"]
+    )
+
+    with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
+        f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])
+
+    text_words_segments = get_word_segments(norm_lines)
+    with open(output_dir / "text_words_segmentation", "w+", encoding="utf-8") as f:
+        f.writelines(text_words_segments)
--- a/egs/mdcc/ASR/local/text2segments.py
+++ b/egs/mdcc/ASR/local/text2segments.py
@ -0,0 +1,86 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright    2021  Xiaomi Corp.        (authors: Mingshuang Luo)
+#              2022  Xiaomi Corp.        (authors: Weiji Zhuang)
+#              2024  Xiaomi Corp.        (authors: Zengrui Jin)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This script takes as input "text", which refers to the transcript file for
+MDCC:
+    - text
+and generates the output file text_word_segmentation which is implemented
+with word segmenting:
+    - text_words_segmentation
+"""
+
+import argparse
+from typing import List
+
+import pycantonese
+from tqdm.auto import tqdm
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Cantonese Word Segmentation for text",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--input-file",
+        "-i",
+        default="data/lang_char/text",
+        type=str,
+        help="the input text file for MDCC",
+    )
+    parser.add_argument(
+        "--output-file",
+        "-o",
+        default="data/lang_char/text_words_segmentation",
+        type=str,
+        help="the text implemented with words segmenting for MDCC",
+    )
+
+    return parser
+
+
+def get_word_segments(lines: List[str]) -> List[str]:
+    return [
+        " ".join(pycantonese.segment(line)) + "\n"
+        for line in tqdm(lines, desc="Segmenting lines")
+    ]
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    input_file = args.input_file
+    output_file = args.output_file
+
+    with open(input_file, "r", encoding="utf-8") as fr:
+        lines = fr.readlines()
+
+        new_lines = get_word_segments(lines)
+
+    with open(output_file, "w", encoding="utf-8") as fw:
+        fw.writelines(new_lines)
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/mdcc/ASR/local/text2token.py
+++ b/egs/mdcc/ASR/local/text2token.py
@ -0,0 +1 @@
+../../../aidatatang_200zh/ASR/local/text2token.py
--- a/egs/mdcc/ASR/prepare.sh
+++ b/egs/mdcc/ASR/prepare.sh
@ -0,0 +1,304 @@
+#!/usr/bin/env bash
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+set -eou pipefail
+
+stage=-1
+stop_stage=100
+perturb_speed=true
+
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+#  - $dl_dir/mdcc
+#       |-- README.md
+#       |-- audio/
+#       |-- clip_info_rthk.csv
+#       |-- cnt_asr_metadata_full.csv
+#       |-- cnt_asr_test_metadata.csv
+#       |-- cnt_asr_train_metadata.csv
+#       |-- cnt_asr_valid_metadata.csv
+#       |-- data_statistic.py
+#       |-- length
+#       |-- podcast_447_2021.csv
+#       |-- test.txt
+#       |-- transcription/
+#       `-- words_length
+#      You can download them from:
+#      https://drive.google.com/file/d/1epfYMMhXdBKA6nxPgUugb2Uj4DllSxkn/view?usp=drive_link
+#
+#  - $dl_dir/musan
+#      This directory contains the following directories downloaded from
+#       http://www.openslr.org/17/
+#
+#     - music
+#     - noise
+#     - speech
+
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "stage 0: Download data"
+
+  # If you have pre-downloaded it to /path/to/mdcc,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/mdcc $dl_dir/mdcc
+  #
+  # The directory structure is
+  # mdcc/
+  #  |-- README.md
+  #  |-- audio/
+  #  |-- clip_info_rthk.csv
+  #  |-- cnt_asr_metadata_full.csv
+  #  |-- cnt_asr_test_metadata.csv
+  #  |-- cnt_asr_train_metadata.csv
+  #  |-- cnt_asr_valid_metadata.csv
+  #  |-- data_statistic.py
+  #  |-- length
+  #  |-- podcast_447_2021.csv
+  #  |-- test.txt
+  #  |-- transcription/
+  #  `-- words_length
+
+  if [ ! -d $dl_dir/mdcc/audio ]; then
+    lhotse download mdcc $dl_dir
+
+    # this will download and unzip dataset.zip to $dl_dir/
+
+    mv $dl_dir/dataset $dl_dir/mdcc
+  fi
+
+  # If you have pre-downloaded it to /path/to/musan,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/musan $dl_dir/musan
+  #
+  if [ ! -d $dl_dir/musan ]; then
+    lhotse download musan $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare MDCC manifest"
+  # We assume that you have downloaded the MDCC corpus
+  # to $dl_dir/mdcc
+  if [ ! -f data/manifests/.mdcc_manifests.done ]; then
+    mkdir -p data/manifests
+    lhotse prepare mdcc $dl_dir/mdcc data/manifests
+    touch data/manifests/.mdcc_manifests.done
+  fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare musan manifest"
+  # We assume that you have downloaded the musan corpus
+  # to data/musan
+  if [ ! -f data/manifests/.musan_manifests.done ]; then
+    log "It may take 6 minutes"
+    mkdir -p data/manifests
+    lhotse prepare musan $dl_dir/musan data/manifests
+    touch data/manifests/.musan_manifests.done
+  fi
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Compute fbank for MDCC"
+  if [ ! -f data/fbank/.mdcc.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_mdcc.py --perturb-speed ${perturb_speed}
+    touch data/fbank/.mdcc.done
+  fi
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Compute fbank for musan"
+  if [ ! -f data/fbank/.msuan.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_musan.py
+    touch data/fbank/.msuan.done
+  fi
+fi
+
+lang_char_dir=data/lang_char
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Prepare char based lang"
+  mkdir -p $lang_char_dir
+
+  # Prepare text.
+  # Note: in Linux, you can install jq with the following command:
+  # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
+  # 2. chmod +x ./jq
+  # 3. cp jq /usr/bin
+  if [ ! -f $lang_char_dir/text ]; then
+    gunzip -c data/manifests/mdcc_supervisions_train.jsonl.gz \
+      |jq '.text' | sed 's/"//g' | ./local/text2token.py -t "char" \
+      > $lang_char_dir/train_text
+
+    cat $lang_char_dir/train_text > $lang_char_dir/text
+
+    gunzip -c data/manifests/mdcc_supervisions_test.jsonl.gz \
+      |jq '.text' | sed 's/"//g' | ./local/text2token.py -t "char" \
+      > $lang_char_dir/valid_text
+    
+    cat $lang_char_dir/valid_text >> $lang_char_dir/text
+  
+    gunzip -c data/manifests/mdcc_supervisions_valid.jsonl.gz \
+      |jq '.text' | sed 's/"//g' | ./local/text2token.py -t "char" \
+      > $lang_char_dir/test_text
+
+    cat $lang_char_dir/test_text >> $lang_char_dir/text
+  fi
+
+  if [ ! -f $lang_char_dir/text_words_segmentation ]; then
+    ./local/preprocess_mdcc.py --input-file $lang_char_dir/text \
+      --output-dir $lang_char_dir
+  fi
+
+  if [ ! -f $lang_char_dir/tokens.txt ]; then
+    ./local/prepare_char.py --lang-dir $lang_char_dir
+  fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Prepare G"
+
+  mkdir -p data/lm
+
+  # Train LM on transcripts
+  if [ ! -f data/lm/3-gram.unpruned.arpa ]; then
+    python3 ./shared/make_kn_lm.py \
+      -ngram-order 3 \
+      -text $lang_char_dir/text_words_segmentation \
+      -lm data/lm/3-gram.unpruned.arpa
+  fi
+
+  # We assume you have installed kaldilm, if not, please install
+  # it using: pip install kaldilm
+  if [ ! -f data/lm/G_3_gram_char.fst.txt ]; then
+    # It is used in building HLG
+    python3 -m kaldilm \
+      --read-symbol-table="$lang_char_dir/words.txt" \
+      --disambig-symbol='#0' \
+      --max-order=3 \
+      data/lm/3-gram.unpruned.arpa > data/lm/G_3_gram_char.fst.txt
+  fi
+
+  if [ ! -f $lang_char_dir/HLG.fst ]; then
+    ./local/prepare_lang_fst.py  \
+      --lang-dir $lang_char_dir \
+      --ngram-G ./data/lm/G_3_gram_char.fst.txt
+  fi
+fi
+
+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+  log "Stage 7: Compile LG & HLG"
+
+  ./local/compile_hlg.py --lang-dir $lang_char_dir --lm G_3_gram_char
+  ./local/compile_lg.py --lang-dir $lang_char_dir --lm G_3_gram_char
+fi
+
+if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
+  log "Stage 8: Generate LM training data"
+
+  log "Processing char based data"
+  out_dir=data/lm_training_char
+  mkdir -p $out_dir $dl_dir/lm
+
+  if [ ! -f $dl_dir/lm/mdcc-train-word.txt ]; then
+    ./local/text2segments.py --input-file $lang_char_dir/train_text \
+        --output-file $dl_dir/lm/mdcc-train-word.txt
+  fi
+
+  # training words
+  ./local/prepare_char_lm_training_data.py \
+    --lang-char data/lang_char \
+    --lm-data $dl_dir/lm/mdcc-train-word.txt \
+    --lm-archive $out_dir/lm_data.pt
+  
+  # valid words
+  if [ ! -f $dl_dir/lm/mdcc-valid-word.txt ]; then
+    ./local/text2segments.py --input-file $lang_char_dir/valid_text \
+        --output-file $dl_dir/lm/mdcc-valid-word.txt
+  fi
+
+  ./local/prepare_char_lm_training_data.py \
+    --lang-char data/lang_char \
+    --lm-data $dl_dir/lm/mdcc-valid-word.txt \
+    --lm-archive $out_dir/lm_data_valid.pt
+
+  # test words
+  if [ ! -f $dl_dir/lm/mdcc-test-word.txt ]; then
+    ./local/text2segments.py --input-file $lang_char_dir/test_text \
+        --output-file $dl_dir/lm/mdcc-test-word.txt
+  fi
+
+  ./local/prepare_char_lm_training_data.py \
+    --lang-char data/lang_char \
+    --lm-data $dl_dir/lm/mdcc-test-word.txt \
+    --lm-archive $out_dir/lm_data_test.pt
+fi
+
+if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
+  log "Stage 9: Sort LM training data"
+  # Sort LM training data by sentence length in descending order
+  # for ease of training.
+  #
+  # Sentence length equals to the number of tokens
+  # in a sentence.
+
+  out_dir=data/lm_training_char
+  mkdir -p $out_dir
+  ln -snf ../../../librispeech/ASR/local/sort_lm_training_data.py local/
+
+  ./local/sort_lm_training_data.py \
+    --in-lm-data $out_dir/lm_data.pt \
+    --out-lm-data $out_dir/sorted_lm_data.pt \
+    --out-statistics $out_dir/statistics.txt
+
+  ./local/sort_lm_training_data.py \
+    --in-lm-data $out_dir/lm_data_valid.pt \
+    --out-lm-data $out_dir/sorted_lm_data-valid.pt \
+    --out-statistics $out_dir/statistics-valid.txt
+
+  ./local/sort_lm_training_data.py \
+    --in-lm-data $out_dir/lm_data_test.pt \
+    --out-lm-data $out_dir/sorted_lm_data-test.pt \
+    --out-statistics $out_dir/statistics-test.txt
+fi
+
+if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
+  log "Stage 12: Train RNN LM model"
+  python ../../../icefall/rnn_lm/train.py \
+    --start-epoch 0 \
+    --world-size 1 \
+    --num-epochs 20 \
+    --use-fp16 0 \
+    --embedding-dim 512 \
+    --hidden-dim 512 \
+    --num-layers 2 \
+    --batch-size 400 \
+    --exp-dir rnnlm_char/exp \
+    --lm-data $out_dir/sorted_lm_data.pt \
+    --lm-data-valid $out_dir/sorted_lm_data-valid.pt \
+    --vocab-size 4336 \
+    --master-port 12345
+fi
--- a/egs/mdcc/ASR/shared
+++ b/egs/mdcc/ASR/shared
@ -0,0 +1 @@
+../../../icefall/shared/
--- a/requirements.txt
+++ b/requirements.txt
@ -14,4 +14,7 @@ onnxruntime==1.16.3
 # style check session:
 black==22.3.0
 isort==5.10.1
-flake8==5.0.4 
+flake8==5.0.4 
+
+# cantonese word segment support
+pycantonese==3.4.0
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/compile_hlg.py`
				`@ -0,0 +1 @@`
				`../../../aishell/ASR/local/prepare_char_lm_training_data.py`
				`@ -0,0 +1 @@`
				`../../../aidatatang_200zh/ASR/local/text2token.py`