add some files

2025-12-11 06:55:27 +00:00 · 2022-04-18 11:11:34 +08:00 · 2022-04-18 11:11:34 +08:00 · 80b2cfee23
commit 80b2cfee23
parent e48eeb143e
9 changed files with 1019 additions and 0 deletions
--- a/egs/wenetspeech/ASR/local/compute_fbank_musan.py
+++ b/egs/wenetspeech/ASR/local/compute_fbank_musan.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py
+++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py
@ -0,0 +1,93 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from pathlib import Path
 import torch
 from lhotse import (
    CutSet,
    KaldifeatFbank,
    KaldifeatFbankConfig,
    LilcomHdf5Writer,
 )
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_wenetspeech_dev_test():
    in_out_dir = Path("data/fbank")
    # number of workers in dataloader
    num_workers = 20
    # number of seconds in a batch
    batch_duration = 600
    subsets = ("DEV", "TEST_NET", "TEST_MEETING")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 1)
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")
    for partition in subsets:
        cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue
        raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz"
        logging.info(f"Loading {raw_cuts_path}")
        cut_set = CutSet.from_file(raw_cuts_path)
        logging.info("Computing features")
        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
            storage_path=f"{in_out_dir}/feats_{partition}",
            num_workers=num_workers,
            batch_duration=batch_duration,
            storage_type=LilcomHdf5Writer,
        )
        cut_set = cut_set.trim_to_supervisions(
            keep_overlapping=False, min_duration=None
        )
        logging.info(f"Saving to {cuts_path}")
        cut_set.to_file(cuts_path)
 def main():
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    compute_fbank_wenetspeech_dev_test()
 if __name__ == "__main__":
    main()
--- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py
+++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py
@ -0,0 +1,171 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from datetime import datetime
 from pathlib import Path
 import torch
 from lhotse import (
    ChunkedLilcomHdf5Writer,
    CutSet,
    KaldifeatFbank,
    KaldifeatFbankConfig,
    set_audio_duration_mismatch_tolerance,
    set_caching_enabled,
 )
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--num-workers",
        type=int,
        default=20,
        help="Number of dataloading workers used for reading the audio.",
    )
    parser.add_argument(
        "--batch-duration",
        type=float,
        default=600.0,
        help="The maximum number of audio seconds in a batch."
        "Determines batch size dynamically.",
    )
    parser.add_argument(
        "--num-splits",
        type=int,
        required=True,
        help="The number of splits of the L subset",
    )
    parser.add_argument(
        "--start",
        type=int,
        default=0,
        help="Process pieces starting from this number (inclusive).",
    )
    parser.add_argument(
        "--stop",
        type=int,
        default=-1,
        help="Stop processing pieces until this number (exclusive).",
    )
    return parser
 def compute_fbank_wenetspeech_splits(args):
    num_splits = args.num_splits
    output_dir = f"data/fbank/L_split_{num_splits}"
    output_dir = Path(output_dir)
    assert output_dir.exists(), f"{output_dir} does not exist!"
    num_digits = len(str(num_splits))
    start = args.start
    stop = args.stop
    if stop < start:
        stop = num_splits
    stop = min(stop, num_splits)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 5)
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")
    set_audio_duration_mismatch_tolerance(0.01)  # 10ms tolerance
    set_caching_enabled(False)
    for i in range(start, stop):
        idx = f"{i + 1}".zfill(num_digits)
        logging.info(f"Processing {idx}/{num_splits}")
        cuts_path = output_dir / f"cuts_L.{idx}.jsonl.gz"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue
        raw_cuts_path = output_dir / f"cuts_L_raw.{idx}.jsonl.gz"
        logging.info(f"Loading {raw_cuts_path}")
        cut_set = CutSet.from_file(raw_cuts_path)
        logging.info("Computing features")
        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
            storage_path=f"{output_dir}/feats_L_{idx}",
            num_workers=args.num_workers,
            batch_duration=args.batch_duration,
            storage_type=ChunkedLilcomHdf5Writer,
        )
        logging.info("About to split cuts into smaller chunks.")
        cut_set = cut_set.trim_to_supervisions(
            keep_overlapping=False, min_duration=None
        )
        logging.info(f"Saving to {cuts_path}")
        cut_set.to_file(cuts_path)
        logging.info(f"Saved to {cuts_path}")
 def main():
    now = datetime.now()
    date_time = now.strftime("%Y-%m-%d-%H-%M-%S")
    log_filename = "log-compute_fbank_wenetspeech_splits"
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    log_filename = f"{log_filename}-{date_time}"
    logging.basicConfig(
        filename=log_filename,
        format=formatter,
        level=logging.INFO,
        filemode="w",
    )
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    console.setFormatter(logging.Formatter(formatter))
    logging.getLogger("").addHandler(console)
    parser = get_parser()
    args = parser.parse_args()
    logging.info(vars(args))
    compute_fbank_wenetspeech_splits(args)
 if __name__ == "__main__":
    main()
--- a/egs/wenetspeech/ASR/local/prepare_lang.py
+++ b/egs/wenetspeech/ASR/local/prepare_lang.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/prepare_lang.py
--- a/egs/wenetspeech/ASR/local/prepare_lang_wenetspeech.py
+++ b/egs/wenetspeech/ASR/local/prepare_lang_wenetspeech.py
@ -0,0 +1,253 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
 #                                                  Wei Kang,
 #                                                  Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input `lang_dir`, which should contain::
    - lang_dir/text,
    - lang_dir/words.txt
 and generates the following files in the directory `lang_dir`:
    - lexicon.txt
    - lexicon_disambig.txt
    - L.pt
    - L_disambig.pt
    - tokens.txt
 """
 import argparse
 import re
 from pathlib import Path
 from typing import Dict, List
 import k2
 import torch
 from prepare_lang import (
    Lexicon,
    add_disambig_symbols,
    add_self_loops,
    write_lexicon,
    write_mapping,
 )
 def lexicon_to_fst_no_sil(
    lexicon: Lexicon,
    token2id: Dict[str, int],
    word2id: Dict[str, int],
    need_self_loops: bool = False,
 ) -> k2.Fsa:
    """Convert a lexicon to an FST (in k2 format).
    Args:
      lexicon:
        The input lexicon. See also :func:`read_lexicon`
      token2id:
        A dict mapping tokens to IDs.
      word2id:
        A dict mapping words to IDs.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
    loop_state = 0  # words enter and leave from here
    next_state = 1  # the next un-allocated state, will be incremented as we go
    arcs = []
    # The blank symbol <blk> is defined in local/train_bpe_model.py
    assert token2id["<blk>"] == 0
    assert word2id["<eps>"] == 0
    eps = 0
    for word, pieces in lexicon:
        assert len(pieces) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
        pieces = [
            token2id[i] if i in token2id else token2id["<unk>"] for i in pieces
        ]
        for i in range(len(pieces) - 1):
            w = word if i == 0 else eps
            arcs.append([cur_state, next_state, pieces[i], w, 0])
            cur_state = next_state
            next_state += 1
        # now for the last piece of this word
        i = len(pieces) - 1
        w = word if i == 0 else eps
        arcs.append([cur_state, loop_state, pieces[i], w, 0])
    if need_self_loops:
        disambig_token = token2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
            arcs,
            disambig_token=disambig_token,
            disambig_word=disambig_word,
        )
    final_state = next_state
    arcs.append([loop_state, final_state, -1, -1, 0])
    arcs.append([final_state])
    arcs = sorted(arcs, key=lambda arc: arc[0])
    arcs = [[str(i) for i in arc] for arc in arcs]
    arcs = [" ".join(arc) for arc in arcs]
    arcs = "\n".join(arcs)
    fsa = k2.Fsa.from_str(arcs, acceptor=False)
    return fsa
 def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool:
    """Check if all the given tokens are in token symbol table.
    Args:
      token_sym_table:
        Token symbol table that contains all the valid tokens.
      tokens:
        A list of tokens.
    Returns:
      Return True if there is any token not in the token_sym_table,
      otherwise False.
    """
    for tok in tokens:
        if tok not in token_sym_table:
            return True
    return False
 def generate_lexicon(
    token_sym_table: Dict[str, int], words: List[str]
 ) -> Lexicon:
    """Generate a lexicon from a word list and token_sym_table.
    Args:
      token_sym_table:
        Token symbol table that mapping token to token ids.
      words:
        A list of strings representing words.
    Returns:
      Return a dict whose keys are words and values are the corresponding
          tokens.
    """
    lexicon = []
    for word in words:
        chars = list(word.strip(" \t"))
        if contain_oov(token_sym_table, chars):
            continue
        lexicon.append((word, chars))
    # The OOV word is <UNK>
    lexicon.append(("<UNK>", ["<unk>"]))
    return lexicon
 def generate_tokens(text_file: str, token_type: str) -> Dict[str, int]:
    """Generate tokens from the given text file.
    Args:
      text_file:
        A file that contains text lines to generate tokens.
      token_type:
            The type of token, such as "char", "pinyin" and "lazy_pinyin".
    Returns:
      Return a dict whose keys are tokens and values are token ids ranged
      from 0 to len(keys) - 1.
    """
    tokens: Dict[str, int] = dict()
    tokens["<blk>"] = 0
    tokens["<sos/eos>"] = 1
    tokens["<unk>"] = 2
    whitespace = re.compile(r"([ \t\r\n]+)")
    with open(text_file, "r", encoding="utf-8") as f:
        for line in f:
            if token_type == "char":
                line = re.sub(whitespace, "", line)
                tokens_list = list(line)
            else:
                tokens_list = line.strip().split(" ")
            for token in tokens_list:
                if token not in tokens:
                    tokens[token] = len(tokens)
    return tokens
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lang-dir", type=str, help="The lang directory.")
    parser.add_argument("--token-type", type=str, help="The type of token.")
    args = parser.parse_args()
    lang_dir = Path(args.lang_dir)
    token_type = args.token_type
    text_file = lang_dir / "text"
    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
    words = word_sym_table.symbols
    excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>", "#0", "<s>", "</s>"]
    for w in excluded:
        if w in words:
            words.remove(w)
    token_sym_table = generate_tokens(text_file, token_type)
    lexicon = generate_lexicon(token_sym_table, words)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    next_token_id = max(token_sym_table.values()) + 1
    for i in range(max_disambig + 1):
        disambig = f"#{i}"
        assert disambig not in token_sym_table
        token_sym_table[disambig] = next_token_id
        next_token_id += 1
    word_sym_table.add("#0")
    word_sym_table.add("<s>")
    word_sym_table.add("</s>")
    write_mapping(lang_dir / "tokens.txt", token_sym_table)
    write_lexicon(lang_dir / "lexicon.txt", lexicon)
    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
    L = lexicon_to_fst_no_sil(
        lexicon,
        token2id=token_sym_table,
        word2id=word_sym_table,
    )
    L_disambig = lexicon_to_fst_no_sil(
        lexicon_disambig,
        token2id=token_sym_table,
        word2id=word_sym_table,
        need_self_loops=True,
    )
    torch.save(L.as_dict(), lang_dir / "L.pt")
    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
 if __name__ == "__main__":
    main()
--- a/egs/wenetspeech/ASR/local/preprocess_wenetspeech.py
+++ b/egs/wenetspeech/ASR/local/preprocess_wenetspeech.py
@ -0,0 +1,120 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import re
 from pathlib import Path
 from lhotse import CutSet, SupervisionSegment
 from lhotse.recipes.utils import read_manifests_if_cached
 # Similar text filtering and normalization procedure as in:
 # https://github.com/SpeechColab/WenetSpeech/blob/main/toolkits/kaldi/wenetspeech_data_prep.sh
 def normalize_text(
    utt: str,
    # punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
    punct_pattern=re.compile(r"<(PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
    whitespace_pattern=re.compile(r"\s\s+"),
 ) -> str:
    return whitespace_pattern.sub(" ", punct_pattern.sub("", utt))
 def has_no_oov(
    sup: SupervisionSegment,
    oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
 ) -> bool:
    return oov_pattern.search(sup.text) is None
 def preprocess_wenet_speech():
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    output_dir.mkdir(exist_ok=True)
    dataset_parts = (
        "L",
        "M",
        "S",
        "DEV",
        "TEST_NET",
        "TEST_MEETING",
    )
    logging.info("Loading manifest (may take 10 minutes)")
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        suffix="jsonl.gz",
    )
    assert manifests is not None
    for partition, m in manifests.items():
        logging.info(f"Processing {partition}")
        raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
        if raw_cuts_path.is_file():
            logging.info(f"{partition} already exists - skipping")
            continue
        # Note this step makes the recipe different than LibriSpeech:
        # We must filter out some utterances and remove punctuation
        # to be consistent with Kaldi.
        logging.info("Filtering OOV utterances from supervisions")
        m["supervisions"] = m["supervisions"].filter(has_no_oov)
        logging.info(f"Normalizing text in {partition}")
        for sup in m["supervisions"]:
            text = str(sup.text)
            logging.info(f"Original text: {text}")
            sup.text = normalize_text(sup.text)
            text = str(sup.text)
            logging.info(f"Normalize text: {text}")
        # Create long-recording cut manifests.
        logging.info(f"Processing {partition}")
        cut_set = CutSet.from_manifests(
            recordings=m["recordings"],
            supervisions=m["supervisions"],
        )
        # Run data augmentation that needs to be done in the
        # time domain.
        if partition not in ["DEV", "TEST_NET", "TEST_MEETING"]:
            logging.info(
                f"Speed perturb for {partition} with factors 0.9 and 1.1 "
                "(Perturbing may take 8 minutes and saving may take 20 minutes)"
            )
            cut_set = (
                cut_set
                + cut_set.perturb_speed(0.9)
                + cut_set.perturb_speed(1.1)
            )
        logging.info(f"Saving to {raw_cuts_path}")
        cut_set.to_file(raw_cuts_path)
 def main():
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    preprocess_wenet_speech()
 if __name__ == "__main__":
    main()
--- a/egs/wenetspeech/ASR/local/text2token.py
+++ b/egs/wenetspeech/ASR/local/text2token.py
@ -0,0 +1,196 @@
 #!/usr/bin/env python3
 # Copyright    2017  Johns Hopkins University   (authors: Shinji Watanabe)
 #              2022  Xiaomi Corp.               (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import codecs
 import re
 import sys
 from typing import List
 from pypinyin import lazy_pinyin, pinyin
 is_python2 = sys.version_info[0] == 2
 def exist_or_not(i, match_pos):
    start_pos = None
    end_pos = None
    for pos in match_pos:
        if pos[0] <= i < pos[1]:
            start_pos = pos[0]
            end_pos = pos[1]
            break
    return start_pos, end_pos
 def get_parser():
    parser = argparse.ArgumentParser(
        description="convert raw text to tokenized text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--nchar",
        "-n",
        default=1,
        type=int,
        help="number of characters to split, i.e., \
                        aabb -> a a b b with -n 1 and aa bb with -n 2",
    )
    parser.add_argument(
        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
    )
    parser.add_argument(
        "--space", default="<space>", type=str, help="space symbol"
    )
    parser.add_argument(
        "--non-lang-syms",
        "-l",
        default=None,
        type=str,
        help="list of non-linguistic symobles, e.g., <NOISE> etc.",
    )
    parser.add_argument(
        "text", type=str, default=False, nargs="?", help="input text"
    )
    parser.add_argument(
        "--trans_type",
        "-t",
        type=str,
        default="char",
        choices=["char", "pinyin", "lazy_pinyin"],
        help="""Transcript type. char/pinyin/lazy_pinyin""",
    )
    return parser
 def token2id(
    texts, token_table, token_type: str = "lazy_pinyin", oov: str = "<unk>"
 ) -> List[List[int]]:
    """Convert token to id.
    Args:
      texts:
        The input texts, it refers to the chinese text here.
      token_table:
        The token table is built based on "data/lang_xxx/token.txt"
      token_type:
        The type of token, such as "pinyin" and "lazy_pinyin".
      oov:
        Out of vocabulary token. When a word(token) in the transcript
        does not exist in the token list, it is replaced with `oov`.
    Returns:
      The list of ids for the input texts.
    """
    if texts is None:
        raise ValueError("texts can't be None!")
    else:
        oov_id = token_table[oov]
        ids: List[List[int]] = []
        for text in texts:
            chars_list = list(str(text))
            if token_type == "lazy_pinyin":
                text = lazy_pinyin(chars_list)
                sub_ids = [
                    token_table[txt] if txt in token_table else oov_id
                    for txt in text
                ]
                ids.append(sub_ids)
            else:  # token_type = "pinyin"
                text = pinyin(chars_list)
                sub_ids = [
                    token_table[txt[0]] if txt[0] in token_table else oov_id
                    for txt in text
                ]
                ids.append(sub_ids)
        return ids
 def main():
    parser = get_parser()
    args = parser.parse_args()
    rs = []
    if args.non_lang_syms is not None:
        with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
            nls = [x.rstrip() for x in f.readlines()]
            rs = [re.compile(re.escape(x)) for x in nls]
    if args.text:
        f = codecs.open(args.text, encoding="utf-8")
    else:
        f = codecs.getreader("utf-8")(
            sys.stdin if is_python2 else sys.stdin.buffer
        )
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    line = f.readline()
    n = args.nchar
    while line:
        x = line.split()
        print(" ".join(x[: args.skip_ncols]), end=" ")
        a = " ".join(x[args.skip_ncols :])  # noqa E203
        # get all matched positions
        match_pos = []
        for r in rs:
            i = 0
            while i >= 0:
                m = r.search(a, i)
                if m:
                    match_pos.append([m.start(), m.end()])
                    i = m.end()
                else:
                    break
        if len(match_pos) > 0:
            chars = []
            i = 0
            while i < len(a):
                start_pos, end_pos = exist_or_not(i, match_pos)
                if start_pos is not None:
                    chars.append(a[start_pos:end_pos])
                    i = end_pos
                else:
                    chars.append(a[i])
                    i += 1
            a = chars
        if args.trans_type == "pinyin":
            a = pinyin(list(str(a)))
            a = [one[0] for one in a]
        if args.trans_type == "lazy_pinyin":
            a = lazy_pinyin(list(str(a)))
        a = [a[j : j + n] for j in range(0, len(a), n)]  # noqa E203
        a_flat = []
        for z in a:
            a_flat.append("".join(z))
        a_chars = [z.replace(" ", args.space) for z in a_flat]
        print(" ".join(a_chars))
        line = f.readline()
 if __name__ == "__main__":
    main()
--- a/egs/wenetspeech/ASR/prepare.sh
+++ b/egs/wenetspeech/ASR/prepare.sh
@ -0,0 +1,183 @@
 #!/usr/bin/env bash
 set -eou pipefail
 nj=15
 stage=0
 stop_stage=100
 # Split L subset to this number of pieces
 # This is to avoid OOM during feature extraction.
 num_splits=1000
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
 # by this script automatically.
 #
 #  - $dl_dir/WenetSpeech
 #      You can find audio, WenetSpeech.json inside it.
 #      You can apply for the download credentials by following
 #      https://github.com/wenet-e2e/WenetSpeech#download
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
 #       http://www.openslr.org/17/
 #
 #     - music
 #     - noise
 #     - speech
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "dl_dir: $dl_dir"
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
  [ ! -e $dl_dir/WenetSpeech ] && mkdir -p $dl_dir/WenetSpeech
  # If you have pre-downloaded it to /path/to/WenetSpeech,
  # you can create a symlink
  #
  # ln -sfv /path/to/WenetSpeech $dl_dir/WenetSpeech
  #
  if [ ! -d $dl_dir/WenetSpeech/wenet_speech ] && [ ! -f $dl_dir/WenetSpeech/metadata/v1.list ]; then
    log "Stage 0: should download WenetSpeech first"
    exit 1;
  fi
  # If you have pre-downloaded it to /path/to/musan,
  # you can create a symlink
  #
  #ln -sfv /path/to/musan $dl_dir/musan
  if [ ! -d $dl_dir/musan ]; then
    lhotse download musan $dl_dir
  fi
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare WenetSpeech manifest"
  # We assume that you have downloaded the WenetSpeech corpus
  # to $dl_dir/WenetSpeech
  mkdir -p data/manifests
  lhotse prepare wenet-speech $dl_dir/WenetSpeech data/manifests -j $nj
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Prepare musan manifest"
  # We assume that you have downloaded the musan corpus
  # to data/musan
  mkdir -p data/manifests
  lhotse prepare musan $dl_dir/musan data/manifests
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Preprocess WenetSpeech manifest"
  if [ ! -f data/fbank/.preprocess_complete ]; then
    python3 ./local/preprocess_wenetspeech.py
    touch data/fbank/.preprocess_complete
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute features for DEV and TEST subsets of WenetSpeech (may take 2 minutes)"
  python3 ./local/compute_fbank_wenetspeech_dev_test.py
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Split L subset into ${num_splits} pieces (may take 30 minutes)"
  split_dir=data/fbank/L_split_${num_splits}
  if [ ! -f $split_dir/.split_completed ]; then
    lhotse split $num_splits ./data/fbank/cuts_L_raw.jsonl.gz $split_dir
    touch $split_dir/.split_completed
  fi
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Compute features for L"
  python3 ./local/compute_fbank_wenetspeech_splits.py \
    --num-workers 20 \
    --batch-duration 600 \
    --start 0 \
    --num-splits $num_splits
 fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "Stage 7: Combine features for L"
  if [ ! -f data/fbank/cuts_L_50.jsonl.gz ]; then
    pieces=$(find data/fbank/L_split_50 -name "cuts_L.*.jsonl.gz")
    lhotse combine $pieces data/fbank/cuts_L_50.jsonl.gz
  fi
 fi
 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
  log "Stage 8: Compute fbank for musan"
  mkdir -p data/fbank
  ./local/compute_fbank_musan.py
 fi
 if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
  log "Stage 9: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir
  # Prepare text.
  if [ ! -f $lang_char_dir/text ]; then
    gunzip -c data/manifests/supervisions_L.jsonl.gz \
      | jq '.text' | sed 's/"//g' \
      | ./local/text2token.py -t "char" > $lang_char_dir/text
  fi
  # The implementation of chinese word segmentation for text,
  # and it will take about 15 minutes.
  if [ ! -f $lang_char_dir/text_words_segmentation ]; then
    python ./local/text2segments.py \
      --input $lang_char_dir/text \
      --output $lang_char_dir/text_words_segmentation
  fi
  cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \
    | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt
  if [ ! -f $lang_char_dir/words.txt ]; then
    python ./local/prepare_words.py \
      --input-file $lang_char_dir/words_no_ids.txt \
      --output-file $lang_char_dir/words.txt
  fi
 fi
 if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
  log "Stage 10: Prepare char based L_disambig.pt"
  if [ ! -f data/lang_char/L_disambig.pt ]; then
    python ./local/prepare_char.py \
      --lang-dir data/lang_char
  fi
 fi
 if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
  log "Stage 11: Prepare pinyin based L_disambig.pt"
  lang_pinyin_dir=data/lang_pinyin
  mkdir -p $lang_pinyin_dir
  cp -r data/lang_char/words.txt $lang_pinyin_dir/
  cp -r data/lang_char/text $lang_pinyin_dir/
  cp -r data/lang_char/text_words_segmentation $lang_pinyin_dir/
  if [ ! -f data/lang_pinyin/L_disambig.pt ]; then
    python ./local/prepare_pinyin.py \
      --lang-dir data/lang_pinyin
  fi
 fi
--- a/egs/wenetspeech/ASR/shared
+++ b/egs/wenetspeech/ASR/shared
@ -0,0 +1 @@
 ../../librispeech/ASR/shared
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/compute_fbank_musan.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/prepare_lang.py`