updated

2025-12-11 06:55:27 +00:00 · 2023-11-09 10:51:41 +08:00 · 2023-11-09 10:51:41 +08:00 · 91da99ff52
commit 91da99ff52
parent 817413f899
1 changed files with 144 additions and 0 deletions
--- a/egs/multi_zh-hans/ASR/local/tokenize_for_lm_training.py
+++ b/egs/multi_zh-hans/ASR/local/tokenize_for_lm_training.py
@ -0,0 +1,144 @@
 #!/usr/bin/env python3
 # Copyright    2017  Johns Hopkins University   (authors: Shinji Watanabe)
 #              2022  Xiaomi Corp.               (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import codecs
 import re
 import sys
 from typing import List
 from pypinyin import lazy_pinyin, pinyin
 from icefall.utils import str2bool, tokenize_by_CJK_char
 is_python2 = sys.version_info[0] == 2
 def exist_or_not(i, match_pos):
    start_pos = None
    end_pos = None
    for pos in match_pos:
        if pos[0] <= i < pos[1]:
            start_pos = pos[0]
            end_pos = pos[1]
            break
    return start_pos, end_pos
 def get_parser():
    parser = argparse.ArgumentParser(
        description="convert raw text to tokenized text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
    )
    parser.add_argument(
        "--non-lang-syms",
        "-l",
        default=None,
        type=str,
        help="list of non-linguistic symobles, e.g., <NOISE> etc.",
    )
    parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
    parser.add_argument(
        "--trans_type",
        "-t",
        type=str,
        default="char",
        choices=["char", "pinyin", "lazy_pinyin"],
        help="""Transcript type. char/pinyin/lazy_pinyin""",
    )
    return parser
 def token2id(
    texts, token_table, token_type: str = "lazy_pinyin", oov: str = "<unk>"
 ) -> List[List[int]]:
    """Convert token to id.
    Args:
      texts:
        The input texts, it refers to the chinese text here.
      token_table:
        The token table is built based on "data/lang_xxx/token.txt"
      token_type:
        The type of token, such as "pinyin" and "lazy_pinyin".
      oov:
        Out of vocabulary token. When a word(token) in the transcript
        does not exist in the token list, it is replaced with `oov`.
    Returns:
      The list of ids for the input texts.
    """
    if texts is None:
        raise ValueError("texts can't be None!")
    else:
        oov_id = token_table[oov]
        ids: List[List[int]] = []
        for text in texts:
            chars_list = list(str(text))
            if token_type == "lazy_pinyin":
                text = lazy_pinyin(chars_list)
                sub_ids = [
                    token_table[txt] if txt in token_table else oov_id for txt in text
                ]
                ids.append(sub_ids)
            else:  # token_type = "pinyin"
                text = pinyin(chars_list)
                sub_ids = [
                    token_table[txt[0]] if txt[0] in token_table else oov_id
                    for txt in text
                ]
                ids.append(sub_ids)
        return ids
 def main():
    parser = get_parser()
    args = parser.parse_args()
    rs = []
    if args.non_lang_syms is not None:
        with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
            nls = [x.rstrip() for x in f.readlines()]
            rs = [re.compile(re.escape(x)) for x in nls]
    if args.text:
        f = codecs.open(args.text, encoding="utf-8")
    else:
        f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    line = f.readline()
    while line:
        x = line.split()
        print(" ".join(x[: args.skip_ncols]), end=" ")
        a = " ".join(x[args.skip_ncols :])  # noqa E203
        a_flat = tokenize_by_CJK_char(a)
        # print("".join(a_chars))
        print(a_flat)
        line = f.readline()
 if __name__ == "__main__":
    main()