fix repeated definition of tokenize_by_ja_char

2025-08-10 18:42:19 +00:00 · 2025-01-07 14:30:13 +09:00 · 2025-01-07 14:30:13 +09:00 · 564b632eda
commit 564b632eda
parent f4210013b7
7 changed files with 30 additions and 120 deletions
--- a/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py
+++ b/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py
@ -25,29 +25,7 @@ from pathlib import Path
 from tqdm.auto import tqdm
-
+from icefall.utils import tokenize_by_ja_char
 def tokenize_by_ja_char(line: str) -> str:
    """
    Tokenize a line of text with Japanese characters.
    Note: All non-Japanese characters will be upper case.
    Example:
      input = "こんにちは世界は hello world の日本語"
      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
    Args:
      line:
        The input text.
    Return:
      A new string tokenized by Japanese characters.
    """
    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
    chars = pattern.split(line.strip())
    return " ".join(
        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
    )
 def get_args():
--- a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
+++ b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
@ -50,31 +50,7 @@ from prepare_lang import (
 )
 from icefall.byte_utils import byte_encode
-from icefall.utils import str2bool
+from icefall.utils import str2bool, tokenize_by_ja_char
 def tokenize_by_ja_char(line: str) -> str:
    """
    Tokenize a line of text with Japanese characters.
    Note: All non-Japanese characters will be upper case.
    Example:
      input = "こんにちは世界は hello world の日本語"
      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
    Args:
      line:
        The input text.
    Return:
      A new string tokenized by Japanese characters.
    """
    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
    chars = pattern.split(line.strip())
    return " ".join(
        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
    )
 def lexicon_to_fst_no_sil(
--- a/egs/multi_ja_en/ASR/local/train_bbpe_model.py
+++ b/egs/multi_ja_en/ASR/local/train_bbpe_model.py
@ -33,30 +33,7 @@ from pathlib import Path
 import sentencepiece as spm
 from icefall import byte_encode
-
+from icefall.utils import tokenize_by_ja_char
 def tokenize_by_ja_char(line: str) -> str:
    """
    Tokenize a line of text with Japanese characters.
    Note: All non-Japanese characters will be upper case.
    Example:
      input = "こんにちは世界は hello world の日本語"
      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
    Args:
      line:
        The input text.
    Return:
      A new string tokenized by Japanese characters.
    """
    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
    chars = pattern.split(line.strip())
    return " ".join(
        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
    )
 def get_args():
--- a/egs/multi_ja_en/ASR/zipformer/decode.py
+++ b/egs/multi_ja_en/ASR/zipformer/decode.py
@ -96,36 +96,13 @@ from icefall.utils import (
    setup_logger,
    store_transcripts,
    str2bool,
    tokenize_by_ja_char,
    write_error_stats,
 )
 LOG_EPS = math.log(1e-10)
 def tokenize_by_ja_char(line: str) -> str:
    """
    Tokenize a line of text with Japanese characters.
    Note: All non-Japanese characters will be upper case.
    Example:
      input = "こんにちは世界は hello world の日本語"
      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
    Args:
      line:
        The input text.
    Return:
      A new string tokenized by Japanese characters.
    """
    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
    chars = pattern.split(line.strip())
    return " ".join(
        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
    )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
--- a/egs/multi_ja_en/ASR/zipformer/train.py
+++ b/egs/multi_ja_en/ASR/zipformer/train.py
@ -101,35 +101,12 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
    tokenize_by_ja_char,
 )
 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 def tokenize_by_ja_char(line: str) -> str:
    """
    Tokenize a line of text with Japanese characters.
    Note: All non-Japanese characters will be upper case.
    Example:
      input = "こんにちは世界は hello world の日本語"
      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
    Args:
      line:
        The input text.
    Return:
      A new string tokenized by Japanese characters.
    """
    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
    chars = pattern.split(line.strip())
    return " ".join(
        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
    )
 def get_adjusted_batch_count(params: AttributeDict) -> float:
    # returns the number of batches we would have used so far if we had used the reference
    # duration.  This is for purposes of set_batch_count().
--- a/icefall/init.py
+++ b/icefall/init.py
@ -68,6 +68,7 @@ from .utils import (
    str2bool,
    subsequent_chunk_mask,
    tokenize_by_CJK_char,
    tokenize_by_ja_char,
    write_error_stats,
 )
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -1746,6 +1746,30 @@ def tokenize_by_CJK_char(line: str) -> str:
    return " ".join([w.strip() for w in chars if w.strip()])
 def tokenize_by_ja_char(line: str) -> str:
    """
    Tokenize a line of text with Japanese characters.
    Note: All non-Japanese characters will be upper case.
    Example:
      input = "こんにちは世界は hello world の日本語"
      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
    Args:
      line:
        The input text.
    Return:
      A new string tokenized by Japanese characters.
    """
    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
    chars = pattern.split(line.strip())
    return " ".join(
        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
    )
 def display_and_save_batch(
    batch: dict,
    params: AttributeDict,