From 564b632eda65ea678721b0de8ce42364befe0424 Mon Sep 17 00:00:00 2001
From: Bailey Hirota <baileyhirota@icloud.com>
Date: Tue, 7 Jan 2025 14:30:13 +0900
Subject: [PATCH] fix repeated definition of tokenize_by_ja_char

---
 .../ASR/local/prepare_for_bpe_model.py        | 24 +----------------
 .../ASR/local/prepare_lang_bbpe.py            | 26 +------------------
 egs/multi_ja_en/ASR/local/train_bbpe_model.py | 25 +-----------------
 egs/multi_ja_en/ASR/zipformer/decode.py       | 25 +-----------------
 egs/multi_ja_en/ASR/zipformer/train.py        | 25 +-----------------
 icefall/__init__.py                           |  1 +
 icefall/utils.py                              | 24 +++++++++++++++++
 7 files changed, 30 insertions(+), 120 deletions(-)

diff --git a/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py b/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py
index 70846140b..27832ad1b 100755
--- a/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py
+++ b/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py
@@ -25,29 +25,7 @@ from pathlib import Path
 
 from tqdm.auto import tqdm
 
-
-def tokenize_by_ja_char(line: str) -> str:
-    """
-    Tokenize a line of text with Japanese characters.
-
-    Note: All non-Japanese characters will be upper case.
-
-    Example:
-      input = "こんにちは世界は hello world の日本語"
-      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
-
-    Args:
-      line:
-        The input text.
-
-    Return:
-      A new string tokenized by Japanese characters.
-    """
-    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
-    chars = pattern.split(line.strip())
-    return " ".join(
-        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
-    )
+from icefall.utils import tokenize_by_ja_char
 
 
 def get_args():
diff --git a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
index cbf6ccca5..6134710ad 100755
--- a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
+++ b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
@@ -50,31 +50,7 @@ from prepare_lang import (
 )
 
 from icefall.byte_utils import byte_encode
-from icefall.utils import str2bool
-
-
-def tokenize_by_ja_char(line: str) -> str:
-    """
-    Tokenize a line of text with Japanese characters.
-
-    Note: All non-Japanese characters will be upper case.
-
-    Example:
-      input = "こんにちは世界は hello world の日本語"
-      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
-
-    Args:
-      line:
-        The input text.
-
-    Return:
-      A new string tokenized by Japanese characters.
-    """
-    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
-    chars = pattern.split(line.strip())
-    return " ".join(
-        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
-    )
+from icefall.utils import str2bool, tokenize_by_ja_char
 
 
 def lexicon_to_fst_no_sil(
diff --git a/egs/multi_ja_en/ASR/local/train_bbpe_model.py b/egs/multi_ja_en/ASR/local/train_bbpe_model.py
index 4d7dd1225..d104f2717 100755
--- a/egs/multi_ja_en/ASR/local/train_bbpe_model.py
+++ b/egs/multi_ja_en/ASR/local/train_bbpe_model.py
@@ -33,30 +33,7 @@ from pathlib import Path
 import sentencepiece as spm
 
 from icefall import byte_encode
-
-
-def tokenize_by_ja_char(line: str) -> str:
-    """
-    Tokenize a line of text with Japanese characters.
-
-    Note: All non-Japanese characters will be upper case.
-
-    Example:
-      input = "こんにちは世界は hello world の日本語"
-      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
-
-    Args:
-      line:
-        The input text.
-
-    Return:
-      A new string tokenized by Japanese characters.
-    """
-    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
-    chars = pattern.split(line.strip())
-    return " ".join(
-        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
-    )
+from icefall.utils import tokenize_by_ja_char
 
 
 def get_args():
diff --git a/egs/multi_ja_en/ASR/zipformer/decode.py b/egs/multi_ja_en/ASR/zipformer/decode.py
index 591baa85d..26ce3e018 100755
--- a/egs/multi_ja_en/ASR/zipformer/decode.py
+++ b/egs/multi_ja_en/ASR/zipformer/decode.py
@@ -96,36 +96,13 @@ from icefall.utils import (
     setup_logger,
     store_transcripts,
     str2bool,
+    tokenize_by_ja_char,
     write_error_stats,
 )
 
 LOG_EPS = math.log(1e-10)
 
 
-def tokenize_by_ja_char(line: str) -> str:
-    """
-    Tokenize a line of text with Japanese characters.
-
-    Note: All non-Japanese characters will be upper case.
-
-    Example:
-      input = "こんにちは世界は hello world の日本語"
-      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
-
-    Args:
-      line:
-        The input text.
-
-    Return:
-      A new string tokenized by Japanese characters.
-    """
-    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
-    chars = pattern.split(line.strip())
-    return " ".join(
-        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
-    )
-
-
 def get_parser():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
diff --git a/egs/multi_ja_en/ASR/zipformer/train.py b/egs/multi_ja_en/ASR/zipformer/train.py
index cfaa9bc92..bfb037f50 100755
--- a/egs/multi_ja_en/ASR/zipformer/train.py
+++ b/egs/multi_ja_en/ASR/zipformer/train.py
@@ -101,35 +101,12 @@ from icefall.utils import (
     get_parameter_groups_with_lrs,
     setup_logger,
     str2bool,
+    tokenize_by_ja_char,
 )
 
 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 
 
-def tokenize_by_ja_char(line: str) -> str:
-    """
-    Tokenize a line of text with Japanese characters.
-
-    Note: All non-Japanese characters will be upper case.
-
-    Example:
-      input = "こんにちは世界は hello world の日本語"
-      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
-
-    Args:
-      line:
-        The input text.
-
-    Return:
-      A new string tokenized by Japanese characters.
-    """
-    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
-    chars = pattern.split(line.strip())
-    return " ".join(
-        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
-    )
-
-
 def get_adjusted_batch_count(params: AttributeDict) -> float:
     # returns the number of batches we would have used so far if we had used the reference
     # duration.  This is for purposes of set_batch_count().
diff --git a/icefall/__init__.py b/icefall/__init__.py
index b1e4313e9..3077b8162 100644
--- a/icefall/__init__.py
+++ b/icefall/__init__.py
@@ -68,6 +68,7 @@ from .utils import (
     str2bool,
     subsequent_chunk_mask,
     tokenize_by_CJK_char,
+    tokenize_by_ja_char,
     write_error_stats,
 )
 
diff --git a/icefall/utils.py b/icefall/utils.py
index 9a25784cb..74ee9d427 100644
--- a/icefall/utils.py
+++ b/icefall/utils.py
@@ -1746,6 +1746,30 @@ def tokenize_by_CJK_char(line: str) -> str:
     return " ".join([w.strip() for w in chars if w.strip()])
 
 
+def tokenize_by_ja_char(line: str) -> str:
+    """
+    Tokenize a line of text with Japanese characters.
+
+    Note: All non-Japanese characters will be upper case.
+
+    Example:
+      input = "こんにちは世界は hello world の日本語"
+      output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
+
+    Args:
+      line:
+        The input text.
+
+    Return:
+      A new string tokenized by Japanese characters.
+    """
+    pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
+    chars = pattern.split(line.strip())
+    return " ".join(
+        [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
+    )
+
+
 def display_and_save_batch(
     batch: dict,
     params: AttributeDict,