From 564b632eda65ea678721b0de8ce42364befe0424 Mon Sep 17 00:00:00 2001 From: Bailey Hirota Date: Tue, 7 Jan 2025 14:30:13 +0900 Subject: [PATCH] fix repeated definition of tokenize_by_ja_char --- .../ASR/local/prepare_for_bpe_model.py | 24 +---------------- .../ASR/local/prepare_lang_bbpe.py | 26 +------------------ egs/multi_ja_en/ASR/local/train_bbpe_model.py | 25 +----------------- egs/multi_ja_en/ASR/zipformer/decode.py | 25 +----------------- egs/multi_ja_en/ASR/zipformer/train.py | 25 +----------------- icefall/__init__.py | 1 + icefall/utils.py | 24 +++++++++++++++++ 7 files changed, 30 insertions(+), 120 deletions(-) diff --git a/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py b/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py index 70846140b..27832ad1b 100755 --- a/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py +++ b/egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py @@ -25,29 +25,7 @@ from pathlib import Path from tqdm.auto import tqdm - -def tokenize_by_ja_char(line: str) -> str: - """ - Tokenize a line of text with Japanese characters. - - Note: All non-Japanese characters will be upper case. - - Example: - input = "こんにちは世界は hello world の日本語" - output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語" - - Args: - line: - The input text. - - Return: - A new string tokenized by Japanese characters. - """ - pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])") - chars = pattern.split(line.strip()) - return " ".join( - [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()] - ) +from icefall.utils import tokenize_by_ja_char def get_args(): diff --git a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py index cbf6ccca5..6134710ad 100755 --- a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py +++ b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py @@ -50,31 +50,7 @@ from prepare_lang import ( ) from icefall.byte_utils import byte_encode -from icefall.utils import str2bool - - -def tokenize_by_ja_char(line: str) -> str: - """ - Tokenize a line of text with Japanese characters. - - Note: All non-Japanese characters will be upper case. - - Example: - input = "こんにちは世界は hello world の日本語" - output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語" - - Args: - line: - The input text. - - Return: - A new string tokenized by Japanese characters. - """ - pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])") - chars = pattern.split(line.strip()) - return " ".join( - [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()] - ) +from icefall.utils import str2bool, tokenize_by_ja_char def lexicon_to_fst_no_sil( diff --git a/egs/multi_ja_en/ASR/local/train_bbpe_model.py b/egs/multi_ja_en/ASR/local/train_bbpe_model.py index 4d7dd1225..d104f2717 100755 --- a/egs/multi_ja_en/ASR/local/train_bbpe_model.py +++ b/egs/multi_ja_en/ASR/local/train_bbpe_model.py @@ -33,30 +33,7 @@ from pathlib import Path import sentencepiece as spm from icefall import byte_encode - - -def tokenize_by_ja_char(line: str) -> str: - """ - Tokenize a line of text with Japanese characters. - - Note: All non-Japanese characters will be upper case. - - Example: - input = "こんにちは世界は hello world の日本語" - output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語" - - Args: - line: - The input text. - - Return: - A new string tokenized by Japanese characters. - """ - pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])") - chars = pattern.split(line.strip()) - return " ".join( - [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()] - ) +from icefall.utils import tokenize_by_ja_char def get_args(): diff --git a/egs/multi_ja_en/ASR/zipformer/decode.py b/egs/multi_ja_en/ASR/zipformer/decode.py index 591baa85d..26ce3e018 100755 --- a/egs/multi_ja_en/ASR/zipformer/decode.py +++ b/egs/multi_ja_en/ASR/zipformer/decode.py @@ -96,36 +96,13 @@ from icefall.utils import ( setup_logger, store_transcripts, str2bool, + tokenize_by_ja_char, write_error_stats, ) LOG_EPS = math.log(1e-10) -def tokenize_by_ja_char(line: str) -> str: - """ - Tokenize a line of text with Japanese characters. - - Note: All non-Japanese characters will be upper case. - - Example: - input = "こんにちは世界は hello world の日本語" - output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語" - - Args: - line: - The input text. - - Return: - A new string tokenized by Japanese characters. - """ - pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])") - chars = pattern.split(line.strip()) - return " ".join( - [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()] - ) - - def get_parser(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter diff --git a/egs/multi_ja_en/ASR/zipformer/train.py b/egs/multi_ja_en/ASR/zipformer/train.py index cfaa9bc92..bfb037f50 100755 --- a/egs/multi_ja_en/ASR/zipformer/train.py +++ b/egs/multi_ja_en/ASR/zipformer/train.py @@ -101,35 +101,12 @@ from icefall.utils import ( get_parameter_groups_with_lrs, setup_logger, str2bool, + tokenize_by_ja_char, ) LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler] -def tokenize_by_ja_char(line: str) -> str: - """ - Tokenize a line of text with Japanese characters. - - Note: All non-Japanese characters will be upper case. - - Example: - input = "こんにちは世界は hello world の日本語" - output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語" - - Args: - line: - The input text. - - Return: - A new string tokenized by Japanese characters. - """ - pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])") - chars = pattern.split(line.strip()) - return " ".join( - [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()] - ) - - def get_adjusted_batch_count(params: AttributeDict) -> float: # returns the number of batches we would have used so far if we had used the reference # duration. This is for purposes of set_batch_count(). diff --git a/icefall/__init__.py b/icefall/__init__.py index b1e4313e9..3077b8162 100644 --- a/icefall/__init__.py +++ b/icefall/__init__.py @@ -68,6 +68,7 @@ from .utils import ( str2bool, subsequent_chunk_mask, tokenize_by_CJK_char, + tokenize_by_ja_char, write_error_stats, ) diff --git a/icefall/utils.py b/icefall/utils.py index 9a25784cb..74ee9d427 100644 --- a/icefall/utils.py +++ b/icefall/utils.py @@ -1746,6 +1746,30 @@ def tokenize_by_CJK_char(line: str) -> str: return " ".join([w.strip() for w in chars if w.strip()]) +def tokenize_by_ja_char(line: str) -> str: + """ + Tokenize a line of text with Japanese characters. + + Note: All non-Japanese characters will be upper case. + + Example: + input = "こんにちは世界は hello world の日本語" + output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語" + + Args: + line: + The input text. + + Return: + A new string tokenized by Japanese characters. + """ + pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])") + chars = pattern.split(line.strip()) + return " ".join( + [w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()] + ) + + def display_and_save_batch( batch: dict, params: AttributeDict,