fix repeated definition of tokenize_by_ja_char

This commit is contained in:
Bailey Hirota 2025-01-07 14:30:13 +09:00
parent f4210013b7
commit 564b632eda
7 changed files with 30 additions and 120 deletions

View File

@ -25,29 +25,7 @@ from pathlib import Path
from tqdm.auto import tqdm
def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
from icefall.utils import tokenize_by_ja_char
def get_args():

View File

@ -50,31 +50,7 @@ from prepare_lang import (
)
from icefall.byte_utils import byte_encode
from icefall.utils import str2bool
def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
from icefall.utils import str2bool, tokenize_by_ja_char
def lexicon_to_fst_no_sil(

View File

@ -33,30 +33,7 @@ from pathlib import Path
import sentencepiece as spm
from icefall import byte_encode
def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
from icefall.utils import tokenize_by_ja_char
def get_args():

View File

@ -96,36 +96,13 @@ from icefall.utils import (
setup_logger,
store_transcripts,
str2bool,
tokenize_by_ja_char,
write_error_stats,
)
LOG_EPS = math.log(1e-10)
def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter

View File

@ -101,35 +101,12 @@ from icefall.utils import (
get_parameter_groups_with_lrs,
setup_logger,
str2bool,
tokenize_by_ja_char,
)
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
def get_adjusted_batch_count(params: AttributeDict) -> float:
# returns the number of batches we would have used so far if we had used the reference
# duration. This is for purposes of set_batch_count().

View File

@ -68,6 +68,7 @@ from .utils import (
str2bool,
subsequent_chunk_mask,
tokenize_by_CJK_char,
tokenize_by_ja_char,
write_error_stats,
)

View File

@ -1746,6 +1746,30 @@ def tokenize_by_CJK_char(line: str) -> str:
return " ".join([w.strip() for w in chars if w.strip()])
def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
def display_and_save_batch(
batch: dict,
params: AttributeDict,