mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-10 18:42:19 +00:00
fix repeated definition of tokenize_by_ja_char
This commit is contained in:
parent
f4210013b7
commit
564b632eda
@ -25,29 +25,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
|
from icefall.utils import tokenize_by_ja_char
|
||||||
def tokenize_by_ja_char(line: str) -> str:
|
|
||||||
"""
|
|
||||||
Tokenize a line of text with Japanese characters.
|
|
||||||
|
|
||||||
Note: All non-Japanese characters will be upper case.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
input = "こんにちは世界は hello world の日本語"
|
|
||||||
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
|
|
||||||
|
|
||||||
Args:
|
|
||||||
line:
|
|
||||||
The input text.
|
|
||||||
|
|
||||||
Return:
|
|
||||||
A new string tokenized by Japanese characters.
|
|
||||||
"""
|
|
||||||
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
|
|
||||||
chars = pattern.split(line.strip())
|
|
||||||
return " ".join(
|
|
||||||
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
|
@ -50,31 +50,7 @@ from prepare_lang import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from icefall.byte_utils import byte_encode
|
from icefall.byte_utils import byte_encode
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import str2bool, tokenize_by_ja_char
|
||||||
|
|
||||||
|
|
||||||
def tokenize_by_ja_char(line: str) -> str:
|
|
||||||
"""
|
|
||||||
Tokenize a line of text with Japanese characters.
|
|
||||||
|
|
||||||
Note: All non-Japanese characters will be upper case.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
input = "こんにちは世界は hello world の日本語"
|
|
||||||
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
|
|
||||||
|
|
||||||
Args:
|
|
||||||
line:
|
|
||||||
The input text.
|
|
||||||
|
|
||||||
Return:
|
|
||||||
A new string tokenized by Japanese characters.
|
|
||||||
"""
|
|
||||||
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
|
|
||||||
chars = pattern.split(line.strip())
|
|
||||||
return " ".join(
|
|
||||||
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def lexicon_to_fst_no_sil(
|
def lexicon_to_fst_no_sil(
|
||||||
|
@ -33,30 +33,7 @@ from pathlib import Path
|
|||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
|
|
||||||
from icefall import byte_encode
|
from icefall import byte_encode
|
||||||
|
from icefall.utils import tokenize_by_ja_char
|
||||||
|
|
||||||
def tokenize_by_ja_char(line: str) -> str:
|
|
||||||
"""
|
|
||||||
Tokenize a line of text with Japanese characters.
|
|
||||||
|
|
||||||
Note: All non-Japanese characters will be upper case.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
input = "こんにちは世界は hello world の日本語"
|
|
||||||
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
|
|
||||||
|
|
||||||
Args:
|
|
||||||
line:
|
|
||||||
The input text.
|
|
||||||
|
|
||||||
Return:
|
|
||||||
A new string tokenized by Japanese characters.
|
|
||||||
"""
|
|
||||||
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
|
|
||||||
chars = pattern.split(line.strip())
|
|
||||||
return " ".join(
|
|
||||||
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
|
@ -96,36 +96,13 @@ from icefall.utils import (
|
|||||||
setup_logger,
|
setup_logger,
|
||||||
store_transcripts,
|
store_transcripts,
|
||||||
str2bool,
|
str2bool,
|
||||||
|
tokenize_by_ja_char,
|
||||||
write_error_stats,
|
write_error_stats,
|
||||||
)
|
)
|
||||||
|
|
||||||
LOG_EPS = math.log(1e-10)
|
LOG_EPS = math.log(1e-10)
|
||||||
|
|
||||||
|
|
||||||
def tokenize_by_ja_char(line: str) -> str:
|
|
||||||
"""
|
|
||||||
Tokenize a line of text with Japanese characters.
|
|
||||||
|
|
||||||
Note: All non-Japanese characters will be upper case.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
input = "こんにちは世界は hello world の日本語"
|
|
||||||
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
|
|
||||||
|
|
||||||
Args:
|
|
||||||
line:
|
|
||||||
The input text.
|
|
||||||
|
|
||||||
Return:
|
|
||||||
A new string tokenized by Japanese characters.
|
|
||||||
"""
|
|
||||||
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
|
|
||||||
chars = pattern.split(line.strip())
|
|
||||||
return " ".join(
|
|
||||||
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
@ -101,35 +101,12 @@ from icefall.utils import (
|
|||||||
get_parameter_groups_with_lrs,
|
get_parameter_groups_with_lrs,
|
||||||
setup_logger,
|
setup_logger,
|
||||||
str2bool,
|
str2bool,
|
||||||
|
tokenize_by_ja_char,
|
||||||
)
|
)
|
||||||
|
|
||||||
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
|
LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
|
||||||
|
|
||||||
|
|
||||||
def tokenize_by_ja_char(line: str) -> str:
|
|
||||||
"""
|
|
||||||
Tokenize a line of text with Japanese characters.
|
|
||||||
|
|
||||||
Note: All non-Japanese characters will be upper case.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
input = "こんにちは世界は hello world の日本語"
|
|
||||||
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
|
|
||||||
|
|
||||||
Args:
|
|
||||||
line:
|
|
||||||
The input text.
|
|
||||||
|
|
||||||
Return:
|
|
||||||
A new string tokenized by Japanese characters.
|
|
||||||
"""
|
|
||||||
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
|
|
||||||
chars = pattern.split(line.strip())
|
|
||||||
return " ".join(
|
|
||||||
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_adjusted_batch_count(params: AttributeDict) -> float:
|
def get_adjusted_batch_count(params: AttributeDict) -> float:
|
||||||
# returns the number of batches we would have used so far if we had used the reference
|
# returns the number of batches we would have used so far if we had used the reference
|
||||||
# duration. This is for purposes of set_batch_count().
|
# duration. This is for purposes of set_batch_count().
|
||||||
|
@ -68,6 +68,7 @@ from .utils import (
|
|||||||
str2bool,
|
str2bool,
|
||||||
subsequent_chunk_mask,
|
subsequent_chunk_mask,
|
||||||
tokenize_by_CJK_char,
|
tokenize_by_CJK_char,
|
||||||
|
tokenize_by_ja_char,
|
||||||
write_error_stats,
|
write_error_stats,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1746,6 +1746,30 @@ def tokenize_by_CJK_char(line: str) -> str:
|
|||||||
return " ".join([w.strip() for w in chars if w.strip()])
|
return " ".join([w.strip() for w in chars if w.strip()])
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_by_ja_char(line: str) -> str:
|
||||||
|
"""
|
||||||
|
Tokenize a line of text with Japanese characters.
|
||||||
|
|
||||||
|
Note: All non-Japanese characters will be upper case.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
input = "こんにちは世界は hello world の日本語"
|
||||||
|
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
|
||||||
|
|
||||||
|
Args:
|
||||||
|
line:
|
||||||
|
The input text.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
A new string tokenized by Japanese characters.
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
|
||||||
|
chars = pattern.split(line.strip())
|
||||||
|
return " ".join(
|
||||||
|
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def display_and_save_batch(
|
def display_and_save_batch(
|
||||||
batch: dict,
|
batch: dict,
|
||||||
params: AttributeDict,
|
params: AttributeDict,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user