add char coverage option to avoid having a lot of rarely used tokens in the BPE; add the option to use byte-fallback in training BPE

This commit is contained in:
marcoyang1998 2023-07-19 10:55:57 +08:00
parent b53c0d1e5f
commit 0d1cd4f595

View File

@ -31,6 +31,8 @@ from pathlib import Path
import sentencepiece as spm import sentencepiece as spm
from icefall.utils import str2bool
def get_args(): def get_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -54,11 +56,25 @@ def get_args():
help="Vocabulary size for BPE training", help="Vocabulary size for BPE training",
) )
parser.add_argument(
"--byte-fallback",
type=str2bool,
default=False,
)
parser.add_argument(
"--character-coverage",
type=float,
default=0.99,
help="Character coverage when training BPE",
)
return parser.parse_args() return parser.parse_args()
def main(): def main():
args = get_args() args = get_args()
print(args)
vocab_size = args.vocab_size vocab_size = args.vocab_size
lang_dir = Path(args.lang_dir) lang_dir = Path(args.lang_dir)
@ -83,12 +99,13 @@ def main():
model_type=model_type, model_type=model_type,
model_prefix=model_prefix, model_prefix=model_prefix,
input_sentence_size=input_sentence_size, input_sentence_size=input_sentence_size,
character_coverage=character_coverage, character_coverage=args.character_coverage,
user_defined_symbols=user_defined_symbols, user_defined_symbols=user_defined_symbols,
unk_id=unk_id, unk_id=unk_id,
bos_id=-1, bos_id=-1,
eos_id=-1, eos_id=-1,
train_extremely_large_corpus=False, train_extremely_large_corpus=False,
byte_fallback=args.byte_fallback,
) )
else: else:
print(f"{model_file} exists - skipping") print(f"{model_file} exists - skipping")