check bbpe model exists in advance. (#1277)

This commit is contained in:
yaguang 2023-09-27 17:35:26 +08:00 committed by GitHub
parent a5ba1133c4
commit 8181d19860
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -15,7 +15,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# You can install sentencepiece via: # You can install sentencepiece via:
# #
# pip install sentencepiece # pip install sentencepiece
@ -26,12 +25,12 @@
# Please install a version >=0.1.96 # Please install a version >=0.1.96
import argparse import argparse
import re
import shutil import shutil
import tempfile import tempfile
from pathlib import Path from pathlib import Path
import sentencepiece as spm import sentencepiece as spm
from icefall import byte_encode, tokenize_by_CJK_char from icefall import byte_encode, tokenize_by_CJK_char
@ -74,6 +73,11 @@ def main():
model_type = "unigram" model_type = "unigram"
model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
model_file = Path(model_prefix + ".model")
if model_file.is_file():
print(f"{model_file} exists - skipping")
return
character_coverage = 1.0 character_coverage = 1.0
input_sentence_size = 100000000 input_sentence_size = 100000000
@ -88,8 +92,6 @@ def main():
_convert_to_bchar(args.transcript, train_text) _convert_to_bchar(args.transcript, train_text)
model_file = Path(model_prefix + ".model")
if not model_file.is_file():
spm.SentencePieceTrainer.train( spm.SentencePieceTrainer.train(
input=train_text, input=train_text,
vocab_size=vocab_size, vocab_size=vocab_size,
@ -102,9 +104,6 @@ def main():
bos_id=-1, bos_id=-1,
eos_id=-1, eos_id=-1,
) )
else:
print(f"{model_file} exists - skipping")
return
shutil.copyfile(model_file, f"{lang_dir}/bbpe.model") shutil.copyfile(model_file, f"{lang_dir}/bbpe.model")