mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
check bbpe model exists in advance. (#1277)
This commit is contained in:
parent
a5ba1133c4
commit
8181d19860
@ -15,7 +15,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
# You can install sentencepiece via:
|
# You can install sentencepiece via:
|
||||||
#
|
#
|
||||||
# pip install sentencepiece
|
# pip install sentencepiece
|
||||||
@ -26,12 +25,12 @@
|
|||||||
# Please install a version >=0.1.96
|
# Please install a version >=0.1.96
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
|
|
||||||
from icefall import byte_encode, tokenize_by_CJK_char
|
from icefall import byte_encode, tokenize_by_CJK_char
|
||||||
|
|
||||||
|
|
||||||
@ -74,6 +73,11 @@ def main():
|
|||||||
model_type = "unigram"
|
model_type = "unigram"
|
||||||
|
|
||||||
model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
|
model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
|
||||||
|
model_file = Path(model_prefix + ".model")
|
||||||
|
if model_file.is_file():
|
||||||
|
print(f"{model_file} exists - skipping")
|
||||||
|
return
|
||||||
|
|
||||||
character_coverage = 1.0
|
character_coverage = 1.0
|
||||||
input_sentence_size = 100000000
|
input_sentence_size = 100000000
|
||||||
|
|
||||||
@ -88,8 +92,6 @@ def main():
|
|||||||
|
|
||||||
_convert_to_bchar(args.transcript, train_text)
|
_convert_to_bchar(args.transcript, train_text)
|
||||||
|
|
||||||
model_file = Path(model_prefix + ".model")
|
|
||||||
if not model_file.is_file():
|
|
||||||
spm.SentencePieceTrainer.train(
|
spm.SentencePieceTrainer.train(
|
||||||
input=train_text,
|
input=train_text,
|
||||||
vocab_size=vocab_size,
|
vocab_size=vocab_size,
|
||||||
@ -102,9 +104,6 @@ def main():
|
|||||||
bos_id=-1,
|
bos_id=-1,
|
||||||
eos_id=-1,
|
eos_id=-1,
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
print(f"{model_file} exists - skipping")
|
|
||||||
return
|
|
||||||
|
|
||||||
shutil.copyfile(model_file, f"{lang_dir}/bbpe.model")
|
shutil.copyfile(model_file, f"{lang_dir}/bbpe.model")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user