diff --git a/egs/mls_english/ASR/local/train_bpe_model.py b/egs/mls_english/ASR/local/train_bpe_model.py index 59e79be1e..5a95a6bc8 100644 --- a/egs/mls_english/ASR/local/train_bpe_model.py +++ b/egs/mls_english/ASR/local/train_bpe_model.py @@ -70,6 +70,16 @@ def get_args(): return parser.parse_args() +def generate_tokens(lang_dir: Path): + """ + Generate the tokens.txt from a bpe model. + """ + sp = spm.SentencePieceProcessor() + sp.load(str(lang_dir / "bpe.model")) + token2id: Dict[str, int] = {sp.id_to_piece(i): i for i in range(sp.vocab_size())} + with open(lang_dir / "tokens.txt", "w", encoding="utf-8") as f: + for sym, i in token2id.items(): + f.write(f"{sym} {i}\n") def main(): args = get_args() @@ -109,6 +119,7 @@ def main(): shutil.copyfile(model_file, f"{lang_dir}/bpe.model") + generate_tokens(lang_dir) if __name__ == "__main__": main()