icefall/egs/mucs/ASR/local/prepare_lm_files.py
2023-04-19 16:17:06 +05:30

55 lines
1.7 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import gzip
import logging
import os
import shutil
from pathlib import Path
from tqdm.auto import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--out-dir", type=str, help="Output directory.")
parser.add_argument("--data-path", type=str, help="Input directory.")
parser.add_argument("--mode", type=str, help="Input split")
args = parser.parse_args()
return args
def read_text(path):
with open(path, 'r') as f:
lines = f.read().split('\n')
return [' '.join(l.split(' ')[1:]) for l in lines]
def create_files(text):
lexicon = {}
for line in text:
for word in line.split(' '):
if word.strip() == '': continue
if word not in lexicon:
lexicon[word] = ' '.join(list(word))
with open(os.path.join(args.out_dir, 'mucs_lexicon.txt'), 'w') as f:
for word in lexicon:
f.write(word + '\t' + lexicon[word] + '\n')
with open(os.path.join(args.out_dir, 'mucs_vocab.txt'), 'w') as f:
for word in lexicon:
f.write(word + '\n')
with open(os.path.join(args.out_dir, 'mucs_vocab_text.txt'), 'w') as f:
for line in text:
f.write(line + '\n')
def main():
path = os.path.join(args.data_path, args.mode)
text = read_text(os.path.join(path, "text"))
create_files(text)
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
logging.info(f"out_dir: {args.out_dir}")
logging.info(f"in_dir: {args.data_path}")
main()