mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
55 lines
1.7 KiB
Python
Executable File
55 lines
1.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import gzip
|
|
import logging
|
|
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from tqdm.auto import tqdm
|
|
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--out-dir", type=str, help="Output directory.")
|
|
parser.add_argument("--data-path", type=str, help="Input directory.")
|
|
parser.add_argument("--mode", type=str, help="Input split")
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
def read_text(path):
|
|
with open(path, 'r') as f:
|
|
lines = f.read().split('\n')
|
|
return [' '.join(l.split(' ')[1:]) for l in lines]
|
|
|
|
def create_files(text):
|
|
lexicon = {}
|
|
for line in text:
|
|
for word in line.split(' '):
|
|
if word.strip() == '': continue
|
|
if word not in lexicon:
|
|
lexicon[word] = ' '.join(list(word))
|
|
with open(os.path.join(args.out_dir, 'mucs_lexicon.txt'), 'w') as f:
|
|
for word in lexicon:
|
|
f.write(word + '\t' + lexicon[word] + '\n')
|
|
with open(os.path.join(args.out_dir, 'mucs_vocab.txt'), 'w') as f:
|
|
for word in lexicon:
|
|
f.write(word + '\n')
|
|
with open(os.path.join(args.out_dir, 'mucs_vocab_text.txt'), 'w') as f:
|
|
for line in text:
|
|
f.write(line + '\n')
|
|
|
|
def main():
|
|
path = os.path.join(args.data_path, args.mode)
|
|
text = read_text(os.path.join(path, "text"))
|
|
create_files(text)
|
|
|
|
if __name__ == "__main__":
|
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
args = get_args()
|
|
logging.info(f"out_dir: {args.out_dir}")
|
|
logging.info(f"in_dir: {args.data_path}")
|
|
main()
|