mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-04 06:34:20 +00:00
47 lines
1.6 KiB
Python
Executable File
47 lines
1.6 KiB
Python
Executable File
import pandas, argparse
|
|
from tqdm import tqdm
|
|
|
|
def generate_lexicon(corpus_dir, lm_dir):
|
|
data = pandas.read_csv(str(corpus_dir) + '/data/train_data.csv', index_col = 0, header = 0)
|
|
vocab_transcript = set()
|
|
vocab_frames = set()
|
|
transcripts = data['transcription'].tolist()
|
|
frames = list(i for i in zip(data['action'].tolist(), data['object'].tolist(), data['location'].tolist()))
|
|
|
|
for transcript in tqdm(transcripts):
|
|
for word in transcript.split():
|
|
vocab_transcript.add(word)
|
|
|
|
for frame in tqdm(frames):
|
|
for word in frame:
|
|
vocab_frames.add('_'.join(word.split()))
|
|
|
|
with open(lm_dir + '/words_transcript.txt', 'w') as lexicon_transcript_file:
|
|
lexicon_transcript_file.write("<UNK> 1" + '\n')
|
|
lexicon_transcript_file.write("<s> 2" + '\n')
|
|
lexicon_transcript_file.write("</s> 0" + '\n')
|
|
id = 3
|
|
for vocab in vocab_transcript:
|
|
lexicon_transcript_file.write(vocab + ' ' + str(id) + '\n')
|
|
id += 1
|
|
|
|
with open(lm_dir + '/words_frames.txt', 'w') as lexicon_frames_file:
|
|
lexicon_frames_file.write("<UNK> 1" + '\n')
|
|
lexicon_frames_file.write("<s> 2" + '\n')
|
|
lexicon_frames_file.write("</s> 0" + '\n')
|
|
id = 3
|
|
for vocab in vocab_frames:
|
|
lexicon_frames_file.write(vocab + ' ' + str(id) + '\n')
|
|
id += 1
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('corpus_dir')
|
|
parser.add_argument('lm_dir')
|
|
|
|
def main():
|
|
args = parser.parse_args()
|
|
|
|
generate_lexicon(args.corpus_dir, args.lm_dir)
|
|
|
|
main() |