Henry Li Xinyuan b07d5472c5
Implement recipe for Fluent Speech Commands dataset (#1469)
---------

Signed-off-by: Xinyuan Li <xli257@c13.clsp.jhu.edu>
2024-01-31 22:53:36 +08:00

60 lines
1.6 KiB
Python
Executable File

import argparse
import pandas
from tqdm import tqdm
def generate_lexicon(corpus_dir, lm_dir):
data = pandas.read_csv(
str(corpus_dir) + "/data/train_data.csv", index_col=0, header=0
)
vocab_transcript = set()
vocab_frames = set()
transcripts = data["transcription"].tolist()
frames = list(
i
for i in zip(
data["action"].tolist(), data["object"].tolist(), data["location"].tolist()
)
)
for transcript in tqdm(transcripts):
for word in transcript.split():
vocab_transcript.add(word)
for frame in tqdm(frames):
for word in frame:
vocab_frames.add("_".join(word.split()))
with open(lm_dir + "/words_transcript.txt", "w") as lexicon_transcript_file:
lexicon_transcript_file.write("<UNK> 1" + "\n")
lexicon_transcript_file.write("<s> 2" + "\n")
lexicon_transcript_file.write("</s> 0" + "\n")
id = 3
for vocab in vocab_transcript:
lexicon_transcript_file.write(vocab + " " + str(id) + "\n")
id += 1
with open(lm_dir + "/words_frames.txt", "w") as lexicon_frames_file:
lexicon_frames_file.write("<UNK> 1" + "\n")
lexicon_frames_file.write("<s> 2" + "\n")
lexicon_frames_file.write("</s> 0" + "\n")
id = 3
for vocab in vocab_frames:
lexicon_frames_file.write(vocab + " " + str(id) + "\n")
id += 1
parser = argparse.ArgumentParser()
parser.add_argument("corpus_dir")
parser.add_argument("lm_dir")
def main():
args = parser.parse_args()
generate_lexicon(args.corpus_dir, args.lm_dir)
main()