icefall/egs/fisher_swbd/ASR/local/extract_json_cuts.py
2022-07-28 07:58:20 -04:00

65 lines
1.8 KiB
Python

#!/usr/bin/env python3
# script to extract cutids corresponding to a list of source audio files.
# It takes three arguments: list of audio (.sph) , cut jsonl and out jsonl
import sys, json
import ntpath
list_of_sph = sys.argv[1]
jsonfile = sys.argv[2]
out_partition_json = sys.argv[3]
list_of_sph = [line.rstrip("\n") for line in open(list_of_sph)]
sph_basename_list = []
for f in list_of_sph:
bsname = ntpath.basename(f)
sph_basename_list.append(ntpath.basename(f))
json_str = [line.rstrip("\n") for line in open(jsonfile)]
num_json = len(json_str)
out_partition = open(out_partition_json, "w", encoding="utf-8")
for i in range(num_json):
if json_str[i] != "":
# print(json_str[i])
cur_json = json.loads(json_str[i])
# print(cur_json)
cur_cutid = cur_json["id"]
cur_rec = cur_json["recording"]
cur_sources = cur_rec["sources"]
# print(cur_cutid)
# print(cur_rec)
# print(cur_sources)
for s in cur_sources:
cur_sph = s["source"]
cur_sph_basename = ntpath.basename(cur_sph)
# print(cur_sph)
# print(cur_sph_basename)
if cur_sph_basename in sph_basename_list:
out_json_line = json_str[i]
out_partition.write(out_json_line)
out_partition.write("\n")
# for keys in cur_json:
# cur_cutid= cur_json['id']
# cur_rec = cur_json['recording_id']
# print(cur_cutid)
"""
for keys in cur_json:
#print(keys)
cur_cutid= cur_json['id']
cur_rec = cur_json['recording_id']
print(cur_rec)
if cur_sph_basename in sph_basename_list :
out_json_line = json_str[i]
out_partition.write(out_json_line)
out_partition.write("\n")
"""