diff --git a/src/dataset_merger.py b/src/dataset_merger.py new file mode 100644 index 0000000..9e915d5 --- /dev/null +++ b/src/dataset_merger.py @@ -0,0 +1,52 @@ +import os +import re +import json + + +class DatasetMerger: + def __init__(self): + self.file_path = os.path.dirname(__file__) + + + def load_one_dataset(self, path): + with open(path, "r") as f: + data = json.load(f) + return data + + + def save_merged_dataset(self, data, path, version): + save_path = path + "/" + "merged_dataset_v" + str(version) + ".json" + with open(save_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + + def merge(self, version): + path = self.file_path + "/../data/generated/" + version + + files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + + + pattern = r"^part_(\d+)_dataset\.json$" + + files = [f for f in files if re.match(pattern, f)] + files.sort() + + all_dataset = [] + for file in files: + data = self.load_one_dataset(path + "/" + file) + all_dataset += data + + + + self.save_merged_dataset(all_dataset, path, version) + + +def main(): + dataset_merger = DatasetMerger() + + version = "v8" + dataset_merger.merge(version) + + +if __name__ == "__main__": + main() \ No newline at end of file