diff --git a/src/configuration.py b/src/configuration.py index 659522c..97cb2a1 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -160,8 +160,7 @@ Ensure to generate only the JSON output with content in English. # for key in data: # example[key] = data[key] - config["length"] = random.choice([10, 20, 40, 80, 150]) - + config["length"] = random.choice([5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 40, 60, 80, 100, 150]) return config diff --git a/src/pipline.py.py b/src/pipline.py.py index cc0932b..efe65af 100644 --- a/src/pipline.py.py +++ b/src/pipline.py.py @@ -5,7 +5,7 @@ import re import random import tqdm import pandas as pd - +import traceback def import_lib(path, file_name, package_name): file_path = path + "/" + file_name + ".py" @@ -72,11 +72,14 @@ class Pipline: def exec_function(self, passage): - config = self.configuration.run(passage) - generated_data = self.query_generator.run(passage, config) - one_data = config.copy() - one_data["document"] = passage - one_data["query"] = generated_data["query"] + try: + config = self.configuration.run(passage) + generated_data = self.query_generator.run(passage, config) + one_data = config.copy() + one_data["document"] = passage + one_data["query"] = generated_data["query"] + except Exception as e: + one_data = {"passage": passage, "error": traceback.format_exc()} return one_data @@ -128,7 +131,7 @@ class Pipline: data = self.load_data() chunk_data = self.pre_process(data) - num_data = 20 + num_data = 25000 num_threads = 5 parallel_requester = ParallelRequester()