From 9a446bca163dc05a8ddddf1afde26d4daeff1fd3 Mon Sep 17 00:00:00 2001 From: hediehloo Date: Sun, 30 Nov 2025 09:36:29 +0000 Subject: [PATCH] add pipline --- .vscode/launch.json | 16 ++++++++ src/configuration.py | 2 +- src/pipline.py.py | 93 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 .vscode/launch.json create mode 100644 src/pipline.py.py diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..7774467 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/src/configuration.py b/src/configuration.py index 3acbbd1..c4bea68 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -162,7 +162,7 @@ Ensure to generate only the JSON output with content in English. # for key in data: # example[key] = data[key] - config["length"] = random.choice([20, 40, 80, 160]) + config["length"] = random.choice([10, 20, 40, 80]) return config diff --git a/src/pipline.py.py b/src/pipline.py.py new file mode 100644 index 0000000..9149841 --- /dev/null +++ b/src/pipline.py.py @@ -0,0 +1,93 @@ +import json +import os +import requests +import tqdm +import faiss +import numpy +import importlib +from openai import OpenAI +from dotenv import load_dotenv +import re +import random +import pandas as pd + + + +def import_lib(path, file_name, package_name): + file_path = path + "/" + file_name + ".py" + spec = importlib.util.spec_from_file_location(file_name, file_path) + imported_file = importlib.util.module_from_spec(spec) + spec.loader.exec_module(imported_file) + return getattr(imported_file, package_name) + + +Configuration = import_lib(os.path.dirname(__file__) , "configuration", "Configuration") +QueryGenerator = import_lib(os.path.dirname(__file__) , "query_generator", "QueryGenerator") + + +class Pipline: + def __init__(self): + self.file_path = os.path.dirname(__file__) + self.configuration = Configuration() + self.configuration.init_persona() + self.query_generator = QueryGenerator() + + def load_data(self): + df = pd.read_csv(self.file_path + "/../data/persian_blog/blogs.csv") + rows = df.values.tolist() + rows = [rows[i][0] for i in range(len(rows))] + return rows + + def save_dataset(self, data): + path = self.file_path + "/../data/generated" + if not os.path.exists(path): + os.makedirs(path) + + files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + + pattern = r"^v(\d+)_dataset\.json$" + + all_numbers = [] + + for f in files: + match = re.match(pattern, f) + if match: + num = int(match.group(1)) + all_numbers.append(num) + + if all_numbers: + number = max(all_numbers) + 1 + else: + number = 1 + + with open(path + "/v" + str(number) + "_dataset.json", "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + + def run(self): + data = self.load_data() + + num_data = 10 + + dataset = [] + for i in range(num_data): + config = self.configuration.run(data[i]) + generated_data = self.query_generator.run(data[i], config) + one_data = config.copy() + one_data["document"] = data[i] + one_data["query"] = generated_data + dataset += [one_data] + + self.save_dataset(dataset) + + + + +def main(): + pipline = Pipline() + + pipline.run() + + +if __name__ == "__main__": + main() \ No newline at end of file