From 9a446bca163dc05a8ddddf1afde26d4daeff1fd3 Mon Sep 17 00:00:00 2001
From: hediehloo <alirezahediehloo@gmail.com>
Date: Sun, 30 Nov 2025 09:36:29 +0000
Subject: [PATCH] add pipline

---
 .vscode/launch.json  | 16 ++++++++
 src/configuration.py |  2 +-
 src/pipline.py.py    | 93 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 .vscode/launch.json
 create mode 100644 src/pipline.py.py

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..7774467
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/src/configuration.py b/src/configuration.py
index 3acbbd1..c4bea68 100644
--- a/src/configuration.py
+++ b/src/configuration.py
@@ -162,7 +162,7 @@ Ensure to generate only the JSON output with content in English.
         # for key in data:
         #     example[key] = data[key]
 
-        config["length"] = random.choice([20, 40, 80, 160])
+        config["length"] = random.choice([10, 20, 40, 80])
 
 
         return config
diff --git a/src/pipline.py.py b/src/pipline.py.py
new file mode 100644
index 0000000..9149841
--- /dev/null
+++ b/src/pipline.py.py
@@ -0,0 +1,93 @@
+import json
+import os
+import requests
+import tqdm
+import faiss
+import numpy
+import importlib
+from openai import OpenAI
+from dotenv import load_dotenv
+import re
+import random
+import pandas as pd
+
+
+
+def import_lib(path, file_name, package_name):
+    file_path = path + "/" + file_name + ".py"
+    spec = importlib.util.spec_from_file_location(file_name, file_path)
+    imported_file = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(imported_file)
+    return getattr(imported_file, package_name)
+
+
+Configuration = import_lib(os.path.dirname(__file__) , "configuration", "Configuration")
+QueryGenerator = import_lib(os.path.dirname(__file__) , "query_generator", "QueryGenerator")
+
+
+class Pipline:
+    def __init__(self):
+        self.file_path = os.path.dirname(__file__)
+        self.configuration = Configuration()
+        self.configuration.init_persona()
+        self.query_generator = QueryGenerator()
+
+    def load_data(self):
+        df = pd.read_csv(self.file_path + "/../data/persian_blog/blogs.csv")
+        rows = df.values.tolist()
+        rows = [rows[i][0] for i in range(len(rows))]
+        return rows
+
+    def save_dataset(self, data):
+        path = self.file_path + "/../data/generated"
+        if not os.path.exists(path):
+            os.makedirs(path)
+        
+        files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+
+        pattern = r"^v(\d+)_dataset\.json$"
+
+        all_numbers = []
+
+        for f in files:
+            match = re.match(pattern, f)
+            if match:
+                num = int(match.group(1))
+                all_numbers.append(num)
+
+        if all_numbers:
+            number = max(all_numbers) + 1
+        else:
+            number = 1
+
+        with open(path + "/v" + str(number) + "_dataset.json", "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+
+    
+    def run(self):
+        data = self.load_data()
+
+        num_data = 10
+
+        dataset = []
+        for i in range(num_data):
+            config = self.configuration.run(data[i])
+            generated_data = self.query_generator.run(data[i], config)
+            one_data = config.copy()
+            one_data["document"] = data[i]
+            one_data["query"] = generated_data
+            dataset += [one_data]
+        
+        self.save_dataset(dataset)
+
+
+
+
+def main():
+    pipline = Pipline()
+
+    pipline.run()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file