add pipline

2025-11-30 09:36:29 +00:00 · 2025-11-30 09:36:29 +00:00 · 9a446bca16
commit 9a446bca16
parent c9ac8b436e
3 changed files with 110 additions and 1 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
--- a/src/configuration.py
+++ b/src/configuration.py
@ -162,7 +162,7 @@ Ensure to generate only the JSON output with content in English.
        # for key in data:
        #     example[key] = data[key]

-        config["length"] = random.choice([20, 40, 80, 160])
+        config["length"] = random.choice([10, 20, 40, 80])


        return config
--- a/src/pipline.py.py
+++ b/src/pipline.py.py
@ -0,0 +1,93 @@
+import json
+import os
+import requests
+import tqdm
+import faiss
+import numpy
+import importlib
+from openai import OpenAI
+from dotenv import load_dotenv
+import re
+import random
+import pandas as pd
+
+
+
+def import_lib(path, file_name, package_name):
+    file_path = path + "/" + file_name + ".py"
+    spec = importlib.util.spec_from_file_location(file_name, file_path)
+    imported_file = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(imported_file)
+    return getattr(imported_file, package_name)
+
+
+Configuration = import_lib(os.path.dirname(__file__) , "configuration", "Configuration")
+QueryGenerator = import_lib(os.path.dirname(__file__) , "query_generator", "QueryGenerator")
+
+
+class Pipline:
+    def __init__(self):
+        self.file_path = os.path.dirname(__file__)
+        self.configuration = Configuration()
+        self.configuration.init_persona()
+        self.query_generator = QueryGenerator()
+
+    def load_data(self):
+        df = pd.read_csv(self.file_path + "/../data/persian_blog/blogs.csv")
+        rows = df.values.tolist()
+        rows = [rows[i][0] for i in range(len(rows))]
+        return rows
+
+    def save_dataset(self, data):
+        path = self.file_path + "/../data/generated"
+        if not os.path.exists(path):
+            os.makedirs(path)
+        
+        files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+
+        pattern = r"^v(\d+)_dataset\.json$"
+
+        all_numbers = []
+
+        for f in files:
+            match = re.match(pattern, f)
+            if match:
+                num = int(match.group(1))
+                all_numbers.append(num)
+
+        if all_numbers:
+            number = max(all_numbers) + 1
+        else:
+            number = 1
+
+        with open(path + "/v" + str(number) + "_dataset.json", "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+
+    
+    def run(self):
+        data = self.load_data()
+
+        num_data = 10
+
+        dataset = []
+        for i in range(num_data):
+            config = self.configuration.run(data[i])
+            generated_data = self.query_generator.run(data[i], config)
+            one_data = config.copy()
+            one_data["document"] = data[i]
+            one_data["query"] = generated_data
+            dataset += [one_data]
+        
+        self.save_dataset(dataset)
+
+
+
+
+def main():
+    pipline = Pipline()
+
+    pipline.run()
+
+
+if __name__ == "__main__":
+    main()