add pipline

This commit is contained in:
hediehloo 2025-11-30 09:36:29 +00:00
parent c9ac8b436e
commit 9a446bca16
3 changed files with 110 additions and 1 deletions

16
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

View File

@ -162,7 +162,7 @@ Ensure to generate only the JSON output with content in English.
# for key in data:
# example[key] = data[key]
config["length"] = random.choice([20, 40, 80, 160])
config["length"] = random.choice([10, 20, 40, 80])
return config

93
src/pipline.py.py Normal file
View File

@ -0,0 +1,93 @@
import json
import os
import requests
import tqdm
import faiss
import numpy
import importlib
from openai import OpenAI
from dotenv import load_dotenv
import re
import random
import pandas as pd
def import_lib(path, file_name, package_name):
file_path = path + "/" + file_name + ".py"
spec = importlib.util.spec_from_file_location(file_name, file_path)
imported_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(imported_file)
return getattr(imported_file, package_name)
Configuration = import_lib(os.path.dirname(__file__) , "configuration", "Configuration")
QueryGenerator = import_lib(os.path.dirname(__file__) , "query_generator", "QueryGenerator")
class Pipline:
def __init__(self):
self.file_path = os.path.dirname(__file__)
self.configuration = Configuration()
self.configuration.init_persona()
self.query_generator = QueryGenerator()
def load_data(self):
df = pd.read_csv(self.file_path + "/../data/persian_blog/blogs.csv")
rows = df.values.tolist()
rows = [rows[i][0] for i in range(len(rows))]
return rows
def save_dataset(self, data):
path = self.file_path + "/../data/generated"
if not os.path.exists(path):
os.makedirs(path)
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
pattern = r"^v(\d+)_dataset\.json$"
all_numbers = []
for f in files:
match = re.match(pattern, f)
if match:
num = int(match.group(1))
all_numbers.append(num)
if all_numbers:
number = max(all_numbers) + 1
else:
number = 1
with open(path + "/v" + str(number) + "_dataset.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def run(self):
data = self.load_data()
num_data = 10
dataset = []
for i in range(num_data):
config = self.configuration.run(data[i])
generated_data = self.query_generator.run(data[i], config)
one_data = config.copy()
one_data["document"] = data[i]
one_data["query"] = generated_data
dataset += [one_data]
self.save_dataset(dataset)
def main():
pipline = Pipline()
pipline.run()
if __name__ == "__main__":
main()