add pipline
This commit is contained in:
parent
c9ac8b436e
commit
9a446bca16
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
"name": "Python Debugger: Current File",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -162,7 +162,7 @@ Ensure to generate only the JSON output with content in English.
|
||||
# for key in data:
|
||||
# example[key] = data[key]
|
||||
|
||||
config["length"] = random.choice([20, 40, 80, 160])
|
||||
config["length"] = random.choice([10, 20, 40, 80])
|
||||
|
||||
|
||||
return config
|
||||
|
||||
93
src/pipline.py.py
Normal file
93
src/pipline.py.py
Normal file
@ -0,0 +1,93 @@
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import tqdm
|
||||
import faiss
|
||||
import numpy
|
||||
import importlib
|
||||
from openai import OpenAI
|
||||
from dotenv import load_dotenv
|
||||
import re
|
||||
import random
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
def import_lib(path, file_name, package_name):
|
||||
file_path = path + "/" + file_name + ".py"
|
||||
spec = importlib.util.spec_from_file_location(file_name, file_path)
|
||||
imported_file = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(imported_file)
|
||||
return getattr(imported_file, package_name)
|
||||
|
||||
|
||||
Configuration = import_lib(os.path.dirname(__file__) , "configuration", "Configuration")
|
||||
QueryGenerator = import_lib(os.path.dirname(__file__) , "query_generator", "QueryGenerator")
|
||||
|
||||
|
||||
class Pipline:
|
||||
def __init__(self):
|
||||
self.file_path = os.path.dirname(__file__)
|
||||
self.configuration = Configuration()
|
||||
self.configuration.init_persona()
|
||||
self.query_generator = QueryGenerator()
|
||||
|
||||
def load_data(self):
|
||||
df = pd.read_csv(self.file_path + "/../data/persian_blog/blogs.csv")
|
||||
rows = df.values.tolist()
|
||||
rows = [rows[i][0] for i in range(len(rows))]
|
||||
return rows
|
||||
|
||||
def save_dataset(self, data):
|
||||
path = self.file_path + "/../data/generated"
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
|
||||
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
|
||||
|
||||
pattern = r"^v(\d+)_dataset\.json$"
|
||||
|
||||
all_numbers = []
|
||||
|
||||
for f in files:
|
||||
match = re.match(pattern, f)
|
||||
if match:
|
||||
num = int(match.group(1))
|
||||
all_numbers.append(num)
|
||||
|
||||
if all_numbers:
|
||||
number = max(all_numbers) + 1
|
||||
else:
|
||||
number = 1
|
||||
|
||||
with open(path + "/v" + str(number) + "_dataset.json", "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def run(self):
|
||||
data = self.load_data()
|
||||
|
||||
num_data = 10
|
||||
|
||||
dataset = []
|
||||
for i in range(num_data):
|
||||
config = self.configuration.run(data[i])
|
||||
generated_data = self.query_generator.run(data[i], config)
|
||||
one_data = config.copy()
|
||||
one_data["document"] = data[i]
|
||||
one_data["query"] = generated_data
|
||||
dataset += [one_data]
|
||||
|
||||
self.save_dataset(dataset)
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
pipline = Pipline()
|
||||
|
||||
pipline.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user