add pipline
This commit is contained in:
parent
c9ac8b436e
commit
9a446bca16
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "Python Debugger: Current File",
|
||||||
|
"type": "debugpy",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${file}",
|
||||||
|
"console": "integratedTerminal"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -162,7 +162,7 @@ Ensure to generate only the JSON output with content in English.
|
|||||||
# for key in data:
|
# for key in data:
|
||||||
# example[key] = data[key]
|
# example[key] = data[key]
|
||||||
|
|
||||||
config["length"] = random.choice([20, 40, 80, 160])
|
config["length"] = random.choice([10, 20, 40, 80])
|
||||||
|
|
||||||
|
|
||||||
return config
|
return config
|
||||||
|
|||||||
93
src/pipline.py.py
Normal file
93
src/pipline.py.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import tqdm
|
||||||
|
import faiss
|
||||||
|
import numpy
|
||||||
|
import importlib
|
||||||
|
from openai import OpenAI
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import re
|
||||||
|
import random
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def import_lib(path, file_name, package_name):
|
||||||
|
file_path = path + "/" + file_name + ".py"
|
||||||
|
spec = importlib.util.spec_from_file_location(file_name, file_path)
|
||||||
|
imported_file = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(imported_file)
|
||||||
|
return getattr(imported_file, package_name)
|
||||||
|
|
||||||
|
|
||||||
|
Configuration = import_lib(os.path.dirname(__file__) , "configuration", "Configuration")
|
||||||
|
QueryGenerator = import_lib(os.path.dirname(__file__) , "query_generator", "QueryGenerator")
|
||||||
|
|
||||||
|
|
||||||
|
class Pipline:
|
||||||
|
def __init__(self):
|
||||||
|
self.file_path = os.path.dirname(__file__)
|
||||||
|
self.configuration = Configuration()
|
||||||
|
self.configuration.init_persona()
|
||||||
|
self.query_generator = QueryGenerator()
|
||||||
|
|
||||||
|
def load_data(self):
|
||||||
|
df = pd.read_csv(self.file_path + "/../data/persian_blog/blogs.csv")
|
||||||
|
rows = df.values.tolist()
|
||||||
|
rows = [rows[i][0] for i in range(len(rows))]
|
||||||
|
return rows
|
||||||
|
|
||||||
|
def save_dataset(self, data):
|
||||||
|
path = self.file_path + "/../data/generated"
|
||||||
|
if not os.path.exists(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
|
||||||
|
|
||||||
|
pattern = r"^v(\d+)_dataset\.json$"
|
||||||
|
|
||||||
|
all_numbers = []
|
||||||
|
|
||||||
|
for f in files:
|
||||||
|
match = re.match(pattern, f)
|
||||||
|
if match:
|
||||||
|
num = int(match.group(1))
|
||||||
|
all_numbers.append(num)
|
||||||
|
|
||||||
|
if all_numbers:
|
||||||
|
number = max(all_numbers) + 1
|
||||||
|
else:
|
||||||
|
number = 1
|
||||||
|
|
||||||
|
with open(path + "/v" + str(number) + "_dataset.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
data = self.load_data()
|
||||||
|
|
||||||
|
num_data = 10
|
||||||
|
|
||||||
|
dataset = []
|
||||||
|
for i in range(num_data):
|
||||||
|
config = self.configuration.run(data[i])
|
||||||
|
generated_data = self.query_generator.run(data[i], config)
|
||||||
|
one_data = config.copy()
|
||||||
|
one_data["document"] = data[i]
|
||||||
|
one_data["query"] = generated_data
|
||||||
|
dataset += [one_data]
|
||||||
|
|
||||||
|
self.save_dataset(dataset)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pipline = Pipline()
|
||||||
|
|
||||||
|
pipline.run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user