add sub cluster
This commit is contained in:
parent
78656d9f4d
commit
104aef2eef
14
README.md
14
README.md
@ -32,9 +32,17 @@ A pipeline for clustering tweets
|
||||
By first clustering similar names (step 5), the input to GPT became more organized,
|
||||
which made step 6 much more effective.
|
||||
|
||||
## How to use
|
||||
## How to extract main cluster
|
||||
|
||||
You should give a excel file which has a column named "tweets" to this below command
|
||||
You should give a excel file which has a column named "tweet" to this below command
|
||||
Overally it will take 15h time for 40,000 tweets
|
||||
|
||||
python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx
|
||||
python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_cluster.xlsx
|
||||
|
||||
|
||||
## How to extract sub cluster
|
||||
|
||||
You should first run above code whihc will give you a excel file which has a colummn of "topic" and "cluster_llm"
|
||||
|
||||
|
||||
python3 sub_clustering_pipeline.py --input_file tweets_file_cluster.xlsx --output_file tweets_file_sub_cluster.xlsx
|
||||
|
||||
@ -196,4 +196,4 @@ if __name__ == "__main__":
|
||||
|
||||
# apply clustering
|
||||
post_cluster = PostClusterLLM()
|
||||
post_cluster.start_process(topics_file, args.output_file)
|
||||
post_cluster.start_process(topic_file, titles_file, args.output_file)
|
||||
@ -153,11 +153,11 @@ class PostClusterLLM:
|
||||
|
||||
return df_copy
|
||||
|
||||
def start_process(self, input_path, output_path):
|
||||
def start_process(self, input_path, titles_path, output_path):
|
||||
df = pd.read_excel(input_path)
|
||||
df_copy = df.copy()
|
||||
|
||||
with open("titles_o3.txt", "r") as f:
|
||||
with open(titles_path, "r") as f:
|
||||
titles = f.readlines()
|
||||
|
||||
titles = [title.strip() for title in titles]
|
||||
@ -200,4 +200,4 @@ class PostClusterLLM:
|
||||
|
||||
if __name__ == "__main__":
|
||||
llm = PostClusterLLM()
|
||||
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
|
||||
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "titles_o3.txt", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
|
||||
214
post_sub_cluster.py
Normal file
214
post_sub_cluster.py
Normal file
@ -0,0 +1,214 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import time
|
||||
import re
|
||||
import pandas as pd
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
class PostSubClusterLLM:
|
||||
def __init__(self):
|
||||
|
||||
self.instruction = f"""
|
||||
You will be given a title and a list of all cluster names.
|
||||
Your task is to find the best fit cluster name for the title.
|
||||
Go through the list of all cluster names and find the best fit cluster name for the title.
|
||||
If you found a good fit, return the cluster name.
|
||||
If you didn't find a good fit, return "outlier" is "yes".
|
||||
|
||||
#IMPORTANT:
|
||||
- if you found a good fit use its id : {{"cluster" : "id_i"}}
|
||||
- if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}
|
||||
|
||||
Example-1:
|
||||
- Input:
|
||||
- title: "کتاب و درس"
|
||||
- all_cluster_names: {{
|
||||
"1" : "کتابخوانی",
|
||||
"2" : "فوتبال جام جهانی",
|
||||
"3" : "ساختمان سازی شهری" }}
|
||||
- Output:
|
||||
- {{"cluster" : "1"}}
|
||||
|
||||
Example-2:
|
||||
- Input:
|
||||
- title: "لپتاب و کامپیوتر"
|
||||
- all_cluster_names: {{
|
||||
"1" : "کتابخوانی",
|
||||
"2" : "فوتبال جام جهانی",
|
||||
"3" : "ساختمان سازی شهری" }}
|
||||
- Output:
|
||||
- {{"outlier" : "yes"}}
|
||||
|
||||
Example-3:
|
||||
- Input:
|
||||
- title: "ساختمان"
|
||||
- all_cluster_names: {{
|
||||
"1" : "کتابخوانی",
|
||||
"2" : "فوتبال جام جهانی",
|
||||
"3" : "ساختمان سازی شهری" }}
|
||||
- Output:
|
||||
- {{"cluster" : "3"}}
|
||||
|
||||
write a small reason and give the final answer.
|
||||
"""
|
||||
|
||||
|
||||
async def run_llm(self, session, topic, cluster_name, cluster_sub_cluster_list):
|
||||
"""
|
||||
Run the LLM as reranker.
|
||||
Args:
|
||||
session: The session to use for the request.
|
||||
question: The question to rerank the documents.
|
||||
chunk: The chunk to rerank.
|
||||
Returns:
|
||||
The score of the chunk.
|
||||
"""
|
||||
if cluster_name == "متفرقه":
|
||||
return None
|
||||
|
||||
headers = {"Content-Type": "application/json",}
|
||||
|
||||
for cluster_sub_cluster in cluster_sub_cluster_list:
|
||||
if cluster_sub_cluster["cluster_name"] == cluster_name:
|
||||
sub_cluster_names = cluster_sub_cluster["sub_cluster_names"]
|
||||
break
|
||||
|
||||
sub_cluster_names_str = "{\n"
|
||||
for count, value in enumerate(sub_cluster_names):
|
||||
sub_cluster_names_str += f"{count} : {value},\n"
|
||||
|
||||
sub_cluster_names_str += "}"
|
||||
|
||||
input_message = f"""{{"all_cluster_names": "{sub_cluster_names_str}", "title": "{topic}"}}"""
|
||||
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
|
||||
|
||||
payload = {
|
||||
"model": "google/gemma-3-27b-it",
|
||||
"messages": messages,
|
||||
"max_tokens": 500
|
||||
}
|
||||
try:
|
||||
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
|
||||
resp.raise_for_status()
|
||||
response = await resp.json()
|
||||
|
||||
out = response['choices'][0]['message']['content']
|
||||
print("--------------------------------")
|
||||
print(f"title: {topic}")
|
||||
print(f"cluster_name: {cluster_name}")
|
||||
print(out)
|
||||
pattern = r'(\{"cluster".*?\})'
|
||||
|
||||
matches = re.findall(pattern, out)
|
||||
|
||||
for m in matches:
|
||||
out_json = json.loads(m)
|
||||
print(f"out_json: {out_json}")
|
||||
if out_json.get("cluster") is not None:
|
||||
print(sub_cluster_names[int(out_json.get("cluster"))])
|
||||
return out_json
|
||||
|
||||
pattern = r'(\{"outlier".*?\})'
|
||||
|
||||
matches = re.findall(pattern, out)
|
||||
|
||||
for m in matches:
|
||||
out_json = json.loads(m)
|
||||
print(f"out_json: {out_json}")
|
||||
print("outlier")
|
||||
return out_json
|
||||
except Exception as e:
|
||||
print(f"Error in llm as reranker: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
async def run_llm_async(self, topics, cluster_names, cluster_sub_cluster_dict):
|
||||
"""
|
||||
Send all chunk requests concurrently.
|
||||
Args:
|
||||
topics: The topics to rerank.
|
||||
cluster_names: The cluster names to rerank.
|
||||
cluster_sub_cluster_dict: The cluster sub cluster dictionary.
|
||||
Returns:
|
||||
The scores of the chunks.
|
||||
"""
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [self.run_llm(session, topic, cluster_name, cluster_sub_cluster_dict) for topic, cluster_name in zip(topics, cluster_names)]
|
||||
scores_embed = await asyncio.gather(*tasks)
|
||||
return scores_embed
|
||||
|
||||
def sanitize_for_excel(self, df):
|
||||
def _sanitize_for_excel(text):
|
||||
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
|
||||
if text is None:
|
||||
return ""
|
||||
s = str(text)
|
||||
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
|
||||
remove_chars = [
|
||||
"\u200c", # ZWNJ
|
||||
"\u200d", # ZWJ
|
||||
"\u200e", # LRM
|
||||
"\u200f", # RLM
|
||||
"\u202a", # LRE
|
||||
"\u202b", # RLE
|
||||
"\u202c", # PDF
|
||||
"\u202d", # LRO
|
||||
"\u202e", # RLO
|
||||
"\ufeff", # BOM
|
||||
"\u0640", # Tatweel
|
||||
]
|
||||
for ch in remove_chars:
|
||||
s = s.replace(ch, "")
|
||||
# Normalize whitespace
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
df_copy = df.copy()
|
||||
for m in df.columns:
|
||||
for i in range(len(df_copy[m])):
|
||||
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
|
||||
|
||||
return df_copy
|
||||
|
||||
def start_process(self, input_path, titles_path, output_path):
|
||||
df = pd.read_excel(input_path)
|
||||
df_copy = df.copy()
|
||||
|
||||
with open(titles_path, "r") as f:
|
||||
cluster_sub_cluster_list = json.load(f)
|
||||
|
||||
batch_size = 100
|
||||
for i in tqdm(range(0, len(df["topic"]), batch_size)):
|
||||
start_time = time.time()
|
||||
result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], df["cluster_llm"][i:i+batch_size], cluster_sub_cluster_list))
|
||||
end_time = time.time()
|
||||
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
|
||||
time.sleep(5)
|
||||
|
||||
for j, result in enumerate(result_list):
|
||||
try:
|
||||
if result is None:
|
||||
df_copy.at[i+j, "sub_cluster"] = "متفرقه"
|
||||
elif result.get("outlier") == "yes":
|
||||
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
|
||||
elif result.get("cluster") is not None:
|
||||
for cluster_sub_cluster in cluster_sub_cluster_list:
|
||||
if cluster_sub_cluster["cluster_name"] == df["cluster_llm"][i+j]:
|
||||
sub_cluster_names = cluster_sub_cluster["sub_cluster_names"]
|
||||
break
|
||||
df_copy.at[i+j, "sub_cluster"] = sub_cluster_names[int(result["cluster"])]
|
||||
else:
|
||||
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in result_list: {e}")
|
||||
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
|
||||
|
||||
print(df_copy.at[i+j, "sub_cluster"])
|
||||
df_copy = self.sanitize_for_excel(df_copy)
|
||||
df_copy.to_excel(output_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
llm = PostSubClusterLLM()
|
||||
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.json", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.xlsx")
|
||||
@ -12,6 +12,8 @@ import random
|
||||
import re
|
||||
import json
|
||||
|
||||
from post_sub_cluster import PostSubClusterLLM
|
||||
|
||||
|
||||
START_K = 2
|
||||
END_K = 60
|
||||
@ -282,4 +284,9 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
# extracting topics
|
||||
main(args.input_file, args.output_file)
|
||||
sub_cluster_file = args.output_file.replace(".xlsx", "_sub_cluster.json")
|
||||
main(args.input_file, sub_cluster_file)
|
||||
|
||||
# apply clustering
|
||||
post_sub_cluster = PostSubClusterLLM()
|
||||
post_sub_cluster.start_process(args.input_file, sub_cluster_file, args.output_file)
|
||||
Loading…
x
Reference in New Issue
Block a user