add sub cluster

This commit is contained in:
SFirouzi 2025-10-22 14:31:39 +03:30
parent 78656d9f4d
commit 104aef2eef
5 changed files with 237 additions and 8 deletions

View File

@ -32,9 +32,17 @@ A pipeline for clustering tweets
By first clustering similar names (step 5), the input to GPT became more organized, By first clustering similar names (step 5), the input to GPT became more organized,
which made step 6 much more effective. which made step 6 much more effective.
## How to use ## How to extract main cluster
You should give a excel file which has a column named "tweets" to this below command You should give a excel file which has a column named "tweet" to this below command
Overally it will take 15h time for 40,000 tweets Overally it will take 15h time for 40,000 tweets
python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_cluster.xlsx
## How to extract sub cluster
You should first run above code whihc will give you a excel file which has a colummn of "topic" and "cluster_llm"
python3 sub_clustering_pipeline.py --input_file tweets_file_cluster.xlsx --output_file tweets_file_sub_cluster.xlsx

View File

@ -196,4 +196,4 @@ if __name__ == "__main__":
# apply clustering # apply clustering
post_cluster = PostClusterLLM() post_cluster = PostClusterLLM()
post_cluster.start_process(topics_file, args.output_file) post_cluster.start_process(topic_file, titles_file, args.output_file)

View File

@ -153,11 +153,11 @@ class PostClusterLLM:
return df_copy return df_copy
def start_process(self, input_path, output_path): def start_process(self, input_path, titles_path, output_path):
df = pd.read_excel(input_path) df = pd.read_excel(input_path)
df_copy = df.copy() df_copy = df.copy()
with open("titles_o3.txt", "r") as f: with open(titles_path, "r") as f:
titles = f.readlines() titles = f.readlines()
titles = [title.strip() for title in titles] titles = [title.strip() for title in titles]
@ -200,4 +200,4 @@ class PostClusterLLM:
if __name__ == "__main__": if __name__ == "__main__":
llm = PostClusterLLM() llm = PostClusterLLM()
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx") llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "titles_o3.txt", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")

214
post_sub_cluster.py Normal file
View File

@ -0,0 +1,214 @@
import asyncio
import aiohttp
import time
import re
import pandas as pd
import json
from tqdm import tqdm
class PostSubClusterLLM:
def __init__(self):
self.instruction = f"""
You will be given a title and a list of all cluster names.
Your task is to find the best fit cluster name for the title.
Go through the list of all cluster names and find the best fit cluster name for the title.
If you found a good fit, return the cluster name.
If you didn't find a good fit, return "outlier" is "yes".
#IMPORTANT:
- if you found a good fit use its id : {{"cluster" : "id_i"}}
- if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}
Example-1:
- Input:
- title: "کتاب و درس"
- all_cluster_names: {{
"1" : "کتابخوانی",
"2" : "فوتبال جام جهانی",
"3" : "ساختمان سازی شهری" }}
- Output:
- {{"cluster" : "1"}}
Example-2:
- Input:
- title: "لپتاب و کامپیوتر"
- all_cluster_names: {{
"1" : "کتابخوانی",
"2" : "فوتبال جام جهانی",
"3" : "ساختمان سازی شهری" }}
- Output:
- {{"outlier" : "yes"}}
Example-3:
- Input:
- title: "ساختمان"
- all_cluster_names: {{
"1" : "کتابخوانی",
"2" : "فوتبال جام جهانی",
"3" : "ساختمان سازی شهری" }}
- Output:
- {{"cluster" : "3"}}
write a small reason and give the final answer.
"""
async def run_llm(self, session, topic, cluster_name, cluster_sub_cluster_list):
"""
Run the LLM as reranker.
Args:
session: The session to use for the request.
question: The question to rerank the documents.
chunk: The chunk to rerank.
Returns:
The score of the chunk.
"""
if cluster_name == "متفرقه":
return None
headers = {"Content-Type": "application/json",}
for cluster_sub_cluster in cluster_sub_cluster_list:
if cluster_sub_cluster["cluster_name"] == cluster_name:
sub_cluster_names = cluster_sub_cluster["sub_cluster_names"]
break
sub_cluster_names_str = "{\n"
for count, value in enumerate(sub_cluster_names):
sub_cluster_names_str += f"{count} : {value},\n"
sub_cluster_names_str += "}"
input_message = f"""{{"all_cluster_names": "{sub_cluster_names_str}", "title": "{topic}"}}"""
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
payload = {
"model": "google/gemma-3-27b-it",
"messages": messages,
"max_tokens": 500
}
try:
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
resp.raise_for_status()
response = await resp.json()
out = response['choices'][0]['message']['content']
print("--------------------------------")
print(f"title: {topic}")
print(f"cluster_name: {cluster_name}")
print(out)
pattern = r'(\{"cluster".*?\})'
matches = re.findall(pattern, out)
for m in matches:
out_json = json.loads(m)
print(f"out_json: {out_json}")
if out_json.get("cluster") is not None:
print(sub_cluster_names[int(out_json.get("cluster"))])
return out_json
pattern = r'(\{"outlier".*?\})'
matches = re.findall(pattern, out)
for m in matches:
out_json = json.loads(m)
print(f"out_json: {out_json}")
print("outlier")
return out_json
except Exception as e:
print(f"Error in llm as reranker: {e}")
return 0
async def run_llm_async(self, topics, cluster_names, cluster_sub_cluster_dict):
"""
Send all chunk requests concurrently.
Args:
topics: The topics to rerank.
cluster_names: The cluster names to rerank.
cluster_sub_cluster_dict: The cluster sub cluster dictionary.
Returns:
The scores of the chunks.
"""
async with aiohttp.ClientSession() as session:
tasks = [self.run_llm(session, topic, cluster_name, cluster_sub_cluster_dict) for topic, cluster_name in zip(topics, cluster_names)]
scores_embed = await asyncio.gather(*tasks)
return scores_embed
def sanitize_for_excel(self, df):
def _sanitize_for_excel(text):
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
if text is None:
return ""
s = str(text)
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
remove_chars = [
"\u200c", # ZWNJ
"\u200d", # ZWJ
"\u200e", # LRM
"\u200f", # RLM
"\u202a", # LRE
"\u202b", # RLE
"\u202c", # PDF
"\u202d", # LRO
"\u202e", # RLO
"\ufeff", # BOM
"\u0640", # Tatweel
]
for ch in remove_chars:
s = s.replace(ch, "")
# Normalize whitespace
s = re.sub(r"\s+", " ", s).strip()
return s
df_copy = df.copy()
for m in df.columns:
for i in range(len(df_copy[m])):
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
return df_copy
def start_process(self, input_path, titles_path, output_path):
df = pd.read_excel(input_path)
df_copy = df.copy()
with open(titles_path, "r") as f:
cluster_sub_cluster_list = json.load(f)
batch_size = 100
for i in tqdm(range(0, len(df["topic"]), batch_size)):
start_time = time.time()
result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], df["cluster_llm"][i:i+batch_size], cluster_sub_cluster_list))
end_time = time.time()
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
time.sleep(5)
for j, result in enumerate(result_list):
try:
if result is None:
df_copy.at[i+j, "sub_cluster"] = "متفرقه"
elif result.get("outlier") == "yes":
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
elif result.get("cluster") is not None:
for cluster_sub_cluster in cluster_sub_cluster_list:
if cluster_sub_cluster["cluster_name"] == df["cluster_llm"][i+j]:
sub_cluster_names = cluster_sub_cluster["sub_cluster_names"]
break
df_copy.at[i+j, "sub_cluster"] = sub_cluster_names[int(result["cluster"])]
else:
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
except Exception as e:
print(f"Error in result_list: {e}")
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
print(df_copy.at[i+j, "sub_cluster"])
df_copy = self.sanitize_for_excel(df_copy)
df_copy.to_excel(output_path)
if __name__ == "__main__":
llm = PostSubClusterLLM()
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.json", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.xlsx")

View File

@ -12,6 +12,8 @@ import random
import re import re
import json import json
from post_sub_cluster import PostSubClusterLLM
START_K = 2 START_K = 2
END_K = 60 END_K = 60
@ -282,4 +284,9 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
# extracting topics # extracting topics
main(args.input_file, args.output_file) sub_cluster_file = args.output_file.replace(".xlsx", "_sub_cluster.json")
main(args.input_file, sub_cluster_file)
# apply clustering
post_sub_cluster = PostSubClusterLLM()
post_sub_cluster.start_process(args.input_file, sub_cluster_file, args.output_file)