From 104aef2eef5bb13c7e56b7826b058cf960958140 Mon Sep 17 00:00:00 2001 From: SFirouzi Date: Wed, 22 Oct 2025 14:31:39 +0330 Subject: [PATCH] add sub cluster --- README.md | 14 ++- clustering_pipeline.py | 2 +- post_cluster.py | 6 +- post_sub_cluster.py | 214 +++++++++++++++++++++++++++++++++++++ sub_clustering_pipeline.py | 9 +- 5 files changed, 237 insertions(+), 8 deletions(-) create mode 100644 post_sub_cluster.py diff --git a/README.md b/README.md index 374732d..14bc303 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,17 @@ A pipeline for clustering tweets By first clustering similar names (step 5), the input to GPT became more organized, which made step 6 much more effective. -## How to use +## How to extract main cluster -You should give a excel file which has a column named "tweets" to this below command +You should give a excel file which has a column named "tweet" to this below command Overally it will take 15h time for 40,000 tweets - python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx + python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_cluster.xlsx + + +## How to extract sub cluster + +You should first run above code whihc will give you a excel file which has a colummn of "topic" and "cluster_llm" + + + python3 sub_clustering_pipeline.py --input_file tweets_file_cluster.xlsx --output_file tweets_file_sub_cluster.xlsx diff --git a/clustering_pipeline.py b/clustering_pipeline.py index ca661ac..9fcbd16 100644 --- a/clustering_pipeline.py +++ b/clustering_pipeline.py @@ -196,4 +196,4 @@ if __name__ == "__main__": # apply clustering post_cluster = PostClusterLLM() - post_cluster.start_process(topics_file, args.output_file) \ No newline at end of file + post_cluster.start_process(topic_file, titles_file, args.output_file) \ No newline at end of file diff --git a/post_cluster.py b/post_cluster.py index 257a95e..ccf0ced 100644 --- a/post_cluster.py +++ b/post_cluster.py @@ -153,11 +153,11 @@ class PostClusterLLM: return df_copy - def start_process(self, input_path, output_path): + def start_process(self, input_path, titles_path, output_path): df = pd.read_excel(input_path) df_copy = df.copy() - with open("titles_o3.txt", "r") as f: + with open(titles_path, "r") as f: titles = f.readlines() titles = [title.strip() for title in titles] @@ -200,4 +200,4 @@ class PostClusterLLM: if __name__ == "__main__": llm = PostClusterLLM() - llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx") \ No newline at end of file + llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "titles_o3.txt", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx") \ No newline at end of file diff --git a/post_sub_cluster.py b/post_sub_cluster.py new file mode 100644 index 0000000..d19b91a --- /dev/null +++ b/post_sub_cluster.py @@ -0,0 +1,214 @@ +import asyncio +import aiohttp +import time +import re +import pandas as pd +import json +from tqdm import tqdm + +class PostSubClusterLLM: + def __init__(self): + + self.instruction = f""" + You will be given a title and a list of all cluster names. + Your task is to find the best fit cluster name for the title. + Go through the list of all cluster names and find the best fit cluster name for the title. + If you found a good fit, return the cluster name. + If you didn't find a good fit, return "outlier" is "yes". + + #IMPORTANT: + - if you found a good fit use its id : {{"cluster" : "id_i"}} + - if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}} + + Example-1: + - Input: + - title: "کتاب و درس" + - all_cluster_names: {{ + "1" : "کتابخوانی", + "2" : "فوتبال جام جهانی", + "3" : "ساختمان سازی شهری" }} + - Output: + - {{"cluster" : "1"}} + + Example-2: + - Input: + - title: "لپتاب و کامپیوتر" + - all_cluster_names: {{ + "1" : "کتابخوانی", + "2" : "فوتبال جام جهانی", + "3" : "ساختمان سازی شهری" }} + - Output: + - {{"outlier" : "yes"}} + + Example-3: + - Input: + - title: "ساختمان" + - all_cluster_names: {{ + "1" : "کتابخوانی", + "2" : "فوتبال جام جهانی", + "3" : "ساختمان سازی شهری" }} + - Output: + - {{"cluster" : "3"}} + + write a small reason and give the final answer. + """ + + + async def run_llm(self, session, topic, cluster_name, cluster_sub_cluster_list): + """ + Run the LLM as reranker. + Args: + session: The session to use for the request. + question: The question to rerank the documents. + chunk: The chunk to rerank. + Returns: + The score of the chunk. + """ + if cluster_name == "متفرقه": + return None + + headers = {"Content-Type": "application/json",} + + for cluster_sub_cluster in cluster_sub_cluster_list: + if cluster_sub_cluster["cluster_name"] == cluster_name: + sub_cluster_names = cluster_sub_cluster["sub_cluster_names"] + break + + sub_cluster_names_str = "{\n" + for count, value in enumerate(sub_cluster_names): + sub_cluster_names_str += f"{count} : {value},\n" + + sub_cluster_names_str += "}" + + input_message = f"""{{"all_cluster_names": "{sub_cluster_names_str}", "title": "{topic}"}}""" + messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}] + + payload = { + "model": "google/gemma-3-27b-it", + "messages": messages, + "max_tokens": 500 + } + try: + async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp: + resp.raise_for_status() + response = await resp.json() + + out = response['choices'][0]['message']['content'] + print("--------------------------------") + print(f"title: {topic}") + print(f"cluster_name: {cluster_name}") + print(out) + pattern = r'(\{"cluster".*?\})' + + matches = re.findall(pattern, out) + + for m in matches: + out_json = json.loads(m) + print(f"out_json: {out_json}") + if out_json.get("cluster") is not None: + print(sub_cluster_names[int(out_json.get("cluster"))]) + return out_json + + pattern = r'(\{"outlier".*?\})' + + matches = re.findall(pattern, out) + + for m in matches: + out_json = json.loads(m) + print(f"out_json: {out_json}") + print("outlier") + return out_json + except Exception as e: + print(f"Error in llm as reranker: {e}") + return 0 + + + async def run_llm_async(self, topics, cluster_names, cluster_sub_cluster_dict): + """ + Send all chunk requests concurrently. + Args: + topics: The topics to rerank. + cluster_names: The cluster names to rerank. + cluster_sub_cluster_dict: The cluster sub cluster dictionary. + Returns: + The scores of the chunks. + """ + async with aiohttp.ClientSession() as session: + tasks = [self.run_llm(session, topic, cluster_name, cluster_sub_cluster_dict) for topic, cluster_name in zip(topics, cluster_names)] + scores_embed = await asyncio.gather(*tasks) + return scores_embed + + def sanitize_for_excel(self, df): + def _sanitize_for_excel(text): + """Remove zero-width and bidi control characters that can confuse Excel rendering.""" + if text is None: + return "" + s = str(text) + # Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel + remove_chars = [ + "\u200c", # ZWNJ + "\u200d", # ZWJ + "\u200e", # LRM + "\u200f", # RLM + "\u202a", # LRE + "\u202b", # RLE + "\u202c", # PDF + "\u202d", # LRO + "\u202e", # RLO + "\ufeff", # BOM + "\u0640", # Tatweel + ] + for ch in remove_chars: + s = s.replace(ch, "") + # Normalize whitespace + s = re.sub(r"\s+", " ", s).strip() + return s + + df_copy = df.copy() + for m in df.columns: + for i in range(len(df_copy[m])): + df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m]) + + return df_copy + + def start_process(self, input_path, titles_path, output_path): + df = pd.read_excel(input_path) + df_copy = df.copy() + + with open(titles_path, "r") as f: + cluster_sub_cluster_list = json.load(f) + + batch_size = 100 + for i in tqdm(range(0, len(df["topic"]), batch_size)): + start_time = time.time() + result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], df["cluster_llm"][i:i+batch_size], cluster_sub_cluster_list)) + end_time = time.time() + print(f"Time taken for llm as reranker: {end_time - start_time} seconds") + time.sleep(5) + + for j, result in enumerate(result_list): + try: + if result is None: + df_copy.at[i+j, "sub_cluster"] = "متفرقه" + elif result.get("outlier") == "yes": + df_copy.at[i+j, "sub_cluster"] = "موارد دیگر" + elif result.get("cluster") is not None: + for cluster_sub_cluster in cluster_sub_cluster_list: + if cluster_sub_cluster["cluster_name"] == df["cluster_llm"][i+j]: + sub_cluster_names = cluster_sub_cluster["sub_cluster_names"] + break + df_copy.at[i+j, "sub_cluster"] = sub_cluster_names[int(result["cluster"])] + else: + df_copy.at[i+j, "sub_cluster"] = "موارد دیگر" + + except Exception as e: + print(f"Error in result_list: {e}") + df_copy.at[i+j, "sub_cluster"] = "موارد دیگر" + + print(df_copy.at[i+j, "sub_cluster"]) + df_copy = self.sanitize_for_excel(df_copy) + df_copy.to_excel(output_path) + +if __name__ == "__main__": + llm = PostSubClusterLLM() + llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.json", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.xlsx") \ No newline at end of file diff --git a/sub_clustering_pipeline.py b/sub_clustering_pipeline.py index 65cd2cc..1996d7b 100644 --- a/sub_clustering_pipeline.py +++ b/sub_clustering_pipeline.py @@ -12,6 +12,8 @@ import random import re import json +from post_sub_cluster import PostSubClusterLLM + START_K = 2 END_K = 60 @@ -282,4 +284,9 @@ if __name__ == "__main__": args = parser.parse_args() # extracting topics - main(args.input_file, args.output_file) \ No newline at end of file + sub_cluster_file = args.output_file.replace(".xlsx", "_sub_cluster.json") + main(args.input_file, sub_cluster_file) + + # apply clustering + post_sub_cluster = PostSubClusterLLM() + post_sub_cluster.start_process(args.input_file, sub_cluster_file, args.output_file) \ No newline at end of file