add sub cluster

2025-10-22 14:31:39 +03:30 · 2025-10-22 14:31:39 +03:30 · 104aef2eef
commit 104aef2eef
parent 78656d9f4d
5 changed files with 237 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -32,9 +32,17 @@ A pipeline for clustering tweets
    By first clustering similar names (step 5), the input to GPT became more organized,
    which made step 6 much more effective.

-## How to use
+## How to extract main cluster

-You should give a excel file which has a column named "tweets" to this below command
+You should give a excel file which has a column named "tweet" to this below command
 Overally it will take 15h time for 40,000 tweets

-    python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx
+    python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_cluster.xlsx
+
+
+## How to extract sub cluster
+
+You should first run above code whihc will give you a excel file which has a colummn of "topic" and "cluster_llm"
+
+
+    python3 sub_clustering_pipeline.py --input_file tweets_file_cluster.xlsx --output_file tweets_file_sub_cluster.xlsx 
--- a/clustering_pipeline.py
+++ b/clustering_pipeline.py
@ -196,4 +196,4 @@ if __name__ == "__main__":

    # apply clustering
    post_cluster = PostClusterLLM()
-    post_cluster.start_process(topics_file, args.output_file)
+    post_cluster.start_process(topic_file, titles_file, args.output_file)
--- a/post_cluster.py
+++ b/post_cluster.py
@ -153,11 +153,11 @@ class PostClusterLLM:

        return df_copy

-    def start_process(self, input_path, output_path):
+    def start_process(self, input_path, titles_path, output_path):
        df = pd.read_excel(input_path)
        df_copy = df.copy()

-        with open("titles_o3.txt", "r") as f:
+        with open(titles_path, "r") as f:
            titles = f.readlines()

        titles = [title.strip() for title in titles]
@ -200,4 +200,4 @@ class PostClusterLLM:

 if __name__ == "__main__":
    llm = PostClusterLLM()
-    llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
+    llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "titles_o3.txt", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
--- a/post_sub_cluster.py
+++ b/post_sub_cluster.py
@ -0,0 +1,214 @@
+import asyncio
+import aiohttp
+import time
+import re
+import pandas as pd
+import json
+from tqdm import tqdm
+
+class PostSubClusterLLM:
+    def __init__(self):
+
+        self.instruction = f"""
+        You will be given a title and a list of all cluster names.
+        Your task is to find the best fit cluster name for the title.
+        Go through the list of all cluster names and find the best fit cluster name for the title.
+        If you found a good fit, return the cluster name.
+        If you didn't find a good fit, return "outlier" is "yes".
+        
+        #IMPORTANT:
+        - if you found a good fit use its id : {{"cluster" : "id_i"}}   
+        - if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}   
+
+        Example-1:
+        - Input:
+            - title: "کتاب و درس"
+            - all_cluster_names: {{
+                "1" : "کتابخوانی",
+                "2" : "فوتبال جام جهانی",
+                "3" : "ساختمان سازی شهری"  }}
+        - Output:
+            - {{"cluster" : "1"}}
+
+        Example-2:
+        - Input:
+            - title: "لپتاب و کامپیوتر"
+            - all_cluster_names: {{
+                "1" : "کتابخوانی",
+                "2" : "فوتبال جام جهانی",
+                "3" : "ساختمان سازی شهری"  }}
+        - Output:
+            - {{"outlier" : "yes"}}
+        
+        Example-3:
+        - Input:
+            - title: "ساختمان"
+            - all_cluster_names: {{
+                "1" : "کتابخوانی",
+                "2" : "فوتبال جام جهانی",
+                "3" : "ساختمان سازی شهری"  }}
+        - Output:
+            - {{"cluster" : "3"}}
+
+        write a small reason and give the final answer.
+        """
+
+
+    async def run_llm(self, session, topic, cluster_name, cluster_sub_cluster_list):
+        """
+        Run the LLM as reranker.
+        Args:
+            session: The session to use for the request.
+            question: The question to rerank the documents.
+            chunk: The chunk to rerank.
+        Returns:
+            The score of the chunk.
+        """
+        if cluster_name == "متفرقه":
+            return None
+        
+        headers = {"Content-Type": "application/json",}
+
+        for cluster_sub_cluster in cluster_sub_cluster_list:
+            if cluster_sub_cluster["cluster_name"] == cluster_name:
+                sub_cluster_names = cluster_sub_cluster["sub_cluster_names"]
+                break
+
+        sub_cluster_names_str = "{\n"
+        for count, value in enumerate(sub_cluster_names):
+            sub_cluster_names_str += f"{count} : {value},\n"
+
+        sub_cluster_names_str += "}"
+
+        input_message = f"""{{"all_cluster_names": "{sub_cluster_names_str}", "title": "{topic}"}}"""
+        messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
+
+        payload = {
+            "model": "google/gemma-3-27b-it",
+            "messages": messages,
+            "max_tokens": 500
+        }
+        try:
+            async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
+                resp.raise_for_status()
+                response = await resp.json()
+            
+            out = response['choices'][0]['message']['content']
+            print("--------------------------------")
+            print(f"title: {topic}")
+            print(f"cluster_name: {cluster_name}")
+            print(out)
+            pattern = r'(\{"cluster".*?\})'
+
+            matches = re.findall(pattern, out)
+
+            for m in matches:
+                out_json = json.loads(m)
+                print(f"out_json: {out_json}")
+                if out_json.get("cluster") is not None:
+                    print(sub_cluster_names[int(out_json.get("cluster"))])
+                return out_json
+
+            pattern = r'(\{"outlier".*?\})'
+
+            matches = re.findall(pattern, out)
+
+            for m in matches:
+                out_json = json.loads(m)
+                print(f"out_json: {out_json}")
+                print("outlier")
+                return out_json
+        except Exception as e:
+            print(f"Error in llm as reranker: {e}")
+            return 0
+
+
+    async def run_llm_async(self, topics, cluster_names, cluster_sub_cluster_dict):
+        """
+        Send all chunk requests concurrently.
+        Args:
+            topics: The topics to rerank.
+            cluster_names: The cluster names to rerank.
+            cluster_sub_cluster_dict: The cluster sub cluster dictionary.
+        Returns:
+            The scores of the chunks.
+        """
+        async with aiohttp.ClientSession() as session:
+            tasks = [self.run_llm(session, topic, cluster_name, cluster_sub_cluster_dict) for topic, cluster_name in zip(topics, cluster_names)]
+            scores_embed = await asyncio.gather(*tasks)
+        return scores_embed
+
+    def sanitize_for_excel(self, df):
+        def _sanitize_for_excel(text):
+            """Remove zero-width and bidi control characters that can confuse Excel rendering."""
+            if text is None:
+                return ""
+            s = str(text)
+            # Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
+            remove_chars = [
+                "\u200c",  # ZWNJ
+                "\u200d",  # ZWJ
+                "\u200e",  # LRM
+                "\u200f",  # RLM
+                "\u202a",  # LRE
+                "\u202b",  # RLE
+                "\u202c",  # PDF
+                "\u202d",  # LRO
+                "\u202e",  # RLO
+                "\ufeff",  # BOM
+                "\u0640",  # Tatweel
+            ]
+            for ch in remove_chars:
+                s = s.replace(ch, "")
+            # Normalize whitespace
+            s = re.sub(r"\s+", " ", s).strip()
+            return s
+
+        df_copy = df.copy()
+        for m in df.columns:
+            for i in range(len(df_copy[m])):
+                df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
+
+        return df_copy
+
+    def start_process(self, input_path, titles_path, output_path):
+        df = pd.read_excel(input_path)
+        df_copy = df.copy()
+
+        with open(titles_path, "r") as f:
+            cluster_sub_cluster_list = json.load(f)
+
+        batch_size = 100
+        for i in tqdm(range(0, len(df["topic"]), batch_size)):
+            start_time = time.time()
+            result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], df["cluster_llm"][i:i+batch_size], cluster_sub_cluster_list))
+            end_time = time.time()
+            print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
+            time.sleep(5)
+
+            for j, result in enumerate(result_list):
+                try:
+                    if result is None:
+                        df_copy.at[i+j, "sub_cluster"] = "متفرقه"
+                    elif result.get("outlier") == "yes":
+                        df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
+                    elif result.get("cluster") is not None:
+                        for cluster_sub_cluster in cluster_sub_cluster_list:
+                            if cluster_sub_cluster["cluster_name"] == df["cluster_llm"][i+j]:
+                                sub_cluster_names = cluster_sub_cluster["sub_cluster_names"]
+                                break
+                        df_copy.at[i+j, "sub_cluster"] = sub_cluster_names[int(result["cluster"])]
+                    else:
+                        df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
+
+                except Exception as e:
+                    print(f"Error in result_list: {e}")
+                    df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
+
+                print(df_copy.at[i+j, "sub_cluster"])
+        df_copy = self.sanitize_for_excel(df_copy)
+        df_copy.to_excel(output_path)
+
+if __name__ == "__main__":
+    llm = PostSubClusterLLM()
+    llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.json", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.xlsx")
--- a/sub_clustering_pipeline.py
+++ b/sub_clustering_pipeline.py
@ -12,6 +12,8 @@ import random
 import re
 import json

+from post_sub_cluster import PostSubClusterLLM
+

 START_K = 2
 END_K = 60
@ -282,4 +284,9 @@ if __name__ == "__main__":
    args = parser.parse_args()

    # extracting topics
-    main(args.input_file, args.output_file)
+    sub_cluster_file = args.output_file.replace(".xlsx", "_sub_cluster.json")
+    main(args.input_file, sub_cluster_file)
+
+    # apply clustering
+    post_sub_cluster = PostSubClusterLLM()
+    post_sub_cluster.start_process(args.input_file, sub_cluster_file, args.output_file)