add codes

2025-10-21 11:14:59 +03:30 · 2025-10-21 11:14:59 +03:30 · 78656d9f4d
commit 78656d9f4d
6 changed files with 15593 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,40 @@
+# TEXT CLUSTERING
+
+A pipeline for clustering tweets 
+
+## Overall Pipeline for Cluster Extraction
+
+    1.Convert tweet text to categories using the Gemma model:
+    Takes about 7 hours for 40,000 tweets.
+
+    2.Convert categories to embedding vectors using Jina:
+    Takes about 3 minutes.
+
+    3.Perform clustering with K-Means:
+    Choose the number of clusters with the highest silhouette score among 20–60 groups.
+    Takes about 5 minutes.
+
+    4.Name the clusters using the Gemma model:
+    Takes about 1 minute.
+
+    5.Cluster the generated names using K-Means and group similar names together:
+    Takes about 1 minute.
+
+    6.Use GPT O3 to merge and refine cluster names:
+    Provided GPT with the list cluster names and asked it to build new, higher-level clusters.
+    Takes about 1 minute.
+
+    7.Assign each topic to its final cluster using the Gemma model:
+    Takes about 7 hours.
+
+    Reason for step 5:
+    If I had directly given the list of names to step 6, GPT wouldn’t have performed well.
+    By first clustering similar names (step 5), the input to GPT became more organized,
+    which made step 6 much more effective.
+
+## How to use
+
+You should give a excel file which has a column named "tweets" to this below command
+Overally it will take 15h time for 40,000 tweets
+
+    python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx
--- a/clustering_pipeline.py
+++ b/clustering_pipeline.py
@ -0,0 +1,199 @@
+import argparse
+import pandas as pd
+from transformers import AutoModel
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from hazm import Normalizer
+from tqdm import tqdm
+import requests
+from openai import OpenAI
+import httpx
+import random
+
+from post_cluster import PostClusterLLM
+from topic_recreation import TopicRecreation
+
+
+START_K = 20
+END_K = 60
+
+
+def get_best_k(embeddings):
+
+    max_sil_score = 0
+    best_k = START_K
+    for k in range(START_K, min(END_K, len(embeddings))):
+        kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
+        labels = kmeans.fit_predict(embeddings)
+
+        sil_score = silhouette_score(embeddings, labels)
+        if sil_score > max_sil_score:
+            max_sil_score = sil_score
+            best_k = k
+
+    kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(embeddings)
+
+    return best_k, labels
+
+
+def get_embeddings(names):
+    model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
+    
+    normalizer = Normalizer()
+    names = [normalizer.normalize(name) for name in names]
+
+    adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
+
+    names_new = []
+    for name in names:
+        for adj in adjs:
+            name = name.replace(adj, "")
+        names_new.append(name)
+
+    embeddings = []
+    for batch in tqdm(range(0, len(names_new), 50)):    
+        embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
+
+    return embeddings
+
+
+def get_cluster_names(clusters):
+    headers = {"Content-Type": "application/json",}
+
+    prompt = """
+    You are a helpful assistant that generates names for clusters of trends in persian.
+    I will give you a list of trends and you will generate a name for this cluster.
+    There might be some different topics in the list so you just consider the dominant topic.
+    Just give me the final answer in persian.
+    """
+
+    cluster_names = []
+    for data in clusters:
+        cluster_samples = random.sample(data, min(20, len(data)))
+
+        messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
+
+        payload = {
+            "model": "google/gemma-3-27b-it",
+            "messages": messages,
+            "max_tokens": 8000
+        }
+
+        response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
+        our_response = response.json()['choices'][0]['message']['content']
+        cluster_names.append(our_response)
+
+    return cluster_names
+
+
+def modify_cluster_names(cluster_names):
+    PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
+    http_client = httpx.Client(proxy=PROXY_URL)
+    client = OpenAI(api_key="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", http_client=http_client)
+
+    prompt = """
+    You are a topic modification expert.
+
+    I will give you a list of topics.
+
+    ## TASK
+    Extract meaningful and distinct topics from the list. you can chnage the name of topics. Just about 20-30 topics that cover all of them.
+
+    ## RULES
+    - You can combine or split or ... for doing this task.
+    - You can change the name of topics to make it more general or more specific.
+    - the final topics must be distinct and have specific meaning rather than others.
+    - dont combine topics that are not related to each other. like economical with political with social with ...
+    - combine topics that are related to each other. like ghaza with palestine or ...
+
+    ## MUST
+    - all categories must be distinct and have specific meaning from other categories.
+    - two categories can not be similar to each other.
+
+    I will trust your intelligence.
+    write the final answer in persian.
+    """
+
+    response = client.chat.completions.create(
+            model="o3",
+            messages=[
+                {"role": "system", "content": prompt},
+                {"role": "user", "content": str(cluster_names)}
+            ]
+        )
+    out = response.choices[0].message.content
+
+    return out
+
+
+def main(input_file, output_file):
+    # read input file
+    df = pd.read_excel(input_file)
+    topics = df["topic_recreation"].tolist()
+
+    # get embeddings
+    embeddings = get_embeddings(topics)
+
+    # get best k and labels of kmeans with best_k
+    best_k, labels = get_best_k(embeddings)
+
+    # fill clusters
+    clusters = []
+    for i in range(best_k):
+        clusters.append([])
+
+    for i in range(len(clusters)):
+        for topic, label in zip(topics, labels):
+            if label == i:
+                clusters[i].append(topic)
+
+    # get cluster names
+    cluster_names = get_cluster_names(clusters)
+
+    # get embeddings for cluster names
+    cluster_names_embeddings = get_embeddings(cluster_names)
+
+    # get best k and labels of kmeans with best_k
+    best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
+
+    # fill clusters of cluster_names
+    clusters_cluster_names = []
+    for i in range(best_k_cluster_names):
+        clusters_cluster_names.append([])
+
+    for i in range(len(clusters_cluster_names)):
+        for cluster_name, label in zip(cluster_names, labels_cluster_names):
+            if label == i:
+                clusters_cluster_names[i].append(cluster_name)
+
+    # get cluster names for clusters of cluster_names
+    cluster_names_modify = modify_cluster_names(clusters_cluster_names)
+
+    # save cluster names
+    with open(output_file, "w") as f:
+        for count, cluster_name in enumerate(cluster_names_modify):
+            if count == len(cluster_names_modify) - 1:
+                f.write(cluster_name)
+            else:
+                f.write(cluster_name + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", type=str, required=True)
+    parser.add_argument("--output_file", type=str, required=True)
+    args = parser.parse_args()
+
+    # apply topic_recreation
+    topic_recreation = TopicRecreation()
+    topic_file = args.output_file.replace(".xlsx", "_topic_recreation.xlsx")
+    topic_recreation.start_process(args.input_file, topic_file)
+
+    # extracting topics
+    titles_file = args.output_file.replace(".xlsx", "_titles.txt")
+    main(topic_file, titles_file)
+
+    # apply clustering
+    post_cluster = PostClusterLLM()
+    post_cluster.start_process(topics_file, args.output_file)
--- a/post_cluster.py
+++ b/post_cluster.py
@ -0,0 +1,203 @@
+import asyncio
+import aiohttp
+import time
+import re
+import pandas as pd
+import json
+from tqdm import tqdm
+
+class PostClusterLLM:
+    def __init__(self):
+
+        self.instruction = f"""
+        You will be given a title and a list of all cluster names.
+        Your task is to find the best fit cluster name for the title.
+        Go through the list of all cluster names and find the best fit cluster name for the title.
+        If you found a good fit, return the cluster name.
+        If you didn't find a good fit, return "outlier" is "yes".
+        
+        #IMPORTANT:
+        - if you found a good fit use its id : {{"cluster" : "id_i"}}   
+        - if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}   
+
+        Example-1:
+        - Input:
+            - title: "کتاب و درس"
+            - all_cluster_names: {{
+                "1" : "کتابخوانی",
+                "2" : "فوتبال جام جهانی",
+                "3" : "ساختمان سازی شهری"  }}
+        - Output:
+            - {{"cluster" : "1"}}
+
+        Example-2:
+        - Input:
+            - title: "لپتاب و کامپیوتر"
+            - all_cluster_names: {{
+                "1" : "کتابخوانی",
+                "2" : "فوتبال جام جهانی",
+                "3" : "ساختمان سازی شهری"  }}
+        - Output:
+            - {{"outlier" : "yes"}}
+        
+        Example-3:
+        - Input:
+            - title: "ساختمان"
+            - all_cluster_names: {{
+                "1" : "کتابخوانی",
+                "2" : "فوتبال جام جهانی",
+                "3" : "ساختمان سازی شهری"  }}
+        - Output:
+            - {{"cluster" : "3"}}
+
+        write a small reason and give the final answer.
+        """
+
+
+    async def run_llm(self, session, title, cluster_names):
+        """
+        Run the LLM as reranker.
+        Args:
+            session: The session to use for the request.
+            question: The question to rerank the documents.
+            chunk: The chunk to rerank.
+        Returns:
+            The score of the chunk.
+        """
+        headers = {"Content-Type": "application/json",}
+
+        input_message = f"""{{"all_cluster_names": "{cluster_names}", "title": "{title}"}}"""
+        messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
+
+        payload = {
+            "model": "google/gemma-3-27b-it",
+            "messages": messages,
+            "max_tokens": 500
+        }
+        try:
+            async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
+                resp.raise_for_status()
+                response = await resp.json()
+            
+            out = response['choices'][0]['message']['content']
+            print("--------------------------------")
+            print(f"title: {title}")
+            print(out)
+            pattern = r'(\{"cluster".*?\})'
+
+            matches = re.findall(pattern, out)
+
+            for m in matches:
+                out_json = json.loads(m)
+                print(f"out_json: {out_json}")
+                return out_json
+
+            pattern = r'(\{"outlier".*?\})'
+
+            matches = re.findall(pattern, out)
+
+            for m in matches:
+                out_json = json.loads(m)
+                print(f"out_json: {out_json}")
+                return out_json
+        except Exception as e:
+            print(f"Error in llm as reranker: {e}")
+            return 0
+
+
+    async def run_llm_async(self, titles, cluster_names):
+        """
+        Send all chunk requests concurrently.
+        Args:
+            titles: The titles to rerank.
+            possible_cluster_names: The possible cluster names to rerank.
+            cluster_names: The cluster names to rerank.
+        Returns:
+            The scores of the chunks.
+        """
+        async with aiohttp.ClientSession() as session:
+            tasks = [self.run_llm(session, title, cluster_names) for title in titles]
+            scores_embed = await asyncio.gather(*tasks)
+        return scores_embed
+
+    def sanitize_for_excel(self, df):
+        def _sanitize_for_excel(text):
+            """Remove zero-width and bidi control characters that can confuse Excel rendering."""
+            if text is None:
+                return ""
+            s = str(text)
+            # Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
+            remove_chars = [
+                "\u200c",  # ZWNJ
+                "\u200d",  # ZWJ
+                "\u200e",  # LRM
+                "\u200f",  # RLM
+                "\u202a",  # LRE
+                "\u202b",  # RLE
+                "\u202c",  # PDF
+                "\u202d",  # LRO
+                "\u202e",  # RLO
+                "\ufeff",  # BOM
+                "\u0640",  # Tatweel
+            ]
+            for ch in remove_chars:
+                s = s.replace(ch, "")
+            # Normalize whitespace
+            s = re.sub(r"\s+", " ", s).strip()
+            return s
+
+        df_copy = df.copy()
+        for m in df.columns:
+            for i in range(len(df_copy[m])):
+                df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
+
+        return df_copy
+
+    def start_process(self, input_path, output_path):
+        df = pd.read_excel(input_path)
+        df_copy = df.copy()
+
+        with open("titles_o3.txt", "r") as f:
+            titles = f.readlines()
+
+        titles = [title.strip() for title in titles]
+
+        cluster_names_dict = {}
+        count = 1
+        for item in titles:
+            cluster_names_dict[str(count)] = item
+            count += 1
+
+        cluster_names = "{\n"
+        for key, value in cluster_names_dict.items():
+            cluster_names += f"{key} : {value},\n"
+
+        cluster_names += "}"
+
+        batch_size = 100
+        for i in tqdm(range(0, len(df["topic"]), batch_size)):
+            start_time = time.time()
+            result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], cluster_names))
+            end_time = time.time()
+            print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
+            time.sleep(5)
+
+            for j, result in enumerate(result_list):
+                try:
+                    if result.get("outlier") == "yes":
+                        df_copy.at[i+j, "cluster_llm"] = "متفرقه"
+                    elif result.get("cluster") is not None:
+                        df_copy.at[i+j, "cluster_llm"] = cluster_names_dict[result["cluster"]]
+                    else:
+                        df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
+
+                except Exception as e:
+                    print(f"Error in result_list: {e}")
+                    df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
+
+        df_copy = self.sanitize_for_excel(df_copy)
+        df_copy.to_excel(output_path)
+
+if __name__ == "__main__":
+    llm = PostClusterLLM()
+    llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
--- a/sub_clustering_pipeline.py
+++ b/sub_clustering_pipeline.py
@ -0,0 +1,285 @@
+import argparse
+import pandas as pd
+from transformers import AutoModel
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from hazm import Normalizer
+from tqdm import tqdm
+import requests
+from openai import OpenAI
+import httpx
+import random
+import re
+import json
+
+
+START_K = 2
+END_K = 60
+
+
+def sanitize_for_excel(text):
+    """Remove zero-width and bidi control characters that can confuse Excel rendering."""
+    if text is None:
+        return ""
+    s = str(text)
+    # Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
+    remove_chars = [
+        "\u200c",  # ZWNJ
+        "\u200d",  # ZWJ
+        "\u200e",  # LRM
+        "\u200f",  # RLM
+        "\u202a",  # LRE
+        "\u202b",  # RLE
+        "\u202c",  # PDF
+        "\u202d",  # LRO
+        "\u202e",  # RLO
+        "\ufeff",  # BOM
+        "\u0640",  # Tatweel
+    ]
+    for ch in remove_chars:
+        s = s.replace(ch, "")
+    # Normalize whitespace
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+
+
+def get_best_k(embeddings):
+
+    max_sil_score = 0
+    best_k = START_K
+    for k in range(START_K, min(END_K, len(embeddings))):
+        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+        labels = kmeans.fit_predict(embeddings)
+
+        sil_score = silhouette_score(embeddings, labels)
+        if sil_score > max_sil_score:
+            max_sil_score = sil_score
+            best_k = k
+
+    kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(embeddings)
+
+    return best_k, labels
+
+
+def get_embeddings(names):
+    model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
+    
+    normalizer = Normalizer()
+    names = [normalizer.normalize(name) for name in names]
+
+    adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
+
+    names_new = []
+    for name in names:
+        for adj in adjs:
+            name = name.replace(adj, "")
+        names_new.append(name)
+
+    embeddings = []
+    for batch in tqdm(range(0, len(names_new), 50)):    
+        embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
+
+    return embeddings
+
+
+def get_cluster_names(clusters):
+    headers = {"Content-Type": "application/json",}
+
+    prompt = """
+    You are a helpful assistant that generates names for clusters of topics in persian.
+    I will give you a list of topics and you will generate a name for this cluster.
+    There might be some different topics in the list so you just consider the dominant topic.
+    be specific about the cluster name.
+    Just give me the final answer in persian.
+    """
+
+    cluster_names = []
+    for data in clusters:
+
+        if len(data) < 10:
+            continue
+
+        cluster_samples = random.sample(data, min(20, len(data)))
+
+        messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
+
+        payload = {
+            "model": "google/gemma-3-27b-it",
+            "messages": messages,
+            "max_tokens": 8000
+        }
+
+        response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
+        our_response = response.json()['choices'][0]['message']['content']
+        cluster_names.append(our_response)
+
+    return cluster_names
+
+
+def modify_cluster_names(cluster_names, title, best_k):
+    PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
+    http_client = httpx.Client(proxy=PROXY_URL)
+    client = OpenAI(api_key="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", http_client=http_client)
+
+    start = (best_k / 2)  - ((best_k / 2) % 10)
+    if start == 0:
+        start = 1
+
+    prompt = f"""
+    You are a sub category modification expert.
+
+    I will give you a list of topics.
+
+    all these topics belongs to {title} category
+
+    ## TASK
+    Extract meaningful and distinct sub category from the list. you can change the name of topics. Just about {start}-{start+10} topics that cover all of them.
+
+    ## RULES
+    - You can combine or split or ... for doing this task.
+    - You can change the name of topics to make it more general or more specific.
+    - the final topics must be distinct and have specific meaning rather than others.
+    - dont combine topics that are not related to each other. like economical with political with social with ...
+    - combine topics that are related to each other. like ghaza with palestine or ...
+
+    ## MUST
+    - all sub categories must be distinct and have specific meaning from other categories.
+    - two categories can not be similar to each other.
+    - be specifc about sub categories
+
+    I will trust your intelligence.
+    write the final answer in persian.
+    """
+
+    response = client.chat.completions.create(
+            model="o3",
+            messages=[
+                {"role": "system", "content": prompt},
+                {"role": "user", "content": str(cluster_names)}
+            ]
+        )
+    out = response.choices[0].message.content
+
+    return out
+
+
+def extract_list(text, count):
+
+    headers = {"Content-Type": "application/json",}
+
+    prompt = """
+    extract the titles from this text and put it in a list.
+    just return the output in list format, do not include any other text : ["title_1", "title_2", ...]
+    """
+
+    messages = [{"role": "system", "content": prompt}, {"role": "user", "content": text}]
+
+    payload = {
+        "model": "google/gemma-3-27b-it",
+        "messages": messages,
+        "max_tokens": 8000
+    }
+
+    response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
+    out = response.json()['choices'][0]['message']['content']
+    try:
+        out = json.loads(out)
+    except:
+        print(f"error in extract list {count}")
+    return out
+
+
+def main(input_file, output_file):
+    # read input file
+    df = pd.read_excel(input_file)
+    topics = df["topic"].tolist()
+    cluster_llms = df["cluster_llm"].tolist()
+
+    # get embeddings
+    embeddings = get_embeddings(topics)
+
+    # extract main cluster names
+    cluster_names = []
+    with open("titles_o3.txt", "r") as f:
+        titles = f.readlines()
+
+    titles = [sanitize_for_excel(title.strip()) for title in titles]
+
+    embedding_cluster = []
+    best_k = len(titles)
+    for i in range(best_k):
+        embedding_cluster.append([])
+
+    topic_cluster = []
+    best_k = len(titles)
+    for i in range(best_k):
+        topic_cluster.append([])
+
+    for m in range(len(titles)):
+        for embedding, cluster_name, topic in zip(embeddings, cluster_llms, topics):
+            if cluster_name == titles[m]:
+                embedding_cluster[m].append(embedding)
+                topic_cluster[m].append(topic)
+
+    sub_cluster_names = []
+    for cluster_count in tqdm(range(len(titles))):
+        print(f"start {cluster_count} \n")
+        # get best k and labels of kmeans with best_k
+        best_k, labels = get_best_k(embedding_cluster[cluster_count])
+        print(f"initial best_k {best_k}\n")
+
+        # fill clusters
+        clusters = []
+        for i in range(best_k):
+            clusters.append([])
+
+        for i in range(len(clusters)):
+            for topic, label in zip(topic_cluster[cluster_count], labels):
+                if label == i:
+                    clusters[i].append(topic)
+
+        # get cluster names
+        cluster_names = get_cluster_names(clusters)
+
+        if len(cluster_names) > 1:
+            # get embeddings for cluster names
+            cluster_names_embeddings = get_embeddings(cluster_names)
+
+            # get best k and labels of kmeans with best_k
+            best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
+            print(f"second best_k {best_k_cluster_names}\n")
+            
+            # fill clusters of cluster_names
+            clusters_cluster_names = []
+            for i in range(best_k_cluster_names):
+                clusters_cluster_names.append([])
+
+            for i in range(len(clusters_cluster_names)):
+                for cluster_name, label in zip(cluster_names, labels_cluster_names):
+                    if label == i:
+                        clusters_cluster_names[i].append(cluster_name)
+
+            # get cluster names for clusters of cluster_names
+            cluster_names_modify = modify_cluster_names(clusters_cluster_names, titles[cluster_count], best_k)
+            cluster_names_modify_list = extract_list(cluster_names_modify, cluster_count)
+            sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": cluster_names_modify_list})
+        
+        else:
+            sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": []})
+
+    # save cluster names
+    if not output_file.endswith(".json"):
+        output_file = output_file + ".json"
+
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(sub_cluster_names, f, ensure_ascii=False, indent=2)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", type=str, required=True)
+    parser.add_argument("--output_file", type=str, required=True)
+    args = parser.parse_args()
+
+    # extracting topics
+    main(args.input_file, args.output_file)
--- a/test_saeed_tweet_2.ipynb
+++ b/test_saeed_tweet_2.ipynb
--- a/topic_recreation.py
+++ b/topic_recreation.py
@ -0,0 +1,142 @@
+import asyncio
+import aiohttp
+import time
+import re
+import pandas as pd
+import json
+from tqdm import tqdm
+
+
+class TopicRecreation:
+    def __init__(self):
+
+        self.instruction = f"""
+        You will be given a tweet text.
+        Your task is to write a phrase category for this tweet which tweet is related to it.
+        this should be a combination of action + category : 
+        
+        for example : 
+            انتقاد از سیاست ایران
+            توهین به مقامات کشور
+            حمایت از نظام جمهوری اسلامی
+            جنگ اسراییل و قطر
+            مسایل مربوط به موضوع هسته ای ایران
+            مسایل مربوط به افغانستان
+
+        The category should be in persian.
+
+        # Roles
+        - If it does not have specifc meaning then write "متفرقه"
+        - Be specifc about the countries.
+        - Do not be specifc about the people.
+        - you can consider different categories and write an action + category or just simple category
+
+        Just return the category, do not include any other text.
+        """
+
+
+    async def run_llm(self, session, tweet):
+        """
+        Run the LLM as reranker.
+        Args:
+            session: The session to use for the request.
+            tweet: The tweet to rerank.
+        Returns:
+            The category of the tweet.
+        """
+        headers = {"Content-Type": "application/json",}
+
+        tweet = " ".join([m for m in tweet.split(" ") if "@" not in m])
+
+        input_message = f"""{{"tweet": "{tweet}"}}"""
+        messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
+
+        payload = {
+            "model": "google/gemma-3-27b-it",
+            "messages": messages,
+            "max_tokens": 500
+        }
+        # try:
+        async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
+            resp.raise_for_status()
+            response = await resp.json()
+        
+        out = response['choices'][0]['message']['content']
+        
+        return out
+        
+        # except Exception as e:
+        #     print(f"Error in llm as reranker: {e}")
+        #     return 0
+
+
+    async def run_llm_async(self, tweets):
+        """
+        Send all chunk requests concurrently.
+        Args:
+            tweets: The tweets to rerank.
+        Returns:
+            The categories of the tweets.
+        """
+        async with aiohttp.ClientSession() as session:
+            tasks = [self.run_llm(session, tweet) for tweet in tweets]
+            scores_embed = await asyncio.gather(*tasks)
+        return scores_embed
+
+    def sanitize_for_excel(self, df):
+        def _sanitize_for_excel(text):
+            """Remove zero-width and bidi control characters that can confuse Excel rendering."""
+            if text is None:
+                return ""
+            s = str(text)
+            # Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
+            remove_chars = [
+                "\u200c",  # ZWNJ
+                "\u200d",  # ZWJ
+                "\u200e",  # LRM
+                "\u200f",  # RLM
+                "\u202a",  # LRE
+                "\u202b",  # RLE
+                "\u202c",  # PDF
+                "\u202d",  # LRO
+                "\u202e",  # RLO
+                "\ufeff",  # BOM
+                "\u0640",  # Tatweel
+            ]
+            for ch in remove_chars:
+                s = s.replace(ch, "")
+            # Normalize whitespace
+            s = re.sub(r"\s+", " ", s).strip()
+            return s
+
+        df_copy = df.copy()
+        for m in ["category"]:
+            for i in range(len(df_copy[m])):
+                df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
+
+        return df_copy
+
+    def start_process(self, input_path, output_path):
+        df = pd.read_excel(input_path)
+        df_copy = df.copy()
+
+        tweets = df["tweet"].tolist()
+
+        for i in tqdm(range(0, len(tweets), 1000)):
+            start_time = time.time()
+            result_list = asyncio.run(self.run_llm_async(tweets[i:i+1000]))
+            end_time = time.time()
+            print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
+
+            time.sleep(5)
+
+            for j, result in enumerate(result_list):
+                df_copy.at[i+j, "category"] = result
+
+
+        df_copy = self.sanitize_for_excel(df_copy)
+        df_copy.to_excel(output_path)
+
+if __name__ == "__main__":
+    llm = TopicRecreation()
+    llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx")