add codes

2025-10-21 11:14:59 +03:30 · 2025-10-21 11:14:59 +03:30 · 78656d9f4d
commit 78656d9f4d
6 changed files with 15593 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,40 @@
 # TEXT CLUSTERING
 A pipeline for clustering tweets 
 ## Overall Pipeline for Cluster Extraction
    1.Convert tweet text to categories using the Gemma model:
    Takes about 7 hours for 40,000 tweets.
    2.Convert categories to embedding vectors using Jina:
    Takes about 3 minutes.
    3.Perform clustering with K-Means:
    Choose the number of clusters with the highest silhouette score among 20–60 groups.
    Takes about 5 minutes.
    4.Name the clusters using the Gemma model:
    Takes about 1 minute.
    5.Cluster the generated names using K-Means and group similar names together:
    Takes about 1 minute.
    6.Use GPT O3 to merge and refine cluster names:
    Provided GPT with the list cluster names and asked it to build new, higher-level clusters.
    Takes about 1 minute.
    7.Assign each topic to its final cluster using the Gemma model:
    Takes about 7 hours.
    Reason for step 5:
    If I had directly given the list of names to step 6, GPT wouldn’t have performed well.
    By first clustering similar names (step 5), the input to GPT became more organized,
    which made step 6 much more effective.
 ## How to use
 You should give a excel file which has a column named "tweets" to this below command
 Overally it will take 15h time for 40,000 tweets
    python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx
--- a/clustering_pipeline.py
+++ b/clustering_pipeline.py
@ -0,0 +1,199 @@
 import argparse
 import pandas as pd
 from transformers import AutoModel
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
 from hazm import Normalizer
 from tqdm import tqdm
 import requests
 from openai import OpenAI
 import httpx
 import random
 from post_cluster import PostClusterLLM
 from topic_recreation import TopicRecreation
 START_K = 20
 END_K = 60
 def get_best_k(embeddings):
    max_sil_score = 0
    best_k = START_K
    for k in range(START_K, min(END_K, len(embeddings))):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
        labels = kmeans.fit_predict(embeddings)
        sil_score = silhouette_score(embeddings, labels)
        if sil_score > max_sil_score:
            max_sil_score = sil_score
            best_k = k
    kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(embeddings)
    return best_k, labels
 def get_embeddings(names):
    model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
    normalizer = Normalizer()
    names = [normalizer.normalize(name) for name in names]
    adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
    names_new = []
    for name in names:
        for adj in adjs:
            name = name.replace(adj, "")
        names_new.append(name)
    embeddings = []
    for batch in tqdm(range(0, len(names_new), 50)):    
        embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
    return embeddings
 def get_cluster_names(clusters):
    headers = {"Content-Type": "application/json",}
    prompt = """
    You are a helpful assistant that generates names for clusters of trends in persian.
    I will give you a list of trends and you will generate a name for this cluster.
    There might be some different topics in the list so you just consider the dominant topic.
    Just give me the final answer in persian.
    """
    cluster_names = []
    for data in clusters:
        cluster_samples = random.sample(data, min(20, len(data)))
        messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
        payload = {
            "model": "google/gemma-3-27b-it",
            "messages": messages,
            "max_tokens": 8000
        }
        response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
        our_response = response.json()['choices'][0]['message']['content']
        cluster_names.append(our_response)
    return cluster_names
 def modify_cluster_names(cluster_names):
    PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
    http_client = httpx.Client(proxy=PROXY_URL)
    client = OpenAI(api_key="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", http_client=http_client)
    prompt = """
    You are a topic modification expert.
    I will give you a list of topics.
    ## TASK
    Extract meaningful and distinct topics from the list. you can chnage the name of topics. Just about 20-30 topics that cover all of them.
    ## RULES
    - You can combine or split or ... for doing this task.
    - You can change the name of topics to make it more general or more specific.
    - the final topics must be distinct and have specific meaning rather than others.
    - dont combine topics that are not related to each other. like economical with political with social with ...
    - combine topics that are related to each other. like ghaza with palestine or ...
    ## MUST
    - all categories must be distinct and have specific meaning from other categories.
    - two categories can not be similar to each other.
    I will trust your intelligence.
    write the final answer in persian.
    """
    response = client.chat.completions.create(
            model="o3",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": str(cluster_names)}
            ]
        )
    out = response.choices[0].message.content
    return out
 def main(input_file, output_file):
    # read input file
    df = pd.read_excel(input_file)
    topics = df["topic_recreation"].tolist()
    # get embeddings
    embeddings = get_embeddings(topics)
    # get best k and labels of kmeans with best_k
    best_k, labels = get_best_k(embeddings)
    # fill clusters
    clusters = []
    for i in range(best_k):
        clusters.append([])
    for i in range(len(clusters)):
        for topic, label in zip(topics, labels):
            if label == i:
                clusters[i].append(topic)
    # get cluster names
    cluster_names = get_cluster_names(clusters)
    # get embeddings for cluster names
    cluster_names_embeddings = get_embeddings(cluster_names)
    # get best k and labels of kmeans with best_k
    best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
    # fill clusters of cluster_names
    clusters_cluster_names = []
    for i in range(best_k_cluster_names):
        clusters_cluster_names.append([])
    for i in range(len(clusters_cluster_names)):
        for cluster_name, label in zip(cluster_names, labels_cluster_names):
            if label == i:
                clusters_cluster_names[i].append(cluster_name)
    # get cluster names for clusters of cluster_names
    cluster_names_modify = modify_cluster_names(clusters_cluster_names)
    # save cluster names
    with open(output_file, "w") as f:
        for count, cluster_name in enumerate(cluster_names_modify):
            if count == len(cluster_names_modify) - 1:
                f.write(cluster_name)
            else:
                f.write(cluster_name + "\n")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, required=True)
    parser.add_argument("--output_file", type=str, required=True)
    args = parser.parse_args()
    # apply topic_recreation
    topic_recreation = TopicRecreation()
    topic_file = args.output_file.replace(".xlsx", "_topic_recreation.xlsx")
    topic_recreation.start_process(args.input_file, topic_file)
    # extracting topics
    titles_file = args.output_file.replace(".xlsx", "_titles.txt")
    main(topic_file, titles_file)
    # apply clustering
    post_cluster = PostClusterLLM()
    post_cluster.start_process(topics_file, args.output_file)
--- a/post_cluster.py
+++ b/post_cluster.py
@ -0,0 +1,203 @@
 import asyncio
 import aiohttp
 import time
 import re
 import pandas as pd
 import json
 from tqdm import tqdm
 class PostClusterLLM:
    def __init__(self):
        self.instruction = f"""
        You will be given a title and a list of all cluster names.
        Your task is to find the best fit cluster name for the title.
        Go through the list of all cluster names and find the best fit cluster name for the title.
        If you found a good fit, return the cluster name.
        If you didn't find a good fit, return "outlier" is "yes".
        #IMPORTANT:
        - if you found a good fit use its id : {{"cluster" : "id_i"}}   
        - if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}   
        Example-1:
        - Input:
            - title: "کتاب و درس"
            - all_cluster_names: {{
                "1" : "کتابخوانی",
                "2" : "فوتبال جام جهانی",
                "3" : "ساختمان سازی شهری"  }}
        - Output:
            - {{"cluster" : "1"}}
        Example-2:
        - Input:
            - title: "لپتاب و کامپیوتر"
            - all_cluster_names: {{
                "1" : "کتابخوانی",
                "2" : "فوتبال جام جهانی",
                "3" : "ساختمان سازی شهری"  }}
        - Output:
            - {{"outlier" : "yes"}}
        Example-3:
        - Input:
            - title: "ساختمان"
            - all_cluster_names: {{
                "1" : "کتابخوانی",
                "2" : "فوتبال جام جهانی",
                "3" : "ساختمان سازی شهری"  }}
        - Output:
            - {{"cluster" : "3"}}
        write a small reason and give the final answer.
        """
    async def run_llm(self, session, title, cluster_names):
        """
        Run the LLM as reranker.
        Args:
            session: The session to use for the request.
            question: The question to rerank the documents.
            chunk: The chunk to rerank.
        Returns:
            The score of the chunk.
        """
        headers = {"Content-Type": "application/json",}
        input_message = f"""{{"all_cluster_names": "{cluster_names}", "title": "{title}"}}"""
        messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
        payload = {
            "model": "google/gemma-3-27b-it",
            "messages": messages,
            "max_tokens": 500
        }
        try:
            async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
                resp.raise_for_status()
                response = await resp.json()
            out = response['choices'][0]['message']['content']
            print("--------------------------------")
            print(f"title: {title}")
            print(out)
            pattern = r'(\{"cluster".*?\})'
            matches = re.findall(pattern, out)
            for m in matches:
                out_json = json.loads(m)
                print(f"out_json: {out_json}")
                return out_json
            pattern = r'(\{"outlier".*?\})'
            matches = re.findall(pattern, out)
            for m in matches:
                out_json = json.loads(m)
                print(f"out_json: {out_json}")
                return out_json
        except Exception as e:
            print(f"Error in llm as reranker: {e}")
            return 0
    async def run_llm_async(self, titles, cluster_names):
        """
        Send all chunk requests concurrently.
        Args:
            titles: The titles to rerank.
            possible_cluster_names: The possible cluster names to rerank.
            cluster_names: The cluster names to rerank.
        Returns:
            The scores of the chunks.
        """
        async with aiohttp.ClientSession() as session:
            tasks = [self.run_llm(session, title, cluster_names) for title in titles]
            scores_embed = await asyncio.gather(*tasks)
        return scores_embed
    def sanitize_for_excel(self, df):
        def _sanitize_for_excel(text):
            """Remove zero-width and bidi control characters that can confuse Excel rendering."""
            if text is None:
                return ""
            s = str(text)
            # Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
            remove_chars = [
                "\u200c",  # ZWNJ
                "\u200d",  # ZWJ
                "\u200e",  # LRM
                "\u200f",  # RLM
                "\u202a",  # LRE
                "\u202b",  # RLE
                "\u202c",  # PDF
                "\u202d",  # LRO
                "\u202e",  # RLO
                "\ufeff",  # BOM
                "\u0640",  # Tatweel
            ]
            for ch in remove_chars:
                s = s.replace(ch, "")
            # Normalize whitespace
            s = re.sub(r"\s+", " ", s).strip()
            return s
        df_copy = df.copy()
        for m in df.columns:
            for i in range(len(df_copy[m])):
                df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
        return df_copy
    def start_process(self, input_path, output_path):
        df = pd.read_excel(input_path)
        df_copy = df.copy()
        with open("titles_o3.txt", "r") as f:
            titles = f.readlines()
        titles = [title.strip() for title in titles]
        cluster_names_dict = {}
        count = 1
        for item in titles:
            cluster_names_dict[str(count)] = item
            count += 1
        cluster_names = "{\n"
        for key, value in cluster_names_dict.items():
            cluster_names += f"{key} : {value},\n"
        cluster_names += "}"
        batch_size = 100
        for i in tqdm(range(0, len(df["topic"]), batch_size)):
            start_time = time.time()
            result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], cluster_names))
            end_time = time.time()
            print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
            time.sleep(5)
            for j, result in enumerate(result_list):
                try:
                    if result.get("outlier") == "yes":
                        df_copy.at[i+j, "cluster_llm"] = "متفرقه"
                    elif result.get("cluster") is not None:
                        df_copy.at[i+j, "cluster_llm"] = cluster_names_dict[result["cluster"]]
                    else:
                        df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
                except Exception as e:
                    print(f"Error in result_list: {e}")
                    df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
        df_copy = self.sanitize_for_excel(df_copy)
        df_copy.to_excel(output_path)
 if __name__ == "__main__":
    llm = PostClusterLLM()
    llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
--- a/sub_clustering_pipeline.py
+++ b/sub_clustering_pipeline.py
@ -0,0 +1,285 @@
 import argparse
 import pandas as pd
 from transformers import AutoModel
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
 from hazm import Normalizer
 from tqdm import tqdm
 import requests
 from openai import OpenAI
 import httpx
 import random
 import re
 import json
 START_K = 2
 END_K = 60
 def sanitize_for_excel(text):
    """Remove zero-width and bidi control characters that can confuse Excel rendering."""
    if text is None:
        return ""
    s = str(text)
    # Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
    remove_chars = [
        "\u200c",  # ZWNJ
        "\u200d",  # ZWJ
        "\u200e",  # LRM
        "\u200f",  # RLM
        "\u202a",  # LRE
        "\u202b",  # RLE
        "\u202c",  # PDF
        "\u202d",  # LRO
        "\u202e",  # RLO
        "\ufeff",  # BOM
        "\u0640",  # Tatweel
    ]
    for ch in remove_chars:
        s = s.replace(ch, "")
    # Normalize whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s
 def get_best_k(embeddings):
    max_sil_score = 0
    best_k = START_K
    for k in range(START_K, min(END_K, len(embeddings))):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(embeddings)
        sil_score = silhouette_score(embeddings, labels)
        if sil_score > max_sil_score:
            max_sil_score = sil_score
            best_k = k
    kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(embeddings)
    return best_k, labels
 def get_embeddings(names):
    model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
    normalizer = Normalizer()
    names = [normalizer.normalize(name) for name in names]
    adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
    names_new = []
    for name in names:
        for adj in adjs:
            name = name.replace(adj, "")
        names_new.append(name)
    embeddings = []
    for batch in tqdm(range(0, len(names_new), 50)):    
        embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
    return embeddings
 def get_cluster_names(clusters):
    headers = {"Content-Type": "application/json",}
    prompt = """
    You are a helpful assistant that generates names for clusters of topics in persian.
    I will give you a list of topics and you will generate a name for this cluster.
    There might be some different topics in the list so you just consider the dominant topic.
    be specific about the cluster name.
    Just give me the final answer in persian.
    """
    cluster_names = []
    for data in clusters:
        if len(data) < 10:
            continue
        cluster_samples = random.sample(data, min(20, len(data)))
        messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
        payload = {
            "model": "google/gemma-3-27b-it",
            "messages": messages,
            "max_tokens": 8000
        }
        response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
        our_response = response.json()['choices'][0]['message']['content']
        cluster_names.append(our_response)
    return cluster_names
 def modify_cluster_names(cluster_names, title, best_k):
    PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
    http_client = httpx.Client(proxy=PROXY_URL)
    client = OpenAI(api_key="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", http_client=http_client)
    start = (best_k / 2)  - ((best_k / 2) % 10)
    if start == 0:
        start = 1
    prompt = f"""
    You are a sub category modification expert.
    I will give you a list of topics.
    all these topics belongs to {title} category
    ## TASK
    Extract meaningful and distinct sub category from the list. you can change the name of topics. Just about {start}-{start+10} topics that cover all of them.
    ## RULES
    - You can combine or split or ... for doing this task.
    - You can change the name of topics to make it more general or more specific.
    - the final topics must be distinct and have specific meaning rather than others.
    - dont combine topics that are not related to each other. like economical with political with social with ...
    - combine topics that are related to each other. like ghaza with palestine or ...
    ## MUST
    - all sub categories must be distinct and have specific meaning from other categories.
    - two categories can not be similar to each other.
    - be specifc about sub categories
    I will trust your intelligence.
    write the final answer in persian.
    """
    response = client.chat.completions.create(
            model="o3",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": str(cluster_names)}
            ]
        )
    out = response.choices[0].message.content
    return out
 def extract_list(text, count):
    headers = {"Content-Type": "application/json",}
    prompt = """
    extract the titles from this text and put it in a list.
    just return the output in list format, do not include any other text : ["title_1", "title_2", ...]
    """
    messages = [{"role": "system", "content": prompt}, {"role": "user", "content": text}]
    payload = {
        "model": "google/gemma-3-27b-it",
        "messages": messages,
        "max_tokens": 8000
    }
    response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
    out = response.json()['choices'][0]['message']['content']
    try:
        out = json.loads(out)
    except:
        print(f"error in extract list {count}")
    return out
 def main(input_file, output_file):
    # read input file
    df = pd.read_excel(input_file)
    topics = df["topic"].tolist()
    cluster_llms = df["cluster_llm"].tolist()
    # get embeddings
    embeddings = get_embeddings(topics)
    # extract main cluster names
    cluster_names = []
    with open("titles_o3.txt", "r") as f:
        titles = f.readlines()
    titles = [sanitize_for_excel(title.strip()) for title in titles]
    embedding_cluster = []
    best_k = len(titles)
    for i in range(best_k):
        embedding_cluster.append([])
    topic_cluster = []
    best_k = len(titles)
    for i in range(best_k):
        topic_cluster.append([])
    for m in range(len(titles)):
        for embedding, cluster_name, topic in zip(embeddings, cluster_llms, topics):
            if cluster_name == titles[m]:
                embedding_cluster[m].append(embedding)
                topic_cluster[m].append(topic)
    sub_cluster_names = []
    for cluster_count in tqdm(range(len(titles))):
        print(f"start {cluster_count} \n")
        # get best k and labels of kmeans with best_k
        best_k, labels = get_best_k(embedding_cluster[cluster_count])
        print(f"initial best_k {best_k}\n")
        # fill clusters
        clusters = []
        for i in range(best_k):
            clusters.append([])
        for i in range(len(clusters)):
            for topic, label in zip(topic_cluster[cluster_count], labels):
                if label == i:
                    clusters[i].append(topic)
        # get cluster names
        cluster_names = get_cluster_names(clusters)
        if len(cluster_names) > 1:
            # get embeddings for cluster names
            cluster_names_embeddings = get_embeddings(cluster_names)
            # get best k and labels of kmeans with best_k
            best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
            print(f"second best_k {best_k_cluster_names}\n")
            # fill clusters of cluster_names
            clusters_cluster_names = []
            for i in range(best_k_cluster_names):
                clusters_cluster_names.append([])
            for i in range(len(clusters_cluster_names)):
                for cluster_name, label in zip(cluster_names, labels_cluster_names):
                    if label == i:
                        clusters_cluster_names[i].append(cluster_name)
            # get cluster names for clusters of cluster_names
            cluster_names_modify = modify_cluster_names(clusters_cluster_names, titles[cluster_count], best_k)
            cluster_names_modify_list = extract_list(cluster_names_modify, cluster_count)
            sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": cluster_names_modify_list})
        else:
            sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": []})
    # save cluster names
    if not output_file.endswith(".json"):
        output_file = output_file + ".json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(sub_cluster_names, f, ensure_ascii=False, indent=2)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, required=True)
    parser.add_argument("--output_file", type=str, required=True)
    args = parser.parse_args()
    # extracting topics
    main(args.input_file, args.output_file)
--- a/test_saeed_tweet_2.ipynb
+++ b/test_saeed_tweet_2.ipynb
--- a/topic_recreation.py
+++ b/topic_recreation.py
@ -0,0 +1,142 @@
 import asyncio
 import aiohttp
 import time
 import re
 import pandas as pd
 import json
 from tqdm import tqdm
 class TopicRecreation:
    def __init__(self):
        self.instruction = f"""
        You will be given a tweet text.
        Your task is to write a phrase category for this tweet which tweet is related to it.
        this should be a combination of action + category : 
        for example : 
            انتقاد از سیاست ایران
            توهین به مقامات کشور
            حمایت از نظام جمهوری اسلامی
            جنگ اسراییل و قطر
            مسایل مربوط به موضوع هسته ای ایران
            مسایل مربوط به افغانستان
        The category should be in persian.
        # Roles
        - If it does not have specifc meaning then write "متفرقه"
        - Be specifc about the countries.
        - Do not be specifc about the people.
        - you can consider different categories and write an action + category or just simple category
        Just return the category, do not include any other text.
        """
    async def run_llm(self, session, tweet):
        """
        Run the LLM as reranker.
        Args:
            session: The session to use for the request.
            tweet: The tweet to rerank.
        Returns:
            The category of the tweet.
        """
        headers = {"Content-Type": "application/json",}
        tweet = " ".join([m for m in tweet.split(" ") if "@" not in m])
        input_message = f"""{{"tweet": "{tweet}"}}"""
        messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
        payload = {
            "model": "google/gemma-3-27b-it",
            "messages": messages,
            "max_tokens": 500
        }
        # try:
        async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
            resp.raise_for_status()
            response = await resp.json()
        out = response['choices'][0]['message']['content']
        return out
        # except Exception as e:
        #     print(f"Error in llm as reranker: {e}")
        #     return 0
    async def run_llm_async(self, tweets):
        """
        Send all chunk requests concurrently.
        Args:
            tweets: The tweets to rerank.
        Returns:
            The categories of the tweets.
        """
        async with aiohttp.ClientSession() as session:
            tasks = [self.run_llm(session, tweet) for tweet in tweets]
            scores_embed = await asyncio.gather(*tasks)
        return scores_embed
    def sanitize_for_excel(self, df):
        def _sanitize_for_excel(text):
            """Remove zero-width and bidi control characters that can confuse Excel rendering."""
            if text is None:
                return ""
            s = str(text)
            # Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
            remove_chars = [
                "\u200c",  # ZWNJ
                "\u200d",  # ZWJ
                "\u200e",  # LRM
                "\u200f",  # RLM
                "\u202a",  # LRE
                "\u202b",  # RLE
                "\u202c",  # PDF
                "\u202d",  # LRO
                "\u202e",  # RLO
                "\ufeff",  # BOM
                "\u0640",  # Tatweel
            ]
            for ch in remove_chars:
                s = s.replace(ch, "")
            # Normalize whitespace
            s = re.sub(r"\s+", " ", s).strip()
            return s
        df_copy = df.copy()
        for m in ["category"]:
            for i in range(len(df_copy[m])):
                df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
        return df_copy
    def start_process(self, input_path, output_path):
        df = pd.read_excel(input_path)
        df_copy = df.copy()
        tweets = df["tweet"].tolist()
        for i in tqdm(range(0, len(tweets), 1000)):
            start_time = time.time()
            result_list = asyncio.run(self.run_llm_async(tweets[i:i+1000]))
            end_time = time.time()
            print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
            time.sleep(5)
            for j, result in enumerate(result_list):
                df_copy.at[i+j, "category"] = result
        df_copy = self.sanitize_for_excel(df_copy)
        df_copy.to_excel(output_path)
 if __name__ == "__main__":
    llm = TopicRecreation()
    llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx")