add codes

This commit is contained in:
SFirouzi 2025-10-21 11:14:59 +03:30
commit 78656d9f4d
6 changed files with 15593 additions and 0 deletions

40
README.md Normal file
View File

@ -0,0 +1,40 @@
# TEXT CLUSTERING
A pipeline for clustering tweets
## Overall Pipeline for Cluster Extraction
1.Convert tweet text to categories using the Gemma model:
Takes about 7 hours for 40,000 tweets.
2.Convert categories to embedding vectors using Jina:
Takes about 3 minutes.
3.Perform clustering with K-Means:
Choose the number of clusters with the highest silhouette score among 2060 groups.
Takes about 5 minutes.
4.Name the clusters using the Gemma model:
Takes about 1 minute.
5.Cluster the generated names using K-Means and group similar names together:
Takes about 1 minute.
6.Use GPT O3 to merge and refine cluster names:
Provided GPT with the list cluster names and asked it to build new, higher-level clusters.
Takes about 1 minute.
7.Assign each topic to its final cluster using the Gemma model:
Takes about 7 hours.
Reason for step 5:
If I had directly given the list of names to step 6, GPT wouldnt have performed well.
By first clustering similar names (step 5), the input to GPT became more organized,
which made step 6 much more effective.
## How to use
You should give a excel file which has a column named "tweets" to this below command
Overally it will take 15h time for 40,000 tweets
python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx

199
clustering_pipeline.py Normal file
View File

@ -0,0 +1,199 @@
import argparse
import pandas as pd
from transformers import AutoModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from hazm import Normalizer
from tqdm import tqdm
import requests
from openai import OpenAI
import httpx
import random
from post_cluster import PostClusterLLM
from topic_recreation import TopicRecreation
START_K = 20
END_K = 60
def get_best_k(embeddings):
max_sil_score = 0
best_k = START_K
for k in range(START_K, min(END_K, len(embeddings))):
kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
labels = kmeans.fit_predict(embeddings)
sil_score = silhouette_score(embeddings, labels)
if sil_score > max_sil_score:
max_sil_score = sil_score
best_k = k
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
labels = kmeans.fit_predict(embeddings)
return best_k, labels
def get_embeddings(names):
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
normalizer = Normalizer()
names = [normalizer.normalize(name) for name in names]
adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
names_new = []
for name in names:
for adj in adjs:
name = name.replace(adj, "")
names_new.append(name)
embeddings = []
for batch in tqdm(range(0, len(names_new), 50)):
embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
return embeddings
def get_cluster_names(clusters):
headers = {"Content-Type": "application/json",}
prompt = """
You are a helpful assistant that generates names for clusters of trends in persian.
I will give you a list of trends and you will generate a name for this cluster.
There might be some different topics in the list so you just consider the dominant topic.
Just give me the final answer in persian.
"""
cluster_names = []
for data in clusters:
cluster_samples = random.sample(data, min(20, len(data)))
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
payload = {
"model": "google/gemma-3-27b-it",
"messages": messages,
"max_tokens": 8000
}
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
our_response = response.json()['choices'][0]['message']['content']
cluster_names.append(our_response)
return cluster_names
def modify_cluster_names(cluster_names):
PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
http_client = httpx.Client(proxy=PROXY_URL)
client = OpenAI(api_key="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", http_client=http_client)
prompt = """
You are a topic modification expert.
I will give you a list of topics.
## TASK
Extract meaningful and distinct topics from the list. you can chnage the name of topics. Just about 20-30 topics that cover all of them.
## RULES
- You can combine or split or ... for doing this task.
- You can change the name of topics to make it more general or more specific.
- the final topics must be distinct and have specific meaning rather than others.
- dont combine topics that are not related to each other. like economical with political with social with ...
- combine topics that are related to each other. like ghaza with palestine or ...
## MUST
- all categories must be distinct and have specific meaning from other categories.
- two categories can not be similar to each other.
I will trust your intelligence.
write the final answer in persian.
"""
response = client.chat.completions.create(
model="o3",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": str(cluster_names)}
]
)
out = response.choices[0].message.content
return out
def main(input_file, output_file):
# read input file
df = pd.read_excel(input_file)
topics = df["topic_recreation"].tolist()
# get embeddings
embeddings = get_embeddings(topics)
# get best k and labels of kmeans with best_k
best_k, labels = get_best_k(embeddings)
# fill clusters
clusters = []
for i in range(best_k):
clusters.append([])
for i in range(len(clusters)):
for topic, label in zip(topics, labels):
if label == i:
clusters[i].append(topic)
# get cluster names
cluster_names = get_cluster_names(clusters)
# get embeddings for cluster names
cluster_names_embeddings = get_embeddings(cluster_names)
# get best k and labels of kmeans with best_k
best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
# fill clusters of cluster_names
clusters_cluster_names = []
for i in range(best_k_cluster_names):
clusters_cluster_names.append([])
for i in range(len(clusters_cluster_names)):
for cluster_name, label in zip(cluster_names, labels_cluster_names):
if label == i:
clusters_cluster_names[i].append(cluster_name)
# get cluster names for clusters of cluster_names
cluster_names_modify = modify_cluster_names(clusters_cluster_names)
# save cluster names
with open(output_file, "w") as f:
for count, cluster_name in enumerate(cluster_names_modify):
if count == len(cluster_names_modify) - 1:
f.write(cluster_name)
else:
f.write(cluster_name + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", type=str, required=True)
parser.add_argument("--output_file", type=str, required=True)
args = parser.parse_args()
# apply topic_recreation
topic_recreation = TopicRecreation()
topic_file = args.output_file.replace(".xlsx", "_topic_recreation.xlsx")
topic_recreation.start_process(args.input_file, topic_file)
# extracting topics
titles_file = args.output_file.replace(".xlsx", "_titles.txt")
main(topic_file, titles_file)
# apply clustering
post_cluster = PostClusterLLM()
post_cluster.start_process(topics_file, args.output_file)

203
post_cluster.py Normal file
View File

@ -0,0 +1,203 @@
import asyncio
import aiohttp
import time
import re
import pandas as pd
import json
from tqdm import tqdm
class PostClusterLLM:
def __init__(self):
self.instruction = f"""
You will be given a title and a list of all cluster names.
Your task is to find the best fit cluster name for the title.
Go through the list of all cluster names and find the best fit cluster name for the title.
If you found a good fit, return the cluster name.
If you didn't find a good fit, return "outlier" is "yes".
#IMPORTANT:
- if you found a good fit use its id : {{"cluster" : "id_i"}}
- if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}
Example-1:
- Input:
- title: "کتاب و درس"
- all_cluster_names: {{
"1" : "کتابخوانی",
"2" : "فوتبال جام جهانی",
"3" : "ساختمان سازی شهری" }}
- Output:
- {{"cluster" : "1"}}
Example-2:
- Input:
- title: "لپتاب و کامپیوتر"
- all_cluster_names: {{
"1" : "کتابخوانی",
"2" : "فوتبال جام جهانی",
"3" : "ساختمان سازی شهری" }}
- Output:
- {{"outlier" : "yes"}}
Example-3:
- Input:
- title: "ساختمان"
- all_cluster_names: {{
"1" : "کتابخوانی",
"2" : "فوتبال جام جهانی",
"3" : "ساختمان سازی شهری" }}
- Output:
- {{"cluster" : "3"}}
write a small reason and give the final answer.
"""
async def run_llm(self, session, title, cluster_names):
"""
Run the LLM as reranker.
Args:
session: The session to use for the request.
question: The question to rerank the documents.
chunk: The chunk to rerank.
Returns:
The score of the chunk.
"""
headers = {"Content-Type": "application/json",}
input_message = f"""{{"all_cluster_names": "{cluster_names}", "title": "{title}"}}"""
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
payload = {
"model": "google/gemma-3-27b-it",
"messages": messages,
"max_tokens": 500
}
try:
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
resp.raise_for_status()
response = await resp.json()
out = response['choices'][0]['message']['content']
print("--------------------------------")
print(f"title: {title}")
print(out)
pattern = r'(\{"cluster".*?\})'
matches = re.findall(pattern, out)
for m in matches:
out_json = json.loads(m)
print(f"out_json: {out_json}")
return out_json
pattern = r'(\{"outlier".*?\})'
matches = re.findall(pattern, out)
for m in matches:
out_json = json.loads(m)
print(f"out_json: {out_json}")
return out_json
except Exception as e:
print(f"Error in llm as reranker: {e}")
return 0
async def run_llm_async(self, titles, cluster_names):
"""
Send all chunk requests concurrently.
Args:
titles: The titles to rerank.
possible_cluster_names: The possible cluster names to rerank.
cluster_names: The cluster names to rerank.
Returns:
The scores of the chunks.
"""
async with aiohttp.ClientSession() as session:
tasks = [self.run_llm(session, title, cluster_names) for title in titles]
scores_embed = await asyncio.gather(*tasks)
return scores_embed
def sanitize_for_excel(self, df):
def _sanitize_for_excel(text):
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
if text is None:
return ""
s = str(text)
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
remove_chars = [
"\u200c", # ZWNJ
"\u200d", # ZWJ
"\u200e", # LRM
"\u200f", # RLM
"\u202a", # LRE
"\u202b", # RLE
"\u202c", # PDF
"\u202d", # LRO
"\u202e", # RLO
"\ufeff", # BOM
"\u0640", # Tatweel
]
for ch in remove_chars:
s = s.replace(ch, "")
# Normalize whitespace
s = re.sub(r"\s+", " ", s).strip()
return s
df_copy = df.copy()
for m in df.columns:
for i in range(len(df_copy[m])):
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
return df_copy
def start_process(self, input_path, output_path):
df = pd.read_excel(input_path)
df_copy = df.copy()
with open("titles_o3.txt", "r") as f:
titles = f.readlines()
titles = [title.strip() for title in titles]
cluster_names_dict = {}
count = 1
for item in titles:
cluster_names_dict[str(count)] = item
count += 1
cluster_names = "{\n"
for key, value in cluster_names_dict.items():
cluster_names += f"{key} : {value},\n"
cluster_names += "}"
batch_size = 100
for i in tqdm(range(0, len(df["topic"]), batch_size)):
start_time = time.time()
result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], cluster_names))
end_time = time.time()
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
time.sleep(5)
for j, result in enumerate(result_list):
try:
if result.get("outlier") == "yes":
df_copy.at[i+j, "cluster_llm"] = "متفرقه"
elif result.get("cluster") is not None:
df_copy.at[i+j, "cluster_llm"] = cluster_names_dict[result["cluster"]]
else:
df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
except Exception as e:
print(f"Error in result_list: {e}")
df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
df_copy = self.sanitize_for_excel(df_copy)
df_copy.to_excel(output_path)
if __name__ == "__main__":
llm = PostClusterLLM()
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")

285
sub_clustering_pipeline.py Normal file
View File

@ -0,0 +1,285 @@
import argparse
import pandas as pd
from transformers import AutoModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from hazm import Normalizer
from tqdm import tqdm
import requests
from openai import OpenAI
import httpx
import random
import re
import json
START_K = 2
END_K = 60
def sanitize_for_excel(text):
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
if text is None:
return ""
s = str(text)
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
remove_chars = [
"\u200c", # ZWNJ
"\u200d", # ZWJ
"\u200e", # LRM
"\u200f", # RLM
"\u202a", # LRE
"\u202b", # RLE
"\u202c", # PDF
"\u202d", # LRO
"\u202e", # RLO
"\ufeff", # BOM
"\u0640", # Tatweel
]
for ch in remove_chars:
s = s.replace(ch, "")
# Normalize whitespace
s = re.sub(r"\s+", " ", s).strip()
return s
def get_best_k(embeddings):
max_sil_score = 0
best_k = START_K
for k in range(START_K, min(END_K, len(embeddings))):
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(embeddings)
sil_score = silhouette_score(embeddings, labels)
if sil_score > max_sil_score:
max_sil_score = sil_score
best_k = k
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
labels = kmeans.fit_predict(embeddings)
return best_k, labels
def get_embeddings(names):
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
normalizer = Normalizer()
names = [normalizer.normalize(name) for name in names]
adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
names_new = []
for name in names:
for adj in adjs:
name = name.replace(adj, "")
names_new.append(name)
embeddings = []
for batch in tqdm(range(0, len(names_new), 50)):
embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
return embeddings
def get_cluster_names(clusters):
headers = {"Content-Type": "application/json",}
prompt = """
You are a helpful assistant that generates names for clusters of topics in persian.
I will give you a list of topics and you will generate a name for this cluster.
There might be some different topics in the list so you just consider the dominant topic.
be specific about the cluster name.
Just give me the final answer in persian.
"""
cluster_names = []
for data in clusters:
if len(data) < 10:
continue
cluster_samples = random.sample(data, min(20, len(data)))
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
payload = {
"model": "google/gemma-3-27b-it",
"messages": messages,
"max_tokens": 8000
}
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
our_response = response.json()['choices'][0]['message']['content']
cluster_names.append(our_response)
return cluster_names
def modify_cluster_names(cluster_names, title, best_k):
PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
http_client = httpx.Client(proxy=PROXY_URL)
client = OpenAI(api_key="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", http_client=http_client)
start = (best_k / 2) - ((best_k / 2) % 10)
if start == 0:
start = 1
prompt = f"""
You are a sub category modification expert.
I will give you a list of topics.
all these topics belongs to {title} category
## TASK
Extract meaningful and distinct sub category from the list. you can change the name of topics. Just about {start}-{start+10} topics that cover all of them.
## RULES
- You can combine or split or ... for doing this task.
- You can change the name of topics to make it more general or more specific.
- the final topics must be distinct and have specific meaning rather than others.
- dont combine topics that are not related to each other. like economical with political with social with ...
- combine topics that are related to each other. like ghaza with palestine or ...
## MUST
- all sub categories must be distinct and have specific meaning from other categories.
- two categories can not be similar to each other.
- be specifc about sub categories
I will trust your intelligence.
write the final answer in persian.
"""
response = client.chat.completions.create(
model="o3",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": str(cluster_names)}
]
)
out = response.choices[0].message.content
return out
def extract_list(text, count):
headers = {"Content-Type": "application/json",}
prompt = """
extract the titles from this text and put it in a list.
just return the output in list format, do not include any other text : ["title_1", "title_2", ...]
"""
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": text}]
payload = {
"model": "google/gemma-3-27b-it",
"messages": messages,
"max_tokens": 8000
}
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
out = response.json()['choices'][0]['message']['content']
try:
out = json.loads(out)
except:
print(f"error in extract list {count}")
return out
def main(input_file, output_file):
# read input file
df = pd.read_excel(input_file)
topics = df["topic"].tolist()
cluster_llms = df["cluster_llm"].tolist()
# get embeddings
embeddings = get_embeddings(topics)
# extract main cluster names
cluster_names = []
with open("titles_o3.txt", "r") as f:
titles = f.readlines()
titles = [sanitize_for_excel(title.strip()) for title in titles]
embedding_cluster = []
best_k = len(titles)
for i in range(best_k):
embedding_cluster.append([])
topic_cluster = []
best_k = len(titles)
for i in range(best_k):
topic_cluster.append([])
for m in range(len(titles)):
for embedding, cluster_name, topic in zip(embeddings, cluster_llms, topics):
if cluster_name == titles[m]:
embedding_cluster[m].append(embedding)
topic_cluster[m].append(topic)
sub_cluster_names = []
for cluster_count in tqdm(range(len(titles))):
print(f"start {cluster_count} \n")
# get best k and labels of kmeans with best_k
best_k, labels = get_best_k(embedding_cluster[cluster_count])
print(f"initial best_k {best_k}\n")
# fill clusters
clusters = []
for i in range(best_k):
clusters.append([])
for i in range(len(clusters)):
for topic, label in zip(topic_cluster[cluster_count], labels):
if label == i:
clusters[i].append(topic)
# get cluster names
cluster_names = get_cluster_names(clusters)
if len(cluster_names) > 1:
# get embeddings for cluster names
cluster_names_embeddings = get_embeddings(cluster_names)
# get best k and labels of kmeans with best_k
best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
print(f"second best_k {best_k_cluster_names}\n")
# fill clusters of cluster_names
clusters_cluster_names = []
for i in range(best_k_cluster_names):
clusters_cluster_names.append([])
for i in range(len(clusters_cluster_names)):
for cluster_name, label in zip(cluster_names, labels_cluster_names):
if label == i:
clusters_cluster_names[i].append(cluster_name)
# get cluster names for clusters of cluster_names
cluster_names_modify = modify_cluster_names(clusters_cluster_names, titles[cluster_count], best_k)
cluster_names_modify_list = extract_list(cluster_names_modify, cluster_count)
sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": cluster_names_modify_list})
else:
sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": []})
# save cluster names
if not output_file.endswith(".json"):
output_file = output_file + ".json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(sub_cluster_names, f, ensure_ascii=False, indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", type=str, required=True)
parser.add_argument("--output_file", type=str, required=True)
args = parser.parse_args()
# extracting topics
main(args.input_file, args.output_file)

14724
test_saeed_tweet_2.ipynb Normal file

File diff suppressed because one or more lines are too long

142
topic_recreation.py Normal file
View File

@ -0,0 +1,142 @@
import asyncio
import aiohttp
import time
import re
import pandas as pd
import json
from tqdm import tqdm
class TopicRecreation:
def __init__(self):
self.instruction = f"""
You will be given a tweet text.
Your task is to write a phrase category for this tweet which tweet is related to it.
this should be a combination of action + category :
for example :
انتقاد از سیاست ایران
توهین به مقامات کشور
حمایت از نظام جمهوری اسلامی
جنگ اسراییل و قطر
مسایل مربوط به موضوع هسته ای ایران
مسایل مربوط به افغانستان
The category should be in persian.
# Roles
- If it does not have specifc meaning then write "متفرقه"
- Be specifc about the countries.
- Do not be specifc about the people.
- you can consider different categories and write an action + category or just simple category
Just return the category, do not include any other text.
"""
async def run_llm(self, session, tweet):
"""
Run the LLM as reranker.
Args:
session: The session to use for the request.
tweet: The tweet to rerank.
Returns:
The category of the tweet.
"""
headers = {"Content-Type": "application/json",}
tweet = " ".join([m for m in tweet.split(" ") if "@" not in m])
input_message = f"""{{"tweet": "{tweet}"}}"""
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
payload = {
"model": "google/gemma-3-27b-it",
"messages": messages,
"max_tokens": 500
}
# try:
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
resp.raise_for_status()
response = await resp.json()
out = response['choices'][0]['message']['content']
return out
# except Exception as e:
# print(f"Error in llm as reranker: {e}")
# return 0
async def run_llm_async(self, tweets):
"""
Send all chunk requests concurrently.
Args:
tweets: The tweets to rerank.
Returns:
The categories of the tweets.
"""
async with aiohttp.ClientSession() as session:
tasks = [self.run_llm(session, tweet) for tweet in tweets]
scores_embed = await asyncio.gather(*tasks)
return scores_embed
def sanitize_for_excel(self, df):
def _sanitize_for_excel(text):
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
if text is None:
return ""
s = str(text)
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
remove_chars = [
"\u200c", # ZWNJ
"\u200d", # ZWJ
"\u200e", # LRM
"\u200f", # RLM
"\u202a", # LRE
"\u202b", # RLE
"\u202c", # PDF
"\u202d", # LRO
"\u202e", # RLO
"\ufeff", # BOM
"\u0640", # Tatweel
]
for ch in remove_chars:
s = s.replace(ch, "")
# Normalize whitespace
s = re.sub(r"\s+", " ", s).strip()
return s
df_copy = df.copy()
for m in ["category"]:
for i in range(len(df_copy[m])):
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
return df_copy
def start_process(self, input_path, output_path):
df = pd.read_excel(input_path)
df_copy = df.copy()
tweets = df["tweet"].tolist()
for i in tqdm(range(0, len(tweets), 1000)):
start_time = time.time()
result_list = asyncio.run(self.run_llm_async(tweets[i:i+1000]))
end_time = time.time()
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
time.sleep(5)
for j, result in enumerate(result_list):
df_copy.at[i+j, "category"] = result
df_copy = self.sanitize_for_excel(df_copy)
df_copy.to_excel(output_path)
if __name__ == "__main__":
llm = TopicRecreation()
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx")