add codes
This commit is contained in:
commit
78656d9f4d
40
README.md
Normal file
40
README.md
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
# TEXT CLUSTERING
|
||||||
|
|
||||||
|
A pipeline for clustering tweets
|
||||||
|
|
||||||
|
## Overall Pipeline for Cluster Extraction
|
||||||
|
|
||||||
|
1.Convert tweet text to categories using the Gemma model:
|
||||||
|
Takes about 7 hours for 40,000 tweets.
|
||||||
|
|
||||||
|
2.Convert categories to embedding vectors using Jina:
|
||||||
|
Takes about 3 minutes.
|
||||||
|
|
||||||
|
3.Perform clustering with K-Means:
|
||||||
|
Choose the number of clusters with the highest silhouette score among 20–60 groups.
|
||||||
|
Takes about 5 minutes.
|
||||||
|
|
||||||
|
4.Name the clusters using the Gemma model:
|
||||||
|
Takes about 1 minute.
|
||||||
|
|
||||||
|
5.Cluster the generated names using K-Means and group similar names together:
|
||||||
|
Takes about 1 minute.
|
||||||
|
|
||||||
|
6.Use GPT O3 to merge and refine cluster names:
|
||||||
|
Provided GPT with the list cluster names and asked it to build new, higher-level clusters.
|
||||||
|
Takes about 1 minute.
|
||||||
|
|
||||||
|
7.Assign each topic to its final cluster using the Gemma model:
|
||||||
|
Takes about 7 hours.
|
||||||
|
|
||||||
|
Reason for step 5:
|
||||||
|
If I had directly given the list of names to step 6, GPT wouldn’t have performed well.
|
||||||
|
By first clustering similar names (step 5), the input to GPT became more organized,
|
||||||
|
which made step 6 much more effective.
|
||||||
|
|
||||||
|
## How to use
|
||||||
|
|
||||||
|
You should give a excel file which has a column named "tweets" to this below command
|
||||||
|
Overally it will take 15h time for 40,000 tweets
|
||||||
|
|
||||||
|
python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx
|
||||||
199
clustering_pipeline.py
Normal file
199
clustering_pipeline.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
import argparse
|
||||||
|
import pandas as pd
|
||||||
|
from transformers import AutoModel
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.metrics import silhouette_score
|
||||||
|
from hazm import Normalizer
|
||||||
|
from tqdm import tqdm
|
||||||
|
import requests
|
||||||
|
from openai import OpenAI
|
||||||
|
import httpx
|
||||||
|
import random
|
||||||
|
|
||||||
|
from post_cluster import PostClusterLLM
|
||||||
|
from topic_recreation import TopicRecreation
|
||||||
|
|
||||||
|
|
||||||
|
START_K = 20
|
||||||
|
END_K = 60
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_k(embeddings):
|
||||||
|
|
||||||
|
max_sil_score = 0
|
||||||
|
best_k = START_K
|
||||||
|
for k in range(START_K, min(END_K, len(embeddings))):
|
||||||
|
kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
|
||||||
|
labels = kmeans.fit_predict(embeddings)
|
||||||
|
|
||||||
|
sil_score = silhouette_score(embeddings, labels)
|
||||||
|
if sil_score > max_sil_score:
|
||||||
|
max_sil_score = sil_score
|
||||||
|
best_k = k
|
||||||
|
|
||||||
|
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
|
||||||
|
labels = kmeans.fit_predict(embeddings)
|
||||||
|
|
||||||
|
return best_k, labels
|
||||||
|
|
||||||
|
|
||||||
|
def get_embeddings(names):
|
||||||
|
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
|
||||||
|
|
||||||
|
normalizer = Normalizer()
|
||||||
|
names = [normalizer.normalize(name) for name in names]
|
||||||
|
|
||||||
|
adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
|
||||||
|
|
||||||
|
names_new = []
|
||||||
|
for name in names:
|
||||||
|
for adj in adjs:
|
||||||
|
name = name.replace(adj, "")
|
||||||
|
names_new.append(name)
|
||||||
|
|
||||||
|
embeddings = []
|
||||||
|
for batch in tqdm(range(0, len(names_new), 50)):
|
||||||
|
embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
|
||||||
|
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def get_cluster_names(clusters):
|
||||||
|
headers = {"Content-Type": "application/json",}
|
||||||
|
|
||||||
|
prompt = """
|
||||||
|
You are a helpful assistant that generates names for clusters of trends in persian.
|
||||||
|
I will give you a list of trends and you will generate a name for this cluster.
|
||||||
|
There might be some different topics in the list so you just consider the dominant topic.
|
||||||
|
Just give me the final answer in persian.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cluster_names = []
|
||||||
|
for data in clusters:
|
||||||
|
cluster_samples = random.sample(data, min(20, len(data)))
|
||||||
|
|
||||||
|
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "google/gemma-3-27b-it",
|
||||||
|
"messages": messages,
|
||||||
|
"max_tokens": 8000
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
|
||||||
|
our_response = response.json()['choices'][0]['message']['content']
|
||||||
|
cluster_names.append(our_response)
|
||||||
|
|
||||||
|
return cluster_names
|
||||||
|
|
||||||
|
|
||||||
|
def modify_cluster_names(cluster_names):
|
||||||
|
PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
|
||||||
|
http_client = httpx.Client(proxy=PROXY_URL)
|
||||||
|
client = OpenAI(api_key="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", http_client=http_client)
|
||||||
|
|
||||||
|
prompt = """
|
||||||
|
You are a topic modification expert.
|
||||||
|
|
||||||
|
I will give you a list of topics.
|
||||||
|
|
||||||
|
## TASK
|
||||||
|
Extract meaningful and distinct topics from the list. you can chnage the name of topics. Just about 20-30 topics that cover all of them.
|
||||||
|
|
||||||
|
## RULES
|
||||||
|
- You can combine or split or ... for doing this task.
|
||||||
|
- You can change the name of topics to make it more general or more specific.
|
||||||
|
- the final topics must be distinct and have specific meaning rather than others.
|
||||||
|
- dont combine topics that are not related to each other. like economical with political with social with ...
|
||||||
|
- combine topics that are related to each other. like ghaza with palestine or ...
|
||||||
|
|
||||||
|
## MUST
|
||||||
|
- all categories must be distinct and have specific meaning from other categories.
|
||||||
|
- two categories can not be similar to each other.
|
||||||
|
|
||||||
|
I will trust your intelligence.
|
||||||
|
write the final answer in persian.
|
||||||
|
"""
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="o3",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": prompt},
|
||||||
|
{"role": "user", "content": str(cluster_names)}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
out = response.choices[0].message.content
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def main(input_file, output_file):
|
||||||
|
# read input file
|
||||||
|
df = pd.read_excel(input_file)
|
||||||
|
topics = df["topic_recreation"].tolist()
|
||||||
|
|
||||||
|
# get embeddings
|
||||||
|
embeddings = get_embeddings(topics)
|
||||||
|
|
||||||
|
# get best k and labels of kmeans with best_k
|
||||||
|
best_k, labels = get_best_k(embeddings)
|
||||||
|
|
||||||
|
# fill clusters
|
||||||
|
clusters = []
|
||||||
|
for i in range(best_k):
|
||||||
|
clusters.append([])
|
||||||
|
|
||||||
|
for i in range(len(clusters)):
|
||||||
|
for topic, label in zip(topics, labels):
|
||||||
|
if label == i:
|
||||||
|
clusters[i].append(topic)
|
||||||
|
|
||||||
|
# get cluster names
|
||||||
|
cluster_names = get_cluster_names(clusters)
|
||||||
|
|
||||||
|
# get embeddings for cluster names
|
||||||
|
cluster_names_embeddings = get_embeddings(cluster_names)
|
||||||
|
|
||||||
|
# get best k and labels of kmeans with best_k
|
||||||
|
best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
|
||||||
|
|
||||||
|
# fill clusters of cluster_names
|
||||||
|
clusters_cluster_names = []
|
||||||
|
for i in range(best_k_cluster_names):
|
||||||
|
clusters_cluster_names.append([])
|
||||||
|
|
||||||
|
for i in range(len(clusters_cluster_names)):
|
||||||
|
for cluster_name, label in zip(cluster_names, labels_cluster_names):
|
||||||
|
if label == i:
|
||||||
|
clusters_cluster_names[i].append(cluster_name)
|
||||||
|
|
||||||
|
# get cluster names for clusters of cluster_names
|
||||||
|
cluster_names_modify = modify_cluster_names(clusters_cluster_names)
|
||||||
|
|
||||||
|
# save cluster names
|
||||||
|
with open(output_file, "w") as f:
|
||||||
|
for count, cluster_name in enumerate(cluster_names_modify):
|
||||||
|
if count == len(cluster_names_modify) - 1:
|
||||||
|
f.write(cluster_name)
|
||||||
|
else:
|
||||||
|
f.write(cluster_name + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--input_file", type=str, required=True)
|
||||||
|
parser.add_argument("--output_file", type=str, required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# apply topic_recreation
|
||||||
|
topic_recreation = TopicRecreation()
|
||||||
|
topic_file = args.output_file.replace(".xlsx", "_topic_recreation.xlsx")
|
||||||
|
topic_recreation.start_process(args.input_file, topic_file)
|
||||||
|
|
||||||
|
# extracting topics
|
||||||
|
titles_file = args.output_file.replace(".xlsx", "_titles.txt")
|
||||||
|
main(topic_file, titles_file)
|
||||||
|
|
||||||
|
# apply clustering
|
||||||
|
post_cluster = PostClusterLLM()
|
||||||
|
post_cluster.start_process(topics_file, args.output_file)
|
||||||
203
post_cluster.py
Normal file
203
post_cluster.py
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
class PostClusterLLM:
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self.instruction = f"""
|
||||||
|
You will be given a title and a list of all cluster names.
|
||||||
|
Your task is to find the best fit cluster name for the title.
|
||||||
|
Go through the list of all cluster names and find the best fit cluster name for the title.
|
||||||
|
If you found a good fit, return the cluster name.
|
||||||
|
If you didn't find a good fit, return "outlier" is "yes".
|
||||||
|
|
||||||
|
#IMPORTANT:
|
||||||
|
- if you found a good fit use its id : {{"cluster" : "id_i"}}
|
||||||
|
- if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}
|
||||||
|
|
||||||
|
Example-1:
|
||||||
|
- Input:
|
||||||
|
- title: "کتاب و درس"
|
||||||
|
- all_cluster_names: {{
|
||||||
|
"1" : "کتابخوانی",
|
||||||
|
"2" : "فوتبال جام جهانی",
|
||||||
|
"3" : "ساختمان سازی شهری" }}
|
||||||
|
- Output:
|
||||||
|
- {{"cluster" : "1"}}
|
||||||
|
|
||||||
|
Example-2:
|
||||||
|
- Input:
|
||||||
|
- title: "لپتاب و کامپیوتر"
|
||||||
|
- all_cluster_names: {{
|
||||||
|
"1" : "کتابخوانی",
|
||||||
|
"2" : "فوتبال جام جهانی",
|
||||||
|
"3" : "ساختمان سازی شهری" }}
|
||||||
|
- Output:
|
||||||
|
- {{"outlier" : "yes"}}
|
||||||
|
|
||||||
|
Example-3:
|
||||||
|
- Input:
|
||||||
|
- title: "ساختمان"
|
||||||
|
- all_cluster_names: {{
|
||||||
|
"1" : "کتابخوانی",
|
||||||
|
"2" : "فوتبال جام جهانی",
|
||||||
|
"3" : "ساختمان سازی شهری" }}
|
||||||
|
- Output:
|
||||||
|
- {{"cluster" : "3"}}
|
||||||
|
|
||||||
|
write a small reason and give the final answer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def run_llm(self, session, title, cluster_names):
|
||||||
|
"""
|
||||||
|
Run the LLM as reranker.
|
||||||
|
Args:
|
||||||
|
session: The session to use for the request.
|
||||||
|
question: The question to rerank the documents.
|
||||||
|
chunk: The chunk to rerank.
|
||||||
|
Returns:
|
||||||
|
The score of the chunk.
|
||||||
|
"""
|
||||||
|
headers = {"Content-Type": "application/json",}
|
||||||
|
|
||||||
|
input_message = f"""{{"all_cluster_names": "{cluster_names}", "title": "{title}"}}"""
|
||||||
|
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "google/gemma-3-27b-it",
|
||||||
|
"messages": messages,
|
||||||
|
"max_tokens": 500
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
|
||||||
|
resp.raise_for_status()
|
||||||
|
response = await resp.json()
|
||||||
|
|
||||||
|
out = response['choices'][0]['message']['content']
|
||||||
|
print("--------------------------------")
|
||||||
|
print(f"title: {title}")
|
||||||
|
print(out)
|
||||||
|
pattern = r'(\{"cluster".*?\})'
|
||||||
|
|
||||||
|
matches = re.findall(pattern, out)
|
||||||
|
|
||||||
|
for m in matches:
|
||||||
|
out_json = json.loads(m)
|
||||||
|
print(f"out_json: {out_json}")
|
||||||
|
return out_json
|
||||||
|
|
||||||
|
pattern = r'(\{"outlier".*?\})'
|
||||||
|
|
||||||
|
matches = re.findall(pattern, out)
|
||||||
|
|
||||||
|
for m in matches:
|
||||||
|
out_json = json.loads(m)
|
||||||
|
print(f"out_json: {out_json}")
|
||||||
|
return out_json
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in llm as reranker: {e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def run_llm_async(self, titles, cluster_names):
|
||||||
|
"""
|
||||||
|
Send all chunk requests concurrently.
|
||||||
|
Args:
|
||||||
|
titles: The titles to rerank.
|
||||||
|
possible_cluster_names: The possible cluster names to rerank.
|
||||||
|
cluster_names: The cluster names to rerank.
|
||||||
|
Returns:
|
||||||
|
The scores of the chunks.
|
||||||
|
"""
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
tasks = [self.run_llm(session, title, cluster_names) for title in titles]
|
||||||
|
scores_embed = await asyncio.gather(*tasks)
|
||||||
|
return scores_embed
|
||||||
|
|
||||||
|
def sanitize_for_excel(self, df):
|
||||||
|
def _sanitize_for_excel(text):
|
||||||
|
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
|
||||||
|
if text is None:
|
||||||
|
return ""
|
||||||
|
s = str(text)
|
||||||
|
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
|
||||||
|
remove_chars = [
|
||||||
|
"\u200c", # ZWNJ
|
||||||
|
"\u200d", # ZWJ
|
||||||
|
"\u200e", # LRM
|
||||||
|
"\u200f", # RLM
|
||||||
|
"\u202a", # LRE
|
||||||
|
"\u202b", # RLE
|
||||||
|
"\u202c", # PDF
|
||||||
|
"\u202d", # LRO
|
||||||
|
"\u202e", # RLO
|
||||||
|
"\ufeff", # BOM
|
||||||
|
"\u0640", # Tatweel
|
||||||
|
]
|
||||||
|
for ch in remove_chars:
|
||||||
|
s = s.replace(ch, "")
|
||||||
|
# Normalize whitespace
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
df_copy = df.copy()
|
||||||
|
for m in df.columns:
|
||||||
|
for i in range(len(df_copy[m])):
|
||||||
|
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
|
||||||
|
|
||||||
|
return df_copy
|
||||||
|
|
||||||
|
def start_process(self, input_path, output_path):
|
||||||
|
df = pd.read_excel(input_path)
|
||||||
|
df_copy = df.copy()
|
||||||
|
|
||||||
|
with open("titles_o3.txt", "r") as f:
|
||||||
|
titles = f.readlines()
|
||||||
|
|
||||||
|
titles = [title.strip() for title in titles]
|
||||||
|
|
||||||
|
cluster_names_dict = {}
|
||||||
|
count = 1
|
||||||
|
for item in titles:
|
||||||
|
cluster_names_dict[str(count)] = item
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
cluster_names = "{\n"
|
||||||
|
for key, value in cluster_names_dict.items():
|
||||||
|
cluster_names += f"{key} : {value},\n"
|
||||||
|
|
||||||
|
cluster_names += "}"
|
||||||
|
|
||||||
|
batch_size = 100
|
||||||
|
for i in tqdm(range(0, len(df["topic"]), batch_size)):
|
||||||
|
start_time = time.time()
|
||||||
|
result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], cluster_names))
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
for j, result in enumerate(result_list):
|
||||||
|
try:
|
||||||
|
if result.get("outlier") == "yes":
|
||||||
|
df_copy.at[i+j, "cluster_llm"] = "متفرقه"
|
||||||
|
elif result.get("cluster") is not None:
|
||||||
|
df_copy.at[i+j, "cluster_llm"] = cluster_names_dict[result["cluster"]]
|
||||||
|
else:
|
||||||
|
df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in result_list: {e}")
|
||||||
|
df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
|
||||||
|
|
||||||
|
df_copy = self.sanitize_for_excel(df_copy)
|
||||||
|
df_copy.to_excel(output_path)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
llm = PostClusterLLM()
|
||||||
|
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
|
||||||
285
sub_clustering_pipeline.py
Normal file
285
sub_clustering_pipeline.py
Normal file
@ -0,0 +1,285 @@
|
|||||||
|
import argparse
|
||||||
|
import pandas as pd
|
||||||
|
from transformers import AutoModel
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.metrics import silhouette_score
|
||||||
|
from hazm import Normalizer
|
||||||
|
from tqdm import tqdm
|
||||||
|
import requests
|
||||||
|
from openai import OpenAI
|
||||||
|
import httpx
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
START_K = 2
|
||||||
|
END_K = 60
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_for_excel(text):
|
||||||
|
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
|
||||||
|
if text is None:
|
||||||
|
return ""
|
||||||
|
s = str(text)
|
||||||
|
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
|
||||||
|
remove_chars = [
|
||||||
|
"\u200c", # ZWNJ
|
||||||
|
"\u200d", # ZWJ
|
||||||
|
"\u200e", # LRM
|
||||||
|
"\u200f", # RLM
|
||||||
|
"\u202a", # LRE
|
||||||
|
"\u202b", # RLE
|
||||||
|
"\u202c", # PDF
|
||||||
|
"\u202d", # LRO
|
||||||
|
"\u202e", # RLO
|
||||||
|
"\ufeff", # BOM
|
||||||
|
"\u0640", # Tatweel
|
||||||
|
]
|
||||||
|
for ch in remove_chars:
|
||||||
|
s = s.replace(ch, "")
|
||||||
|
# Normalize whitespace
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_k(embeddings):
|
||||||
|
|
||||||
|
max_sil_score = 0
|
||||||
|
best_k = START_K
|
||||||
|
for k in range(START_K, min(END_K, len(embeddings))):
|
||||||
|
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||||||
|
labels = kmeans.fit_predict(embeddings)
|
||||||
|
|
||||||
|
sil_score = silhouette_score(embeddings, labels)
|
||||||
|
if sil_score > max_sil_score:
|
||||||
|
max_sil_score = sil_score
|
||||||
|
best_k = k
|
||||||
|
|
||||||
|
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
|
||||||
|
labels = kmeans.fit_predict(embeddings)
|
||||||
|
|
||||||
|
return best_k, labels
|
||||||
|
|
||||||
|
|
||||||
|
def get_embeddings(names):
|
||||||
|
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
|
||||||
|
|
||||||
|
normalizer = Normalizer()
|
||||||
|
names = [normalizer.normalize(name) for name in names]
|
||||||
|
|
||||||
|
adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
|
||||||
|
|
||||||
|
names_new = []
|
||||||
|
for name in names:
|
||||||
|
for adj in adjs:
|
||||||
|
name = name.replace(adj, "")
|
||||||
|
names_new.append(name)
|
||||||
|
|
||||||
|
embeddings = []
|
||||||
|
for batch in tqdm(range(0, len(names_new), 50)):
|
||||||
|
embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
|
||||||
|
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def get_cluster_names(clusters):
|
||||||
|
headers = {"Content-Type": "application/json",}
|
||||||
|
|
||||||
|
prompt = """
|
||||||
|
You are a helpful assistant that generates names for clusters of topics in persian.
|
||||||
|
I will give you a list of topics and you will generate a name for this cluster.
|
||||||
|
There might be some different topics in the list so you just consider the dominant topic.
|
||||||
|
be specific about the cluster name.
|
||||||
|
Just give me the final answer in persian.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cluster_names = []
|
||||||
|
for data in clusters:
|
||||||
|
|
||||||
|
if len(data) < 10:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cluster_samples = random.sample(data, min(20, len(data)))
|
||||||
|
|
||||||
|
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "google/gemma-3-27b-it",
|
||||||
|
"messages": messages,
|
||||||
|
"max_tokens": 8000
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
|
||||||
|
our_response = response.json()['choices'][0]['message']['content']
|
||||||
|
cluster_names.append(our_response)
|
||||||
|
|
||||||
|
return cluster_names
|
||||||
|
|
||||||
|
|
||||||
|
def modify_cluster_names(cluster_names, title, best_k):
|
||||||
|
PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
|
||||||
|
http_client = httpx.Client(proxy=PROXY_URL)
|
||||||
|
client = OpenAI(api_key="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", http_client=http_client)
|
||||||
|
|
||||||
|
start = (best_k / 2) - ((best_k / 2) % 10)
|
||||||
|
if start == 0:
|
||||||
|
start = 1
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are a sub category modification expert.
|
||||||
|
|
||||||
|
I will give you a list of topics.
|
||||||
|
|
||||||
|
all these topics belongs to {title} category
|
||||||
|
|
||||||
|
## TASK
|
||||||
|
Extract meaningful and distinct sub category from the list. you can change the name of topics. Just about {start}-{start+10} topics that cover all of them.
|
||||||
|
|
||||||
|
## RULES
|
||||||
|
- You can combine or split or ... for doing this task.
|
||||||
|
- You can change the name of topics to make it more general or more specific.
|
||||||
|
- the final topics must be distinct and have specific meaning rather than others.
|
||||||
|
- dont combine topics that are not related to each other. like economical with political with social with ...
|
||||||
|
- combine topics that are related to each other. like ghaza with palestine or ...
|
||||||
|
|
||||||
|
## MUST
|
||||||
|
- all sub categories must be distinct and have specific meaning from other categories.
|
||||||
|
- two categories can not be similar to each other.
|
||||||
|
- be specifc about sub categories
|
||||||
|
|
||||||
|
I will trust your intelligence.
|
||||||
|
write the final answer in persian.
|
||||||
|
"""
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="o3",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": prompt},
|
||||||
|
{"role": "user", "content": str(cluster_names)}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
out = response.choices[0].message.content
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_list(text, count):
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json",}
|
||||||
|
|
||||||
|
prompt = """
|
||||||
|
extract the titles from this text and put it in a list.
|
||||||
|
just return the output in list format, do not include any other text : ["title_1", "title_2", ...]
|
||||||
|
"""
|
||||||
|
|
||||||
|
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": text}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "google/gemma-3-27b-it",
|
||||||
|
"messages": messages,
|
||||||
|
"max_tokens": 8000
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
|
||||||
|
out = response.json()['choices'][0]['message']['content']
|
||||||
|
try:
|
||||||
|
out = json.loads(out)
|
||||||
|
except:
|
||||||
|
print(f"error in extract list {count}")
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def main(input_file, output_file):
|
||||||
|
# read input file
|
||||||
|
df = pd.read_excel(input_file)
|
||||||
|
topics = df["topic"].tolist()
|
||||||
|
cluster_llms = df["cluster_llm"].tolist()
|
||||||
|
|
||||||
|
# get embeddings
|
||||||
|
embeddings = get_embeddings(topics)
|
||||||
|
|
||||||
|
# extract main cluster names
|
||||||
|
cluster_names = []
|
||||||
|
with open("titles_o3.txt", "r") as f:
|
||||||
|
titles = f.readlines()
|
||||||
|
|
||||||
|
titles = [sanitize_for_excel(title.strip()) for title in titles]
|
||||||
|
|
||||||
|
embedding_cluster = []
|
||||||
|
best_k = len(titles)
|
||||||
|
for i in range(best_k):
|
||||||
|
embedding_cluster.append([])
|
||||||
|
|
||||||
|
topic_cluster = []
|
||||||
|
best_k = len(titles)
|
||||||
|
for i in range(best_k):
|
||||||
|
topic_cluster.append([])
|
||||||
|
|
||||||
|
for m in range(len(titles)):
|
||||||
|
for embedding, cluster_name, topic in zip(embeddings, cluster_llms, topics):
|
||||||
|
if cluster_name == titles[m]:
|
||||||
|
embedding_cluster[m].append(embedding)
|
||||||
|
topic_cluster[m].append(topic)
|
||||||
|
|
||||||
|
sub_cluster_names = []
|
||||||
|
for cluster_count in tqdm(range(len(titles))):
|
||||||
|
print(f"start {cluster_count} \n")
|
||||||
|
# get best k and labels of kmeans with best_k
|
||||||
|
best_k, labels = get_best_k(embedding_cluster[cluster_count])
|
||||||
|
print(f"initial best_k {best_k}\n")
|
||||||
|
|
||||||
|
# fill clusters
|
||||||
|
clusters = []
|
||||||
|
for i in range(best_k):
|
||||||
|
clusters.append([])
|
||||||
|
|
||||||
|
for i in range(len(clusters)):
|
||||||
|
for topic, label in zip(topic_cluster[cluster_count], labels):
|
||||||
|
if label == i:
|
||||||
|
clusters[i].append(topic)
|
||||||
|
|
||||||
|
# get cluster names
|
||||||
|
cluster_names = get_cluster_names(clusters)
|
||||||
|
|
||||||
|
if len(cluster_names) > 1:
|
||||||
|
# get embeddings for cluster names
|
||||||
|
cluster_names_embeddings = get_embeddings(cluster_names)
|
||||||
|
|
||||||
|
# get best k and labels of kmeans with best_k
|
||||||
|
best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
|
||||||
|
print(f"second best_k {best_k_cluster_names}\n")
|
||||||
|
|
||||||
|
# fill clusters of cluster_names
|
||||||
|
clusters_cluster_names = []
|
||||||
|
for i in range(best_k_cluster_names):
|
||||||
|
clusters_cluster_names.append([])
|
||||||
|
|
||||||
|
for i in range(len(clusters_cluster_names)):
|
||||||
|
for cluster_name, label in zip(cluster_names, labels_cluster_names):
|
||||||
|
if label == i:
|
||||||
|
clusters_cluster_names[i].append(cluster_name)
|
||||||
|
|
||||||
|
# get cluster names for clusters of cluster_names
|
||||||
|
cluster_names_modify = modify_cluster_names(clusters_cluster_names, titles[cluster_count], best_k)
|
||||||
|
cluster_names_modify_list = extract_list(cluster_names_modify, cluster_count)
|
||||||
|
sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": cluster_names_modify_list})
|
||||||
|
|
||||||
|
else:
|
||||||
|
sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": []})
|
||||||
|
|
||||||
|
# save cluster names
|
||||||
|
if not output_file.endswith(".json"):
|
||||||
|
output_file = output_file + ".json"
|
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(sub_cluster_names, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--input_file", type=str, required=True)
|
||||||
|
parser.add_argument("--output_file", type=str, required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# extracting topics
|
||||||
|
main(args.input_file, args.output_file)
|
||||||
14724
test_saeed_tweet_2.ipynb
Normal file
14724
test_saeed_tweet_2.ipynb
Normal file
File diff suppressed because one or more lines are too long
142
topic_recreation.py
Normal file
142
topic_recreation.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
class TopicRecreation:
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self.instruction = f"""
|
||||||
|
You will be given a tweet text.
|
||||||
|
Your task is to write a phrase category for this tweet which tweet is related to it.
|
||||||
|
this should be a combination of action + category :
|
||||||
|
|
||||||
|
for example :
|
||||||
|
انتقاد از سیاست ایران
|
||||||
|
توهین به مقامات کشور
|
||||||
|
حمایت از نظام جمهوری اسلامی
|
||||||
|
جنگ اسراییل و قطر
|
||||||
|
مسایل مربوط به موضوع هسته ای ایران
|
||||||
|
مسایل مربوط به افغانستان
|
||||||
|
|
||||||
|
The category should be in persian.
|
||||||
|
|
||||||
|
# Roles
|
||||||
|
- If it does not have specifc meaning then write "متفرقه"
|
||||||
|
- Be specifc about the countries.
|
||||||
|
- Do not be specifc about the people.
|
||||||
|
- you can consider different categories and write an action + category or just simple category
|
||||||
|
|
||||||
|
Just return the category, do not include any other text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def run_llm(self, session, tweet):
|
||||||
|
"""
|
||||||
|
Run the LLM as reranker.
|
||||||
|
Args:
|
||||||
|
session: The session to use for the request.
|
||||||
|
tweet: The tweet to rerank.
|
||||||
|
Returns:
|
||||||
|
The category of the tweet.
|
||||||
|
"""
|
||||||
|
headers = {"Content-Type": "application/json",}
|
||||||
|
|
||||||
|
tweet = " ".join([m for m in tweet.split(" ") if "@" not in m])
|
||||||
|
|
||||||
|
input_message = f"""{{"tweet": "{tweet}"}}"""
|
||||||
|
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "google/gemma-3-27b-it",
|
||||||
|
"messages": messages,
|
||||||
|
"max_tokens": 500
|
||||||
|
}
|
||||||
|
# try:
|
||||||
|
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
|
||||||
|
resp.raise_for_status()
|
||||||
|
response = await resp.json()
|
||||||
|
|
||||||
|
out = response['choices'][0]['message']['content']
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"Error in llm as reranker: {e}")
|
||||||
|
# return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def run_llm_async(self, tweets):
|
||||||
|
"""
|
||||||
|
Send all chunk requests concurrently.
|
||||||
|
Args:
|
||||||
|
tweets: The tweets to rerank.
|
||||||
|
Returns:
|
||||||
|
The categories of the tweets.
|
||||||
|
"""
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
tasks = [self.run_llm(session, tweet) for tweet in tweets]
|
||||||
|
scores_embed = await asyncio.gather(*tasks)
|
||||||
|
return scores_embed
|
||||||
|
|
||||||
|
def sanitize_for_excel(self, df):
|
||||||
|
def _sanitize_for_excel(text):
|
||||||
|
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
|
||||||
|
if text is None:
|
||||||
|
return ""
|
||||||
|
s = str(text)
|
||||||
|
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
|
||||||
|
remove_chars = [
|
||||||
|
"\u200c", # ZWNJ
|
||||||
|
"\u200d", # ZWJ
|
||||||
|
"\u200e", # LRM
|
||||||
|
"\u200f", # RLM
|
||||||
|
"\u202a", # LRE
|
||||||
|
"\u202b", # RLE
|
||||||
|
"\u202c", # PDF
|
||||||
|
"\u202d", # LRO
|
||||||
|
"\u202e", # RLO
|
||||||
|
"\ufeff", # BOM
|
||||||
|
"\u0640", # Tatweel
|
||||||
|
]
|
||||||
|
for ch in remove_chars:
|
||||||
|
s = s.replace(ch, "")
|
||||||
|
# Normalize whitespace
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
df_copy = df.copy()
|
||||||
|
for m in ["category"]:
|
||||||
|
for i in range(len(df_copy[m])):
|
||||||
|
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
|
||||||
|
|
||||||
|
return df_copy
|
||||||
|
|
||||||
|
def start_process(self, input_path, output_path):
|
||||||
|
df = pd.read_excel(input_path)
|
||||||
|
df_copy = df.copy()
|
||||||
|
|
||||||
|
tweets = df["tweet"].tolist()
|
||||||
|
|
||||||
|
for i in tqdm(range(0, len(tweets), 1000)):
|
||||||
|
start_time = time.time()
|
||||||
|
result_list = asyncio.run(self.run_llm_async(tweets[i:i+1000]))
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
|
||||||
|
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
for j, result in enumerate(result_list):
|
||||||
|
df_copy.at[i+j, "category"] = result
|
||||||
|
|
||||||
|
|
||||||
|
df_copy = self.sanitize_for_excel(df_copy)
|
||||||
|
df_copy.to_excel(output_path)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
llm = TopicRecreation()
|
||||||
|
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx")
|
||||||
Loading…
x
Reference in New Issue
Block a user