add codes
This commit is contained in:
commit
78656d9f4d
40
README.md
Normal file
40
README.md
Normal file
@ -0,0 +1,40 @@
|
||||
# TEXT CLUSTERING
|
||||
|
||||
A pipeline for clustering tweets
|
||||
|
||||
## Overall Pipeline for Cluster Extraction
|
||||
|
||||
1.Convert tweet text to categories using the Gemma model:
|
||||
Takes about 7 hours for 40,000 tweets.
|
||||
|
||||
2.Convert categories to embedding vectors using Jina:
|
||||
Takes about 3 minutes.
|
||||
|
||||
3.Perform clustering with K-Means:
|
||||
Choose the number of clusters with the highest silhouette score among 20–60 groups.
|
||||
Takes about 5 minutes.
|
||||
|
||||
4.Name the clusters using the Gemma model:
|
||||
Takes about 1 minute.
|
||||
|
||||
5.Cluster the generated names using K-Means and group similar names together:
|
||||
Takes about 1 minute.
|
||||
|
||||
6.Use GPT O3 to merge and refine cluster names:
|
||||
Provided GPT with the list cluster names and asked it to build new, higher-level clusters.
|
||||
Takes about 1 minute.
|
||||
|
||||
7.Assign each topic to its final cluster using the Gemma model:
|
||||
Takes about 7 hours.
|
||||
|
||||
Reason for step 5:
|
||||
If I had directly given the list of names to step 6, GPT wouldn’t have performed well.
|
||||
By first clustering similar names (step 5), the input to GPT became more organized,
|
||||
which made step 6 much more effective.
|
||||
|
||||
## How to use
|
||||
|
||||
You should give a excel file which has a column named "tweets" to this below command
|
||||
Overally it will take 15h time for 40,000 tweets
|
||||
|
||||
python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx
|
||||
199
clustering_pipeline.py
Normal file
199
clustering_pipeline.py
Normal file
@ -0,0 +1,199 @@
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from transformers import AutoModel
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import silhouette_score
|
||||
from hazm import Normalizer
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
import httpx
|
||||
import random
|
||||
|
||||
from post_cluster import PostClusterLLM
|
||||
from topic_recreation import TopicRecreation
|
||||
|
||||
|
||||
START_K = 20
|
||||
END_K = 60
|
||||
|
||||
|
||||
def get_best_k(embeddings):
|
||||
|
||||
max_sil_score = 0
|
||||
best_k = START_K
|
||||
for k in range(START_K, min(END_K, len(embeddings))):
|
||||
kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
|
||||
labels = kmeans.fit_predict(embeddings)
|
||||
|
||||
sil_score = silhouette_score(embeddings, labels)
|
||||
if sil_score > max_sil_score:
|
||||
max_sil_score = sil_score
|
||||
best_k = k
|
||||
|
||||
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
|
||||
labels = kmeans.fit_predict(embeddings)
|
||||
|
||||
return best_k, labels
|
||||
|
||||
|
||||
def get_embeddings(names):
|
||||
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
|
||||
|
||||
normalizer = Normalizer()
|
||||
names = [normalizer.normalize(name) for name in names]
|
||||
|
||||
adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
|
||||
|
||||
names_new = []
|
||||
for name in names:
|
||||
for adj in adjs:
|
||||
name = name.replace(adj, "")
|
||||
names_new.append(name)
|
||||
|
||||
embeddings = []
|
||||
for batch in tqdm(range(0, len(names_new), 50)):
|
||||
embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def get_cluster_names(clusters):
|
||||
headers = {"Content-Type": "application/json",}
|
||||
|
||||
prompt = """
|
||||
You are a helpful assistant that generates names for clusters of trends in persian.
|
||||
I will give you a list of trends and you will generate a name for this cluster.
|
||||
There might be some different topics in the list so you just consider the dominant topic.
|
||||
Just give me the final answer in persian.
|
||||
"""
|
||||
|
||||
cluster_names = []
|
||||
for data in clusters:
|
||||
cluster_samples = random.sample(data, min(20, len(data)))
|
||||
|
||||
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
|
||||
|
||||
payload = {
|
||||
"model": "google/gemma-3-27b-it",
|
||||
"messages": messages,
|
||||
"max_tokens": 8000
|
||||
}
|
||||
|
||||
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
|
||||
our_response = response.json()['choices'][0]['message']['content']
|
||||
cluster_names.append(our_response)
|
||||
|
||||
return cluster_names
|
||||
|
||||
|
||||
def modify_cluster_names(cluster_names):
|
||||
PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
|
||||
http_client = httpx.Client(proxy=PROXY_URL)
|
||||
client = OpenAI(api_key="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", http_client=http_client)
|
||||
|
||||
prompt = """
|
||||
You are a topic modification expert.
|
||||
|
||||
I will give you a list of topics.
|
||||
|
||||
## TASK
|
||||
Extract meaningful and distinct topics from the list. you can chnage the name of topics. Just about 20-30 topics that cover all of them.
|
||||
|
||||
## RULES
|
||||
- You can combine or split or ... for doing this task.
|
||||
- You can change the name of topics to make it more general or more specific.
|
||||
- the final topics must be distinct and have specific meaning rather than others.
|
||||
- dont combine topics that are not related to each other. like economical with political with social with ...
|
||||
- combine topics that are related to each other. like ghaza with palestine or ...
|
||||
|
||||
## MUST
|
||||
- all categories must be distinct and have specific meaning from other categories.
|
||||
- two categories can not be similar to each other.
|
||||
|
||||
I will trust your intelligence.
|
||||
write the final answer in persian.
|
||||
"""
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="o3",
|
||||
messages=[
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": str(cluster_names)}
|
||||
]
|
||||
)
|
||||
out = response.choices[0].message.content
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def main(input_file, output_file):
|
||||
# read input file
|
||||
df = pd.read_excel(input_file)
|
||||
topics = df["topic_recreation"].tolist()
|
||||
|
||||
# get embeddings
|
||||
embeddings = get_embeddings(topics)
|
||||
|
||||
# get best k and labels of kmeans with best_k
|
||||
best_k, labels = get_best_k(embeddings)
|
||||
|
||||
# fill clusters
|
||||
clusters = []
|
||||
for i in range(best_k):
|
||||
clusters.append([])
|
||||
|
||||
for i in range(len(clusters)):
|
||||
for topic, label in zip(topics, labels):
|
||||
if label == i:
|
||||
clusters[i].append(topic)
|
||||
|
||||
# get cluster names
|
||||
cluster_names = get_cluster_names(clusters)
|
||||
|
||||
# get embeddings for cluster names
|
||||
cluster_names_embeddings = get_embeddings(cluster_names)
|
||||
|
||||
# get best k and labels of kmeans with best_k
|
||||
best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
|
||||
|
||||
# fill clusters of cluster_names
|
||||
clusters_cluster_names = []
|
||||
for i in range(best_k_cluster_names):
|
||||
clusters_cluster_names.append([])
|
||||
|
||||
for i in range(len(clusters_cluster_names)):
|
||||
for cluster_name, label in zip(cluster_names, labels_cluster_names):
|
||||
if label == i:
|
||||
clusters_cluster_names[i].append(cluster_name)
|
||||
|
||||
# get cluster names for clusters of cluster_names
|
||||
cluster_names_modify = modify_cluster_names(clusters_cluster_names)
|
||||
|
||||
# save cluster names
|
||||
with open(output_file, "w") as f:
|
||||
for count, cluster_name in enumerate(cluster_names_modify):
|
||||
if count == len(cluster_names_modify) - 1:
|
||||
f.write(cluster_name)
|
||||
else:
|
||||
f.write(cluster_name + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input_file", type=str, required=True)
|
||||
parser.add_argument("--output_file", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
# apply topic_recreation
|
||||
topic_recreation = TopicRecreation()
|
||||
topic_file = args.output_file.replace(".xlsx", "_topic_recreation.xlsx")
|
||||
topic_recreation.start_process(args.input_file, topic_file)
|
||||
|
||||
# extracting topics
|
||||
titles_file = args.output_file.replace(".xlsx", "_titles.txt")
|
||||
main(topic_file, titles_file)
|
||||
|
||||
# apply clustering
|
||||
post_cluster = PostClusterLLM()
|
||||
post_cluster.start_process(topics_file, args.output_file)
|
||||
203
post_cluster.py
Normal file
203
post_cluster.py
Normal file
@ -0,0 +1,203 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import time
|
||||
import re
|
||||
import pandas as pd
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
class PostClusterLLM:
|
||||
def __init__(self):
|
||||
|
||||
self.instruction = f"""
|
||||
You will be given a title and a list of all cluster names.
|
||||
Your task is to find the best fit cluster name for the title.
|
||||
Go through the list of all cluster names and find the best fit cluster name for the title.
|
||||
If you found a good fit, return the cluster name.
|
||||
If you didn't find a good fit, return "outlier" is "yes".
|
||||
|
||||
#IMPORTANT:
|
||||
- if you found a good fit use its id : {{"cluster" : "id_i"}}
|
||||
- if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}
|
||||
|
||||
Example-1:
|
||||
- Input:
|
||||
- title: "کتاب و درس"
|
||||
- all_cluster_names: {{
|
||||
"1" : "کتابخوانی",
|
||||
"2" : "فوتبال جام جهانی",
|
||||
"3" : "ساختمان سازی شهری" }}
|
||||
- Output:
|
||||
- {{"cluster" : "1"}}
|
||||
|
||||
Example-2:
|
||||
- Input:
|
||||
- title: "لپتاب و کامپیوتر"
|
||||
- all_cluster_names: {{
|
||||
"1" : "کتابخوانی",
|
||||
"2" : "فوتبال جام جهانی",
|
||||
"3" : "ساختمان سازی شهری" }}
|
||||
- Output:
|
||||
- {{"outlier" : "yes"}}
|
||||
|
||||
Example-3:
|
||||
- Input:
|
||||
- title: "ساختمان"
|
||||
- all_cluster_names: {{
|
||||
"1" : "کتابخوانی",
|
||||
"2" : "فوتبال جام جهانی",
|
||||
"3" : "ساختمان سازی شهری" }}
|
||||
- Output:
|
||||
- {{"cluster" : "3"}}
|
||||
|
||||
write a small reason and give the final answer.
|
||||
"""
|
||||
|
||||
|
||||
async def run_llm(self, session, title, cluster_names):
|
||||
"""
|
||||
Run the LLM as reranker.
|
||||
Args:
|
||||
session: The session to use for the request.
|
||||
question: The question to rerank the documents.
|
||||
chunk: The chunk to rerank.
|
||||
Returns:
|
||||
The score of the chunk.
|
||||
"""
|
||||
headers = {"Content-Type": "application/json",}
|
||||
|
||||
input_message = f"""{{"all_cluster_names": "{cluster_names}", "title": "{title}"}}"""
|
||||
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
|
||||
|
||||
payload = {
|
||||
"model": "google/gemma-3-27b-it",
|
||||
"messages": messages,
|
||||
"max_tokens": 500
|
||||
}
|
||||
try:
|
||||
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
|
||||
resp.raise_for_status()
|
||||
response = await resp.json()
|
||||
|
||||
out = response['choices'][0]['message']['content']
|
||||
print("--------------------------------")
|
||||
print(f"title: {title}")
|
||||
print(out)
|
||||
pattern = r'(\{"cluster".*?\})'
|
||||
|
||||
matches = re.findall(pattern, out)
|
||||
|
||||
for m in matches:
|
||||
out_json = json.loads(m)
|
||||
print(f"out_json: {out_json}")
|
||||
return out_json
|
||||
|
||||
pattern = r'(\{"outlier".*?\})'
|
||||
|
||||
matches = re.findall(pattern, out)
|
||||
|
||||
for m in matches:
|
||||
out_json = json.loads(m)
|
||||
print(f"out_json: {out_json}")
|
||||
return out_json
|
||||
except Exception as e:
|
||||
print(f"Error in llm as reranker: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
async def run_llm_async(self, titles, cluster_names):
|
||||
"""
|
||||
Send all chunk requests concurrently.
|
||||
Args:
|
||||
titles: The titles to rerank.
|
||||
possible_cluster_names: The possible cluster names to rerank.
|
||||
cluster_names: The cluster names to rerank.
|
||||
Returns:
|
||||
The scores of the chunks.
|
||||
"""
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [self.run_llm(session, title, cluster_names) for title in titles]
|
||||
scores_embed = await asyncio.gather(*tasks)
|
||||
return scores_embed
|
||||
|
||||
def sanitize_for_excel(self, df):
|
||||
def _sanitize_for_excel(text):
|
||||
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
|
||||
if text is None:
|
||||
return ""
|
||||
s = str(text)
|
||||
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
|
||||
remove_chars = [
|
||||
"\u200c", # ZWNJ
|
||||
"\u200d", # ZWJ
|
||||
"\u200e", # LRM
|
||||
"\u200f", # RLM
|
||||
"\u202a", # LRE
|
||||
"\u202b", # RLE
|
||||
"\u202c", # PDF
|
||||
"\u202d", # LRO
|
||||
"\u202e", # RLO
|
||||
"\ufeff", # BOM
|
||||
"\u0640", # Tatweel
|
||||
]
|
||||
for ch in remove_chars:
|
||||
s = s.replace(ch, "")
|
||||
# Normalize whitespace
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
df_copy = df.copy()
|
||||
for m in df.columns:
|
||||
for i in range(len(df_copy[m])):
|
||||
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
|
||||
|
||||
return df_copy
|
||||
|
||||
def start_process(self, input_path, output_path):
|
||||
df = pd.read_excel(input_path)
|
||||
df_copy = df.copy()
|
||||
|
||||
with open("titles_o3.txt", "r") as f:
|
||||
titles = f.readlines()
|
||||
|
||||
titles = [title.strip() for title in titles]
|
||||
|
||||
cluster_names_dict = {}
|
||||
count = 1
|
||||
for item in titles:
|
||||
cluster_names_dict[str(count)] = item
|
||||
count += 1
|
||||
|
||||
cluster_names = "{\n"
|
||||
for key, value in cluster_names_dict.items():
|
||||
cluster_names += f"{key} : {value},\n"
|
||||
|
||||
cluster_names += "}"
|
||||
|
||||
batch_size = 100
|
||||
for i in tqdm(range(0, len(df["topic"]), batch_size)):
|
||||
start_time = time.time()
|
||||
result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], cluster_names))
|
||||
end_time = time.time()
|
||||
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
|
||||
time.sleep(5)
|
||||
|
||||
for j, result in enumerate(result_list):
|
||||
try:
|
||||
if result.get("outlier") == "yes":
|
||||
df_copy.at[i+j, "cluster_llm"] = "متفرقه"
|
||||
elif result.get("cluster") is not None:
|
||||
df_copy.at[i+j, "cluster_llm"] = cluster_names_dict[result["cluster"]]
|
||||
else:
|
||||
df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in result_list: {e}")
|
||||
df_copy.at[i+j, "cluster_llm"] = df_copy.at[i+j, "category"]
|
||||
|
||||
df_copy = self.sanitize_for_excel(df_copy)
|
||||
df_copy.to_excel(output_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
llm = PostClusterLLM()
|
||||
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
|
||||
285
sub_clustering_pipeline.py
Normal file
285
sub_clustering_pipeline.py
Normal file
@ -0,0 +1,285 @@
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from transformers import AutoModel
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import silhouette_score
|
||||
from hazm import Normalizer
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
import httpx
|
||||
import random
|
||||
import re
|
||||
import json
|
||||
|
||||
|
||||
START_K = 2
|
||||
END_K = 60
|
||||
|
||||
|
||||
def sanitize_for_excel(text):
|
||||
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
|
||||
if text is None:
|
||||
return ""
|
||||
s = str(text)
|
||||
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
|
||||
remove_chars = [
|
||||
"\u200c", # ZWNJ
|
||||
"\u200d", # ZWJ
|
||||
"\u200e", # LRM
|
||||
"\u200f", # RLM
|
||||
"\u202a", # LRE
|
||||
"\u202b", # RLE
|
||||
"\u202c", # PDF
|
||||
"\u202d", # LRO
|
||||
"\u202e", # RLO
|
||||
"\ufeff", # BOM
|
||||
"\u0640", # Tatweel
|
||||
]
|
||||
for ch in remove_chars:
|
||||
s = s.replace(ch, "")
|
||||
# Normalize whitespace
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
def get_best_k(embeddings):
|
||||
|
||||
max_sil_score = 0
|
||||
best_k = START_K
|
||||
for k in range(START_K, min(END_K, len(embeddings))):
|
||||
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||||
labels = kmeans.fit_predict(embeddings)
|
||||
|
||||
sil_score = silhouette_score(embeddings, labels)
|
||||
if sil_score > max_sil_score:
|
||||
max_sil_score = sil_score
|
||||
best_k = k
|
||||
|
||||
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
|
||||
labels = kmeans.fit_predict(embeddings)
|
||||
|
||||
return best_k, labels
|
||||
|
||||
|
||||
def get_embeddings(names):
|
||||
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to("cuda")
|
||||
|
||||
normalizer = Normalizer()
|
||||
names = [normalizer.normalize(name) for name in names]
|
||||
|
||||
adjs = ["توهین", "انتقاد", "نقد", "حمایت", "مسائل", "مربوط", "تهدید", "عملکرد", "رفتار", "به", "از", "در"]
|
||||
|
||||
names_new = []
|
||||
for name in names:
|
||||
for adj in adjs:
|
||||
name = name.replace(adj, "")
|
||||
names_new.append(name)
|
||||
|
||||
embeddings = []
|
||||
for batch in tqdm(range(0, len(names_new), 50)):
|
||||
embeddings += model.encode(names_new[batch:batch+50], task="separation").tolist()
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def get_cluster_names(clusters):
|
||||
headers = {"Content-Type": "application/json",}
|
||||
|
||||
prompt = """
|
||||
You are a helpful assistant that generates names for clusters of topics in persian.
|
||||
I will give you a list of topics and you will generate a name for this cluster.
|
||||
There might be some different topics in the list so you just consider the dominant topic.
|
||||
be specific about the cluster name.
|
||||
Just give me the final answer in persian.
|
||||
"""
|
||||
|
||||
cluster_names = []
|
||||
for data in clusters:
|
||||
|
||||
if len(data) < 10:
|
||||
continue
|
||||
|
||||
cluster_samples = random.sample(data, min(20, len(data)))
|
||||
|
||||
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": str(cluster_samples)}]
|
||||
|
||||
payload = {
|
||||
"model": "google/gemma-3-27b-it",
|
||||
"messages": messages,
|
||||
"max_tokens": 8000
|
||||
}
|
||||
|
||||
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
|
||||
our_response = response.json()['choices'][0]['message']['content']
|
||||
cluster_names.append(our_response)
|
||||
|
||||
return cluster_names
|
||||
|
||||
|
||||
def modify_cluster_names(cluster_names, title, best_k):
|
||||
PROXY_URL = "http://2zajDvJvJg:e0BtBiynhF@192.168.130.40:51371/"
|
||||
http_client = httpx.Client(proxy=PROXY_URL)
|
||||
client = OpenAI(api_key="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", http_client=http_client)
|
||||
|
||||
start = (best_k / 2) - ((best_k / 2) % 10)
|
||||
if start == 0:
|
||||
start = 1
|
||||
|
||||
prompt = f"""
|
||||
You are a sub category modification expert.
|
||||
|
||||
I will give you a list of topics.
|
||||
|
||||
all these topics belongs to {title} category
|
||||
|
||||
## TASK
|
||||
Extract meaningful and distinct sub category from the list. you can change the name of topics. Just about {start}-{start+10} topics that cover all of them.
|
||||
|
||||
## RULES
|
||||
- You can combine or split or ... for doing this task.
|
||||
- You can change the name of topics to make it more general or more specific.
|
||||
- the final topics must be distinct and have specific meaning rather than others.
|
||||
- dont combine topics that are not related to each other. like economical with political with social with ...
|
||||
- combine topics that are related to each other. like ghaza with palestine or ...
|
||||
|
||||
## MUST
|
||||
- all sub categories must be distinct and have specific meaning from other categories.
|
||||
- two categories can not be similar to each other.
|
||||
- be specifc about sub categories
|
||||
|
||||
I will trust your intelligence.
|
||||
write the final answer in persian.
|
||||
"""
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="o3",
|
||||
messages=[
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": str(cluster_names)}
|
||||
]
|
||||
)
|
||||
out = response.choices[0].message.content
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def extract_list(text, count):
|
||||
|
||||
headers = {"Content-Type": "application/json",}
|
||||
|
||||
prompt = """
|
||||
extract the titles from this text and put it in a list.
|
||||
just return the output in list format, do not include any other text : ["title_1", "title_2", ...]
|
||||
"""
|
||||
|
||||
messages = [{"role": "system", "content": prompt}, {"role": "user", "content": text}]
|
||||
|
||||
payload = {
|
||||
"model": "google/gemma-3-27b-it",
|
||||
"messages": messages,
|
||||
"max_tokens": 8000
|
||||
}
|
||||
|
||||
response = requests.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload)
|
||||
out = response.json()['choices'][0]['message']['content']
|
||||
try:
|
||||
out = json.loads(out)
|
||||
except:
|
||||
print(f"error in extract list {count}")
|
||||
return out
|
||||
|
||||
|
||||
def main(input_file, output_file):
|
||||
# read input file
|
||||
df = pd.read_excel(input_file)
|
||||
topics = df["topic"].tolist()
|
||||
cluster_llms = df["cluster_llm"].tolist()
|
||||
|
||||
# get embeddings
|
||||
embeddings = get_embeddings(topics)
|
||||
|
||||
# extract main cluster names
|
||||
cluster_names = []
|
||||
with open("titles_o3.txt", "r") as f:
|
||||
titles = f.readlines()
|
||||
|
||||
titles = [sanitize_for_excel(title.strip()) for title in titles]
|
||||
|
||||
embedding_cluster = []
|
||||
best_k = len(titles)
|
||||
for i in range(best_k):
|
||||
embedding_cluster.append([])
|
||||
|
||||
topic_cluster = []
|
||||
best_k = len(titles)
|
||||
for i in range(best_k):
|
||||
topic_cluster.append([])
|
||||
|
||||
for m in range(len(titles)):
|
||||
for embedding, cluster_name, topic in zip(embeddings, cluster_llms, topics):
|
||||
if cluster_name == titles[m]:
|
||||
embedding_cluster[m].append(embedding)
|
||||
topic_cluster[m].append(topic)
|
||||
|
||||
sub_cluster_names = []
|
||||
for cluster_count in tqdm(range(len(titles))):
|
||||
print(f"start {cluster_count} \n")
|
||||
# get best k and labels of kmeans with best_k
|
||||
best_k, labels = get_best_k(embedding_cluster[cluster_count])
|
||||
print(f"initial best_k {best_k}\n")
|
||||
|
||||
# fill clusters
|
||||
clusters = []
|
||||
for i in range(best_k):
|
||||
clusters.append([])
|
||||
|
||||
for i in range(len(clusters)):
|
||||
for topic, label in zip(topic_cluster[cluster_count], labels):
|
||||
if label == i:
|
||||
clusters[i].append(topic)
|
||||
|
||||
# get cluster names
|
||||
cluster_names = get_cluster_names(clusters)
|
||||
|
||||
if len(cluster_names) > 1:
|
||||
# get embeddings for cluster names
|
||||
cluster_names_embeddings = get_embeddings(cluster_names)
|
||||
|
||||
# get best k and labels of kmeans with best_k
|
||||
best_k_cluster_names, labels_cluster_names = get_best_k(cluster_names_embeddings)
|
||||
print(f"second best_k {best_k_cluster_names}\n")
|
||||
|
||||
# fill clusters of cluster_names
|
||||
clusters_cluster_names = []
|
||||
for i in range(best_k_cluster_names):
|
||||
clusters_cluster_names.append([])
|
||||
|
||||
for i in range(len(clusters_cluster_names)):
|
||||
for cluster_name, label in zip(cluster_names, labels_cluster_names):
|
||||
if label == i:
|
||||
clusters_cluster_names[i].append(cluster_name)
|
||||
|
||||
# get cluster names for clusters of cluster_names
|
||||
cluster_names_modify = modify_cluster_names(clusters_cluster_names, titles[cluster_count], best_k)
|
||||
cluster_names_modify_list = extract_list(cluster_names_modify, cluster_count)
|
||||
sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": cluster_names_modify_list})
|
||||
|
||||
else:
|
||||
sub_cluster_names.append({"id": cluster_count, "cluster_name": titles[cluster_count], "sub_cluster_names": []})
|
||||
|
||||
# save cluster names
|
||||
if not output_file.endswith(".json"):
|
||||
output_file = output_file + ".json"
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(sub_cluster_names, f, ensure_ascii=False, indent=2)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input_file", type=str, required=True)
|
||||
parser.add_argument("--output_file", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
# extracting topics
|
||||
main(args.input_file, args.output_file)
|
||||
14724
test_saeed_tweet_2.ipynb
Normal file
14724
test_saeed_tweet_2.ipynb
Normal file
File diff suppressed because one or more lines are too long
142
topic_recreation.py
Normal file
142
topic_recreation.py
Normal file
@ -0,0 +1,142 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import time
|
||||
import re
|
||||
import pandas as pd
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class TopicRecreation:
|
||||
def __init__(self):
|
||||
|
||||
self.instruction = f"""
|
||||
You will be given a tweet text.
|
||||
Your task is to write a phrase category for this tweet which tweet is related to it.
|
||||
this should be a combination of action + category :
|
||||
|
||||
for example :
|
||||
انتقاد از سیاست ایران
|
||||
توهین به مقامات کشور
|
||||
حمایت از نظام جمهوری اسلامی
|
||||
جنگ اسراییل و قطر
|
||||
مسایل مربوط به موضوع هسته ای ایران
|
||||
مسایل مربوط به افغانستان
|
||||
|
||||
The category should be in persian.
|
||||
|
||||
# Roles
|
||||
- If it does not have specifc meaning then write "متفرقه"
|
||||
- Be specifc about the countries.
|
||||
- Do not be specifc about the people.
|
||||
- you can consider different categories and write an action + category or just simple category
|
||||
|
||||
Just return the category, do not include any other text.
|
||||
"""
|
||||
|
||||
|
||||
async def run_llm(self, session, tweet):
|
||||
"""
|
||||
Run the LLM as reranker.
|
||||
Args:
|
||||
session: The session to use for the request.
|
||||
tweet: The tweet to rerank.
|
||||
Returns:
|
||||
The category of the tweet.
|
||||
"""
|
||||
headers = {"Content-Type": "application/json",}
|
||||
|
||||
tweet = " ".join([m for m in tweet.split(" ") if "@" not in m])
|
||||
|
||||
input_message = f"""{{"tweet": "{tweet}"}}"""
|
||||
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
|
||||
|
||||
payload = {
|
||||
"model": "google/gemma-3-27b-it",
|
||||
"messages": messages,
|
||||
"max_tokens": 500
|
||||
}
|
||||
# try:
|
||||
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
|
||||
resp.raise_for_status()
|
||||
response = await resp.json()
|
||||
|
||||
out = response['choices'][0]['message']['content']
|
||||
|
||||
return out
|
||||
|
||||
# except Exception as e:
|
||||
# print(f"Error in llm as reranker: {e}")
|
||||
# return 0
|
||||
|
||||
|
||||
async def run_llm_async(self, tweets):
|
||||
"""
|
||||
Send all chunk requests concurrently.
|
||||
Args:
|
||||
tweets: The tweets to rerank.
|
||||
Returns:
|
||||
The categories of the tweets.
|
||||
"""
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [self.run_llm(session, tweet) for tweet in tweets]
|
||||
scores_embed = await asyncio.gather(*tasks)
|
||||
return scores_embed
|
||||
|
||||
def sanitize_for_excel(self, df):
|
||||
def _sanitize_for_excel(text):
|
||||
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
|
||||
if text is None:
|
||||
return ""
|
||||
s = str(text)
|
||||
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
|
||||
remove_chars = [
|
||||
"\u200c", # ZWNJ
|
||||
"\u200d", # ZWJ
|
||||
"\u200e", # LRM
|
||||
"\u200f", # RLM
|
||||
"\u202a", # LRE
|
||||
"\u202b", # RLE
|
||||
"\u202c", # PDF
|
||||
"\u202d", # LRO
|
||||
"\u202e", # RLO
|
||||
"\ufeff", # BOM
|
||||
"\u0640", # Tatweel
|
||||
]
|
||||
for ch in remove_chars:
|
||||
s = s.replace(ch, "")
|
||||
# Normalize whitespace
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
df_copy = df.copy()
|
||||
for m in ["category"]:
|
||||
for i in range(len(df_copy[m])):
|
||||
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
|
||||
|
||||
return df_copy
|
||||
|
||||
def start_process(self, input_path, output_path):
|
||||
df = pd.read_excel(input_path)
|
||||
df_copy = df.copy()
|
||||
|
||||
tweets = df["tweet"].tolist()
|
||||
|
||||
for i in tqdm(range(0, len(tweets), 1000)):
|
||||
start_time = time.time()
|
||||
result_list = asyncio.run(self.run_llm_async(tweets[i:i+1000]))
|
||||
end_time = time.time()
|
||||
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
for j, result in enumerate(result_list):
|
||||
df_copy.at[i+j, "category"] = result
|
||||
|
||||
|
||||
df_copy = self.sanitize_for_excel(df_copy)
|
||||
df_copy.to_excel(output_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
llm = TopicRecreation()
|
||||
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx")
|
||||
Loading…
x
Reference in New Issue
Block a user