add sub cluster
This commit is contained in:
parent
78656d9f4d
commit
104aef2eef
14
README.md
14
README.md
@ -32,9 +32,17 @@ A pipeline for clustering tweets
|
|||||||
By first clustering similar names (step 5), the input to GPT became more organized,
|
By first clustering similar names (step 5), the input to GPT became more organized,
|
||||||
which made step 6 much more effective.
|
which made step 6 much more effective.
|
||||||
|
|
||||||
## How to use
|
## How to extract main cluster
|
||||||
|
|
||||||
You should give a excel file which has a column named "tweets" to this below command
|
You should give a excel file which has a column named "tweet" to this below command
|
||||||
Overally it will take 15h time for 40,000 tweets
|
Overally it will take 15h time for 40,000 tweets
|
||||||
|
|
||||||
python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_output.xlsx
|
python3 clustering_pipeline.py --input_file tweets_file.xlsx --output_file tweets_file_cluster.xlsx
|
||||||
|
|
||||||
|
|
||||||
|
## How to extract sub cluster
|
||||||
|
|
||||||
|
You should first run above code whihc will give you a excel file which has a colummn of "topic" and "cluster_llm"
|
||||||
|
|
||||||
|
|
||||||
|
python3 sub_clustering_pipeline.py --input_file tweets_file_cluster.xlsx --output_file tweets_file_sub_cluster.xlsx
|
||||||
|
|||||||
@ -196,4 +196,4 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
# apply clustering
|
# apply clustering
|
||||||
post_cluster = PostClusterLLM()
|
post_cluster = PostClusterLLM()
|
||||||
post_cluster.start_process(topics_file, args.output_file)
|
post_cluster.start_process(topic_file, titles_file, args.output_file)
|
||||||
@ -153,11 +153,11 @@ class PostClusterLLM:
|
|||||||
|
|
||||||
return df_copy
|
return df_copy
|
||||||
|
|
||||||
def start_process(self, input_path, output_path):
|
def start_process(self, input_path, titles_path, output_path):
|
||||||
df = pd.read_excel(input_path)
|
df = pd.read_excel(input_path)
|
||||||
df_copy = df.copy()
|
df_copy = df.copy()
|
||||||
|
|
||||||
with open("titles_o3.txt", "r") as f:
|
with open(titles_path, "r") as f:
|
||||||
titles = f.readlines()
|
titles = f.readlines()
|
||||||
|
|
||||||
titles = [title.strip() for title in titles]
|
titles = [title.strip() for title in titles]
|
||||||
@ -200,4 +200,4 @@ class PostClusterLLM:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
llm = PostClusterLLM()
|
llm = PostClusterLLM()
|
||||||
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
|
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation.xlsx", "titles_o3.txt", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx")
|
||||||
214
post_sub_cluster.py
Normal file
214
post_sub_cluster.py
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
class PostSubClusterLLM:
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self.instruction = f"""
|
||||||
|
You will be given a title and a list of all cluster names.
|
||||||
|
Your task is to find the best fit cluster name for the title.
|
||||||
|
Go through the list of all cluster names and find the best fit cluster name for the title.
|
||||||
|
If you found a good fit, return the cluster name.
|
||||||
|
If you didn't find a good fit, return "outlier" is "yes".
|
||||||
|
|
||||||
|
#IMPORTANT:
|
||||||
|
- if you found a good fit use its id : {{"cluster" : "id_i"}}
|
||||||
|
- if the title is not related to any of the cluster names, return "outlier" is "yes" : {{"outlier" : "yes"}}
|
||||||
|
|
||||||
|
Example-1:
|
||||||
|
- Input:
|
||||||
|
- title: "کتاب و درس"
|
||||||
|
- all_cluster_names: {{
|
||||||
|
"1" : "کتابخوانی",
|
||||||
|
"2" : "فوتبال جام جهانی",
|
||||||
|
"3" : "ساختمان سازی شهری" }}
|
||||||
|
- Output:
|
||||||
|
- {{"cluster" : "1"}}
|
||||||
|
|
||||||
|
Example-2:
|
||||||
|
- Input:
|
||||||
|
- title: "لپتاب و کامپیوتر"
|
||||||
|
- all_cluster_names: {{
|
||||||
|
"1" : "کتابخوانی",
|
||||||
|
"2" : "فوتبال جام جهانی",
|
||||||
|
"3" : "ساختمان سازی شهری" }}
|
||||||
|
- Output:
|
||||||
|
- {{"outlier" : "yes"}}
|
||||||
|
|
||||||
|
Example-3:
|
||||||
|
- Input:
|
||||||
|
- title: "ساختمان"
|
||||||
|
- all_cluster_names: {{
|
||||||
|
"1" : "کتابخوانی",
|
||||||
|
"2" : "فوتبال جام جهانی",
|
||||||
|
"3" : "ساختمان سازی شهری" }}
|
||||||
|
- Output:
|
||||||
|
- {{"cluster" : "3"}}
|
||||||
|
|
||||||
|
write a small reason and give the final answer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def run_llm(self, session, topic, cluster_name, cluster_sub_cluster_list):
|
||||||
|
"""
|
||||||
|
Run the LLM as reranker.
|
||||||
|
Args:
|
||||||
|
session: The session to use for the request.
|
||||||
|
question: The question to rerank the documents.
|
||||||
|
chunk: The chunk to rerank.
|
||||||
|
Returns:
|
||||||
|
The score of the chunk.
|
||||||
|
"""
|
||||||
|
if cluster_name == "متفرقه":
|
||||||
|
return None
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json",}
|
||||||
|
|
||||||
|
for cluster_sub_cluster in cluster_sub_cluster_list:
|
||||||
|
if cluster_sub_cluster["cluster_name"] == cluster_name:
|
||||||
|
sub_cluster_names = cluster_sub_cluster["sub_cluster_names"]
|
||||||
|
break
|
||||||
|
|
||||||
|
sub_cluster_names_str = "{\n"
|
||||||
|
for count, value in enumerate(sub_cluster_names):
|
||||||
|
sub_cluster_names_str += f"{count} : {value},\n"
|
||||||
|
|
||||||
|
sub_cluster_names_str += "}"
|
||||||
|
|
||||||
|
input_message = f"""{{"all_cluster_names": "{sub_cluster_names_str}", "title": "{topic}"}}"""
|
||||||
|
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "google/gemma-3-27b-it",
|
||||||
|
"messages": messages,
|
||||||
|
"max_tokens": 500
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
async with session.post("http://192.168.130.206:4001/v1/chat/completions", headers=headers, json=payload) as resp:
|
||||||
|
resp.raise_for_status()
|
||||||
|
response = await resp.json()
|
||||||
|
|
||||||
|
out = response['choices'][0]['message']['content']
|
||||||
|
print("--------------------------------")
|
||||||
|
print(f"title: {topic}")
|
||||||
|
print(f"cluster_name: {cluster_name}")
|
||||||
|
print(out)
|
||||||
|
pattern = r'(\{"cluster".*?\})'
|
||||||
|
|
||||||
|
matches = re.findall(pattern, out)
|
||||||
|
|
||||||
|
for m in matches:
|
||||||
|
out_json = json.loads(m)
|
||||||
|
print(f"out_json: {out_json}")
|
||||||
|
if out_json.get("cluster") is not None:
|
||||||
|
print(sub_cluster_names[int(out_json.get("cluster"))])
|
||||||
|
return out_json
|
||||||
|
|
||||||
|
pattern = r'(\{"outlier".*?\})'
|
||||||
|
|
||||||
|
matches = re.findall(pattern, out)
|
||||||
|
|
||||||
|
for m in matches:
|
||||||
|
out_json = json.loads(m)
|
||||||
|
print(f"out_json: {out_json}")
|
||||||
|
print("outlier")
|
||||||
|
return out_json
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in llm as reranker: {e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def run_llm_async(self, topics, cluster_names, cluster_sub_cluster_dict):
|
||||||
|
"""
|
||||||
|
Send all chunk requests concurrently.
|
||||||
|
Args:
|
||||||
|
topics: The topics to rerank.
|
||||||
|
cluster_names: The cluster names to rerank.
|
||||||
|
cluster_sub_cluster_dict: The cluster sub cluster dictionary.
|
||||||
|
Returns:
|
||||||
|
The scores of the chunks.
|
||||||
|
"""
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
tasks = [self.run_llm(session, topic, cluster_name, cluster_sub_cluster_dict) for topic, cluster_name in zip(topics, cluster_names)]
|
||||||
|
scores_embed = await asyncio.gather(*tasks)
|
||||||
|
return scores_embed
|
||||||
|
|
||||||
|
def sanitize_for_excel(self, df):
|
||||||
|
def _sanitize_for_excel(text):
|
||||||
|
"""Remove zero-width and bidi control characters that can confuse Excel rendering."""
|
||||||
|
if text is None:
|
||||||
|
return ""
|
||||||
|
s = str(text)
|
||||||
|
# Characters to remove: ZWNJ, ZWJ, RLM, LRM, RLE, LRE, PDF, BOM, Tatweel
|
||||||
|
remove_chars = [
|
||||||
|
"\u200c", # ZWNJ
|
||||||
|
"\u200d", # ZWJ
|
||||||
|
"\u200e", # LRM
|
||||||
|
"\u200f", # RLM
|
||||||
|
"\u202a", # LRE
|
||||||
|
"\u202b", # RLE
|
||||||
|
"\u202c", # PDF
|
||||||
|
"\u202d", # LRO
|
||||||
|
"\u202e", # RLO
|
||||||
|
"\ufeff", # BOM
|
||||||
|
"\u0640", # Tatweel
|
||||||
|
]
|
||||||
|
for ch in remove_chars:
|
||||||
|
s = s.replace(ch, "")
|
||||||
|
# Normalize whitespace
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
df_copy = df.copy()
|
||||||
|
for m in df.columns:
|
||||||
|
for i in range(len(df_copy[m])):
|
||||||
|
df_copy.loc[i, m] = _sanitize_for_excel(df_copy.loc[i, m])
|
||||||
|
|
||||||
|
return df_copy
|
||||||
|
|
||||||
|
def start_process(self, input_path, titles_path, output_path):
|
||||||
|
df = pd.read_excel(input_path)
|
||||||
|
df_copy = df.copy()
|
||||||
|
|
||||||
|
with open(titles_path, "r") as f:
|
||||||
|
cluster_sub_cluster_list = json.load(f)
|
||||||
|
|
||||||
|
batch_size = 100
|
||||||
|
for i in tqdm(range(0, len(df["topic"]), batch_size)):
|
||||||
|
start_time = time.time()
|
||||||
|
result_list = asyncio.run(self.run_llm_async(df["topic"][i:i+batch_size], df["cluster_llm"][i:i+batch_size], cluster_sub_cluster_list))
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Time taken for llm as reranker: {end_time - start_time} seconds")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
for j, result in enumerate(result_list):
|
||||||
|
try:
|
||||||
|
if result is None:
|
||||||
|
df_copy.at[i+j, "sub_cluster"] = "متفرقه"
|
||||||
|
elif result.get("outlier") == "yes":
|
||||||
|
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
|
||||||
|
elif result.get("cluster") is not None:
|
||||||
|
for cluster_sub_cluster in cluster_sub_cluster_list:
|
||||||
|
if cluster_sub_cluster["cluster_name"] == df["cluster_llm"][i+j]:
|
||||||
|
sub_cluster_names = cluster_sub_cluster["sub_cluster_names"]
|
||||||
|
break
|
||||||
|
df_copy.at[i+j, "sub_cluster"] = sub_cluster_names[int(result["cluster"])]
|
||||||
|
else:
|
||||||
|
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in result_list: {e}")
|
||||||
|
df_copy.at[i+j, "sub_cluster"] = "موارد دیگر"
|
||||||
|
|
||||||
|
print(df_copy.at[i+j, "sub_cluster"])
|
||||||
|
df_copy = self.sanitize_for_excel(df_copy)
|
||||||
|
df_copy.to_excel(output_path)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
llm = PostSubClusterLLM()
|
||||||
|
llm.start_process("/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3.xlsx", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.json", "/home/firouzi/trend_grouping_new/tweet_topic_recreation_post_o3_subcategory.xlsx")
|
||||||
@ -12,6 +12,8 @@ import random
|
|||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from post_sub_cluster import PostSubClusterLLM
|
||||||
|
|
||||||
|
|
||||||
START_K = 2
|
START_K = 2
|
||||||
END_K = 60
|
END_K = 60
|
||||||
@ -282,4 +284,9 @@ if __name__ == "__main__":
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# extracting topics
|
# extracting topics
|
||||||
main(args.input_file, args.output_file)
|
sub_cluster_file = args.output_file.replace(".xlsx", "_sub_cluster.json")
|
||||||
|
main(args.input_file, sub_cluster_file)
|
||||||
|
|
||||||
|
# apply clustering
|
||||||
|
post_sub_cluster = PostSubClusterLLM()
|
||||||
|
post_sub_cluster.start_process(args.input_file, sub_cluster_file, args.output_file)
|
||||||
Loading…
x
Reference in New Issue
Block a user