import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.cluster import AgglomerativeClustering import numpy as np from scipy.sparse import csr_matrix, vstack from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm def batched_sparse_cosine_similarity(X, batch_size=1000): n_samples = X.shape[0] n_batches = (n_samples - 1) // batch_size + 1 # Convert X to a sparse matrix if it isn't already if not isinstance(X, csr_matrix): X = csr_matrix(X) # Normalize the input matrix X_normalized = X / np.sqrt((X.multiply(X)).sum(axis=1)) similarities = [] for i in tqdm(range(n_batches)): start = i * batch_size end = min((i + 1) * batch_size, n_samples) batch = X_normalized[start:end] batch_similarities = cosine_similarity(batch, X_normalized, dense_output=False) similarities.append(batch_similarities) # Combine all batches similarity_matrix = vstack(similarities) return similarity_matrix # Usage example: # X = your_input_data # This should be your feature matrix # similarity_matrix = batched_sparse_cosine_similarity(X) def cluster_keywords(keywords, distance_threshold=4.5): if len(keywords) <= 1: return [0] * len(keywords) vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(keywords) #similarity_matrix = batched_sparse_cosine_similarity(X) similarity_matrix = cosine_similarity(X) clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, metric='euclidean', linkage='ward') clusters = clustering_model.fit_predict(1 - similarity_matrix) return clusters # Load your Excel file file_path = "Downloads/filtered_scrape_data (1).xlsx" df = pd.read_excel(file_path) # Assuming your keywords are in a column named 'Keywords' keywords = df['Keywords'].dropna().tolist() # Initial clustering initial_clusters = cluster_keywords(keywords) # Group keywords by their cluster labels df['Cluster'] = initial_clusters grouped_keywords = df.groupby('Cluster')['Keywords'].apply(list) # Function to iteratively cluster large groups def iterative_clustering(groups, threshold=20, max_iterations=5): final_groups = [] groups_to_process = list(groups) print(groups_to_process) iteration = 0 while groups_to_process and iteration < max_iterations: iteration += 1 current_group = groups_to_process.pop(0) if len(current_group) <= threshold: final_groups.append(current_group) continue sub_clusters = cluster_keywords(current_group, distance_threshold=1.8) unique_clusters = np.unique(sub_clusters) if len(unique_clusters) == 1: # If we can't cluster further, add the group as is final_groups.append(current_group) else: for i in unique_clusters: sub_group = np.array(current_group)[sub_clusters == i].tolist() if len(sub_group) > threshold and len(sub_group) < len(current_group): groups_to_process.append(sub_group) else: final_groups.append(sub_group) # If we've reached max iterations, add remaining groups as they are print(groups_to_process) final_groups.extend(groups_to_process) return final_groups # Apply iterative clustering final_groups = iterative_clustering(grouped_keywords.values) # Output the final grouped keywords for i, group in enumerate(final_groups): print(f"Group {i}: {group}") # If you want to save the results back to Excel result_df = pd.DataFrame({'Group': range(len(final_groups)), 'Keywords': final_groups}) result_df.to_excel("clustered_keywords_eos.xlsx", index=False) print(f"Total number of groups: {len(final_groups)}") print(f"Largest group size: {max(len(group) for group in final_groups)}") print(f"Smallest group size: {min(len(group) for group in final_groups)}")