diff --git a/apps/cluster_map/clustering.py b/apps/cluster_map/clustering.py index f2bc93c..4fe12ee 100644 --- a/apps/cluster_map/clustering.py +++ b/apps/cluster_map/clustering.py @@ -9,9 +9,136 @@ from sklearn.mixture import GaussianMixture from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score, calinski_harabasz_score import hdbscan +import pandas as pd +from collections import Counter +import re from config import DEFAULT_RANDOM_STATE +def summarize_cluster_content(cluster_messages, max_words=3): + """ + Generate a meaningful name for a cluster based on its message content. + + Args: + cluster_messages: List of message contents in the cluster + max_words: Maximum number of words in the cluster name + + Returns: + str: Generated cluster name + """ + if not cluster_messages: + return "Empty Cluster" + + # Combine all messages and clean text + all_text = " ".join([str(msg) for msg in cluster_messages if pd.notna(msg)]) + if not all_text.strip(): + return "Empty Content" + + # Basic text cleaning + text = all_text.lower() + + # Remove URLs, mentions, and special characters + text = re.sub(r'http[s]?://\S+', '', text) # Remove URLs + text = re.sub(r'<@\d+>', '', text) # Remove Discord mentions + text = re.sub(r'<:\w+:\d+>', '', text) # Remove custom emojis + text = re.sub(r'[^\w\s]', ' ', text) # Remove punctuation + text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace + + if not text: + return "Special Characters" + + # Split into words and filter out common words + words = text.split() + + # Common stop words to filter out + stop_words = { + 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', + 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', + 'above', 'below', 'between', 'among', 'until', 'without', 'under', 'over', + 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', + 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', + 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', + 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those', + 'just', 'like', 'get', 'know', 'think', 'see', 'go', 'come', 'say', 'said', + 'yeah', 'yes', 'no', 'oh', 'ok', 'okay', 'well', 'so', 'but', 'if', 'when', + 'what', 'where', 'why', 'how', 'who', 'which', 'than', 'then', 'now', 'here', + 'there', 'also', 'too', 'very', 'really', 'pretty', 'much', 'more', 'most', + 'some', 'any', 'all', 'many', 'few', 'little', 'big', 'small', 'good', 'bad' + } + + # Filter out stop words and very short/long words + filtered_words = [ + word for word in words + if word not in stop_words + and len(word) >= 3 + and len(word) <= 15 + and word.isalpha() # Only alphabetic words + ] + + if not filtered_words: + return f"Chat ({len(cluster_messages)} msgs)" + + # Count word frequencies + word_counts = Counter(filtered_words) + + # Get most common words + most_common = word_counts.most_common(max_words * 2) # Get more than needed for filtering + + # Select diverse words (avoid very similar words) + selected_words = [] + for word, count in most_common: + # Avoid adding very similar words + if not any(word.startswith(existing[:4]) or existing.startswith(word[:4]) + for existing in selected_words): + selected_words.append(word) + if len(selected_words) >= max_words: + break + + if not selected_words: + return f"Discussion ({len(cluster_messages)} msgs)" + + # Create cluster name + cluster_name = " + ".join(selected_words[:max_words]).title() + + # Add message count for context + cluster_name += f" ({len(cluster_messages)})" + + return cluster_name + + +def generate_cluster_names(filtered_df, cluster_labels): + """ + Generate names for all clusters based on their content. + + Args: + filtered_df: DataFrame with message data + cluster_labels: Array of cluster labels for each message + + Returns: + dict: Mapping from cluster_id to cluster_name + """ + if cluster_labels is None: + return {} + + cluster_names = {} + unique_clusters = np.unique(cluster_labels) + + for cluster_id in unique_clusters: + if cluster_id == -1: + cluster_names[cluster_id] = "Noise/Outliers" + continue + + # Get messages in this cluster + cluster_mask = cluster_labels == cluster_id + cluster_messages = filtered_df[cluster_mask]['content'].tolist() + + # Generate name + cluster_name = summarize_cluster_content(cluster_messages) + cluster_names[cluster_id] = cluster_name + + return cluster_names + + def apply_clustering(embeddings, clustering_method="None", n_clusters=5): """ Apply clustering algorithm to embeddings and return labels and metrics. diff --git a/apps/cluster_map/config.py b/apps/cluster_map/config.py index 8c58ede..af00d87 100644 --- a/apps/cluster_map/config.py +++ b/apps/cluster_map/config.py @@ -3,7 +3,7 @@ Configuration settings and constants for the Discord Chat Embeddings Visualizer. """ # Application settings -APP_TITLE = "Discord Chat Embeddings Visualizer" +APP_TITLE = "The Cult - Visualised" APP_ICON = "🗨️" APP_LAYOUT = "wide" @@ -14,6 +14,8 @@ CHAT_LOGS_PATH = "../../discord_chat_logs" DEFAULT_RANDOM_STATE = 42 DEFAULT_N_COMPONENTS = 2 DEFAULT_N_CLUSTERS = 5 +DEFAULT_DIMENSION_REDUCTION_METHOD = "t-SNE" +DEFAULT_CLUSTERING_METHOD = "None" # Visualization settings DEFAULT_POINT_SIZE = 8 diff --git a/apps/cluster_map/main.py b/apps/cluster_map/main.py index a8f6d9b..56a5097 100644 --- a/apps/cluster_map/main.py +++ b/apps/cluster_map/main.py @@ -17,10 +17,10 @@ from data_loader import ( from dimensionality_reduction import ( reduce_dimensions, apply_density_based_jittering ) -from clustering import apply_clustering +from clustering import apply_clustering, generate_cluster_names from visualization import ( create_visualization_plot, display_clustering_metrics, display_summary_stats, - display_clustering_results, display_data_table + display_clustering_results, display_data_table, display_cluster_summary ) @@ -51,11 +51,34 @@ def main(): # Get UI parameters params = get_all_ui_parameters(valid_df) + # Check if any sources are selected before proceeding + if not params['selected_sources']: + st.info("📂 **Select source files from the sidebar to begin visualization**") + st.markdown("### Available Data Sources:") + + # Show available sources as an informational table + source_info = [] + for source in valid_df['source_file'].unique(): + source_data = valid_df[valid_df['source_file'] == source] + source_info.append({ + 'Source File': source, + 'Messages': len(source_data), + 'Unique Authors': source_data['author_name'].nunique(), + 'Date Range': f"{source_data['timestamp_utc'].min()} to {source_data['timestamp_utc'].max()}" + }) + + import pandas as pd + source_df = pd.DataFrame(source_info) + st.dataframe(source_df, use_container_width=True, hide_index=True) + + st.markdown("👈 **Use the sidebar to select which sources to visualize**") + st.stop() + # Filter data filtered_df = filter_data(valid_df, params['selected_sources'], params['selected_authors']) if filtered_df.empty: - st.warning("No data matches the current filters!") + st.warning("No data matches the current filters! Try selecting different sources or authors.") st.stop() # Display performance warnings @@ -95,12 +118,22 @@ def main(): jitter_strength=params['jitter_strength'] ) + # Generate cluster names if clustering was applied + cluster_names = None + if cluster_labels is not None: + with st.spinner("Generating cluster names..."): + cluster_names = generate_cluster_names(filtered_df, cluster_labels) + # Display clustering metrics display_clustering_metrics( cluster_labels, silhouette_avg, calinski_harabasz, params['show_cluster_metrics'] ) + # Display cluster summary with names + if cluster_names: + display_cluster_summary(cluster_names, cluster_labels) + # Create and display the main plot fig = create_visualization_plot( reduced_embeddings=reduced_embeddings, @@ -113,7 +146,8 @@ def main(): point_opacity=params['point_opacity'], density_based_sizing=params['density_based_sizing'], size_variation=params['size_variation'], - enable_3d=params['enable_3d'] + enable_3d=params['enable_3d'], + cluster_names=cluster_names ) st.plotly_chart(fig, use_container_width=True) diff --git a/apps/cluster_map/ui_components.py b/apps/cluster_map/ui_components.py index a02c831..ec3e77c 100644 --- a/apps/cluster_map/ui_components.py +++ b/apps/cluster_map/ui_components.py @@ -7,7 +7,8 @@ import numpy as np from config import ( APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS, CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS, - LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS + LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS, + DEFAULT_DIMENSION_REDUCTION_METHOD, DEFAULT_CLUSTERING_METHOD ) @@ -38,17 +39,23 @@ def create_method_controls(): ) # Dimension reduction method + method_options = ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"] + default_index = method_options.index(DEFAULT_DIMENSION_REDUCTION_METHOD) if DEFAULT_DIMENSION_REDUCTION_METHOD in method_options else 0 method = st.sidebar.selectbox( "Dimension Reduction Method", - ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"], + method_options, + index=default_index, help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing." ) # Clustering method + clustering_options = ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture", + "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"] + clustering_default_index = clustering_options.index(DEFAULT_CLUSTERING_METHOD) if DEFAULT_CLUSTERING_METHOD in clustering_options else 0 clustering_method = st.sidebar.selectbox( "Clustering Method", - ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture", - "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"], + clustering_options, + index=clustering_default_index, help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters." ) @@ -57,9 +64,25 @@ def create_method_controls(): def create_clustering_controls(clustering_method): """Create controls for clustering parameters""" - n_clusters = 5 + # Always show the clusters slider, but indicate when it's used if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS: - n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5) + help_text = "Number of clusters to create. This setting affects the clustering algorithm." + disabled = False + elif clustering_method == "None": + help_text = "Clustering is disabled. This setting has no effect." + disabled = True + else: + help_text = f"{clustering_method} automatically determines the number of clusters. This setting has no effect." + disabled = True + + n_clusters = st.sidebar.slider( + "Number of Clusters", + min_value=2, + max_value=20, + value=5, + disabled=disabled, + help=help_text + ) return n_clusters diff --git a/apps/cluster_map/visualization.py b/apps/cluster_map/visualization.py index 66d2e2d..8944979 100644 --- a/apps/cluster_map/visualization.py +++ b/apps/cluster_map/visualization.py @@ -47,7 +47,8 @@ def calculate_point_sizes(reduced_embeddings, density_based_sizing=False, def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover_text, - point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA", enable_3d=False): + point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA", enable_3d=False, + cluster_names=None): """Create a plot colored by clusters""" fig = go.Figure() @@ -61,7 +62,11 @@ def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover cluster_hover = [hover_text[j] for j, mask in enumerate(cluster_mask) if mask] cluster_sizes = [point_sizes[j] for j, mask in enumerate(cluster_mask) if mask] - cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise" + # Use generated name if available, otherwise fall back to default + if cluster_names and cluster_id in cluster_names: + cluster_name = cluster_names[cluster_id] + else: + cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise" if enable_3d: fig.add_trace(go.Scatter3d( @@ -149,7 +154,8 @@ def create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=None, selected_sources=None, method="PCA", clustering_method="None", point_size=DEFAULT_POINT_SIZE, point_opacity=DEFAULT_POINT_OPACITY, - density_based_sizing=False, size_variation=2.0, enable_3d=False): + density_based_sizing=False, size_variation=2.0, enable_3d=False, + cluster_names=None): """Create the main visualization plot""" # Create hover text @@ -162,7 +168,8 @@ def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=No # Create plot based on coloring strategy if cluster_labels is not None: fig = create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, - hover_text, point_sizes, point_opacity, method, enable_3d) + hover_text, point_sizes, point_opacity, method, enable_3d, + cluster_names) else: if selected_sources is None: selected_sources = filtered_df['source_file'].unique() @@ -276,3 +283,29 @@ def display_data_table(filtered_df, cluster_labels=None): display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display st.dataframe(display_df, use_container_width=True) + + +def display_cluster_summary(cluster_names, cluster_labels): + """Display a summary of cluster names and their sizes""" + if not cluster_names or cluster_labels is None: + return + + st.subheader("🏷️ Cluster Summary") + + # Create summary data + cluster_summary = [] + for cluster_id, name in cluster_names.items(): + count = np.sum(cluster_labels == cluster_id) + cluster_summary.append({ + 'Cluster ID': cluster_id, + 'Cluster Name': name, + 'Message Count': count, + 'Percentage': f"{100 * count / len(cluster_labels):.1f}%" + }) + + # Sort by message count + cluster_summary.sort(key=lambda x: x['Message Count'], reverse=True) + + # Display as table + summary_df = pd.DataFrame(cluster_summary) + st.dataframe(summary_df, use_container_width=True, hide_index=True)