From 4ca7e8ab61bc526a61f8d0e4a2b1ecc39d71cde0 Mon Sep 17 00:00:00 2001 From: Azeem Fidahusein Date: Mon, 11 Aug 2025 02:37:21 +0100 Subject: [PATCH] refactor --- apps/cluster_map/cluster.py | 12 + apps/cluster_map/clustering.py | 99 ++++++++ apps/cluster_map/config.py | 73 ++++++ apps/cluster_map/data_loader.py | 86 +++++++ apps/cluster_map/dimensionality_reduction.py | 211 +++++++++++++++++ apps/cluster_map/main.py | 132 +++++++++++ apps/cluster_map/streamlit_app.py | 233 ------------------ apps/cluster_map/test_debug.py | 43 ++++ apps/cluster_map/ui_components.py | 236 +++++++++++++++++++ apps/cluster_map/visualization.py | 225 ++++++++++++++++++ 10 files changed, 1117 insertions(+), 233 deletions(-) create mode 100644 apps/cluster_map/cluster.py create mode 100644 apps/cluster_map/clustering.py create mode 100644 apps/cluster_map/config.py create mode 100644 apps/cluster_map/data_loader.py create mode 100644 apps/cluster_map/dimensionality_reduction.py create mode 100644 apps/cluster_map/main.py delete mode 100644 apps/cluster_map/streamlit_app.py create mode 100644 apps/cluster_map/test_debug.py create mode 100644 apps/cluster_map/ui_components.py create mode 100644 apps/cluster_map/visualization.py diff --git a/apps/cluster_map/cluster.py b/apps/cluster_map/cluster.py new file mode 100644 index 0000000..cabc728 --- /dev/null +++ b/apps/cluster_map/cluster.py @@ -0,0 +1,12 @@ +""" +Discord Chat Embeddings Visualizer - Legacy Entry Point + +This file serves as a compatibility layer for the original cluster.py. +The application has been refactored into modular components for better maintainability. +""" + +# Import and run the main application +from main import main + +if __name__ == "__main__": + main() diff --git a/apps/cluster_map/clustering.py b/apps/cluster_map/clustering.py new file mode 100644 index 0000000..f2bc93c --- /dev/null +++ b/apps/cluster_map/clustering.py @@ -0,0 +1,99 @@ +""" +Clustering algorithms and evaluation metrics. +""" + +import numpy as np +import streamlit as st +from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS +from sklearn.mixture import GaussianMixture +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import silhouette_score, calinski_harabasz_score +import hdbscan +from config import DEFAULT_RANDOM_STATE + + +def apply_clustering(embeddings, clustering_method="None", n_clusters=5): + """ + Apply clustering algorithm to embeddings and return labels and metrics. + + Args: + embeddings: High-dimensional embeddings to cluster + clustering_method: Name of clustering algorithm + n_clusters: Number of clusters (for methods that require it) + + Returns: + tuple: (cluster_labels, silhouette_score, calinski_harabasz_score) + """ + if clustering_method == "None" or len(embeddings) <= n_clusters: + return None, None, None + + # Standardize embeddings for better clustering + scaler = StandardScaler() + scaled_embeddings = scaler.fit_transform(embeddings) + + cluster_labels = None + silhouette_avg = None + calinski_harabasz = None + + try: + if clustering_method == "HDBSCAN": + min_cluster_size = max(2, len(embeddings) // 20) # Adaptive min cluster size + clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, + min_samples=1, cluster_selection_epsilon=0.5) + cluster_labels = clusterer.fit_predict(scaled_embeddings) + + elif clustering_method == "Spectral Clustering": + clusterer = SpectralClustering(n_clusters=n_clusters, random_state=DEFAULT_RANDOM_STATE, + affinity='rbf', gamma=1.0) + cluster_labels = clusterer.fit_predict(scaled_embeddings) + + elif clustering_method == "Gaussian Mixture": + clusterer = GaussianMixture(n_components=n_clusters, random_state=DEFAULT_RANDOM_STATE, + covariance_type='full', max_iter=200) + cluster_labels = clusterer.fit_predict(scaled_embeddings) + + elif clustering_method == "Agglomerative (Ward)": + clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') + cluster_labels = clusterer.fit_predict(scaled_embeddings) + + elif clustering_method == "Agglomerative (Complete)": + clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete') + cluster_labels = clusterer.fit_predict(scaled_embeddings) + + elif clustering_method == "OPTICS": + min_samples = max(2, len(embeddings) // 50) + clusterer = OPTICS(min_samples=min_samples, xi=0.05, min_cluster_size=0.1) + cluster_labels = clusterer.fit_predict(scaled_embeddings) + + # Calculate clustering quality metrics + if cluster_labels is not None and len(np.unique(cluster_labels)) > 1: + # Only calculate if we have multiple clusters and no noise-only clustering + valid_labels = cluster_labels[cluster_labels != -1] # Remove noise points for HDBSCAN/OPTICS + valid_embeddings = scaled_embeddings[cluster_labels != -1] + + if len(valid_labels) > 0 and len(np.unique(valid_labels)) > 1: + silhouette_avg = silhouette_score(valid_embeddings, valid_labels) + calinski_harabasz = calinski_harabasz_score(valid_embeddings, valid_labels) + + except Exception as e: + st.warning(f"Clustering failed: {str(e)}") + cluster_labels = None + + return cluster_labels, silhouette_avg, calinski_harabasz + + +def get_cluster_statistics(cluster_labels): + """Get basic statistics about clustering results""" + if cluster_labels is None: + return {} + + unique_clusters = np.unique(cluster_labels) + n_clusters = len(unique_clusters[unique_clusters != -1]) # Exclude noise cluster (-1) + n_noise = np.sum(cluster_labels == -1) + + return { + "n_clusters": n_clusters, + "n_noise_points": n_noise, + "cluster_distribution": np.bincount(cluster_labels[cluster_labels != -1]) if n_clusters > 0 else [], + "unique_clusters": unique_clusters + } diff --git a/apps/cluster_map/config.py b/apps/cluster_map/config.py new file mode 100644 index 0000000..8c58ede --- /dev/null +++ b/apps/cluster_map/config.py @@ -0,0 +1,73 @@ +""" +Configuration settings and constants for the Discord Chat Embeddings Visualizer. +""" + +# Application settings +APP_TITLE = "Discord Chat Embeddings Visualizer" +APP_ICON = "🗨️" +APP_LAYOUT = "wide" + +# File paths +CHAT_LOGS_PATH = "../../discord_chat_logs" + +# Algorithm parameters +DEFAULT_RANDOM_STATE = 42 +DEFAULT_N_COMPONENTS = 2 +DEFAULT_N_CLUSTERS = 5 + +# Visualization settings +DEFAULT_POINT_SIZE = 8 +DEFAULT_POINT_OPACITY = 0.7 +MAX_DISPLAYED_AUTHORS = 10 +MESSAGE_CONTENT_PREVIEW_LENGTH = 200 +MESSAGE_CONTENT_DISPLAY_LENGTH = 100 + +# Performance thresholds +LARGE_DATASET_WARNING_THRESHOLD = 1000 + +# Color palettes +PRIMARY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", + "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"] + +# Clustering method categories +CLUSTERING_METHODS_REQUIRING_N_CLUSTERS = [ + "Spectral Clustering", + "Gaussian Mixture", + "Agglomerative (Ward)", + "Agglomerative (Complete)" +] + +COMPUTATIONALLY_INTENSIVE_METHODS = { + "dimension_reduction": ["t-SNE", "Spectral Embedding"], + "clustering": ["Spectral Clustering", "OPTICS"] +} + +# Method explanations +METHOD_EXPLANATIONS = { + "dimension_reduction": { + "PCA": "Linear, fast, preserves global variance", + "t-SNE": "Non-linear, good for local structure, slower", + "UMAP": "Balanced speed/quality, preserves local & global structure", + "Spectral Embedding": "Uses graph theory, good for non-convex clusters", + "Force-Directed": "Physics-based layout, creates natural spacing" + }, + "clustering": { + "HDBSCAN": "Density-based, finds variable density clusters, handles noise", + "Spectral Clustering": "Uses eigenvalues, good for non-convex shapes", + "Gaussian Mixture": "Probabilistic, assumes gaussian distributions", + "Agglomerative (Ward)": "Hierarchical, minimizes within-cluster variance", + "Agglomerative (Complete)": "Hierarchical, minimizes maximum distance", + "OPTICS": "Density-based, finds clusters of varying densities" + }, + "separation": { + "Spread Factor": "Applies repulsive forces between nearby points", + "Smart Jittering": "Adds intelligent noise to separate overlapping points", + "Density-Based Jittering": "Stronger separation in crowded areas", + "Perplexity Factor": "Controls t-SNE's focus on local vs global structure", + "Min Distance Factor": "Controls UMAP's point packing tightness" + }, + "metrics": { + "Silhouette Score": "Higher is better (range: -1 to 1)", + "Calinski-Harabasz": "Higher is better, measures cluster separation" + } +} diff --git a/apps/cluster_map/data_loader.py b/apps/cluster_map/data_loader.py new file mode 100644 index 0000000..73105f4 --- /dev/null +++ b/apps/cluster_map/data_loader.py @@ -0,0 +1,86 @@ +""" +Data loading and parsing utilities for Discord chat logs. +""" + +import pandas as pd +import numpy as np +import streamlit as st +import ast +from pathlib import Path +from config import CHAT_LOGS_PATH + + +@st.cache_data +def load_all_chat_data(): + """Load all CSV files from the discord_chat_logs folder""" + chat_logs_path = Path(CHAT_LOGS_PATH) + + with st.expander("📁 Loading Details", expanded=False): + # Display the path for debugging + st.write(f"Looking for CSV files in: {chat_logs_path}") + st.write(f"Path exists: {chat_logs_path.exists()}") + + all_data = [] + + for csv_file in chat_logs_path.glob("*.csv"): + try: + df = pd.read_csv(csv_file) + df['source_file'] = csv_file.stem # Add source file name + all_data.append(df) + st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}") + except Exception as e: + st.error(f"❌ Error loading {csv_file.name}: {e}") + + if all_data: + combined_df = pd.concat(all_data, ignore_index=True) + st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files") + else: + st.error("No data loaded!") + combined_df = pd.DataFrame() + + return combined_df if all_data else pd.DataFrame() + + +@st.cache_data +def parse_embeddings(df): + """Parse the content_embedding column from string to numpy array""" + embeddings = [] + valid_indices = [] + + for idx, embedding_str in enumerate(df['content_embedding']): + try: + # Parse the string representation of the list + embedding = ast.literal_eval(embedding_str) + if isinstance(embedding, list) and len(embedding) > 0: + embeddings.append(embedding) + valid_indices.append(idx) + except Exception as e: + continue + + embeddings_array = np.array(embeddings) + valid_df = df.iloc[valid_indices].copy() + + st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages") + st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}") + + return embeddings_array, valid_df + + +def filter_data(df, selected_sources, selected_authors): + """Filter dataframe by selected sources and authors""" + if not selected_sources: + selected_sources = df['source_file'].unique() + + filtered_df = df[ + (df['source_file'].isin(selected_sources)) & + (df['author_name'].isin(selected_authors)) + ] + + return filtered_df + + +def get_filtered_embeddings(embeddings, valid_df, filtered_df): + """Get embeddings corresponding to filtered dataframe""" + filtered_indices = filtered_df.index.tolist() + filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]] + return filtered_embeddings diff --git a/apps/cluster_map/dimensionality_reduction.py b/apps/cluster_map/dimensionality_reduction.py new file mode 100644 index 0000000..f066eaf --- /dev/null +++ b/apps/cluster_map/dimensionality_reduction.py @@ -0,0 +1,211 @@ +""" +Dimensionality reduction algorithms and point separation techniques. +""" + +import numpy as np +import streamlit as st +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE, SpectralEmbedding +from sklearn.preprocessing import StandardScaler +from sklearn.neighbors import NearestNeighbors +from scipy.spatial.distance import pdist, squareform +from scipy.optimize import minimize +import umap +from config import DEFAULT_RANDOM_STATE + + +def apply_adaptive_spreading(embeddings, spread_factor=1.0): + """ + Apply adaptive spreading to push apart nearby points while preserving global structure. + Uses a force-based approach where closer points repel more strongly. + """ + if spread_factor <= 0: + return embeddings + + embeddings = embeddings.copy() + n_points = len(embeddings) + + print(f"DEBUG: Applying adaptive spreading to {n_points} points with factor {spread_factor}") + + if n_points < 2: + return embeddings + + # For very large datasets, skip spreading to avoid hanging + if n_points > 1000: + print(f"DEBUG: Large dataset ({n_points} points), skipping adaptive spreading...") + return embeddings + + # Calculate pairwise distances + distances = squareform(pdist(embeddings)) + + # Apply force-based spreading with fewer iterations for large datasets + max_iterations = 3 if n_points > 500 else 5 + + for iteration in range(max_iterations): + if iteration % 2 == 0: # Progress indicator + print(f"DEBUG: Spreading iteration {iteration + 1}/{max_iterations}") + + forces = np.zeros_like(embeddings) + + for i in range(n_points): + for j in range(i + 1, n_points): + diff = embeddings[i] - embeddings[j] + dist = np.linalg.norm(diff) + + if dist > 0: + # Repulsive force inversely proportional to distance + force_magnitude = spread_factor / (dist ** 2 + 0.01) + force_direction = diff / dist + force = force_magnitude * force_direction + + forces[i] += force + forces[j] -= force + + # Apply forces with damping + embeddings += forces * 0.1 + + print(f"DEBUG: Adaptive spreading complete") + return embeddings + + +def force_directed_layout(high_dim_embeddings, n_components=2, spread_factor=1.0): + """ + Create a force-directed layout from high-dimensional embeddings. + This creates more natural spacing between similar points. + """ + print(f"DEBUG: Starting force-directed layout with {len(high_dim_embeddings)} points...") + + # For large datasets, fall back to PCA + spreading to avoid hanging + if len(high_dim_embeddings) > 500: + print(f"DEBUG: Large dataset ({len(high_dim_embeddings)} points), using PCA + spreading instead...") + pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE) + result = pca.fit_transform(high_dim_embeddings) + return apply_adaptive_spreading(result, spread_factor) + + # Start with PCA as initial layout + pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE) + initial_layout = pca.fit_transform(high_dim_embeddings) + print(f"DEBUG: Initial PCA layout computed...") + + # For simplicity, just apply spreading to the PCA result + # The original optimization was too computationally intensive + result = apply_adaptive_spreading(initial_layout, spread_factor) + print(f"DEBUG: Force-directed layout complete...") + return result + + +def calculate_local_density_scaling(embeddings, k=5): + """ + Calculate local density scaling factors to emphasize differences in dense regions. + """ + if len(embeddings) < k: + return np.ones(len(embeddings)) + + # Find k nearest neighbors for each point + nn = NearestNeighbors(n_neighbors=k+1) # +1 because first neighbor is the point itself + nn.fit(embeddings) + distances, indices = nn.kneighbors(embeddings) + + # Calculate local density (inverse of average distance to k nearest neighbors) + local_densities = 1.0 / (np.mean(distances[:, 1:], axis=1) + 1e-6) + + # Normalize densities + local_densities = (local_densities - np.min(local_densities)) / (np.max(local_densities) - np.min(local_densities) + 1e-6) + + return local_densities + + +def apply_density_based_jittering(embeddings, density_scaling=True, jitter_strength=0.1): + """ + Apply smart jittering that's stronger in dense regions to separate overlapping points. + """ + if not density_scaling: + # Simple random jittering + noise = np.random.normal(0, jitter_strength, embeddings.shape) + return embeddings + noise + + # Calculate local densities + densities = calculate_local_density_scaling(embeddings) + + # Apply density-proportional jittering + jittered = embeddings.copy() + for i in range(len(embeddings)): + # More jitter in denser regions + jitter_amount = jitter_strength * (1 + densities[i]) + noise = np.random.normal(0, jitter_amount, embeddings.shape[1]) + jittered[i] += noise + + return jittered + + +def reduce_dimensions(embeddings, method="PCA", n_components=2, spread_factor=1.0, + perplexity_factor=1.0, min_dist_factor=1.0): + """Apply dimensionality reduction with enhanced separation""" + + # Convert to numpy array if it's not already + embeddings = np.array(embeddings) + + print(f"DEBUG: Starting {method} with {len(embeddings)} embeddings, shape: {embeddings.shape}") + + # Standardize embeddings for better processing + scaler = StandardScaler() + scaled_embeddings = scaler.fit_transform(embeddings) + print(f"DEBUG: Embeddings standardized") + + # Apply the selected dimensionality reduction method + if method == "PCA": + print(f"DEBUG: Applying PCA...") + reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE) + reduced_embeddings = reducer.fit_transform(scaled_embeddings) + # Apply spreading to PCA results + print(f"DEBUG: Applying spreading...") + reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor) + + elif method == "t-SNE": + # Adjust perplexity based on user preference and data size + base_perplexity = min(30, len(embeddings)-1) + adjusted_perplexity = max(5, min(50, int(base_perplexity * perplexity_factor))) + print(f"DEBUG: Applying t-SNE with perplexity {adjusted_perplexity}...") + + reducer = TSNE(n_components=n_components, random_state=DEFAULT_RANDOM_STATE, + perplexity=adjusted_perplexity, n_iter=1000, + early_exaggeration=12.0 * spread_factor, # Increase early exaggeration for more separation + learning_rate='auto') + reduced_embeddings = reducer.fit_transform(scaled_embeddings) + + elif method == "UMAP": + # Adjust UMAP parameters for better local separation + n_neighbors = min(15, len(embeddings)-1) + min_dist = 0.1 * min_dist_factor + spread = 1.0 * spread_factor + print(f"DEBUG: Applying UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}...") + + reducer = umap.UMAP(n_components=n_components, random_state=DEFAULT_RANDOM_STATE, + n_neighbors=n_neighbors, min_dist=min_dist, + spread=spread, local_connectivity=2.0) + reduced_embeddings = reducer.fit_transform(scaled_embeddings) + + elif method == "Spectral Embedding": + n_neighbors = min(10, len(embeddings)-1) + print(f"DEBUG: Applying Spectral Embedding with n_neighbors={n_neighbors}...") + reducer = SpectralEmbedding(n_components=n_components, random_state=DEFAULT_RANDOM_STATE, + n_neighbors=n_neighbors) + reduced_embeddings = reducer.fit_transform(scaled_embeddings) + # Apply spreading to spectral results + print(f"DEBUG: Applying spreading...") + reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor) + + elif method == "Force-Directed": + # New method: Use force-directed layout for natural spreading + print(f"DEBUG: Applying Force-Directed layout...") + reduced_embeddings = force_directed_layout(scaled_embeddings, n_components, spread_factor) + + else: + # Fallback to PCA + print(f"DEBUG: Unknown method {method}, falling back to PCA...") + reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE) + reduced_embeddings = reducer.fit_transform(scaled_embeddings) + reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor) + + print(f"DEBUG: Dimensionality reduction complete. Output shape: {reduced_embeddings.shape}") + return reduced_embeddings diff --git a/apps/cluster_map/main.py b/apps/cluster_map/main.py new file mode 100644 index 0000000..395e6f9 --- /dev/null +++ b/apps/cluster_map/main.py @@ -0,0 +1,132 @@ +""" +Main application logic for the Discord Chat Embeddings Visualizer. +""" + +import streamlit as st +import warnings +warnings.filterwarnings('ignore') + +# Import custom modules +from ui_components import ( + setup_page_config, display_title_and_description, get_all_ui_parameters, + display_performance_warnings +) +from data_loader import ( + load_all_chat_data, parse_embeddings, filter_data, get_filtered_embeddings +) +from dimensionality_reduction import ( + reduce_dimensions, apply_density_based_jittering +) +from clustering import apply_clustering +from visualization import ( + create_visualization_plot, display_clustering_metrics, display_summary_stats, + display_clustering_results, display_data_table +) + + +def main(): + """Main application function""" + # Set up page configuration + setup_page_config() + + # Display title and description + display_title_and_description() + + # Load data + with st.spinner("Loading chat data..."): + df = load_all_chat_data() + + if df.empty: + st.error("No data could be loaded. Please check the data directory.") + st.stop() + + # Parse embeddings + with st.spinner("Parsing embeddings..."): + embeddings, valid_df = parse_embeddings(df) + + if len(embeddings) == 0: + st.error("No valid embeddings found!") + st.stop() + + # Get UI parameters + params = get_all_ui_parameters(valid_df) + + # Filter data + filtered_df = filter_data(valid_df, params['selected_sources'], params['selected_authors']) + + if filtered_df.empty: + st.warning("No data matches the current filters!") + st.stop() + + # Display performance warnings + display_performance_warnings(filtered_df, params['method'], params['clustering_method']) + + # Get corresponding embeddings + filtered_embeddings = get_filtered_embeddings(embeddings, valid_df, filtered_df) + + st.info(f"📈 Visualizing {len(filtered_df)} messages") + + # Reduce dimensions + with st.spinner(f"Reducing dimensions using {params['method']}..."): + reduced_embeddings = reduce_dimensions( + filtered_embeddings, + method=params['method'], + spread_factor=params['spread_factor'], + perplexity_factor=params['perplexity_factor'], + min_dist_factor=params['min_dist_factor'] + ) + + # Apply clustering + with st.spinner(f"Applying {params['clustering_method']}..."): + cluster_labels, silhouette_avg, calinski_harabasz = apply_clustering( + filtered_embeddings, + clustering_method=params['clustering_method'], + n_clusters=params['n_clusters'] + ) + + # Apply jittering if requested + if params['apply_jittering']: + with st.spinner("Applying smart jittering to separate overlapping points..."): + reduced_embeddings = apply_density_based_jittering( + reduced_embeddings, + density_scaling=params['density_based_jitter'], + jitter_strength=params['jitter_strength'] + ) + + # Display clustering metrics + display_clustering_metrics( + cluster_labels, silhouette_avg, calinski_harabasz, + params['show_cluster_metrics'] + ) + + # Create and display the main plot + fig = create_visualization_plot( + reduced_embeddings=reduced_embeddings, + filtered_df=filtered_df, + cluster_labels=cluster_labels, + selected_sources=params['selected_sources'] if params['selected_sources'] else None, + method=params['method'], + clustering_method=params['clustering_method'], + point_size=params['point_size'], + point_opacity=params['point_opacity'], + density_based_sizing=params['density_based_sizing'], + size_variation=params['size_variation'] + ) + + st.plotly_chart(fig, use_container_width=True) + + # Display summary statistics + display_summary_stats(filtered_df, params['selected_sources'] or filtered_df['source_file'].unique()) + + # Display clustering results and export options + display_clustering_results( + filtered_df, cluster_labels, reduced_embeddings, + params['method'], params['clustering_method'] + ) + + # Display data table + display_data_table(filtered_df, cluster_labels) + + +if __name__ == "__main__": + main() diff --git a/apps/cluster_map/streamlit_app.py b/apps/cluster_map/streamlit_app.py deleted file mode 100644 index 7f3d0eb..0000000 --- a/apps/cluster_map/streamlit_app.py +++ /dev/null @@ -1,233 +0,0 @@ -import streamlit as st -import pandas as pd -import numpy as np -import plotly.express as px -import plotly.graph_objects as go -from sklearn.decomposition import PCA -from sklearn.manifold import TSNE -import json -import os -from pathlib import Path -import ast - -# Set page config -st.set_page_config( - page_title="Discord Chat Embeddings Visualizer", - page_icon="🗨️", - layout="wide" -) - -# Title and description -st.title("🗨️ Discord Chat Embeddings Visualizer") -st.markdown("Explore Discord chat messages through their vector embeddings in 2D space") - -@st.cache_data -def load_all_chat_data(): - """Load all CSV files from the discord_chat_logs folder""" - chat_logs_path = Path("../../discord_chat_logs") - - # Display the path for debugging - st.write(f"Looking for CSV files in: {chat_logs_path}") - st.write(f"Path exists: {chat_logs_path.exists()}") - - all_data = [] - - for csv_file in chat_logs_path.glob("*.csv"): - try: - df = pd.read_csv(csv_file) - df['source_file'] = csv_file.stem # Add source file name - all_data.append(df) - st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}") - except Exception as e: - st.error(f"❌ Error loading {csv_file.name}: {e}") - - if all_data: - combined_df = pd.concat(all_data, ignore_index=True) - st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files") - return combined_df - else: - st.error("No data loaded!") - return pd.DataFrame() - -@st.cache_data -def parse_embeddings(df): - """Parse the content_embedding column from string to numpy array""" - embeddings = [] - valid_indices = [] - - for idx, embedding_str in enumerate(df['content_embedding']): - try: - # Parse the string representation of the list - embedding = ast.literal_eval(embedding_str) - if isinstance(embedding, list) and len(embedding) > 0: - embeddings.append(embedding) - valid_indices.append(idx) - except Exception as e: - continue - - embeddings_array = np.array(embeddings) - valid_df = df.iloc[valid_indices].copy() - - st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages") - st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}") - - return embeddings_array, valid_df - -@st.cache_data -def reduce_dimensions(embeddings, method="PCA", n_components=2): - """Reduce embeddings to 2D using PCA or t-SNE""" - if method == "PCA": - reducer = PCA(n_components=n_components, random_state=42) - elif method == "t-SNE": - reducer = TSNE(n_components=n_components, random_state=42, perplexity=min(30, len(embeddings)-1)) - - reduced_embeddings = reducer.fit_transform(embeddings) - return reduced_embeddings - -def create_hover_text(df): - """Create hover text for plotly""" - hover_text = [] - for _, row in df.iterrows(): - text = f"Author: {row['author_name']}
" - text += f"Timestamp: {row['timestamp_utc']}
" - text += f"Source: {row['source_file']}
" - - # Handle potential NaN or non-string content - content = row['content'] - if pd.isna(content) or content is None: - content_text = "[No content]" - else: - content_str = str(content) - content_text = content_str[:200] + ('...' if len(content_str) > 200 else '') - - text += f"Content: {content_text}" - hover_text.append(text) - return hover_text - -def main(): - # Load data - with st.spinner("Loading chat data..."): - df = load_all_chat_data() - - if df.empty: - st.stop() - - # Parse embeddings - with st.spinner("Parsing embeddings..."): - embeddings, valid_df = parse_embeddings(df) - - if len(embeddings) == 0: - st.error("No valid embeddings found!") - st.stop() - - # Sidebar controls - st.sidebar.header("🎛️ Visualization Controls") - - # Dimension reduction method - method = st.sidebar.selectbox( - "Dimension Reduction Method", - ["PCA", "t-SNE"], - help="PCA is faster, t-SNE may reveal better clusters" - ) - - # Source file filter - source_files = valid_df['source_file'].unique() - selected_sources = st.sidebar.multiselect( - "Filter by Source Files", - source_files, - default=source_files, - help="Select which chat log files to include" - ) - - # Author filter - authors = valid_df['author_name'].unique() - selected_authors = st.sidebar.multiselect( - "Filter by Authors", - authors, - default=authors[:10] if len(authors) > 10 else authors, # Limit to first 10 for performance - help="Select which authors to include" - ) - - # Filter data - filtered_df = valid_df[ - (valid_df['source_file'].isin(selected_sources)) & - (valid_df['author_name'].isin(selected_authors)) - ] - - if filtered_df.empty: - st.warning("No data matches the current filters!") - st.stop() - - # Get corresponding embeddings - filtered_indices = filtered_df.index.tolist() - filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]] - - st.info(f"📈 Visualizing {len(filtered_df)} messages") - - # Reduce dimensions - with st.spinner(f"Reducing dimensions using {method}..."): - reduced_embeddings = reduce_dimensions(filtered_embeddings, method) - - # Create hover text - hover_text = create_hover_text(filtered_df) - - # Create the plot - fig = go.Figure() - - # Color by source file - colors = px.colors.qualitative.Set1 - for i, source in enumerate(selected_sources): - source_mask = filtered_df['source_file'] == source - if source_mask.any(): - source_data = filtered_df[source_mask] - source_embeddings = reduced_embeddings[source_mask] - source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask] - - fig.add_trace(go.Scatter( - x=source_embeddings[:, 0], - y=source_embeddings[:, 1], - mode='markers', - name=source, - marker=dict( - size=8, - color=colors[i % len(colors)], - opacity=0.7, - line=dict(width=1, color='white') - ), - hovertemplate='%{hovertext}', - hovertext=source_hover - )) - - fig.update_layout( - title=f"Discord Chat Messages - {method} Visualization", - xaxis_title=f"{method} Component 1", - yaxis_title=f"{method} Component 2", - hovermode='closest', - width=1000, - height=700 - ) - - # Display the plot - st.plotly_chart(fig, use_container_width=True) - - # Statistics - col1, col2, col3 = st.columns(3) - - with col1: - st.metric("Total Messages", len(filtered_df)) - - with col2: - st.metric("Unique Authors", filtered_df['author_name'].nunique()) - - with col3: - st.metric("Source Files", len(selected_sources)) - - # Show data table - if st.checkbox("Show Data Table"): - st.subheader("📋 Message Data") - display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy() - display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display - st.dataframe(display_df, use_container_width=True) - -if __name__ == "__main__": - main() diff --git a/apps/cluster_map/test_debug.py b/apps/cluster_map/test_debug.py new file mode 100644 index 0000000..f154263 --- /dev/null +++ b/apps/cluster_map/test_debug.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Test script to debug the hanging issue in the modular app +""" + +import numpy as np +import sys +import os + +# Add the current directory to Python path +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +def test_dimensionality_reduction(): + """Test dimensionality reduction functions""" + print("Testing dimensionality reduction functions...") + + from dimensionality_reduction import reduce_dimensions + + # Create test data similar to what we'd expect + n_samples = 796 # Same as the user's dataset + n_features = 384 # Common embedding dimension + + print(f"Creating test embeddings: {n_samples} x {n_features}") + test_embeddings = np.random.randn(n_samples, n_features) + + # Test PCA (should be fast) + print("Testing PCA...") + try: + result = reduce_dimensions(test_embeddings, method="PCA") + print(f"✓ PCA successful, output shape: {result.shape}") + except Exception as e: + print(f"✗ PCA failed: {e}") + + # Test UMAP (might be slower) + print("Testing UMAP...") + try: + result = reduce_dimensions(test_embeddings, method="UMAP") + print(f"✓ UMAP successful, output shape: {result.shape}") + except Exception as e: + print(f"✗ UMAP failed: {e}") + +if __name__ == "__main__": + test_dimensionality_reduction() diff --git a/apps/cluster_map/ui_components.py b/apps/cluster_map/ui_components.py new file mode 100644 index 0000000..83b7944 --- /dev/null +++ b/apps/cluster_map/ui_components.py @@ -0,0 +1,236 @@ +""" +Streamlit UI components and controls for the Discord Chat Embeddings Visualizer. +""" + +import streamlit as st +import numpy as np +from config import ( + APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS, + CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS, + LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS +) + + +def setup_page_config(): + """Set up the Streamlit page configuration""" + st.set_page_config( + page_title=APP_TITLE, + page_icon=APP_ICON, + layout=APP_LAYOUT + ) + + +def display_title_and_description(): + """Display the main title and description""" + st.title(f"{APP_ICON} {APP_TITLE}") + st.markdown("Explore Discord chat messages through their vector embeddings in 2D space") + + +def create_method_controls(): + """Create controls for dimension reduction and clustering methods""" + st.sidebar.header("🎛️ Visualization Controls") + + # Dimension reduction method + method = st.sidebar.selectbox( + "Dimension Reduction Method", + ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"], + help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing." + ) + + # Clustering method + clustering_method = st.sidebar.selectbox( + "Clustering Method", + ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture", + "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"], + help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters." + ) + + return method, clustering_method + + +def create_clustering_controls(clustering_method): + """Create controls for clustering parameters""" + n_clusters = 5 + if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS: + n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5) + + return n_clusters + + +def create_separation_controls(method): + """Create controls for point separation and method-specific parameters""" + st.sidebar.subheader("🎯 Point Separation Controls") + + spread_factor = st.sidebar.slider( + "Spread Factor", + 0.5, 3.0, 1.0, 0.1, + help="Increase to spread apart nearby points. Higher values create more separation." + ) + + # Method-specific parameters + perplexity_factor = 1.0 + min_dist_factor = 1.0 + + if method == "t-SNE": + perplexity_factor = st.sidebar.slider( + "Perplexity Factor", + 0.5, 2.0, 1.0, 0.1, + help="Affects local vs global structure balance. Lower values focus on local details." + ) + + if method == "UMAP": + min_dist_factor = st.sidebar.slider( + "Min Distance Factor", + 0.1, 2.0, 1.0, 0.1, + help="Controls how tightly points are packed. Lower values create tighter clusters." + ) + + return spread_factor, perplexity_factor, min_dist_factor + + +def create_jittering_controls(): + """Create controls for jittering options""" + apply_jittering = st.sidebar.checkbox( + "Apply Smart Jittering", + value=False, + help="Add intelligent noise to separate overlapping points" + ) + + jitter_strength = 0.1 + density_based_jitter = True + + if apply_jittering: + jitter_strength = st.sidebar.slider( + "Jitter Strength", + 0.01, 0.5, 0.1, 0.01, + help="Strength of jittering. Higher values spread points more." + ) + density_based_jitter = st.sidebar.checkbox( + "Density-Based Jittering", + value=True, + help="Apply stronger jittering in dense regions" + ) + + return apply_jittering, jitter_strength, density_based_jitter + + +def create_advanced_options(): + """Create advanced visualization options""" + with st.sidebar.expander("⚙️ Advanced Options"): + show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True) + point_size = st.slider("Point Size", 4, 15, 8) + point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7) + + # Density-based visualization + density_based_sizing = st.checkbox( + "Density-Based Point Sizing", + value=False, + help="Make points larger in sparse regions, smaller in dense regions" + ) + + size_variation = 2.0 + if density_based_sizing: + size_variation = st.slider( + "Size Variation Factor", + 1.5, 4.0, 2.0, 0.1, + help="How much point sizes vary based on local density" + ) + + return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation + + +def create_filter_controls(valid_df): + """Create controls for filtering data by source and author""" + # Source file filter + source_files = valid_df['source_file'].unique() + selected_sources = st.sidebar.multiselect( + "Filter by Source Files", + source_files, + default=[], + help="Select which chat log files to include" + ) + + # Author filter + authors = valid_df['author_name'].unique() + default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors + selected_authors = st.sidebar.multiselect( + "Filter by Authors", + authors, + default=default_authors, + help="Select which authors to include" + ) + + return selected_sources, selected_authors + + +def display_method_explanations(): + """Display explanations for different methods""" + st.sidebar.markdown("---") + with st.sidebar.expander("📚 Method Explanations"): + st.markdown("**Dimensionality Reduction:**") + for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items(): + st.markdown(f"- **{method}**: {explanation}") + + st.markdown("\n**Clustering Methods:**") + for method, explanation in METHOD_EXPLANATIONS["clustering"].items(): + st.markdown(f"- **{method}**: {explanation}") + + st.markdown("\n**Separation Techniques:**") + for technique, explanation in METHOD_EXPLANATIONS["separation"].items(): + st.markdown(f"- **{technique}**: {explanation}") + + st.markdown("\n**Metrics:**") + for metric, explanation in METHOD_EXPLANATIONS["metrics"].items(): + st.markdown(f"- **{metric}**: {explanation}") + + +def display_performance_warnings(filtered_df, method, clustering_method): + """Display performance warnings for computationally intensive operations""" + if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD: + if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]: + st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.") + if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]: + st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.") + + +def get_all_ui_parameters(valid_df): + """Get all UI parameters in a single function call""" + # Method selection + method, clustering_method = create_method_controls() + + # Clustering parameters + n_clusters = create_clustering_controls(clustering_method) + + # Separation controls + spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method) + + # Jittering controls + apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls() + + # Advanced options + show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options() + + # Filters + selected_sources, selected_authors = create_filter_controls(valid_df) + + # Method explanations + display_method_explanations() + + return { + 'method': method, + 'clustering_method': clustering_method, + 'n_clusters': n_clusters, + 'spread_factor': spread_factor, + 'perplexity_factor': perplexity_factor, + 'min_dist_factor': min_dist_factor, + 'apply_jittering': apply_jittering, + 'jitter_strength': jitter_strength, + 'density_based_jitter': density_based_jitter, + 'show_cluster_metrics': show_cluster_metrics, + 'point_size': point_size, + 'point_opacity': point_opacity, + 'density_based_sizing': density_based_sizing, + 'size_variation': size_variation, + 'selected_sources': selected_sources, + 'selected_authors': selected_authors + } diff --git a/apps/cluster_map/visualization.py b/apps/cluster_map/visualization.py new file mode 100644 index 0000000..93f38d4 --- /dev/null +++ b/apps/cluster_map/visualization.py @@ -0,0 +1,225 @@ +""" +Visualization functions for creating interactive plots and displays. +""" + +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +import streamlit as st +from dimensionality_reduction import calculate_local_density_scaling +from config import MESSAGE_CONTENT_PREVIEW_LENGTH, DEFAULT_POINT_SIZE, DEFAULT_POINT_OPACITY + + +def create_hover_text(df): + """Create hover text for plotly""" + hover_text = [] + for _, row in df.iterrows(): + text = f"Author: {row['author_name']}
" + text += f"Timestamp: {row['timestamp_utc']}
" + text += f"Source: {row['source_file']}
" + + # Handle potential NaN or non-string content + content = row['content'] + if pd.isna(content) or content is None: + content_text = "[No content]" + else: + content_str = str(content) + content_text = content_str[:MESSAGE_CONTENT_PREVIEW_LENGTH] + ('...' if len(content_str) > MESSAGE_CONTENT_PREVIEW_LENGTH else '') + + text += f"Content: {content_text}" + hover_text.append(text) + return hover_text + + +def calculate_point_sizes(reduced_embeddings, density_based_sizing=False, + point_size=DEFAULT_POINT_SIZE, size_variation=2.0): + """Calculate point sizes based on density if enabled""" + if not density_based_sizing: + return [point_size] * len(reduced_embeddings) + + local_densities = calculate_local_density_scaling(reduced_embeddings) + # Invert densities so sparse areas get larger points + inverted_densities = 1.0 - local_densities + # Scale point sizes + point_sizes = point_size * (1.0 + inverted_densities * (size_variation - 1.0)) + return point_sizes + + +def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover_text, + point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA"): + """Create a plot colored by clusters""" + fig = go.Figure() + + unique_clusters = np.unique(cluster_labels) + colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel + + for i, cluster_id in enumerate(unique_clusters): + cluster_mask = cluster_labels == cluster_id + if cluster_mask.any(): + cluster_embeddings = reduced_embeddings[cluster_mask] + cluster_hover = [hover_text[j] for j, mask in enumerate(cluster_mask) if mask] + cluster_sizes = [point_sizes[j] for j, mask in enumerate(cluster_mask) if mask] + + cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise" + + fig.add_trace(go.Scatter( + x=cluster_embeddings[:, 0], + y=cluster_embeddings[:, 1], + mode='markers', + name=cluster_name, + marker=dict( + size=cluster_sizes, + color=colors[i % len(colors)], + opacity=point_opacity, + line=dict(width=1, color='white') + ), + hovertemplate='%{hovertext}', + hovertext=cluster_hover + )) + + return fig + + +def create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources, hover_text, + point_sizes, point_opacity=DEFAULT_POINT_OPACITY): + """Create a plot colored by source files""" + fig = go.Figure() + colors = px.colors.qualitative.Set1 + + for i, source in enumerate(selected_sources): + source_mask = filtered_df['source_file'] == source + if source_mask.any(): + source_embeddings = reduced_embeddings[source_mask] + source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask] + source_sizes = [point_sizes[j] for j, mask in enumerate(source_mask) if mask] + + fig.add_trace(go.Scatter( + x=source_embeddings[:, 0], + y=source_embeddings[:, 1], + mode='markers', + name=source, + marker=dict( + size=source_sizes, + color=colors[i % len(colors)], + opacity=point_opacity, + line=dict(width=1, color='white') + ), + hovertemplate='%{hovertext}', + hovertext=source_hover + )) + + return fig + + +def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=None, + selected_sources=None, method="PCA", clustering_method="None", + point_size=DEFAULT_POINT_SIZE, point_opacity=DEFAULT_POINT_OPACITY, + density_based_sizing=False, size_variation=2.0): + """Create the main visualization plot""" + + # Create hover text + hover_text = create_hover_text(filtered_df) + + # Calculate point sizes + point_sizes = calculate_point_sizes(reduced_embeddings, density_based_sizing, + point_size, size_variation) + + # Create plot based on coloring strategy + if cluster_labels is not None: + fig = create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, + hover_text, point_sizes, point_opacity, method) + else: + if selected_sources is None: + selected_sources = filtered_df['source_file'].unique() + fig = create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources, + hover_text, point_sizes, point_opacity) + + # Update layout + title_suffix = f" with {clustering_method}" if clustering_method != "None" else "" + fig.update_layout( + title=f"Discord Chat Messages - {method} Visualization{title_suffix}", + xaxis_title=f"{method} Component 1", + yaxis_title=f"{method} Component 2", + hovermode='closest', + width=1000, + height=700 + ) + + return fig + + +def display_clustering_metrics(cluster_labels, silhouette_avg, calinski_harabasz, show_metrics=True): + """Display clustering quality metrics""" + if cluster_labels is not None and show_metrics: + col1, col2, col3 = st.columns(3) + with col1: + n_clusters_found = len(np.unique(cluster_labels[cluster_labels != -1])) + st.metric("Clusters Found", n_clusters_found) + with col2: + if silhouette_avg is not None: + st.metric("Silhouette Score", f"{silhouette_avg:.3f}") + else: + st.metric("Silhouette Score", "N/A") + with col3: + if calinski_harabasz is not None: + st.metric("Calinski-Harabasz Index", f"{calinski_harabasz:.1f}") + else: + st.metric("Calinski-Harabasz Index", "N/A") + + +def display_summary_stats(filtered_df, selected_sources): + """Display summary statistics""" + col1, col2, col3 = st.columns(3) + + with col1: + st.metric("Total Messages", len(filtered_df)) + + with col2: + st.metric("Unique Authors", filtered_df['author_name'].nunique()) + + with col3: + st.metric("Source Files", len(selected_sources)) + + +def display_clustering_results(filtered_df, cluster_labels, reduced_embeddings, method, clustering_method): + """Display clustering results and export options""" + if cluster_labels is None: + return + + st.subheader("📊 Clustering Results") + + # Add cluster information to dataframe for export + export_df = filtered_df.copy() + export_df['cluster_id'] = cluster_labels + export_df['x_coordinate'] = reduced_embeddings[:, 0] + export_df['y_coordinate'] = reduced_embeddings[:, 1] + + # Show cluster distribution + cluster_dist = pd.Series(cluster_labels).value_counts().sort_index() + st.bar_chart(cluster_dist) + + # Download option + csv_data = export_df.to_csv(index=False) + st.download_button( + label="📥 Download Clustering Results (CSV)", + data=csv_data, + file_name=f"chat_clusters_{method}_{clustering_method}.csv", + mime="text/csv" + ) + + +def display_data_table(filtered_df, cluster_labels=None): + """Display the data table with optional clustering information""" + if not st.checkbox("Show Data Table"): + return + + st.subheader("📋 Message Data") + display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy() + + # Add clustering info if available + if cluster_labels is not None: + display_df['cluster'] = cluster_labels + + display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display + st.dataframe(display_df, use_container_width=True)