diff --git a/apps/cluster_map/cluster.py b/apps/cluster_map/cluster.py
new file mode 100644
index 0000000..cabc728
--- /dev/null
+++ b/apps/cluster_map/cluster.py
@@ -0,0 +1,12 @@
+"""
+Discord Chat Embeddings Visualizer - Legacy Entry Point
+
+This file serves as a compatibility layer for the original cluster.py.
+The application has been refactored into modular components for better maintainability.
+"""
+
+# Import and run the main application
+from main import main
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/cluster_map/clustering.py b/apps/cluster_map/clustering.py
new file mode 100644
index 0000000..f2bc93c
--- /dev/null
+++ b/apps/cluster_map/clustering.py
@@ -0,0 +1,99 @@
+"""
+Clustering algorithms and evaluation metrics.
+"""
+
+import numpy as np
+import streamlit as st
+from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS
+from sklearn.mixture import GaussianMixture
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import silhouette_score, calinski_harabasz_score
+import hdbscan
+from config import DEFAULT_RANDOM_STATE
+
+
+def apply_clustering(embeddings, clustering_method="None", n_clusters=5):
+ """
+ Apply clustering algorithm to embeddings and return labels and metrics.
+
+ Args:
+ embeddings: High-dimensional embeddings to cluster
+ clustering_method: Name of clustering algorithm
+ n_clusters: Number of clusters (for methods that require it)
+
+ Returns:
+ tuple: (cluster_labels, silhouette_score, calinski_harabasz_score)
+ """
+ if clustering_method == "None" or len(embeddings) <= n_clusters:
+ return None, None, None
+
+ # Standardize embeddings for better clustering
+ scaler = StandardScaler()
+ scaled_embeddings = scaler.fit_transform(embeddings)
+
+ cluster_labels = None
+ silhouette_avg = None
+ calinski_harabasz = None
+
+ try:
+ if clustering_method == "HDBSCAN":
+ min_cluster_size = max(2, len(embeddings) // 20) # Adaptive min cluster size
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
+ min_samples=1, cluster_selection_epsilon=0.5)
+ cluster_labels = clusterer.fit_predict(scaled_embeddings)
+
+ elif clustering_method == "Spectral Clustering":
+ clusterer = SpectralClustering(n_clusters=n_clusters, random_state=DEFAULT_RANDOM_STATE,
+ affinity='rbf', gamma=1.0)
+ cluster_labels = clusterer.fit_predict(scaled_embeddings)
+
+ elif clustering_method == "Gaussian Mixture":
+ clusterer = GaussianMixture(n_components=n_clusters, random_state=DEFAULT_RANDOM_STATE,
+ covariance_type='full', max_iter=200)
+ cluster_labels = clusterer.fit_predict(scaled_embeddings)
+
+ elif clustering_method == "Agglomerative (Ward)":
+ clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
+ cluster_labels = clusterer.fit_predict(scaled_embeddings)
+
+ elif clustering_method == "Agglomerative (Complete)":
+ clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
+ cluster_labels = clusterer.fit_predict(scaled_embeddings)
+
+ elif clustering_method == "OPTICS":
+ min_samples = max(2, len(embeddings) // 50)
+ clusterer = OPTICS(min_samples=min_samples, xi=0.05, min_cluster_size=0.1)
+ cluster_labels = clusterer.fit_predict(scaled_embeddings)
+
+ # Calculate clustering quality metrics
+ if cluster_labels is not None and len(np.unique(cluster_labels)) > 1:
+ # Only calculate if we have multiple clusters and no noise-only clustering
+ valid_labels = cluster_labels[cluster_labels != -1] # Remove noise points for HDBSCAN/OPTICS
+ valid_embeddings = scaled_embeddings[cluster_labels != -1]
+
+ if len(valid_labels) > 0 and len(np.unique(valid_labels)) > 1:
+ silhouette_avg = silhouette_score(valid_embeddings, valid_labels)
+ calinski_harabasz = calinski_harabasz_score(valid_embeddings, valid_labels)
+
+ except Exception as e:
+ st.warning(f"Clustering failed: {str(e)}")
+ cluster_labels = None
+
+ return cluster_labels, silhouette_avg, calinski_harabasz
+
+
+def get_cluster_statistics(cluster_labels):
+ """Get basic statistics about clustering results"""
+ if cluster_labels is None:
+ return {}
+
+ unique_clusters = np.unique(cluster_labels)
+ n_clusters = len(unique_clusters[unique_clusters != -1]) # Exclude noise cluster (-1)
+ n_noise = np.sum(cluster_labels == -1)
+
+ return {
+ "n_clusters": n_clusters,
+ "n_noise_points": n_noise,
+ "cluster_distribution": np.bincount(cluster_labels[cluster_labels != -1]) if n_clusters > 0 else [],
+ "unique_clusters": unique_clusters
+ }
diff --git a/apps/cluster_map/config.py b/apps/cluster_map/config.py
new file mode 100644
index 0000000..8c58ede
--- /dev/null
+++ b/apps/cluster_map/config.py
@@ -0,0 +1,73 @@
+"""
+Configuration settings and constants for the Discord Chat Embeddings Visualizer.
+"""
+
+# Application settings
+APP_TITLE = "Discord Chat Embeddings Visualizer"
+APP_ICON = "🗨️"
+APP_LAYOUT = "wide"
+
+# File paths
+CHAT_LOGS_PATH = "../../discord_chat_logs"
+
+# Algorithm parameters
+DEFAULT_RANDOM_STATE = 42
+DEFAULT_N_COMPONENTS = 2
+DEFAULT_N_CLUSTERS = 5
+
+# Visualization settings
+DEFAULT_POINT_SIZE = 8
+DEFAULT_POINT_OPACITY = 0.7
+MAX_DISPLAYED_AUTHORS = 10
+MESSAGE_CONTENT_PREVIEW_LENGTH = 200
+MESSAGE_CONTENT_DISPLAY_LENGTH = 100
+
+# Performance thresholds
+LARGE_DATASET_WARNING_THRESHOLD = 1000
+
+# Color palettes
+PRIMARY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
+ "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
+
+# Clustering method categories
+CLUSTERING_METHODS_REQUIRING_N_CLUSTERS = [
+ "Spectral Clustering",
+ "Gaussian Mixture",
+ "Agglomerative (Ward)",
+ "Agglomerative (Complete)"
+]
+
+COMPUTATIONALLY_INTENSIVE_METHODS = {
+ "dimension_reduction": ["t-SNE", "Spectral Embedding"],
+ "clustering": ["Spectral Clustering", "OPTICS"]
+}
+
+# Method explanations
+METHOD_EXPLANATIONS = {
+ "dimension_reduction": {
+ "PCA": "Linear, fast, preserves global variance",
+ "t-SNE": "Non-linear, good for local structure, slower",
+ "UMAP": "Balanced speed/quality, preserves local & global structure",
+ "Spectral Embedding": "Uses graph theory, good for non-convex clusters",
+ "Force-Directed": "Physics-based layout, creates natural spacing"
+ },
+ "clustering": {
+ "HDBSCAN": "Density-based, finds variable density clusters, handles noise",
+ "Spectral Clustering": "Uses eigenvalues, good for non-convex shapes",
+ "Gaussian Mixture": "Probabilistic, assumes gaussian distributions",
+ "Agglomerative (Ward)": "Hierarchical, minimizes within-cluster variance",
+ "Agglomerative (Complete)": "Hierarchical, minimizes maximum distance",
+ "OPTICS": "Density-based, finds clusters of varying densities"
+ },
+ "separation": {
+ "Spread Factor": "Applies repulsive forces between nearby points",
+ "Smart Jittering": "Adds intelligent noise to separate overlapping points",
+ "Density-Based Jittering": "Stronger separation in crowded areas",
+ "Perplexity Factor": "Controls t-SNE's focus on local vs global structure",
+ "Min Distance Factor": "Controls UMAP's point packing tightness"
+ },
+ "metrics": {
+ "Silhouette Score": "Higher is better (range: -1 to 1)",
+ "Calinski-Harabasz": "Higher is better, measures cluster separation"
+ }
+}
diff --git a/apps/cluster_map/data_loader.py b/apps/cluster_map/data_loader.py
new file mode 100644
index 0000000..73105f4
--- /dev/null
+++ b/apps/cluster_map/data_loader.py
@@ -0,0 +1,86 @@
+"""
+Data loading and parsing utilities for Discord chat logs.
+"""
+
+import pandas as pd
+import numpy as np
+import streamlit as st
+import ast
+from pathlib import Path
+from config import CHAT_LOGS_PATH
+
+
+@st.cache_data
+def load_all_chat_data():
+ """Load all CSV files from the discord_chat_logs folder"""
+ chat_logs_path = Path(CHAT_LOGS_PATH)
+
+ with st.expander("📁 Loading Details", expanded=False):
+ # Display the path for debugging
+ st.write(f"Looking for CSV files in: {chat_logs_path}")
+ st.write(f"Path exists: {chat_logs_path.exists()}")
+
+ all_data = []
+
+ for csv_file in chat_logs_path.glob("*.csv"):
+ try:
+ df = pd.read_csv(csv_file)
+ df['source_file'] = csv_file.stem # Add source file name
+ all_data.append(df)
+ st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
+ except Exception as e:
+ st.error(f"❌ Error loading {csv_file.name}: {e}")
+
+ if all_data:
+ combined_df = pd.concat(all_data, ignore_index=True)
+ st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
+ else:
+ st.error("No data loaded!")
+ combined_df = pd.DataFrame()
+
+ return combined_df if all_data else pd.DataFrame()
+
+
+@st.cache_data
+def parse_embeddings(df):
+ """Parse the content_embedding column from string to numpy array"""
+ embeddings = []
+ valid_indices = []
+
+ for idx, embedding_str in enumerate(df['content_embedding']):
+ try:
+ # Parse the string representation of the list
+ embedding = ast.literal_eval(embedding_str)
+ if isinstance(embedding, list) and len(embedding) > 0:
+ embeddings.append(embedding)
+ valid_indices.append(idx)
+ except Exception as e:
+ continue
+
+ embeddings_array = np.array(embeddings)
+ valid_df = df.iloc[valid_indices].copy()
+
+ st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
+ st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
+
+ return embeddings_array, valid_df
+
+
+def filter_data(df, selected_sources, selected_authors):
+ """Filter dataframe by selected sources and authors"""
+ if not selected_sources:
+ selected_sources = df['source_file'].unique()
+
+ filtered_df = df[
+ (df['source_file'].isin(selected_sources)) &
+ (df['author_name'].isin(selected_authors))
+ ]
+
+ return filtered_df
+
+
+def get_filtered_embeddings(embeddings, valid_df, filtered_df):
+ """Get embeddings corresponding to filtered dataframe"""
+ filtered_indices = filtered_df.index.tolist()
+ filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
+ return filtered_embeddings
diff --git a/apps/cluster_map/dimensionality_reduction.py b/apps/cluster_map/dimensionality_reduction.py
new file mode 100644
index 0000000..f066eaf
--- /dev/null
+++ b/apps/cluster_map/dimensionality_reduction.py
@@ -0,0 +1,211 @@
+"""
+Dimensionality reduction algorithms and point separation techniques.
+"""
+
+import numpy as np
+import streamlit as st
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE, SpectralEmbedding
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import NearestNeighbors
+from scipy.spatial.distance import pdist, squareform
+from scipy.optimize import minimize
+import umap
+from config import DEFAULT_RANDOM_STATE
+
+
+def apply_adaptive_spreading(embeddings, spread_factor=1.0):
+ """
+ Apply adaptive spreading to push apart nearby points while preserving global structure.
+ Uses a force-based approach where closer points repel more strongly.
+ """
+ if spread_factor <= 0:
+ return embeddings
+
+ embeddings = embeddings.copy()
+ n_points = len(embeddings)
+
+ print(f"DEBUG: Applying adaptive spreading to {n_points} points with factor {spread_factor}")
+
+ if n_points < 2:
+ return embeddings
+
+ # For very large datasets, skip spreading to avoid hanging
+ if n_points > 1000:
+ print(f"DEBUG: Large dataset ({n_points} points), skipping adaptive spreading...")
+ return embeddings
+
+ # Calculate pairwise distances
+ distances = squareform(pdist(embeddings))
+
+ # Apply force-based spreading with fewer iterations for large datasets
+ max_iterations = 3 if n_points > 500 else 5
+
+ for iteration in range(max_iterations):
+ if iteration % 2 == 0: # Progress indicator
+ print(f"DEBUG: Spreading iteration {iteration + 1}/{max_iterations}")
+
+ forces = np.zeros_like(embeddings)
+
+ for i in range(n_points):
+ for j in range(i + 1, n_points):
+ diff = embeddings[i] - embeddings[j]
+ dist = np.linalg.norm(diff)
+
+ if dist > 0:
+ # Repulsive force inversely proportional to distance
+ force_magnitude = spread_factor / (dist ** 2 + 0.01)
+ force_direction = diff / dist
+ force = force_magnitude * force_direction
+
+ forces[i] += force
+ forces[j] -= force
+
+ # Apply forces with damping
+ embeddings += forces * 0.1
+
+ print(f"DEBUG: Adaptive spreading complete")
+ return embeddings
+
+
+def force_directed_layout(high_dim_embeddings, n_components=2, spread_factor=1.0):
+ """
+ Create a force-directed layout from high-dimensional embeddings.
+ This creates more natural spacing between similar points.
+ """
+ print(f"DEBUG: Starting force-directed layout with {len(high_dim_embeddings)} points...")
+
+ # For large datasets, fall back to PCA + spreading to avoid hanging
+ if len(high_dim_embeddings) > 500:
+ print(f"DEBUG: Large dataset ({len(high_dim_embeddings)} points), using PCA + spreading instead...")
+ pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
+ result = pca.fit_transform(high_dim_embeddings)
+ return apply_adaptive_spreading(result, spread_factor)
+
+ # Start with PCA as initial layout
+ pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
+ initial_layout = pca.fit_transform(high_dim_embeddings)
+ print(f"DEBUG: Initial PCA layout computed...")
+
+ # For simplicity, just apply spreading to the PCA result
+ # The original optimization was too computationally intensive
+ result = apply_adaptive_spreading(initial_layout, spread_factor)
+ print(f"DEBUG: Force-directed layout complete...")
+ return result
+
+
+def calculate_local_density_scaling(embeddings, k=5):
+ """
+ Calculate local density scaling factors to emphasize differences in dense regions.
+ """
+ if len(embeddings) < k:
+ return np.ones(len(embeddings))
+
+ # Find k nearest neighbors for each point
+ nn = NearestNeighbors(n_neighbors=k+1) # +1 because first neighbor is the point itself
+ nn.fit(embeddings)
+ distances, indices = nn.kneighbors(embeddings)
+
+ # Calculate local density (inverse of average distance to k nearest neighbors)
+ local_densities = 1.0 / (np.mean(distances[:, 1:], axis=1) + 1e-6)
+
+ # Normalize densities
+ local_densities = (local_densities - np.min(local_densities)) / (np.max(local_densities) - np.min(local_densities) + 1e-6)
+
+ return local_densities
+
+
+def apply_density_based_jittering(embeddings, density_scaling=True, jitter_strength=0.1):
+ """
+ Apply smart jittering that's stronger in dense regions to separate overlapping points.
+ """
+ if not density_scaling:
+ # Simple random jittering
+ noise = np.random.normal(0, jitter_strength, embeddings.shape)
+ return embeddings + noise
+
+ # Calculate local densities
+ densities = calculate_local_density_scaling(embeddings)
+
+ # Apply density-proportional jittering
+ jittered = embeddings.copy()
+ for i in range(len(embeddings)):
+ # More jitter in denser regions
+ jitter_amount = jitter_strength * (1 + densities[i])
+ noise = np.random.normal(0, jitter_amount, embeddings.shape[1])
+ jittered[i] += noise
+
+ return jittered
+
+
+def reduce_dimensions(embeddings, method="PCA", n_components=2, spread_factor=1.0,
+ perplexity_factor=1.0, min_dist_factor=1.0):
+ """Apply dimensionality reduction with enhanced separation"""
+
+ # Convert to numpy array if it's not already
+ embeddings = np.array(embeddings)
+
+ print(f"DEBUG: Starting {method} with {len(embeddings)} embeddings, shape: {embeddings.shape}")
+
+ # Standardize embeddings for better processing
+ scaler = StandardScaler()
+ scaled_embeddings = scaler.fit_transform(embeddings)
+ print(f"DEBUG: Embeddings standardized")
+
+ # Apply the selected dimensionality reduction method
+ if method == "PCA":
+ print(f"DEBUG: Applying PCA...")
+ reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
+ reduced_embeddings = reducer.fit_transform(scaled_embeddings)
+ # Apply spreading to PCA results
+ print(f"DEBUG: Applying spreading...")
+ reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
+
+ elif method == "t-SNE":
+ # Adjust perplexity based on user preference and data size
+ base_perplexity = min(30, len(embeddings)-1)
+ adjusted_perplexity = max(5, min(50, int(base_perplexity * perplexity_factor)))
+ print(f"DEBUG: Applying t-SNE with perplexity {adjusted_perplexity}...")
+
+ reducer = TSNE(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
+ perplexity=adjusted_perplexity, n_iter=1000,
+ early_exaggeration=12.0 * spread_factor, # Increase early exaggeration for more separation
+ learning_rate='auto')
+ reduced_embeddings = reducer.fit_transform(scaled_embeddings)
+
+ elif method == "UMAP":
+ # Adjust UMAP parameters for better local separation
+ n_neighbors = min(15, len(embeddings)-1)
+ min_dist = 0.1 * min_dist_factor
+ spread = 1.0 * spread_factor
+ print(f"DEBUG: Applying UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}...")
+
+ reducer = umap.UMAP(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
+ n_neighbors=n_neighbors, min_dist=min_dist,
+ spread=spread, local_connectivity=2.0)
+ reduced_embeddings = reducer.fit_transform(scaled_embeddings)
+
+ elif method == "Spectral Embedding":
+ n_neighbors = min(10, len(embeddings)-1)
+ print(f"DEBUG: Applying Spectral Embedding with n_neighbors={n_neighbors}...")
+ reducer = SpectralEmbedding(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
+ n_neighbors=n_neighbors)
+ reduced_embeddings = reducer.fit_transform(scaled_embeddings)
+ # Apply spreading to spectral results
+ print(f"DEBUG: Applying spreading...")
+ reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
+
+ elif method == "Force-Directed":
+ # New method: Use force-directed layout for natural spreading
+ print(f"DEBUG: Applying Force-Directed layout...")
+ reduced_embeddings = force_directed_layout(scaled_embeddings, n_components, spread_factor)
+
+ else:
+ # Fallback to PCA
+ print(f"DEBUG: Unknown method {method}, falling back to PCA...")
+ reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
+ reduced_embeddings = reducer.fit_transform(scaled_embeddings)
+ reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
+
+ print(f"DEBUG: Dimensionality reduction complete. Output shape: {reduced_embeddings.shape}")
+ return reduced_embeddings
diff --git a/apps/cluster_map/main.py b/apps/cluster_map/main.py
new file mode 100644
index 0000000..395e6f9
--- /dev/null
+++ b/apps/cluster_map/main.py
@@ -0,0 +1,132 @@
+"""
+Main application logic for the Discord Chat Embeddings Visualizer.
+"""
+
+import streamlit as st
+import warnings
+warnings.filterwarnings('ignore')
+
+# Import custom modules
+from ui_components import (
+ setup_page_config, display_title_and_description, get_all_ui_parameters,
+ display_performance_warnings
+)
+from data_loader import (
+ load_all_chat_data, parse_embeddings, filter_data, get_filtered_embeddings
+)
+from dimensionality_reduction import (
+ reduce_dimensions, apply_density_based_jittering
+)
+from clustering import apply_clustering
+from visualization import (
+ create_visualization_plot, display_clustering_metrics, display_summary_stats,
+ display_clustering_results, display_data_table
+)
+
+
+def main():
+ """Main application function"""
+ # Set up page configuration
+ setup_page_config()
+
+ # Display title and description
+ display_title_and_description()
+
+ # Load data
+ with st.spinner("Loading chat data..."):
+ df = load_all_chat_data()
+
+ if df.empty:
+ st.error("No data could be loaded. Please check the data directory.")
+ st.stop()
+
+ # Parse embeddings
+ with st.spinner("Parsing embeddings..."):
+ embeddings, valid_df = parse_embeddings(df)
+
+ if len(embeddings) == 0:
+ st.error("No valid embeddings found!")
+ st.stop()
+
+ # Get UI parameters
+ params = get_all_ui_parameters(valid_df)
+
+ # Filter data
+ filtered_df = filter_data(valid_df, params['selected_sources'], params['selected_authors'])
+
+ if filtered_df.empty:
+ st.warning("No data matches the current filters!")
+ st.stop()
+
+ # Display performance warnings
+ display_performance_warnings(filtered_df, params['method'], params['clustering_method'])
+
+ # Get corresponding embeddings
+ filtered_embeddings = get_filtered_embeddings(embeddings, valid_df, filtered_df)
+
+ st.info(f"📈 Visualizing {len(filtered_df)} messages")
+
+ # Reduce dimensions
+ with st.spinner(f"Reducing dimensions using {params['method']}..."):
+ reduced_embeddings = reduce_dimensions(
+ filtered_embeddings,
+ method=params['method'],
+ spread_factor=params['spread_factor'],
+ perplexity_factor=params['perplexity_factor'],
+ min_dist_factor=params['min_dist_factor']
+ )
+
+ # Apply clustering
+ with st.spinner(f"Applying {params['clustering_method']}..."):
+ cluster_labels, silhouette_avg, calinski_harabasz = apply_clustering(
+ filtered_embeddings,
+ clustering_method=params['clustering_method'],
+ n_clusters=params['n_clusters']
+ )
+
+ # Apply jittering if requested
+ if params['apply_jittering']:
+ with st.spinner("Applying smart jittering to separate overlapping points..."):
+ reduced_embeddings = apply_density_based_jittering(
+ reduced_embeddings,
+ density_scaling=params['density_based_jitter'],
+ jitter_strength=params['jitter_strength']
+ )
+
+ # Display clustering metrics
+ display_clustering_metrics(
+ cluster_labels, silhouette_avg, calinski_harabasz,
+ params['show_cluster_metrics']
+ )
+
+ # Create and display the main plot
+ fig = create_visualization_plot(
+ reduced_embeddings=reduced_embeddings,
+ filtered_df=filtered_df,
+ cluster_labels=cluster_labels,
+ selected_sources=params['selected_sources'] if params['selected_sources'] else None,
+ method=params['method'],
+ clustering_method=params['clustering_method'],
+ point_size=params['point_size'],
+ point_opacity=params['point_opacity'],
+ density_based_sizing=params['density_based_sizing'],
+ size_variation=params['size_variation']
+ )
+
+ st.plotly_chart(fig, use_container_width=True)
+
+ # Display summary statistics
+ display_summary_stats(filtered_df, params['selected_sources'] or filtered_df['source_file'].unique())
+
+ # Display clustering results and export options
+ display_clustering_results(
+ filtered_df, cluster_labels, reduced_embeddings,
+ params['method'], params['clustering_method']
+ )
+
+ # Display data table
+ display_data_table(filtered_df, cluster_labels)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/cluster_map/streamlit_app.py b/apps/cluster_map/streamlit_app.py
deleted file mode 100644
index 7f3d0eb..0000000
--- a/apps/cluster_map/streamlit_app.py
+++ /dev/null
@@ -1,233 +0,0 @@
-import streamlit as st
-import pandas as pd
-import numpy as np
-import plotly.express as px
-import plotly.graph_objects as go
-from sklearn.decomposition import PCA
-from sklearn.manifold import TSNE
-import json
-import os
-from pathlib import Path
-import ast
-
-# Set page config
-st.set_page_config(
- page_title="Discord Chat Embeddings Visualizer",
- page_icon="🗨️",
- layout="wide"
-)
-
-# Title and description
-st.title("🗨️ Discord Chat Embeddings Visualizer")
-st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
-
-@st.cache_data
-def load_all_chat_data():
- """Load all CSV files from the discord_chat_logs folder"""
- chat_logs_path = Path("../../discord_chat_logs")
-
- # Display the path for debugging
- st.write(f"Looking for CSV files in: {chat_logs_path}")
- st.write(f"Path exists: {chat_logs_path.exists()}")
-
- all_data = []
-
- for csv_file in chat_logs_path.glob("*.csv"):
- try:
- df = pd.read_csv(csv_file)
- df['source_file'] = csv_file.stem # Add source file name
- all_data.append(df)
- st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
- except Exception as e:
- st.error(f"❌ Error loading {csv_file.name}: {e}")
-
- if all_data:
- combined_df = pd.concat(all_data, ignore_index=True)
- st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
- return combined_df
- else:
- st.error("No data loaded!")
- return pd.DataFrame()
-
-@st.cache_data
-def parse_embeddings(df):
- """Parse the content_embedding column from string to numpy array"""
- embeddings = []
- valid_indices = []
-
- for idx, embedding_str in enumerate(df['content_embedding']):
- try:
- # Parse the string representation of the list
- embedding = ast.literal_eval(embedding_str)
- if isinstance(embedding, list) and len(embedding) > 0:
- embeddings.append(embedding)
- valid_indices.append(idx)
- except Exception as e:
- continue
-
- embeddings_array = np.array(embeddings)
- valid_df = df.iloc[valid_indices].copy()
-
- st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
- st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
-
- return embeddings_array, valid_df
-
-@st.cache_data
-def reduce_dimensions(embeddings, method="PCA", n_components=2):
- """Reduce embeddings to 2D using PCA or t-SNE"""
- if method == "PCA":
- reducer = PCA(n_components=n_components, random_state=42)
- elif method == "t-SNE":
- reducer = TSNE(n_components=n_components, random_state=42, perplexity=min(30, len(embeddings)-1))
-
- reduced_embeddings = reducer.fit_transform(embeddings)
- return reduced_embeddings
-
-def create_hover_text(df):
- """Create hover text for plotly"""
- hover_text = []
- for _, row in df.iterrows():
- text = f"Author: {row['author_name']}
"
- text += f"Timestamp: {row['timestamp_utc']}
"
- text += f"Source: {row['source_file']}
"
-
- # Handle potential NaN or non-string content
- content = row['content']
- if pd.isna(content) or content is None:
- content_text = "[No content]"
- else:
- content_str = str(content)
- content_text = content_str[:200] + ('...' if len(content_str) > 200 else '')
-
- text += f"Content: {content_text}"
- hover_text.append(text)
- return hover_text
-
-def main():
- # Load data
- with st.spinner("Loading chat data..."):
- df = load_all_chat_data()
-
- if df.empty:
- st.stop()
-
- # Parse embeddings
- with st.spinner("Parsing embeddings..."):
- embeddings, valid_df = parse_embeddings(df)
-
- if len(embeddings) == 0:
- st.error("No valid embeddings found!")
- st.stop()
-
- # Sidebar controls
- st.sidebar.header("🎛️ Visualization Controls")
-
- # Dimension reduction method
- method = st.sidebar.selectbox(
- "Dimension Reduction Method",
- ["PCA", "t-SNE"],
- help="PCA is faster, t-SNE may reveal better clusters"
- )
-
- # Source file filter
- source_files = valid_df['source_file'].unique()
- selected_sources = st.sidebar.multiselect(
- "Filter by Source Files",
- source_files,
- default=source_files,
- help="Select which chat log files to include"
- )
-
- # Author filter
- authors = valid_df['author_name'].unique()
- selected_authors = st.sidebar.multiselect(
- "Filter by Authors",
- authors,
- default=authors[:10] if len(authors) > 10 else authors, # Limit to first 10 for performance
- help="Select which authors to include"
- )
-
- # Filter data
- filtered_df = valid_df[
- (valid_df['source_file'].isin(selected_sources)) &
- (valid_df['author_name'].isin(selected_authors))
- ]
-
- if filtered_df.empty:
- st.warning("No data matches the current filters!")
- st.stop()
-
- # Get corresponding embeddings
- filtered_indices = filtered_df.index.tolist()
- filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
-
- st.info(f"📈 Visualizing {len(filtered_df)} messages")
-
- # Reduce dimensions
- with st.spinner(f"Reducing dimensions using {method}..."):
- reduced_embeddings = reduce_dimensions(filtered_embeddings, method)
-
- # Create hover text
- hover_text = create_hover_text(filtered_df)
-
- # Create the plot
- fig = go.Figure()
-
- # Color by source file
- colors = px.colors.qualitative.Set1
- for i, source in enumerate(selected_sources):
- source_mask = filtered_df['source_file'] == source
- if source_mask.any():
- source_data = filtered_df[source_mask]
- source_embeddings = reduced_embeddings[source_mask]
- source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
-
- fig.add_trace(go.Scatter(
- x=source_embeddings[:, 0],
- y=source_embeddings[:, 1],
- mode='markers',
- name=source,
- marker=dict(
- size=8,
- color=colors[i % len(colors)],
- opacity=0.7,
- line=dict(width=1, color='white')
- ),
- hovertemplate='%{hovertext}',
- hovertext=source_hover
- ))
-
- fig.update_layout(
- title=f"Discord Chat Messages - {method} Visualization",
- xaxis_title=f"{method} Component 1",
- yaxis_title=f"{method} Component 2",
- hovermode='closest',
- width=1000,
- height=700
- )
-
- # Display the plot
- st.plotly_chart(fig, use_container_width=True)
-
- # Statistics
- col1, col2, col3 = st.columns(3)
-
- with col1:
- st.metric("Total Messages", len(filtered_df))
-
- with col2:
- st.metric("Unique Authors", filtered_df['author_name'].nunique())
-
- with col3:
- st.metric("Source Files", len(selected_sources))
-
- # Show data table
- if st.checkbox("Show Data Table"):
- st.subheader("📋 Message Data")
- display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
- display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display
- st.dataframe(display_df, use_container_width=True)
-
-if __name__ == "__main__":
- main()
diff --git a/apps/cluster_map/test_debug.py b/apps/cluster_map/test_debug.py
new file mode 100644
index 0000000..f154263
--- /dev/null
+++ b/apps/cluster_map/test_debug.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Test script to debug the hanging issue in the modular app
+"""
+
+import numpy as np
+import sys
+import os
+
+# Add the current directory to Python path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+def test_dimensionality_reduction():
+ """Test dimensionality reduction functions"""
+ print("Testing dimensionality reduction functions...")
+
+ from dimensionality_reduction import reduce_dimensions
+
+ # Create test data similar to what we'd expect
+ n_samples = 796 # Same as the user's dataset
+ n_features = 384 # Common embedding dimension
+
+ print(f"Creating test embeddings: {n_samples} x {n_features}")
+ test_embeddings = np.random.randn(n_samples, n_features)
+
+ # Test PCA (should be fast)
+ print("Testing PCA...")
+ try:
+ result = reduce_dimensions(test_embeddings, method="PCA")
+ print(f"✓ PCA successful, output shape: {result.shape}")
+ except Exception as e:
+ print(f"✗ PCA failed: {e}")
+
+ # Test UMAP (might be slower)
+ print("Testing UMAP...")
+ try:
+ result = reduce_dimensions(test_embeddings, method="UMAP")
+ print(f"✓ UMAP successful, output shape: {result.shape}")
+ except Exception as e:
+ print(f"✗ UMAP failed: {e}")
+
+if __name__ == "__main__":
+ test_dimensionality_reduction()
diff --git a/apps/cluster_map/ui_components.py b/apps/cluster_map/ui_components.py
new file mode 100644
index 0000000..83b7944
--- /dev/null
+++ b/apps/cluster_map/ui_components.py
@@ -0,0 +1,236 @@
+"""
+Streamlit UI components and controls for the Discord Chat Embeddings Visualizer.
+"""
+
+import streamlit as st
+import numpy as np
+from config import (
+ APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
+ CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
+ LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS
+)
+
+
+def setup_page_config():
+ """Set up the Streamlit page configuration"""
+ st.set_page_config(
+ page_title=APP_TITLE,
+ page_icon=APP_ICON,
+ layout=APP_LAYOUT
+ )
+
+
+def display_title_and_description():
+ """Display the main title and description"""
+ st.title(f"{APP_ICON} {APP_TITLE}")
+ st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
+
+
+def create_method_controls():
+ """Create controls for dimension reduction and clustering methods"""
+ st.sidebar.header("🎛️ Visualization Controls")
+
+ # Dimension reduction method
+ method = st.sidebar.selectbox(
+ "Dimension Reduction Method",
+ ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"],
+ help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
+ )
+
+ # Clustering method
+ clustering_method = st.sidebar.selectbox(
+ "Clustering Method",
+ ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture",
+ "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"],
+ help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
+ )
+
+ return method, clustering_method
+
+
+def create_clustering_controls(clustering_method):
+ """Create controls for clustering parameters"""
+ n_clusters = 5
+ if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
+ n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5)
+
+ return n_clusters
+
+
+def create_separation_controls(method):
+ """Create controls for point separation and method-specific parameters"""
+ st.sidebar.subheader("🎯 Point Separation Controls")
+
+ spread_factor = st.sidebar.slider(
+ "Spread Factor",
+ 0.5, 3.0, 1.0, 0.1,
+ help="Increase to spread apart nearby points. Higher values create more separation."
+ )
+
+ # Method-specific parameters
+ perplexity_factor = 1.0
+ min_dist_factor = 1.0
+
+ if method == "t-SNE":
+ perplexity_factor = st.sidebar.slider(
+ "Perplexity Factor",
+ 0.5, 2.0, 1.0, 0.1,
+ help="Affects local vs global structure balance. Lower values focus on local details."
+ )
+
+ if method == "UMAP":
+ min_dist_factor = st.sidebar.slider(
+ "Min Distance Factor",
+ 0.1, 2.0, 1.0, 0.1,
+ help="Controls how tightly points are packed. Lower values create tighter clusters."
+ )
+
+ return spread_factor, perplexity_factor, min_dist_factor
+
+
+def create_jittering_controls():
+ """Create controls for jittering options"""
+ apply_jittering = st.sidebar.checkbox(
+ "Apply Smart Jittering",
+ value=False,
+ help="Add intelligent noise to separate overlapping points"
+ )
+
+ jitter_strength = 0.1
+ density_based_jitter = True
+
+ if apply_jittering:
+ jitter_strength = st.sidebar.slider(
+ "Jitter Strength",
+ 0.01, 0.5, 0.1, 0.01,
+ help="Strength of jittering. Higher values spread points more."
+ )
+ density_based_jitter = st.sidebar.checkbox(
+ "Density-Based Jittering",
+ value=True,
+ help="Apply stronger jittering in dense regions"
+ )
+
+ return apply_jittering, jitter_strength, density_based_jitter
+
+
+def create_advanced_options():
+ """Create advanced visualization options"""
+ with st.sidebar.expander("⚙️ Advanced Options"):
+ show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True)
+ point_size = st.slider("Point Size", 4, 15, 8)
+ point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7)
+
+ # Density-based visualization
+ density_based_sizing = st.checkbox(
+ "Density-Based Point Sizing",
+ value=False,
+ help="Make points larger in sparse regions, smaller in dense regions"
+ )
+
+ size_variation = 2.0
+ if density_based_sizing:
+ size_variation = st.slider(
+ "Size Variation Factor",
+ 1.5, 4.0, 2.0, 0.1,
+ help="How much point sizes vary based on local density"
+ )
+
+ return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation
+
+
+def create_filter_controls(valid_df):
+ """Create controls for filtering data by source and author"""
+ # Source file filter
+ source_files = valid_df['source_file'].unique()
+ selected_sources = st.sidebar.multiselect(
+ "Filter by Source Files",
+ source_files,
+ default=[],
+ help="Select which chat log files to include"
+ )
+
+ # Author filter
+ authors = valid_df['author_name'].unique()
+ default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors
+ selected_authors = st.sidebar.multiselect(
+ "Filter by Authors",
+ authors,
+ default=default_authors,
+ help="Select which authors to include"
+ )
+
+ return selected_sources, selected_authors
+
+
+def display_method_explanations():
+ """Display explanations for different methods"""
+ st.sidebar.markdown("---")
+ with st.sidebar.expander("📚 Method Explanations"):
+ st.markdown("**Dimensionality Reduction:**")
+ for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items():
+ st.markdown(f"- **{method}**: {explanation}")
+
+ st.markdown("\n**Clustering Methods:**")
+ for method, explanation in METHOD_EXPLANATIONS["clustering"].items():
+ st.markdown(f"- **{method}**: {explanation}")
+
+ st.markdown("\n**Separation Techniques:**")
+ for technique, explanation in METHOD_EXPLANATIONS["separation"].items():
+ st.markdown(f"- **{technique}**: {explanation}")
+
+ st.markdown("\n**Metrics:**")
+ for metric, explanation in METHOD_EXPLANATIONS["metrics"].items():
+ st.markdown(f"- **{metric}**: {explanation}")
+
+
+def display_performance_warnings(filtered_df, method, clustering_method):
+ """Display performance warnings for computationally intensive operations"""
+ if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD:
+ if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]:
+ st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.")
+ if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]:
+ st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.")
+
+
+def get_all_ui_parameters(valid_df):
+ """Get all UI parameters in a single function call"""
+ # Method selection
+ method, clustering_method = create_method_controls()
+
+ # Clustering parameters
+ n_clusters = create_clustering_controls(clustering_method)
+
+ # Separation controls
+ spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method)
+
+ # Jittering controls
+ apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls()
+
+ # Advanced options
+ show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options()
+
+ # Filters
+ selected_sources, selected_authors = create_filter_controls(valid_df)
+
+ # Method explanations
+ display_method_explanations()
+
+ return {
+ 'method': method,
+ 'clustering_method': clustering_method,
+ 'n_clusters': n_clusters,
+ 'spread_factor': spread_factor,
+ 'perplexity_factor': perplexity_factor,
+ 'min_dist_factor': min_dist_factor,
+ 'apply_jittering': apply_jittering,
+ 'jitter_strength': jitter_strength,
+ 'density_based_jitter': density_based_jitter,
+ 'show_cluster_metrics': show_cluster_metrics,
+ 'point_size': point_size,
+ 'point_opacity': point_opacity,
+ 'density_based_sizing': density_based_sizing,
+ 'size_variation': size_variation,
+ 'selected_sources': selected_sources,
+ 'selected_authors': selected_authors
+ }
diff --git a/apps/cluster_map/visualization.py b/apps/cluster_map/visualization.py
new file mode 100644
index 0000000..93f38d4
--- /dev/null
+++ b/apps/cluster_map/visualization.py
@@ -0,0 +1,225 @@
+"""
+Visualization functions for creating interactive plots and displays.
+"""
+
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+import streamlit as st
+from dimensionality_reduction import calculate_local_density_scaling
+from config import MESSAGE_CONTENT_PREVIEW_LENGTH, DEFAULT_POINT_SIZE, DEFAULT_POINT_OPACITY
+
+
+def create_hover_text(df):
+ """Create hover text for plotly"""
+ hover_text = []
+ for _, row in df.iterrows():
+ text = f"Author: {row['author_name']}
"
+ text += f"Timestamp: {row['timestamp_utc']}
"
+ text += f"Source: {row['source_file']}
"
+
+ # Handle potential NaN or non-string content
+ content = row['content']
+ if pd.isna(content) or content is None:
+ content_text = "[No content]"
+ else:
+ content_str = str(content)
+ content_text = content_str[:MESSAGE_CONTENT_PREVIEW_LENGTH] + ('...' if len(content_str) > MESSAGE_CONTENT_PREVIEW_LENGTH else '')
+
+ text += f"Content: {content_text}"
+ hover_text.append(text)
+ return hover_text
+
+
+def calculate_point_sizes(reduced_embeddings, density_based_sizing=False,
+ point_size=DEFAULT_POINT_SIZE, size_variation=2.0):
+ """Calculate point sizes based on density if enabled"""
+ if not density_based_sizing:
+ return [point_size] * len(reduced_embeddings)
+
+ local_densities = calculate_local_density_scaling(reduced_embeddings)
+ # Invert densities so sparse areas get larger points
+ inverted_densities = 1.0 - local_densities
+ # Scale point sizes
+ point_sizes = point_size * (1.0 + inverted_densities * (size_variation - 1.0))
+ return point_sizes
+
+
+def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover_text,
+ point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA"):
+ """Create a plot colored by clusters"""
+ fig = go.Figure()
+
+ unique_clusters = np.unique(cluster_labels)
+ colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel
+
+ for i, cluster_id in enumerate(unique_clusters):
+ cluster_mask = cluster_labels == cluster_id
+ if cluster_mask.any():
+ cluster_embeddings = reduced_embeddings[cluster_mask]
+ cluster_hover = [hover_text[j] for j, mask in enumerate(cluster_mask) if mask]
+ cluster_sizes = [point_sizes[j] for j, mask in enumerate(cluster_mask) if mask]
+
+ cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise"
+
+ fig.add_trace(go.Scatter(
+ x=cluster_embeddings[:, 0],
+ y=cluster_embeddings[:, 1],
+ mode='markers',
+ name=cluster_name,
+ marker=dict(
+ size=cluster_sizes,
+ color=colors[i % len(colors)],
+ opacity=point_opacity,
+ line=dict(width=1, color='white')
+ ),
+ hovertemplate='%{hovertext}',
+ hovertext=cluster_hover
+ ))
+
+ return fig
+
+
+def create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources, hover_text,
+ point_sizes, point_opacity=DEFAULT_POINT_OPACITY):
+ """Create a plot colored by source files"""
+ fig = go.Figure()
+ colors = px.colors.qualitative.Set1
+
+ for i, source in enumerate(selected_sources):
+ source_mask = filtered_df['source_file'] == source
+ if source_mask.any():
+ source_embeddings = reduced_embeddings[source_mask]
+ source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
+ source_sizes = [point_sizes[j] for j, mask in enumerate(source_mask) if mask]
+
+ fig.add_trace(go.Scatter(
+ x=source_embeddings[:, 0],
+ y=source_embeddings[:, 1],
+ mode='markers',
+ name=source,
+ marker=dict(
+ size=source_sizes,
+ color=colors[i % len(colors)],
+ opacity=point_opacity,
+ line=dict(width=1, color='white')
+ ),
+ hovertemplate='%{hovertext}',
+ hovertext=source_hover
+ ))
+
+ return fig
+
+
+def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=None,
+ selected_sources=None, method="PCA", clustering_method="None",
+ point_size=DEFAULT_POINT_SIZE, point_opacity=DEFAULT_POINT_OPACITY,
+ density_based_sizing=False, size_variation=2.0):
+ """Create the main visualization plot"""
+
+ # Create hover text
+ hover_text = create_hover_text(filtered_df)
+
+ # Calculate point sizes
+ point_sizes = calculate_point_sizes(reduced_embeddings, density_based_sizing,
+ point_size, size_variation)
+
+ # Create plot based on coloring strategy
+ if cluster_labels is not None:
+ fig = create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels,
+ hover_text, point_sizes, point_opacity, method)
+ else:
+ if selected_sources is None:
+ selected_sources = filtered_df['source_file'].unique()
+ fig = create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources,
+ hover_text, point_sizes, point_opacity)
+
+ # Update layout
+ title_suffix = f" with {clustering_method}" if clustering_method != "None" else ""
+ fig.update_layout(
+ title=f"Discord Chat Messages - {method} Visualization{title_suffix}",
+ xaxis_title=f"{method} Component 1",
+ yaxis_title=f"{method} Component 2",
+ hovermode='closest',
+ width=1000,
+ height=700
+ )
+
+ return fig
+
+
+def display_clustering_metrics(cluster_labels, silhouette_avg, calinski_harabasz, show_metrics=True):
+ """Display clustering quality metrics"""
+ if cluster_labels is not None and show_metrics:
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ n_clusters_found = len(np.unique(cluster_labels[cluster_labels != -1]))
+ st.metric("Clusters Found", n_clusters_found)
+ with col2:
+ if silhouette_avg is not None:
+ st.metric("Silhouette Score", f"{silhouette_avg:.3f}")
+ else:
+ st.metric("Silhouette Score", "N/A")
+ with col3:
+ if calinski_harabasz is not None:
+ st.metric("Calinski-Harabasz Index", f"{calinski_harabasz:.1f}")
+ else:
+ st.metric("Calinski-Harabasz Index", "N/A")
+
+
+def display_summary_stats(filtered_df, selected_sources):
+ """Display summary statistics"""
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ st.metric("Total Messages", len(filtered_df))
+
+ with col2:
+ st.metric("Unique Authors", filtered_df['author_name'].nunique())
+
+ with col3:
+ st.metric("Source Files", len(selected_sources))
+
+
+def display_clustering_results(filtered_df, cluster_labels, reduced_embeddings, method, clustering_method):
+ """Display clustering results and export options"""
+ if cluster_labels is None:
+ return
+
+ st.subheader("📊 Clustering Results")
+
+ # Add cluster information to dataframe for export
+ export_df = filtered_df.copy()
+ export_df['cluster_id'] = cluster_labels
+ export_df['x_coordinate'] = reduced_embeddings[:, 0]
+ export_df['y_coordinate'] = reduced_embeddings[:, 1]
+
+ # Show cluster distribution
+ cluster_dist = pd.Series(cluster_labels).value_counts().sort_index()
+ st.bar_chart(cluster_dist)
+
+ # Download option
+ csv_data = export_df.to_csv(index=False)
+ st.download_button(
+ label="📥 Download Clustering Results (CSV)",
+ data=csv_data,
+ file_name=f"chat_clusters_{method}_{clustering_method}.csv",
+ mime="text/csv"
+ )
+
+
+def display_data_table(filtered_df, cluster_labels=None):
+ """Display the data table with optional clustering information"""
+ if not st.checkbox("Show Data Table"):
+ return
+
+ st.subheader("📋 Message Data")
+ display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
+
+ # Add clustering info if available
+ if cluster_labels is not None:
+ display_df['cluster'] = cluster_labels
+
+ display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display
+ st.dataframe(display_df, use_container_width=True)