This commit is contained in:
2025-08-11 02:37:21 +01:00
parent 6d35b42b27
commit 4ca7e8ab61
10 changed files with 1117 additions and 233 deletions

View File

@@ -0,0 +1,12 @@
"""
Discord Chat Embeddings Visualizer - Legacy Entry Point
This file serves as a compatibility layer for the original cluster.py.
The application has been refactored into modular components for better maintainability.
"""
# Import and run the main application
from main import main
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,99 @@
"""
Clustering algorithms and evaluation metrics.
"""
import numpy as np
import streamlit as st
from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import hdbscan
from config import DEFAULT_RANDOM_STATE
def apply_clustering(embeddings, clustering_method="None", n_clusters=5):
"""
Apply clustering algorithm to embeddings and return labels and metrics.
Args:
embeddings: High-dimensional embeddings to cluster
clustering_method: Name of clustering algorithm
n_clusters: Number of clusters (for methods that require it)
Returns:
tuple: (cluster_labels, silhouette_score, calinski_harabasz_score)
"""
if clustering_method == "None" or len(embeddings) <= n_clusters:
return None, None, None
# Standardize embeddings for better clustering
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)
cluster_labels = None
silhouette_avg = None
calinski_harabasz = None
try:
if clustering_method == "HDBSCAN":
min_cluster_size = max(2, len(embeddings) // 20) # Adaptive min cluster size
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
min_samples=1, cluster_selection_epsilon=0.5)
cluster_labels = clusterer.fit_predict(scaled_embeddings)
elif clustering_method == "Spectral Clustering":
clusterer = SpectralClustering(n_clusters=n_clusters, random_state=DEFAULT_RANDOM_STATE,
affinity='rbf', gamma=1.0)
cluster_labels = clusterer.fit_predict(scaled_embeddings)
elif clustering_method == "Gaussian Mixture":
clusterer = GaussianMixture(n_components=n_clusters, random_state=DEFAULT_RANDOM_STATE,
covariance_type='full', max_iter=200)
cluster_labels = clusterer.fit_predict(scaled_embeddings)
elif clustering_method == "Agglomerative (Ward)":
clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
cluster_labels = clusterer.fit_predict(scaled_embeddings)
elif clustering_method == "Agglomerative (Complete)":
clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
cluster_labels = clusterer.fit_predict(scaled_embeddings)
elif clustering_method == "OPTICS":
min_samples = max(2, len(embeddings) // 50)
clusterer = OPTICS(min_samples=min_samples, xi=0.05, min_cluster_size=0.1)
cluster_labels = clusterer.fit_predict(scaled_embeddings)
# Calculate clustering quality metrics
if cluster_labels is not None and len(np.unique(cluster_labels)) > 1:
# Only calculate if we have multiple clusters and no noise-only clustering
valid_labels = cluster_labels[cluster_labels != -1] # Remove noise points for HDBSCAN/OPTICS
valid_embeddings = scaled_embeddings[cluster_labels != -1]
if len(valid_labels) > 0 and len(np.unique(valid_labels)) > 1:
silhouette_avg = silhouette_score(valid_embeddings, valid_labels)
calinski_harabasz = calinski_harabasz_score(valid_embeddings, valid_labels)
except Exception as e:
st.warning(f"Clustering failed: {str(e)}")
cluster_labels = None
return cluster_labels, silhouette_avg, calinski_harabasz
def get_cluster_statistics(cluster_labels):
"""Get basic statistics about clustering results"""
if cluster_labels is None:
return {}
unique_clusters = np.unique(cluster_labels)
n_clusters = len(unique_clusters[unique_clusters != -1]) # Exclude noise cluster (-1)
n_noise = np.sum(cluster_labels == -1)
return {
"n_clusters": n_clusters,
"n_noise_points": n_noise,
"cluster_distribution": np.bincount(cluster_labels[cluster_labels != -1]) if n_clusters > 0 else [],
"unique_clusters": unique_clusters
}

View File

@@ -0,0 +1,73 @@
"""
Configuration settings and constants for the Discord Chat Embeddings Visualizer.
"""
# Application settings
APP_TITLE = "Discord Chat Embeddings Visualizer"
APP_ICON = "🗨️"
APP_LAYOUT = "wide"
# File paths
CHAT_LOGS_PATH = "../../discord_chat_logs"
# Algorithm parameters
DEFAULT_RANDOM_STATE = 42
DEFAULT_N_COMPONENTS = 2
DEFAULT_N_CLUSTERS = 5
# Visualization settings
DEFAULT_POINT_SIZE = 8
DEFAULT_POINT_OPACITY = 0.7
MAX_DISPLAYED_AUTHORS = 10
MESSAGE_CONTENT_PREVIEW_LENGTH = 200
MESSAGE_CONTENT_DISPLAY_LENGTH = 100
# Performance thresholds
LARGE_DATASET_WARNING_THRESHOLD = 1000
# Color palettes
PRIMARY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
"#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
# Clustering method categories
CLUSTERING_METHODS_REQUIRING_N_CLUSTERS = [
"Spectral Clustering",
"Gaussian Mixture",
"Agglomerative (Ward)",
"Agglomerative (Complete)"
]
COMPUTATIONALLY_INTENSIVE_METHODS = {
"dimension_reduction": ["t-SNE", "Spectral Embedding"],
"clustering": ["Spectral Clustering", "OPTICS"]
}
# Method explanations
METHOD_EXPLANATIONS = {
"dimension_reduction": {
"PCA": "Linear, fast, preserves global variance",
"t-SNE": "Non-linear, good for local structure, slower",
"UMAP": "Balanced speed/quality, preserves local & global structure",
"Spectral Embedding": "Uses graph theory, good for non-convex clusters",
"Force-Directed": "Physics-based layout, creates natural spacing"
},
"clustering": {
"HDBSCAN": "Density-based, finds variable density clusters, handles noise",
"Spectral Clustering": "Uses eigenvalues, good for non-convex shapes",
"Gaussian Mixture": "Probabilistic, assumes gaussian distributions",
"Agglomerative (Ward)": "Hierarchical, minimizes within-cluster variance",
"Agglomerative (Complete)": "Hierarchical, minimizes maximum distance",
"OPTICS": "Density-based, finds clusters of varying densities"
},
"separation": {
"Spread Factor": "Applies repulsive forces between nearby points",
"Smart Jittering": "Adds intelligent noise to separate overlapping points",
"Density-Based Jittering": "Stronger separation in crowded areas",
"Perplexity Factor": "Controls t-SNE's focus on local vs global structure",
"Min Distance Factor": "Controls UMAP's point packing tightness"
},
"metrics": {
"Silhouette Score": "Higher is better (range: -1 to 1)",
"Calinski-Harabasz": "Higher is better, measures cluster separation"
}
}

View File

@@ -0,0 +1,86 @@
"""
Data loading and parsing utilities for Discord chat logs.
"""
import pandas as pd
import numpy as np
import streamlit as st
import ast
from pathlib import Path
from config import CHAT_LOGS_PATH
@st.cache_data
def load_all_chat_data():
"""Load all CSV files from the discord_chat_logs folder"""
chat_logs_path = Path(CHAT_LOGS_PATH)
with st.expander("📁 Loading Details", expanded=False):
# Display the path for debugging
st.write(f"Looking for CSV files in: {chat_logs_path}")
st.write(f"Path exists: {chat_logs_path.exists()}")
all_data = []
for csv_file in chat_logs_path.glob("*.csv"):
try:
df = pd.read_csv(csv_file)
df['source_file'] = csv_file.stem # Add source file name
all_data.append(df)
st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
except Exception as e:
st.error(f"❌ Error loading {csv_file.name}: {e}")
if all_data:
combined_df = pd.concat(all_data, ignore_index=True)
st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
else:
st.error("No data loaded!")
combined_df = pd.DataFrame()
return combined_df if all_data else pd.DataFrame()
@st.cache_data
def parse_embeddings(df):
"""Parse the content_embedding column from string to numpy array"""
embeddings = []
valid_indices = []
for idx, embedding_str in enumerate(df['content_embedding']):
try:
# Parse the string representation of the list
embedding = ast.literal_eval(embedding_str)
if isinstance(embedding, list) and len(embedding) > 0:
embeddings.append(embedding)
valid_indices.append(idx)
except Exception as e:
continue
embeddings_array = np.array(embeddings)
valid_df = df.iloc[valid_indices].copy()
st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
return embeddings_array, valid_df
def filter_data(df, selected_sources, selected_authors):
"""Filter dataframe by selected sources and authors"""
if not selected_sources:
selected_sources = df['source_file'].unique()
filtered_df = df[
(df['source_file'].isin(selected_sources)) &
(df['author_name'].isin(selected_authors))
]
return filtered_df
def get_filtered_embeddings(embeddings, valid_df, filtered_df):
"""Get embeddings corresponding to filtered dataframe"""
filtered_indices = filtered_df.index.tolist()
filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
return filtered_embeddings

View File

@@ -0,0 +1,211 @@
"""
Dimensionality reduction algorithms and point separation techniques.
"""
import numpy as np
import streamlit as st
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, SpectralEmbedding
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist, squareform
from scipy.optimize import minimize
import umap
from config import DEFAULT_RANDOM_STATE
def apply_adaptive_spreading(embeddings, spread_factor=1.0):
"""
Apply adaptive spreading to push apart nearby points while preserving global structure.
Uses a force-based approach where closer points repel more strongly.
"""
if spread_factor <= 0:
return embeddings
embeddings = embeddings.copy()
n_points = len(embeddings)
print(f"DEBUG: Applying adaptive spreading to {n_points} points with factor {spread_factor}")
if n_points < 2:
return embeddings
# For very large datasets, skip spreading to avoid hanging
if n_points > 1000:
print(f"DEBUG: Large dataset ({n_points} points), skipping adaptive spreading...")
return embeddings
# Calculate pairwise distances
distances = squareform(pdist(embeddings))
# Apply force-based spreading with fewer iterations for large datasets
max_iterations = 3 if n_points > 500 else 5
for iteration in range(max_iterations):
if iteration % 2 == 0: # Progress indicator
print(f"DEBUG: Spreading iteration {iteration + 1}/{max_iterations}")
forces = np.zeros_like(embeddings)
for i in range(n_points):
for j in range(i + 1, n_points):
diff = embeddings[i] - embeddings[j]
dist = np.linalg.norm(diff)
if dist > 0:
# Repulsive force inversely proportional to distance
force_magnitude = spread_factor / (dist ** 2 + 0.01)
force_direction = diff / dist
force = force_magnitude * force_direction
forces[i] += force
forces[j] -= force
# Apply forces with damping
embeddings += forces * 0.1
print(f"DEBUG: Adaptive spreading complete")
return embeddings
def force_directed_layout(high_dim_embeddings, n_components=2, spread_factor=1.0):
"""
Create a force-directed layout from high-dimensional embeddings.
This creates more natural spacing between similar points.
"""
print(f"DEBUG: Starting force-directed layout with {len(high_dim_embeddings)} points...")
# For large datasets, fall back to PCA + spreading to avoid hanging
if len(high_dim_embeddings) > 500:
print(f"DEBUG: Large dataset ({len(high_dim_embeddings)} points), using PCA + spreading instead...")
pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
result = pca.fit_transform(high_dim_embeddings)
return apply_adaptive_spreading(result, spread_factor)
# Start with PCA as initial layout
pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
initial_layout = pca.fit_transform(high_dim_embeddings)
print(f"DEBUG: Initial PCA layout computed...")
# For simplicity, just apply spreading to the PCA result
# The original optimization was too computationally intensive
result = apply_adaptive_spreading(initial_layout, spread_factor)
print(f"DEBUG: Force-directed layout complete...")
return result
def calculate_local_density_scaling(embeddings, k=5):
"""
Calculate local density scaling factors to emphasize differences in dense regions.
"""
if len(embeddings) < k:
return np.ones(len(embeddings))
# Find k nearest neighbors for each point
nn = NearestNeighbors(n_neighbors=k+1) # +1 because first neighbor is the point itself
nn.fit(embeddings)
distances, indices = nn.kneighbors(embeddings)
# Calculate local density (inverse of average distance to k nearest neighbors)
local_densities = 1.0 / (np.mean(distances[:, 1:], axis=1) + 1e-6)
# Normalize densities
local_densities = (local_densities - np.min(local_densities)) / (np.max(local_densities) - np.min(local_densities) + 1e-6)
return local_densities
def apply_density_based_jittering(embeddings, density_scaling=True, jitter_strength=0.1):
"""
Apply smart jittering that's stronger in dense regions to separate overlapping points.
"""
if not density_scaling:
# Simple random jittering
noise = np.random.normal(0, jitter_strength, embeddings.shape)
return embeddings + noise
# Calculate local densities
densities = calculate_local_density_scaling(embeddings)
# Apply density-proportional jittering
jittered = embeddings.copy()
for i in range(len(embeddings)):
# More jitter in denser regions
jitter_amount = jitter_strength * (1 + densities[i])
noise = np.random.normal(0, jitter_amount, embeddings.shape[1])
jittered[i] += noise
return jittered
def reduce_dimensions(embeddings, method="PCA", n_components=2, spread_factor=1.0,
perplexity_factor=1.0, min_dist_factor=1.0):
"""Apply dimensionality reduction with enhanced separation"""
# Convert to numpy array if it's not already
embeddings = np.array(embeddings)
print(f"DEBUG: Starting {method} with {len(embeddings)} embeddings, shape: {embeddings.shape}")
# Standardize embeddings for better processing
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)
print(f"DEBUG: Embeddings standardized")
# Apply the selected dimensionality reduction method
if method == "PCA":
print(f"DEBUG: Applying PCA...")
reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
# Apply spreading to PCA results
print(f"DEBUG: Applying spreading...")
reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
elif method == "t-SNE":
# Adjust perplexity based on user preference and data size
base_perplexity = min(30, len(embeddings)-1)
adjusted_perplexity = max(5, min(50, int(base_perplexity * perplexity_factor)))
print(f"DEBUG: Applying t-SNE with perplexity {adjusted_perplexity}...")
reducer = TSNE(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
perplexity=adjusted_perplexity, n_iter=1000,
early_exaggeration=12.0 * spread_factor, # Increase early exaggeration for more separation
learning_rate='auto')
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
elif method == "UMAP":
# Adjust UMAP parameters for better local separation
n_neighbors = min(15, len(embeddings)-1)
min_dist = 0.1 * min_dist_factor
spread = 1.0 * spread_factor
print(f"DEBUG: Applying UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}...")
reducer = umap.UMAP(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
n_neighbors=n_neighbors, min_dist=min_dist,
spread=spread, local_connectivity=2.0)
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
elif method == "Spectral Embedding":
n_neighbors = min(10, len(embeddings)-1)
print(f"DEBUG: Applying Spectral Embedding with n_neighbors={n_neighbors}...")
reducer = SpectralEmbedding(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
n_neighbors=n_neighbors)
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
# Apply spreading to spectral results
print(f"DEBUG: Applying spreading...")
reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
elif method == "Force-Directed":
# New method: Use force-directed layout for natural spreading
print(f"DEBUG: Applying Force-Directed layout...")
reduced_embeddings = force_directed_layout(scaled_embeddings, n_components, spread_factor)
else:
# Fallback to PCA
print(f"DEBUG: Unknown method {method}, falling back to PCA...")
reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
print(f"DEBUG: Dimensionality reduction complete. Output shape: {reduced_embeddings.shape}")
return reduced_embeddings

132
apps/cluster_map/main.py Normal file
View File

@@ -0,0 +1,132 @@
"""
Main application logic for the Discord Chat Embeddings Visualizer.
"""
import streamlit as st
import warnings
warnings.filterwarnings('ignore')
# Import custom modules
from ui_components import (
setup_page_config, display_title_and_description, get_all_ui_parameters,
display_performance_warnings
)
from data_loader import (
load_all_chat_data, parse_embeddings, filter_data, get_filtered_embeddings
)
from dimensionality_reduction import (
reduce_dimensions, apply_density_based_jittering
)
from clustering import apply_clustering
from visualization import (
create_visualization_plot, display_clustering_metrics, display_summary_stats,
display_clustering_results, display_data_table
)
def main():
"""Main application function"""
# Set up page configuration
setup_page_config()
# Display title and description
display_title_and_description()
# Load data
with st.spinner("Loading chat data..."):
df = load_all_chat_data()
if df.empty:
st.error("No data could be loaded. Please check the data directory.")
st.stop()
# Parse embeddings
with st.spinner("Parsing embeddings..."):
embeddings, valid_df = parse_embeddings(df)
if len(embeddings) == 0:
st.error("No valid embeddings found!")
st.stop()
# Get UI parameters
params = get_all_ui_parameters(valid_df)
# Filter data
filtered_df = filter_data(valid_df, params['selected_sources'], params['selected_authors'])
if filtered_df.empty:
st.warning("No data matches the current filters!")
st.stop()
# Display performance warnings
display_performance_warnings(filtered_df, params['method'], params['clustering_method'])
# Get corresponding embeddings
filtered_embeddings = get_filtered_embeddings(embeddings, valid_df, filtered_df)
st.info(f"📈 Visualizing {len(filtered_df)} messages")
# Reduce dimensions
with st.spinner(f"Reducing dimensions using {params['method']}..."):
reduced_embeddings = reduce_dimensions(
filtered_embeddings,
method=params['method'],
spread_factor=params['spread_factor'],
perplexity_factor=params['perplexity_factor'],
min_dist_factor=params['min_dist_factor']
)
# Apply clustering
with st.spinner(f"Applying {params['clustering_method']}..."):
cluster_labels, silhouette_avg, calinski_harabasz = apply_clustering(
filtered_embeddings,
clustering_method=params['clustering_method'],
n_clusters=params['n_clusters']
)
# Apply jittering if requested
if params['apply_jittering']:
with st.spinner("Applying smart jittering to separate overlapping points..."):
reduced_embeddings = apply_density_based_jittering(
reduced_embeddings,
density_scaling=params['density_based_jitter'],
jitter_strength=params['jitter_strength']
)
# Display clustering metrics
display_clustering_metrics(
cluster_labels, silhouette_avg, calinski_harabasz,
params['show_cluster_metrics']
)
# Create and display the main plot
fig = create_visualization_plot(
reduced_embeddings=reduced_embeddings,
filtered_df=filtered_df,
cluster_labels=cluster_labels,
selected_sources=params['selected_sources'] if params['selected_sources'] else None,
method=params['method'],
clustering_method=params['clustering_method'],
point_size=params['point_size'],
point_opacity=params['point_opacity'],
density_based_sizing=params['density_based_sizing'],
size_variation=params['size_variation']
)
st.plotly_chart(fig, use_container_width=True)
# Display summary statistics
display_summary_stats(filtered_df, params['selected_sources'] or filtered_df['source_file'].unique())
# Display clustering results and export options
display_clustering_results(
filtered_df, cluster_labels, reduced_embeddings,
params['method'], params['clustering_method']
)
# Display data table
display_data_table(filtered_df, cluster_labels)
if __name__ == "__main__":
main()

View File

@@ -1,233 +0,0 @@
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import json
import os
from pathlib import Path
import ast
# Set page config
st.set_page_config(
page_title="Discord Chat Embeddings Visualizer",
page_icon="🗨️",
layout="wide"
)
# Title and description
st.title("🗨️ Discord Chat Embeddings Visualizer")
st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
@st.cache_data
def load_all_chat_data():
"""Load all CSV files from the discord_chat_logs folder"""
chat_logs_path = Path("../../discord_chat_logs")
# Display the path for debugging
st.write(f"Looking for CSV files in: {chat_logs_path}")
st.write(f"Path exists: {chat_logs_path.exists()}")
all_data = []
for csv_file in chat_logs_path.glob("*.csv"):
try:
df = pd.read_csv(csv_file)
df['source_file'] = csv_file.stem # Add source file name
all_data.append(df)
st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
except Exception as e:
st.error(f"❌ Error loading {csv_file.name}: {e}")
if all_data:
combined_df = pd.concat(all_data, ignore_index=True)
st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
return combined_df
else:
st.error("No data loaded!")
return pd.DataFrame()
@st.cache_data
def parse_embeddings(df):
"""Parse the content_embedding column from string to numpy array"""
embeddings = []
valid_indices = []
for idx, embedding_str in enumerate(df['content_embedding']):
try:
# Parse the string representation of the list
embedding = ast.literal_eval(embedding_str)
if isinstance(embedding, list) and len(embedding) > 0:
embeddings.append(embedding)
valid_indices.append(idx)
except Exception as e:
continue
embeddings_array = np.array(embeddings)
valid_df = df.iloc[valid_indices].copy()
st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
return embeddings_array, valid_df
@st.cache_data
def reduce_dimensions(embeddings, method="PCA", n_components=2):
"""Reduce embeddings to 2D using PCA or t-SNE"""
if method == "PCA":
reducer = PCA(n_components=n_components, random_state=42)
elif method == "t-SNE":
reducer = TSNE(n_components=n_components, random_state=42, perplexity=min(30, len(embeddings)-1))
reduced_embeddings = reducer.fit_transform(embeddings)
return reduced_embeddings
def create_hover_text(df):
"""Create hover text for plotly"""
hover_text = []
for _, row in df.iterrows():
text = f"<b>Author:</b> {row['author_name']}<br>"
text += f"<b>Timestamp:</b> {row['timestamp_utc']}<br>"
text += f"<b>Source:</b> {row['source_file']}<br>"
# Handle potential NaN or non-string content
content = row['content']
if pd.isna(content) or content is None:
content_text = "[No content]"
else:
content_str = str(content)
content_text = content_str[:200] + ('...' if len(content_str) > 200 else '')
text += f"<b>Content:</b> {content_text}"
hover_text.append(text)
return hover_text
def main():
# Load data
with st.spinner("Loading chat data..."):
df = load_all_chat_data()
if df.empty:
st.stop()
# Parse embeddings
with st.spinner("Parsing embeddings..."):
embeddings, valid_df = parse_embeddings(df)
if len(embeddings) == 0:
st.error("No valid embeddings found!")
st.stop()
# Sidebar controls
st.sidebar.header("🎛️ Visualization Controls")
# Dimension reduction method
method = st.sidebar.selectbox(
"Dimension Reduction Method",
["PCA", "t-SNE"],
help="PCA is faster, t-SNE may reveal better clusters"
)
# Source file filter
source_files = valid_df['source_file'].unique()
selected_sources = st.sidebar.multiselect(
"Filter by Source Files",
source_files,
default=source_files,
help="Select which chat log files to include"
)
# Author filter
authors = valid_df['author_name'].unique()
selected_authors = st.sidebar.multiselect(
"Filter by Authors",
authors,
default=authors[:10] if len(authors) > 10 else authors, # Limit to first 10 for performance
help="Select which authors to include"
)
# Filter data
filtered_df = valid_df[
(valid_df['source_file'].isin(selected_sources)) &
(valid_df['author_name'].isin(selected_authors))
]
if filtered_df.empty:
st.warning("No data matches the current filters!")
st.stop()
# Get corresponding embeddings
filtered_indices = filtered_df.index.tolist()
filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
st.info(f"📈 Visualizing {len(filtered_df)} messages")
# Reduce dimensions
with st.spinner(f"Reducing dimensions using {method}..."):
reduced_embeddings = reduce_dimensions(filtered_embeddings, method)
# Create hover text
hover_text = create_hover_text(filtered_df)
# Create the plot
fig = go.Figure()
# Color by source file
colors = px.colors.qualitative.Set1
for i, source in enumerate(selected_sources):
source_mask = filtered_df['source_file'] == source
if source_mask.any():
source_data = filtered_df[source_mask]
source_embeddings = reduced_embeddings[source_mask]
source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
fig.add_trace(go.Scatter(
x=source_embeddings[:, 0],
y=source_embeddings[:, 1],
mode='markers',
name=source,
marker=dict(
size=8,
color=colors[i % len(colors)],
opacity=0.7,
line=dict(width=1, color='white')
),
hovertemplate='%{hovertext}<extra></extra>',
hovertext=source_hover
))
fig.update_layout(
title=f"Discord Chat Messages - {method} Visualization",
xaxis_title=f"{method} Component 1",
yaxis_title=f"{method} Component 2",
hovermode='closest',
width=1000,
height=700
)
# Display the plot
st.plotly_chart(fig, use_container_width=True)
# Statistics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Messages", len(filtered_df))
with col2:
st.metric("Unique Authors", filtered_df['author_name'].nunique())
with col3:
st.metric("Source Files", len(selected_sources))
# Show data table
if st.checkbox("Show Data Table"):
st.subheader("📋 Message Data")
display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display
st.dataframe(display_df, use_container_width=True)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
"""
Test script to debug the hanging issue in the modular app
"""
import numpy as np
import sys
import os
# Add the current directory to Python path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
def test_dimensionality_reduction():
"""Test dimensionality reduction functions"""
print("Testing dimensionality reduction functions...")
from dimensionality_reduction import reduce_dimensions
# Create test data similar to what we'd expect
n_samples = 796 # Same as the user's dataset
n_features = 384 # Common embedding dimension
print(f"Creating test embeddings: {n_samples} x {n_features}")
test_embeddings = np.random.randn(n_samples, n_features)
# Test PCA (should be fast)
print("Testing PCA...")
try:
result = reduce_dimensions(test_embeddings, method="PCA")
print(f"✓ PCA successful, output shape: {result.shape}")
except Exception as e:
print(f"✗ PCA failed: {e}")
# Test UMAP (might be slower)
print("Testing UMAP...")
try:
result = reduce_dimensions(test_embeddings, method="UMAP")
print(f"✓ UMAP successful, output shape: {result.shape}")
except Exception as e:
print(f"✗ UMAP failed: {e}")
if __name__ == "__main__":
test_dimensionality_reduction()

View File

@@ -0,0 +1,236 @@
"""
Streamlit UI components and controls for the Discord Chat Embeddings Visualizer.
"""
import streamlit as st
import numpy as np
from config import (
APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS
)
def setup_page_config():
"""Set up the Streamlit page configuration"""
st.set_page_config(
page_title=APP_TITLE,
page_icon=APP_ICON,
layout=APP_LAYOUT
)
def display_title_and_description():
"""Display the main title and description"""
st.title(f"{APP_ICON} {APP_TITLE}")
st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
def create_method_controls():
"""Create controls for dimension reduction and clustering methods"""
st.sidebar.header("🎛️ Visualization Controls")
# Dimension reduction method
method = st.sidebar.selectbox(
"Dimension Reduction Method",
["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"],
help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
)
# Clustering method
clustering_method = st.sidebar.selectbox(
"Clustering Method",
["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture",
"Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"],
help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
)
return method, clustering_method
def create_clustering_controls(clustering_method):
"""Create controls for clustering parameters"""
n_clusters = 5
if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5)
return n_clusters
def create_separation_controls(method):
"""Create controls for point separation and method-specific parameters"""
st.sidebar.subheader("🎯 Point Separation Controls")
spread_factor = st.sidebar.slider(
"Spread Factor",
0.5, 3.0, 1.0, 0.1,
help="Increase to spread apart nearby points. Higher values create more separation."
)
# Method-specific parameters
perplexity_factor = 1.0
min_dist_factor = 1.0
if method == "t-SNE":
perplexity_factor = st.sidebar.slider(
"Perplexity Factor",
0.5, 2.0, 1.0, 0.1,
help="Affects local vs global structure balance. Lower values focus on local details."
)
if method == "UMAP":
min_dist_factor = st.sidebar.slider(
"Min Distance Factor",
0.1, 2.0, 1.0, 0.1,
help="Controls how tightly points are packed. Lower values create tighter clusters."
)
return spread_factor, perplexity_factor, min_dist_factor
def create_jittering_controls():
"""Create controls for jittering options"""
apply_jittering = st.sidebar.checkbox(
"Apply Smart Jittering",
value=False,
help="Add intelligent noise to separate overlapping points"
)
jitter_strength = 0.1
density_based_jitter = True
if apply_jittering:
jitter_strength = st.sidebar.slider(
"Jitter Strength",
0.01, 0.5, 0.1, 0.01,
help="Strength of jittering. Higher values spread points more."
)
density_based_jitter = st.sidebar.checkbox(
"Density-Based Jittering",
value=True,
help="Apply stronger jittering in dense regions"
)
return apply_jittering, jitter_strength, density_based_jitter
def create_advanced_options():
"""Create advanced visualization options"""
with st.sidebar.expander("⚙️ Advanced Options"):
show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True)
point_size = st.slider("Point Size", 4, 15, 8)
point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7)
# Density-based visualization
density_based_sizing = st.checkbox(
"Density-Based Point Sizing",
value=False,
help="Make points larger in sparse regions, smaller in dense regions"
)
size_variation = 2.0
if density_based_sizing:
size_variation = st.slider(
"Size Variation Factor",
1.5, 4.0, 2.0, 0.1,
help="How much point sizes vary based on local density"
)
return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation
def create_filter_controls(valid_df):
"""Create controls for filtering data by source and author"""
# Source file filter
source_files = valid_df['source_file'].unique()
selected_sources = st.sidebar.multiselect(
"Filter by Source Files",
source_files,
default=[],
help="Select which chat log files to include"
)
# Author filter
authors = valid_df['author_name'].unique()
default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors
selected_authors = st.sidebar.multiselect(
"Filter by Authors",
authors,
default=default_authors,
help="Select which authors to include"
)
return selected_sources, selected_authors
def display_method_explanations():
"""Display explanations for different methods"""
st.sidebar.markdown("---")
with st.sidebar.expander("📚 Method Explanations"):
st.markdown("**Dimensionality Reduction:**")
for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items():
st.markdown(f"- **{method}**: {explanation}")
st.markdown("\n**Clustering Methods:**")
for method, explanation in METHOD_EXPLANATIONS["clustering"].items():
st.markdown(f"- **{method}**: {explanation}")
st.markdown("\n**Separation Techniques:**")
for technique, explanation in METHOD_EXPLANATIONS["separation"].items():
st.markdown(f"- **{technique}**: {explanation}")
st.markdown("\n**Metrics:**")
for metric, explanation in METHOD_EXPLANATIONS["metrics"].items():
st.markdown(f"- **{metric}**: {explanation}")
def display_performance_warnings(filtered_df, method, clustering_method):
"""Display performance warnings for computationally intensive operations"""
if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD:
if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]:
st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.")
if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]:
st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.")
def get_all_ui_parameters(valid_df):
"""Get all UI parameters in a single function call"""
# Method selection
method, clustering_method = create_method_controls()
# Clustering parameters
n_clusters = create_clustering_controls(clustering_method)
# Separation controls
spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method)
# Jittering controls
apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls()
# Advanced options
show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options()
# Filters
selected_sources, selected_authors = create_filter_controls(valid_df)
# Method explanations
display_method_explanations()
return {
'method': method,
'clustering_method': clustering_method,
'n_clusters': n_clusters,
'spread_factor': spread_factor,
'perplexity_factor': perplexity_factor,
'min_dist_factor': min_dist_factor,
'apply_jittering': apply_jittering,
'jitter_strength': jitter_strength,
'density_based_jitter': density_based_jitter,
'show_cluster_metrics': show_cluster_metrics,
'point_size': point_size,
'point_opacity': point_opacity,
'density_based_sizing': density_based_sizing,
'size_variation': size_variation,
'selected_sources': selected_sources,
'selected_authors': selected_authors
}

View File

@@ -0,0 +1,225 @@
"""
Visualization functions for creating interactive plots and displays.
"""
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from dimensionality_reduction import calculate_local_density_scaling
from config import MESSAGE_CONTENT_PREVIEW_LENGTH, DEFAULT_POINT_SIZE, DEFAULT_POINT_OPACITY
def create_hover_text(df):
"""Create hover text for plotly"""
hover_text = []
for _, row in df.iterrows():
text = f"<b>Author:</b> {row['author_name']}<br>"
text += f"<b>Timestamp:</b> {row['timestamp_utc']}<br>"
text += f"<b>Source:</b> {row['source_file']}<br>"
# Handle potential NaN or non-string content
content = row['content']
if pd.isna(content) or content is None:
content_text = "[No content]"
else:
content_str = str(content)
content_text = content_str[:MESSAGE_CONTENT_PREVIEW_LENGTH] + ('...' if len(content_str) > MESSAGE_CONTENT_PREVIEW_LENGTH else '')
text += f"<b>Content:</b> {content_text}"
hover_text.append(text)
return hover_text
def calculate_point_sizes(reduced_embeddings, density_based_sizing=False,
point_size=DEFAULT_POINT_SIZE, size_variation=2.0):
"""Calculate point sizes based on density if enabled"""
if not density_based_sizing:
return [point_size] * len(reduced_embeddings)
local_densities = calculate_local_density_scaling(reduced_embeddings)
# Invert densities so sparse areas get larger points
inverted_densities = 1.0 - local_densities
# Scale point sizes
point_sizes = point_size * (1.0 + inverted_densities * (size_variation - 1.0))
return point_sizes
def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover_text,
point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA"):
"""Create a plot colored by clusters"""
fig = go.Figure()
unique_clusters = np.unique(cluster_labels)
colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel
for i, cluster_id in enumerate(unique_clusters):
cluster_mask = cluster_labels == cluster_id
if cluster_mask.any():
cluster_embeddings = reduced_embeddings[cluster_mask]
cluster_hover = [hover_text[j] for j, mask in enumerate(cluster_mask) if mask]
cluster_sizes = [point_sizes[j] for j, mask in enumerate(cluster_mask) if mask]
cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise"
fig.add_trace(go.Scatter(
x=cluster_embeddings[:, 0],
y=cluster_embeddings[:, 1],
mode='markers',
name=cluster_name,
marker=dict(
size=cluster_sizes,
color=colors[i % len(colors)],
opacity=point_opacity,
line=dict(width=1, color='white')
),
hovertemplate='%{hovertext}<extra></extra>',
hovertext=cluster_hover
))
return fig
def create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources, hover_text,
point_sizes, point_opacity=DEFAULT_POINT_OPACITY):
"""Create a plot colored by source files"""
fig = go.Figure()
colors = px.colors.qualitative.Set1
for i, source in enumerate(selected_sources):
source_mask = filtered_df['source_file'] == source
if source_mask.any():
source_embeddings = reduced_embeddings[source_mask]
source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
source_sizes = [point_sizes[j] for j, mask in enumerate(source_mask) if mask]
fig.add_trace(go.Scatter(
x=source_embeddings[:, 0],
y=source_embeddings[:, 1],
mode='markers',
name=source,
marker=dict(
size=source_sizes,
color=colors[i % len(colors)],
opacity=point_opacity,
line=dict(width=1, color='white')
),
hovertemplate='%{hovertext}<extra></extra>',
hovertext=source_hover
))
return fig
def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=None,
selected_sources=None, method="PCA", clustering_method="None",
point_size=DEFAULT_POINT_SIZE, point_opacity=DEFAULT_POINT_OPACITY,
density_based_sizing=False, size_variation=2.0):
"""Create the main visualization plot"""
# Create hover text
hover_text = create_hover_text(filtered_df)
# Calculate point sizes
point_sizes = calculate_point_sizes(reduced_embeddings, density_based_sizing,
point_size, size_variation)
# Create plot based on coloring strategy
if cluster_labels is not None:
fig = create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels,
hover_text, point_sizes, point_opacity, method)
else:
if selected_sources is None:
selected_sources = filtered_df['source_file'].unique()
fig = create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources,
hover_text, point_sizes, point_opacity)
# Update layout
title_suffix = f" with {clustering_method}" if clustering_method != "None" else ""
fig.update_layout(
title=f"Discord Chat Messages - {method} Visualization{title_suffix}",
xaxis_title=f"{method} Component 1",
yaxis_title=f"{method} Component 2",
hovermode='closest',
width=1000,
height=700
)
return fig
def display_clustering_metrics(cluster_labels, silhouette_avg, calinski_harabasz, show_metrics=True):
"""Display clustering quality metrics"""
if cluster_labels is not None and show_metrics:
col1, col2, col3 = st.columns(3)
with col1:
n_clusters_found = len(np.unique(cluster_labels[cluster_labels != -1]))
st.metric("Clusters Found", n_clusters_found)
with col2:
if silhouette_avg is not None:
st.metric("Silhouette Score", f"{silhouette_avg:.3f}")
else:
st.metric("Silhouette Score", "N/A")
with col3:
if calinski_harabasz is not None:
st.metric("Calinski-Harabasz Index", f"{calinski_harabasz:.1f}")
else:
st.metric("Calinski-Harabasz Index", "N/A")
def display_summary_stats(filtered_df, selected_sources):
"""Display summary statistics"""
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Messages", len(filtered_df))
with col2:
st.metric("Unique Authors", filtered_df['author_name'].nunique())
with col3:
st.metric("Source Files", len(selected_sources))
def display_clustering_results(filtered_df, cluster_labels, reduced_embeddings, method, clustering_method):
"""Display clustering results and export options"""
if cluster_labels is None:
return
st.subheader("📊 Clustering Results")
# Add cluster information to dataframe for export
export_df = filtered_df.copy()
export_df['cluster_id'] = cluster_labels
export_df['x_coordinate'] = reduced_embeddings[:, 0]
export_df['y_coordinate'] = reduced_embeddings[:, 1]
# Show cluster distribution
cluster_dist = pd.Series(cluster_labels).value_counts().sort_index()
st.bar_chart(cluster_dist)
# Download option
csv_data = export_df.to_csv(index=False)
st.download_button(
label="📥 Download Clustering Results (CSV)",
data=csv_data,
file_name=f"chat_clusters_{method}_{clustering_method}.csv",
mime="text/csv"
)
def display_data_table(filtered_df, cluster_labels=None):
"""Display the data table with optional clustering information"""
if not st.checkbox("Show Data Table"):
return
st.subheader("📋 Message Data")
display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
# Add clustering info if available
if cluster_labels is not None:
display_df['cluster'] = cluster_labels
display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display
st.dataframe(display_df, use_container_width=True)