Compare commits
2 Commits
248cc5765f
...
4ca7e8ab61
| Author | SHA1 | Date | |
|---|---|---|---|
| 4ca7e8ab61 | |||
| 6d35b42b27 |
12
apps/cluster_map/cluster.py
Normal file
12
apps/cluster_map/cluster.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
"""
|
||||||
|
Discord Chat Embeddings Visualizer - Legacy Entry Point
|
||||||
|
|
||||||
|
This file serves as a compatibility layer for the original cluster.py.
|
||||||
|
The application has been refactored into modular components for better maintainability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Import and run the main application
|
||||||
|
from main import main
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
99
apps/cluster_map/clustering.py
Normal file
99
apps/cluster_map/clustering.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
"""
|
||||||
|
Clustering algorithms and evaluation metrics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import streamlit as st
|
||||||
|
from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS
|
||||||
|
from sklearn.mixture import GaussianMixture
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
||||||
|
import hdbscan
|
||||||
|
from config import DEFAULT_RANDOM_STATE
|
||||||
|
|
||||||
|
|
||||||
|
def apply_clustering(embeddings, clustering_method="None", n_clusters=5):
|
||||||
|
"""
|
||||||
|
Apply clustering algorithm to embeddings and return labels and metrics.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embeddings: High-dimensional embeddings to cluster
|
||||||
|
clustering_method: Name of clustering algorithm
|
||||||
|
n_clusters: Number of clusters (for methods that require it)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (cluster_labels, silhouette_score, calinski_harabasz_score)
|
||||||
|
"""
|
||||||
|
if clustering_method == "None" or len(embeddings) <= n_clusters:
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
# Standardize embeddings for better clustering
|
||||||
|
scaler = StandardScaler()
|
||||||
|
scaled_embeddings = scaler.fit_transform(embeddings)
|
||||||
|
|
||||||
|
cluster_labels = None
|
||||||
|
silhouette_avg = None
|
||||||
|
calinski_harabasz = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
if clustering_method == "HDBSCAN":
|
||||||
|
min_cluster_size = max(2, len(embeddings) // 20) # Adaptive min cluster size
|
||||||
|
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
|
||||||
|
min_samples=1, cluster_selection_epsilon=0.5)
|
||||||
|
cluster_labels = clusterer.fit_predict(scaled_embeddings)
|
||||||
|
|
||||||
|
elif clustering_method == "Spectral Clustering":
|
||||||
|
clusterer = SpectralClustering(n_clusters=n_clusters, random_state=DEFAULT_RANDOM_STATE,
|
||||||
|
affinity='rbf', gamma=1.0)
|
||||||
|
cluster_labels = clusterer.fit_predict(scaled_embeddings)
|
||||||
|
|
||||||
|
elif clustering_method == "Gaussian Mixture":
|
||||||
|
clusterer = GaussianMixture(n_components=n_clusters, random_state=DEFAULT_RANDOM_STATE,
|
||||||
|
covariance_type='full', max_iter=200)
|
||||||
|
cluster_labels = clusterer.fit_predict(scaled_embeddings)
|
||||||
|
|
||||||
|
elif clustering_method == "Agglomerative (Ward)":
|
||||||
|
clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
|
||||||
|
cluster_labels = clusterer.fit_predict(scaled_embeddings)
|
||||||
|
|
||||||
|
elif clustering_method == "Agglomerative (Complete)":
|
||||||
|
clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
|
||||||
|
cluster_labels = clusterer.fit_predict(scaled_embeddings)
|
||||||
|
|
||||||
|
elif clustering_method == "OPTICS":
|
||||||
|
min_samples = max(2, len(embeddings) // 50)
|
||||||
|
clusterer = OPTICS(min_samples=min_samples, xi=0.05, min_cluster_size=0.1)
|
||||||
|
cluster_labels = clusterer.fit_predict(scaled_embeddings)
|
||||||
|
|
||||||
|
# Calculate clustering quality metrics
|
||||||
|
if cluster_labels is not None and len(np.unique(cluster_labels)) > 1:
|
||||||
|
# Only calculate if we have multiple clusters and no noise-only clustering
|
||||||
|
valid_labels = cluster_labels[cluster_labels != -1] # Remove noise points for HDBSCAN/OPTICS
|
||||||
|
valid_embeddings = scaled_embeddings[cluster_labels != -1]
|
||||||
|
|
||||||
|
if len(valid_labels) > 0 and len(np.unique(valid_labels)) > 1:
|
||||||
|
silhouette_avg = silhouette_score(valid_embeddings, valid_labels)
|
||||||
|
calinski_harabasz = calinski_harabasz_score(valid_embeddings, valid_labels)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
st.warning(f"Clustering failed: {str(e)}")
|
||||||
|
cluster_labels = None
|
||||||
|
|
||||||
|
return cluster_labels, silhouette_avg, calinski_harabasz
|
||||||
|
|
||||||
|
|
||||||
|
def get_cluster_statistics(cluster_labels):
|
||||||
|
"""Get basic statistics about clustering results"""
|
||||||
|
if cluster_labels is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
unique_clusters = np.unique(cluster_labels)
|
||||||
|
n_clusters = len(unique_clusters[unique_clusters != -1]) # Exclude noise cluster (-1)
|
||||||
|
n_noise = np.sum(cluster_labels == -1)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"n_clusters": n_clusters,
|
||||||
|
"n_noise_points": n_noise,
|
||||||
|
"cluster_distribution": np.bincount(cluster_labels[cluster_labels != -1]) if n_clusters > 0 else [],
|
||||||
|
"unique_clusters": unique_clusters
|
||||||
|
}
|
||||||
73
apps/cluster_map/config.py
Normal file
73
apps/cluster_map/config.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
"""
|
||||||
|
Configuration settings and constants for the Discord Chat Embeddings Visualizer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Application settings
|
||||||
|
APP_TITLE = "Discord Chat Embeddings Visualizer"
|
||||||
|
APP_ICON = "🗨️"
|
||||||
|
APP_LAYOUT = "wide"
|
||||||
|
|
||||||
|
# File paths
|
||||||
|
CHAT_LOGS_PATH = "../../discord_chat_logs"
|
||||||
|
|
||||||
|
# Algorithm parameters
|
||||||
|
DEFAULT_RANDOM_STATE = 42
|
||||||
|
DEFAULT_N_COMPONENTS = 2
|
||||||
|
DEFAULT_N_CLUSTERS = 5
|
||||||
|
|
||||||
|
# Visualization settings
|
||||||
|
DEFAULT_POINT_SIZE = 8
|
||||||
|
DEFAULT_POINT_OPACITY = 0.7
|
||||||
|
MAX_DISPLAYED_AUTHORS = 10
|
||||||
|
MESSAGE_CONTENT_PREVIEW_LENGTH = 200
|
||||||
|
MESSAGE_CONTENT_DISPLAY_LENGTH = 100
|
||||||
|
|
||||||
|
# Performance thresholds
|
||||||
|
LARGE_DATASET_WARNING_THRESHOLD = 1000
|
||||||
|
|
||||||
|
# Color palettes
|
||||||
|
PRIMARY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
|
||||||
|
"#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
|
||||||
|
|
||||||
|
# Clustering method categories
|
||||||
|
CLUSTERING_METHODS_REQUIRING_N_CLUSTERS = [
|
||||||
|
"Spectral Clustering",
|
||||||
|
"Gaussian Mixture",
|
||||||
|
"Agglomerative (Ward)",
|
||||||
|
"Agglomerative (Complete)"
|
||||||
|
]
|
||||||
|
|
||||||
|
COMPUTATIONALLY_INTENSIVE_METHODS = {
|
||||||
|
"dimension_reduction": ["t-SNE", "Spectral Embedding"],
|
||||||
|
"clustering": ["Spectral Clustering", "OPTICS"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Method explanations
|
||||||
|
METHOD_EXPLANATIONS = {
|
||||||
|
"dimension_reduction": {
|
||||||
|
"PCA": "Linear, fast, preserves global variance",
|
||||||
|
"t-SNE": "Non-linear, good for local structure, slower",
|
||||||
|
"UMAP": "Balanced speed/quality, preserves local & global structure",
|
||||||
|
"Spectral Embedding": "Uses graph theory, good for non-convex clusters",
|
||||||
|
"Force-Directed": "Physics-based layout, creates natural spacing"
|
||||||
|
},
|
||||||
|
"clustering": {
|
||||||
|
"HDBSCAN": "Density-based, finds variable density clusters, handles noise",
|
||||||
|
"Spectral Clustering": "Uses eigenvalues, good for non-convex shapes",
|
||||||
|
"Gaussian Mixture": "Probabilistic, assumes gaussian distributions",
|
||||||
|
"Agglomerative (Ward)": "Hierarchical, minimizes within-cluster variance",
|
||||||
|
"Agglomerative (Complete)": "Hierarchical, minimizes maximum distance",
|
||||||
|
"OPTICS": "Density-based, finds clusters of varying densities"
|
||||||
|
},
|
||||||
|
"separation": {
|
||||||
|
"Spread Factor": "Applies repulsive forces between nearby points",
|
||||||
|
"Smart Jittering": "Adds intelligent noise to separate overlapping points",
|
||||||
|
"Density-Based Jittering": "Stronger separation in crowded areas",
|
||||||
|
"Perplexity Factor": "Controls t-SNE's focus on local vs global structure",
|
||||||
|
"Min Distance Factor": "Controls UMAP's point packing tightness"
|
||||||
|
},
|
||||||
|
"metrics": {
|
||||||
|
"Silhouette Score": "Higher is better (range: -1 to 1)",
|
||||||
|
"Calinski-Harabasz": "Higher is better, measures cluster separation"
|
||||||
|
}
|
||||||
|
}
|
||||||
86
apps/cluster_map/data_loader.py
Normal file
86
apps/cluster_map/data_loader.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""
|
||||||
|
Data loading and parsing utilities for Discord chat logs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import streamlit as st
|
||||||
|
import ast
|
||||||
|
from pathlib import Path
|
||||||
|
from config import CHAT_LOGS_PATH
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data
|
||||||
|
def load_all_chat_data():
|
||||||
|
"""Load all CSV files from the discord_chat_logs folder"""
|
||||||
|
chat_logs_path = Path(CHAT_LOGS_PATH)
|
||||||
|
|
||||||
|
with st.expander("📁 Loading Details", expanded=False):
|
||||||
|
# Display the path for debugging
|
||||||
|
st.write(f"Looking for CSV files in: {chat_logs_path}")
|
||||||
|
st.write(f"Path exists: {chat_logs_path.exists()}")
|
||||||
|
|
||||||
|
all_data = []
|
||||||
|
|
||||||
|
for csv_file in chat_logs_path.glob("*.csv"):
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(csv_file)
|
||||||
|
df['source_file'] = csv_file.stem # Add source file name
|
||||||
|
all_data.append(df)
|
||||||
|
st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"❌ Error loading {csv_file.name}: {e}")
|
||||||
|
|
||||||
|
if all_data:
|
||||||
|
combined_df = pd.concat(all_data, ignore_index=True)
|
||||||
|
st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
|
||||||
|
else:
|
||||||
|
st.error("No data loaded!")
|
||||||
|
combined_df = pd.DataFrame()
|
||||||
|
|
||||||
|
return combined_df if all_data else pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data
|
||||||
|
def parse_embeddings(df):
|
||||||
|
"""Parse the content_embedding column from string to numpy array"""
|
||||||
|
embeddings = []
|
||||||
|
valid_indices = []
|
||||||
|
|
||||||
|
for idx, embedding_str in enumerate(df['content_embedding']):
|
||||||
|
try:
|
||||||
|
# Parse the string representation of the list
|
||||||
|
embedding = ast.literal_eval(embedding_str)
|
||||||
|
if isinstance(embedding, list) and len(embedding) > 0:
|
||||||
|
embeddings.append(embedding)
|
||||||
|
valid_indices.append(idx)
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
embeddings_array = np.array(embeddings)
|
||||||
|
valid_df = df.iloc[valid_indices].copy()
|
||||||
|
|
||||||
|
st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
|
||||||
|
st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
|
||||||
|
|
||||||
|
return embeddings_array, valid_df
|
||||||
|
|
||||||
|
|
||||||
|
def filter_data(df, selected_sources, selected_authors):
|
||||||
|
"""Filter dataframe by selected sources and authors"""
|
||||||
|
if not selected_sources:
|
||||||
|
selected_sources = df['source_file'].unique()
|
||||||
|
|
||||||
|
filtered_df = df[
|
||||||
|
(df['source_file'].isin(selected_sources)) &
|
||||||
|
(df['author_name'].isin(selected_authors))
|
||||||
|
]
|
||||||
|
|
||||||
|
return filtered_df
|
||||||
|
|
||||||
|
|
||||||
|
def get_filtered_embeddings(embeddings, valid_df, filtered_df):
|
||||||
|
"""Get embeddings corresponding to filtered dataframe"""
|
||||||
|
filtered_indices = filtered_df.index.tolist()
|
||||||
|
filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
|
||||||
|
return filtered_embeddings
|
||||||
211
apps/cluster_map/dimensionality_reduction.py
Normal file
211
apps/cluster_map/dimensionality_reduction.py
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
"""
|
||||||
|
Dimensionality reduction algorithms and point separation techniques.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import streamlit as st
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
from sklearn.manifold import TSNE, SpectralEmbedding
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.neighbors import NearestNeighbors
|
||||||
|
from scipy.spatial.distance import pdist, squareform
|
||||||
|
from scipy.optimize import minimize
|
||||||
|
import umap
|
||||||
|
from config import DEFAULT_RANDOM_STATE
|
||||||
|
|
||||||
|
|
||||||
|
def apply_adaptive_spreading(embeddings, spread_factor=1.0):
|
||||||
|
"""
|
||||||
|
Apply adaptive spreading to push apart nearby points while preserving global structure.
|
||||||
|
Uses a force-based approach where closer points repel more strongly.
|
||||||
|
"""
|
||||||
|
if spread_factor <= 0:
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
embeddings = embeddings.copy()
|
||||||
|
n_points = len(embeddings)
|
||||||
|
|
||||||
|
print(f"DEBUG: Applying adaptive spreading to {n_points} points with factor {spread_factor}")
|
||||||
|
|
||||||
|
if n_points < 2:
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
# For very large datasets, skip spreading to avoid hanging
|
||||||
|
if n_points > 1000:
|
||||||
|
print(f"DEBUG: Large dataset ({n_points} points), skipping adaptive spreading...")
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
# Calculate pairwise distances
|
||||||
|
distances = squareform(pdist(embeddings))
|
||||||
|
|
||||||
|
# Apply force-based spreading with fewer iterations for large datasets
|
||||||
|
max_iterations = 3 if n_points > 500 else 5
|
||||||
|
|
||||||
|
for iteration in range(max_iterations):
|
||||||
|
if iteration % 2 == 0: # Progress indicator
|
||||||
|
print(f"DEBUG: Spreading iteration {iteration + 1}/{max_iterations}")
|
||||||
|
|
||||||
|
forces = np.zeros_like(embeddings)
|
||||||
|
|
||||||
|
for i in range(n_points):
|
||||||
|
for j in range(i + 1, n_points):
|
||||||
|
diff = embeddings[i] - embeddings[j]
|
||||||
|
dist = np.linalg.norm(diff)
|
||||||
|
|
||||||
|
if dist > 0:
|
||||||
|
# Repulsive force inversely proportional to distance
|
||||||
|
force_magnitude = spread_factor / (dist ** 2 + 0.01)
|
||||||
|
force_direction = diff / dist
|
||||||
|
force = force_magnitude * force_direction
|
||||||
|
|
||||||
|
forces[i] += force
|
||||||
|
forces[j] -= force
|
||||||
|
|
||||||
|
# Apply forces with damping
|
||||||
|
embeddings += forces * 0.1
|
||||||
|
|
||||||
|
print(f"DEBUG: Adaptive spreading complete")
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def force_directed_layout(high_dim_embeddings, n_components=2, spread_factor=1.0):
|
||||||
|
"""
|
||||||
|
Create a force-directed layout from high-dimensional embeddings.
|
||||||
|
This creates more natural spacing between similar points.
|
||||||
|
"""
|
||||||
|
print(f"DEBUG: Starting force-directed layout with {len(high_dim_embeddings)} points...")
|
||||||
|
|
||||||
|
# For large datasets, fall back to PCA + spreading to avoid hanging
|
||||||
|
if len(high_dim_embeddings) > 500:
|
||||||
|
print(f"DEBUG: Large dataset ({len(high_dim_embeddings)} points), using PCA + spreading instead...")
|
||||||
|
pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
|
||||||
|
result = pca.fit_transform(high_dim_embeddings)
|
||||||
|
return apply_adaptive_spreading(result, spread_factor)
|
||||||
|
|
||||||
|
# Start with PCA as initial layout
|
||||||
|
pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
|
||||||
|
initial_layout = pca.fit_transform(high_dim_embeddings)
|
||||||
|
print(f"DEBUG: Initial PCA layout computed...")
|
||||||
|
|
||||||
|
# For simplicity, just apply spreading to the PCA result
|
||||||
|
# The original optimization was too computationally intensive
|
||||||
|
result = apply_adaptive_spreading(initial_layout, spread_factor)
|
||||||
|
print(f"DEBUG: Force-directed layout complete...")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_local_density_scaling(embeddings, k=5):
|
||||||
|
"""
|
||||||
|
Calculate local density scaling factors to emphasize differences in dense regions.
|
||||||
|
"""
|
||||||
|
if len(embeddings) < k:
|
||||||
|
return np.ones(len(embeddings))
|
||||||
|
|
||||||
|
# Find k nearest neighbors for each point
|
||||||
|
nn = NearestNeighbors(n_neighbors=k+1) # +1 because first neighbor is the point itself
|
||||||
|
nn.fit(embeddings)
|
||||||
|
distances, indices = nn.kneighbors(embeddings)
|
||||||
|
|
||||||
|
# Calculate local density (inverse of average distance to k nearest neighbors)
|
||||||
|
local_densities = 1.0 / (np.mean(distances[:, 1:], axis=1) + 1e-6)
|
||||||
|
|
||||||
|
# Normalize densities
|
||||||
|
local_densities = (local_densities - np.min(local_densities)) / (np.max(local_densities) - np.min(local_densities) + 1e-6)
|
||||||
|
|
||||||
|
return local_densities
|
||||||
|
|
||||||
|
|
||||||
|
def apply_density_based_jittering(embeddings, density_scaling=True, jitter_strength=0.1):
|
||||||
|
"""
|
||||||
|
Apply smart jittering that's stronger in dense regions to separate overlapping points.
|
||||||
|
"""
|
||||||
|
if not density_scaling:
|
||||||
|
# Simple random jittering
|
||||||
|
noise = np.random.normal(0, jitter_strength, embeddings.shape)
|
||||||
|
return embeddings + noise
|
||||||
|
|
||||||
|
# Calculate local densities
|
||||||
|
densities = calculate_local_density_scaling(embeddings)
|
||||||
|
|
||||||
|
# Apply density-proportional jittering
|
||||||
|
jittered = embeddings.copy()
|
||||||
|
for i in range(len(embeddings)):
|
||||||
|
# More jitter in denser regions
|
||||||
|
jitter_amount = jitter_strength * (1 + densities[i])
|
||||||
|
noise = np.random.normal(0, jitter_amount, embeddings.shape[1])
|
||||||
|
jittered[i] += noise
|
||||||
|
|
||||||
|
return jittered
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_dimensions(embeddings, method="PCA", n_components=2, spread_factor=1.0,
|
||||||
|
perplexity_factor=1.0, min_dist_factor=1.0):
|
||||||
|
"""Apply dimensionality reduction with enhanced separation"""
|
||||||
|
|
||||||
|
# Convert to numpy array if it's not already
|
||||||
|
embeddings = np.array(embeddings)
|
||||||
|
|
||||||
|
print(f"DEBUG: Starting {method} with {len(embeddings)} embeddings, shape: {embeddings.shape}")
|
||||||
|
|
||||||
|
# Standardize embeddings for better processing
|
||||||
|
scaler = StandardScaler()
|
||||||
|
scaled_embeddings = scaler.fit_transform(embeddings)
|
||||||
|
print(f"DEBUG: Embeddings standardized")
|
||||||
|
|
||||||
|
# Apply the selected dimensionality reduction method
|
||||||
|
if method == "PCA":
|
||||||
|
print(f"DEBUG: Applying PCA...")
|
||||||
|
reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
|
||||||
|
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
|
||||||
|
# Apply spreading to PCA results
|
||||||
|
print(f"DEBUG: Applying spreading...")
|
||||||
|
reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
|
||||||
|
|
||||||
|
elif method == "t-SNE":
|
||||||
|
# Adjust perplexity based on user preference and data size
|
||||||
|
base_perplexity = min(30, len(embeddings)-1)
|
||||||
|
adjusted_perplexity = max(5, min(50, int(base_perplexity * perplexity_factor)))
|
||||||
|
print(f"DEBUG: Applying t-SNE with perplexity {adjusted_perplexity}...")
|
||||||
|
|
||||||
|
reducer = TSNE(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
|
||||||
|
perplexity=adjusted_perplexity, n_iter=1000,
|
||||||
|
early_exaggeration=12.0 * spread_factor, # Increase early exaggeration for more separation
|
||||||
|
learning_rate='auto')
|
||||||
|
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
|
||||||
|
|
||||||
|
elif method == "UMAP":
|
||||||
|
# Adjust UMAP parameters for better local separation
|
||||||
|
n_neighbors = min(15, len(embeddings)-1)
|
||||||
|
min_dist = 0.1 * min_dist_factor
|
||||||
|
spread = 1.0 * spread_factor
|
||||||
|
print(f"DEBUG: Applying UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}...")
|
||||||
|
|
||||||
|
reducer = umap.UMAP(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
|
||||||
|
n_neighbors=n_neighbors, min_dist=min_dist,
|
||||||
|
spread=spread, local_connectivity=2.0)
|
||||||
|
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
|
||||||
|
|
||||||
|
elif method == "Spectral Embedding":
|
||||||
|
n_neighbors = min(10, len(embeddings)-1)
|
||||||
|
print(f"DEBUG: Applying Spectral Embedding with n_neighbors={n_neighbors}...")
|
||||||
|
reducer = SpectralEmbedding(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
|
||||||
|
n_neighbors=n_neighbors)
|
||||||
|
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
|
||||||
|
# Apply spreading to spectral results
|
||||||
|
print(f"DEBUG: Applying spreading...")
|
||||||
|
reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
|
||||||
|
|
||||||
|
elif method == "Force-Directed":
|
||||||
|
# New method: Use force-directed layout for natural spreading
|
||||||
|
print(f"DEBUG: Applying Force-Directed layout...")
|
||||||
|
reduced_embeddings = force_directed_layout(scaled_embeddings, n_components, spread_factor)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Fallback to PCA
|
||||||
|
print(f"DEBUG: Unknown method {method}, falling back to PCA...")
|
||||||
|
reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
|
||||||
|
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
|
||||||
|
reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
|
||||||
|
|
||||||
|
print(f"DEBUG: Dimensionality reduction complete. Output shape: {reduced_embeddings.shape}")
|
||||||
|
return reduced_embeddings
|
||||||
132
apps/cluster_map/main.py
Normal file
132
apps/cluster_map/main.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
"""
|
||||||
|
Main application logic for the Discord Chat Embeddings Visualizer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
# Import custom modules
|
||||||
|
from ui_components import (
|
||||||
|
setup_page_config, display_title_and_description, get_all_ui_parameters,
|
||||||
|
display_performance_warnings
|
||||||
|
)
|
||||||
|
from data_loader import (
|
||||||
|
load_all_chat_data, parse_embeddings, filter_data, get_filtered_embeddings
|
||||||
|
)
|
||||||
|
from dimensionality_reduction import (
|
||||||
|
reduce_dimensions, apply_density_based_jittering
|
||||||
|
)
|
||||||
|
from clustering import apply_clustering
|
||||||
|
from visualization import (
|
||||||
|
create_visualization_plot, display_clustering_metrics, display_summary_stats,
|
||||||
|
display_clustering_results, display_data_table
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main application function"""
|
||||||
|
# Set up page configuration
|
||||||
|
setup_page_config()
|
||||||
|
|
||||||
|
# Display title and description
|
||||||
|
display_title_and_description()
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
with st.spinner("Loading chat data..."):
|
||||||
|
df = load_all_chat_data()
|
||||||
|
|
||||||
|
if df.empty:
|
||||||
|
st.error("No data could be loaded. Please check the data directory.")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
# Parse embeddings
|
||||||
|
with st.spinner("Parsing embeddings..."):
|
||||||
|
embeddings, valid_df = parse_embeddings(df)
|
||||||
|
|
||||||
|
if len(embeddings) == 0:
|
||||||
|
st.error("No valid embeddings found!")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
# Get UI parameters
|
||||||
|
params = get_all_ui_parameters(valid_df)
|
||||||
|
|
||||||
|
# Filter data
|
||||||
|
filtered_df = filter_data(valid_df, params['selected_sources'], params['selected_authors'])
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
st.warning("No data matches the current filters!")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
# Display performance warnings
|
||||||
|
display_performance_warnings(filtered_df, params['method'], params['clustering_method'])
|
||||||
|
|
||||||
|
# Get corresponding embeddings
|
||||||
|
filtered_embeddings = get_filtered_embeddings(embeddings, valid_df, filtered_df)
|
||||||
|
|
||||||
|
st.info(f"📈 Visualizing {len(filtered_df)} messages")
|
||||||
|
|
||||||
|
# Reduce dimensions
|
||||||
|
with st.spinner(f"Reducing dimensions using {params['method']}..."):
|
||||||
|
reduced_embeddings = reduce_dimensions(
|
||||||
|
filtered_embeddings,
|
||||||
|
method=params['method'],
|
||||||
|
spread_factor=params['spread_factor'],
|
||||||
|
perplexity_factor=params['perplexity_factor'],
|
||||||
|
min_dist_factor=params['min_dist_factor']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply clustering
|
||||||
|
with st.spinner(f"Applying {params['clustering_method']}..."):
|
||||||
|
cluster_labels, silhouette_avg, calinski_harabasz = apply_clustering(
|
||||||
|
filtered_embeddings,
|
||||||
|
clustering_method=params['clustering_method'],
|
||||||
|
n_clusters=params['n_clusters']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply jittering if requested
|
||||||
|
if params['apply_jittering']:
|
||||||
|
with st.spinner("Applying smart jittering to separate overlapping points..."):
|
||||||
|
reduced_embeddings = apply_density_based_jittering(
|
||||||
|
reduced_embeddings,
|
||||||
|
density_scaling=params['density_based_jitter'],
|
||||||
|
jitter_strength=params['jitter_strength']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display clustering metrics
|
||||||
|
display_clustering_metrics(
|
||||||
|
cluster_labels, silhouette_avg, calinski_harabasz,
|
||||||
|
params['show_cluster_metrics']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create and display the main plot
|
||||||
|
fig = create_visualization_plot(
|
||||||
|
reduced_embeddings=reduced_embeddings,
|
||||||
|
filtered_df=filtered_df,
|
||||||
|
cluster_labels=cluster_labels,
|
||||||
|
selected_sources=params['selected_sources'] if params['selected_sources'] else None,
|
||||||
|
method=params['method'],
|
||||||
|
clustering_method=params['clustering_method'],
|
||||||
|
point_size=params['point_size'],
|
||||||
|
point_opacity=params['point_opacity'],
|
||||||
|
density_based_sizing=params['density_based_sizing'],
|
||||||
|
size_variation=params['size_variation']
|
||||||
|
)
|
||||||
|
|
||||||
|
st.plotly_chart(fig, use_container_width=True)
|
||||||
|
|
||||||
|
# Display summary statistics
|
||||||
|
display_summary_stats(filtered_df, params['selected_sources'] or filtered_df['source_file'].unique())
|
||||||
|
|
||||||
|
# Display clustering results and export options
|
||||||
|
display_clustering_results(
|
||||||
|
filtered_df, cluster_labels, reduced_embeddings,
|
||||||
|
params['method'], params['clustering_method']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display data table
|
||||||
|
display_data_table(filtered_df, cluster_labels)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -3,3 +3,6 @@ pandas>=1.5.0
|
|||||||
numpy>=1.24.0
|
numpy>=1.24.0
|
||||||
plotly>=5.15.0
|
plotly>=5.15.0
|
||||||
scikit-learn>=1.3.0
|
scikit-learn>=1.3.0
|
||||||
|
umap-learn>=0.5.3
|
||||||
|
hdbscan>=0.8.29
|
||||||
|
scipy>=1.10.0
|
||||||
|
|||||||
@@ -1,233 +0,0 @@
|
|||||||
import streamlit as st
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import plotly.express as px
|
|
||||||
import plotly.graph_objects as go
|
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
from sklearn.manifold import TSNE
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
import ast
|
|
||||||
|
|
||||||
# Set page config
|
|
||||||
st.set_page_config(
|
|
||||||
page_title="Discord Chat Embeddings Visualizer",
|
|
||||||
page_icon="🗨️",
|
|
||||||
layout="wide"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Title and description
|
|
||||||
st.title("🗨️ Discord Chat Embeddings Visualizer")
|
|
||||||
st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
|
|
||||||
|
|
||||||
@st.cache_data
|
|
||||||
def load_all_chat_data():
|
|
||||||
"""Load all CSV files from the discord_chat_logs folder"""
|
|
||||||
chat_logs_path = Path("../../discord_chat_logs")
|
|
||||||
|
|
||||||
# Display the path for debugging
|
|
||||||
st.write(f"Looking for CSV files in: {chat_logs_path}")
|
|
||||||
st.write(f"Path exists: {chat_logs_path.exists()}")
|
|
||||||
|
|
||||||
all_data = []
|
|
||||||
|
|
||||||
for csv_file in chat_logs_path.glob("*.csv"):
|
|
||||||
try:
|
|
||||||
df = pd.read_csv(csv_file)
|
|
||||||
df['source_file'] = csv_file.stem # Add source file name
|
|
||||||
all_data.append(df)
|
|
||||||
st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
|
|
||||||
except Exception as e:
|
|
||||||
st.error(f"❌ Error loading {csv_file.name}: {e}")
|
|
||||||
|
|
||||||
if all_data:
|
|
||||||
combined_df = pd.concat(all_data, ignore_index=True)
|
|
||||||
st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
|
|
||||||
return combined_df
|
|
||||||
else:
|
|
||||||
st.error("No data loaded!")
|
|
||||||
return pd.DataFrame()
|
|
||||||
|
|
||||||
@st.cache_data
|
|
||||||
def parse_embeddings(df):
|
|
||||||
"""Parse the content_embedding column from string to numpy array"""
|
|
||||||
embeddings = []
|
|
||||||
valid_indices = []
|
|
||||||
|
|
||||||
for idx, embedding_str in enumerate(df['content_embedding']):
|
|
||||||
try:
|
|
||||||
# Parse the string representation of the list
|
|
||||||
embedding = ast.literal_eval(embedding_str)
|
|
||||||
if isinstance(embedding, list) and len(embedding) > 0:
|
|
||||||
embeddings.append(embedding)
|
|
||||||
valid_indices.append(idx)
|
|
||||||
except Exception as e:
|
|
||||||
continue
|
|
||||||
|
|
||||||
embeddings_array = np.array(embeddings)
|
|
||||||
valid_df = df.iloc[valid_indices].copy()
|
|
||||||
|
|
||||||
st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
|
|
||||||
st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
|
|
||||||
|
|
||||||
return embeddings_array, valid_df
|
|
||||||
|
|
||||||
@st.cache_data
|
|
||||||
def reduce_dimensions(embeddings, method="PCA", n_components=2):
|
|
||||||
"""Reduce embeddings to 2D using PCA or t-SNE"""
|
|
||||||
if method == "PCA":
|
|
||||||
reducer = PCA(n_components=n_components, random_state=42)
|
|
||||||
elif method == "t-SNE":
|
|
||||||
reducer = TSNE(n_components=n_components, random_state=42, perplexity=min(30, len(embeddings)-1))
|
|
||||||
|
|
||||||
reduced_embeddings = reducer.fit_transform(embeddings)
|
|
||||||
return reduced_embeddings
|
|
||||||
|
|
||||||
def create_hover_text(df):
|
|
||||||
"""Create hover text for plotly"""
|
|
||||||
hover_text = []
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
text = f"<b>Author:</b> {row['author_name']}<br>"
|
|
||||||
text += f"<b>Timestamp:</b> {row['timestamp_utc']}<br>"
|
|
||||||
text += f"<b>Source:</b> {row['source_file']}<br>"
|
|
||||||
|
|
||||||
# Handle potential NaN or non-string content
|
|
||||||
content = row['content']
|
|
||||||
if pd.isna(content) or content is None:
|
|
||||||
content_text = "[No content]"
|
|
||||||
else:
|
|
||||||
content_str = str(content)
|
|
||||||
content_text = content_str[:200] + ('...' if len(content_str) > 200 else '')
|
|
||||||
|
|
||||||
text += f"<b>Content:</b> {content_text}"
|
|
||||||
hover_text.append(text)
|
|
||||||
return hover_text
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Load data
|
|
||||||
with st.spinner("Loading chat data..."):
|
|
||||||
df = load_all_chat_data()
|
|
||||||
|
|
||||||
if df.empty:
|
|
||||||
st.stop()
|
|
||||||
|
|
||||||
# Parse embeddings
|
|
||||||
with st.spinner("Parsing embeddings..."):
|
|
||||||
embeddings, valid_df = parse_embeddings(df)
|
|
||||||
|
|
||||||
if len(embeddings) == 0:
|
|
||||||
st.error("No valid embeddings found!")
|
|
||||||
st.stop()
|
|
||||||
|
|
||||||
# Sidebar controls
|
|
||||||
st.sidebar.header("🎛️ Visualization Controls")
|
|
||||||
|
|
||||||
# Dimension reduction method
|
|
||||||
method = st.sidebar.selectbox(
|
|
||||||
"Dimension Reduction Method",
|
|
||||||
["PCA", "t-SNE"],
|
|
||||||
help="PCA is faster, t-SNE may reveal better clusters"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Source file filter
|
|
||||||
source_files = valid_df['source_file'].unique()
|
|
||||||
selected_sources = st.sidebar.multiselect(
|
|
||||||
"Filter by Source Files",
|
|
||||||
source_files,
|
|
||||||
default=source_files,
|
|
||||||
help="Select which chat log files to include"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Author filter
|
|
||||||
authors = valid_df['author_name'].unique()
|
|
||||||
selected_authors = st.sidebar.multiselect(
|
|
||||||
"Filter by Authors",
|
|
||||||
authors,
|
|
||||||
default=authors[:10] if len(authors) > 10 else authors, # Limit to first 10 for performance
|
|
||||||
help="Select which authors to include"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Filter data
|
|
||||||
filtered_df = valid_df[
|
|
||||||
(valid_df['source_file'].isin(selected_sources)) &
|
|
||||||
(valid_df['author_name'].isin(selected_authors))
|
|
||||||
]
|
|
||||||
|
|
||||||
if filtered_df.empty:
|
|
||||||
st.warning("No data matches the current filters!")
|
|
||||||
st.stop()
|
|
||||||
|
|
||||||
# Get corresponding embeddings
|
|
||||||
filtered_indices = filtered_df.index.tolist()
|
|
||||||
filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
|
|
||||||
|
|
||||||
st.info(f"📈 Visualizing {len(filtered_df)} messages")
|
|
||||||
|
|
||||||
# Reduce dimensions
|
|
||||||
with st.spinner(f"Reducing dimensions using {method}..."):
|
|
||||||
reduced_embeddings = reduce_dimensions(filtered_embeddings, method)
|
|
||||||
|
|
||||||
# Create hover text
|
|
||||||
hover_text = create_hover_text(filtered_df)
|
|
||||||
|
|
||||||
# Create the plot
|
|
||||||
fig = go.Figure()
|
|
||||||
|
|
||||||
# Color by source file
|
|
||||||
colors = px.colors.qualitative.Set1
|
|
||||||
for i, source in enumerate(selected_sources):
|
|
||||||
source_mask = filtered_df['source_file'] == source
|
|
||||||
if source_mask.any():
|
|
||||||
source_data = filtered_df[source_mask]
|
|
||||||
source_embeddings = reduced_embeddings[source_mask]
|
|
||||||
source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
|
|
||||||
|
|
||||||
fig.add_trace(go.Scatter(
|
|
||||||
x=source_embeddings[:, 0],
|
|
||||||
y=source_embeddings[:, 1],
|
|
||||||
mode='markers',
|
|
||||||
name=source,
|
|
||||||
marker=dict(
|
|
||||||
size=8,
|
|
||||||
color=colors[i % len(colors)],
|
|
||||||
opacity=0.7,
|
|
||||||
line=dict(width=1, color='white')
|
|
||||||
),
|
|
||||||
hovertemplate='%{hovertext}<extra></extra>',
|
|
||||||
hovertext=source_hover
|
|
||||||
))
|
|
||||||
|
|
||||||
fig.update_layout(
|
|
||||||
title=f"Discord Chat Messages - {method} Visualization",
|
|
||||||
xaxis_title=f"{method} Component 1",
|
|
||||||
yaxis_title=f"{method} Component 2",
|
|
||||||
hovermode='closest',
|
|
||||||
width=1000,
|
|
||||||
height=700
|
|
||||||
)
|
|
||||||
|
|
||||||
# Display the plot
|
|
||||||
st.plotly_chart(fig, use_container_width=True)
|
|
||||||
|
|
||||||
# Statistics
|
|
||||||
col1, col2, col3 = st.columns(3)
|
|
||||||
|
|
||||||
with col1:
|
|
||||||
st.metric("Total Messages", len(filtered_df))
|
|
||||||
|
|
||||||
with col2:
|
|
||||||
st.metric("Unique Authors", filtered_df['author_name'].nunique())
|
|
||||||
|
|
||||||
with col3:
|
|
||||||
st.metric("Source Files", len(selected_sources))
|
|
||||||
|
|
||||||
# Show data table
|
|
||||||
if st.checkbox("Show Data Table"):
|
|
||||||
st.subheader("📋 Message Data")
|
|
||||||
display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
|
|
||||||
display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display
|
|
||||||
st.dataframe(display_df, use_container_width=True)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
43
apps/cluster_map/test_debug.py
Normal file
43
apps/cluster_map/test_debug.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script to debug the hanging issue in the modular app
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the current directory to Python path
|
||||||
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
def test_dimensionality_reduction():
|
||||||
|
"""Test dimensionality reduction functions"""
|
||||||
|
print("Testing dimensionality reduction functions...")
|
||||||
|
|
||||||
|
from dimensionality_reduction import reduce_dimensions
|
||||||
|
|
||||||
|
# Create test data similar to what we'd expect
|
||||||
|
n_samples = 796 # Same as the user's dataset
|
||||||
|
n_features = 384 # Common embedding dimension
|
||||||
|
|
||||||
|
print(f"Creating test embeddings: {n_samples} x {n_features}")
|
||||||
|
test_embeddings = np.random.randn(n_samples, n_features)
|
||||||
|
|
||||||
|
# Test PCA (should be fast)
|
||||||
|
print("Testing PCA...")
|
||||||
|
try:
|
||||||
|
result = reduce_dimensions(test_embeddings, method="PCA")
|
||||||
|
print(f"✓ PCA successful, output shape: {result.shape}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ PCA failed: {e}")
|
||||||
|
|
||||||
|
# Test UMAP (might be slower)
|
||||||
|
print("Testing UMAP...")
|
||||||
|
try:
|
||||||
|
result = reduce_dimensions(test_embeddings, method="UMAP")
|
||||||
|
print(f"✓ UMAP successful, output shape: {result.shape}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ UMAP failed: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_dimensionality_reduction()
|
||||||
236
apps/cluster_map/ui_components.py
Normal file
236
apps/cluster_map/ui_components.py
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
"""
|
||||||
|
Streamlit UI components and controls for the Discord Chat Embeddings Visualizer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
import numpy as np
|
||||||
|
from config import (
|
||||||
|
APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
|
||||||
|
CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
|
||||||
|
LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_page_config():
|
||||||
|
"""Set up the Streamlit page configuration"""
|
||||||
|
st.set_page_config(
|
||||||
|
page_title=APP_TITLE,
|
||||||
|
page_icon=APP_ICON,
|
||||||
|
layout=APP_LAYOUT
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def display_title_and_description():
|
||||||
|
"""Display the main title and description"""
|
||||||
|
st.title(f"{APP_ICON} {APP_TITLE}")
|
||||||
|
st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
|
||||||
|
|
||||||
|
|
||||||
|
def create_method_controls():
|
||||||
|
"""Create controls for dimension reduction and clustering methods"""
|
||||||
|
st.sidebar.header("🎛️ Visualization Controls")
|
||||||
|
|
||||||
|
# Dimension reduction method
|
||||||
|
method = st.sidebar.selectbox(
|
||||||
|
"Dimension Reduction Method",
|
||||||
|
["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"],
|
||||||
|
help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clustering method
|
||||||
|
clustering_method = st.sidebar.selectbox(
|
||||||
|
"Clustering Method",
|
||||||
|
["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture",
|
||||||
|
"Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"],
|
||||||
|
help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
|
||||||
|
)
|
||||||
|
|
||||||
|
return method, clustering_method
|
||||||
|
|
||||||
|
|
||||||
|
def create_clustering_controls(clustering_method):
|
||||||
|
"""Create controls for clustering parameters"""
|
||||||
|
n_clusters = 5
|
||||||
|
if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
|
||||||
|
n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5)
|
||||||
|
|
||||||
|
return n_clusters
|
||||||
|
|
||||||
|
|
||||||
|
def create_separation_controls(method):
|
||||||
|
"""Create controls for point separation and method-specific parameters"""
|
||||||
|
st.sidebar.subheader("🎯 Point Separation Controls")
|
||||||
|
|
||||||
|
spread_factor = st.sidebar.slider(
|
||||||
|
"Spread Factor",
|
||||||
|
0.5, 3.0, 1.0, 0.1,
|
||||||
|
help="Increase to spread apart nearby points. Higher values create more separation."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Method-specific parameters
|
||||||
|
perplexity_factor = 1.0
|
||||||
|
min_dist_factor = 1.0
|
||||||
|
|
||||||
|
if method == "t-SNE":
|
||||||
|
perplexity_factor = st.sidebar.slider(
|
||||||
|
"Perplexity Factor",
|
||||||
|
0.5, 2.0, 1.0, 0.1,
|
||||||
|
help="Affects local vs global structure balance. Lower values focus on local details."
|
||||||
|
)
|
||||||
|
|
||||||
|
if method == "UMAP":
|
||||||
|
min_dist_factor = st.sidebar.slider(
|
||||||
|
"Min Distance Factor",
|
||||||
|
0.1, 2.0, 1.0, 0.1,
|
||||||
|
help="Controls how tightly points are packed. Lower values create tighter clusters."
|
||||||
|
)
|
||||||
|
|
||||||
|
return spread_factor, perplexity_factor, min_dist_factor
|
||||||
|
|
||||||
|
|
||||||
|
def create_jittering_controls():
|
||||||
|
"""Create controls for jittering options"""
|
||||||
|
apply_jittering = st.sidebar.checkbox(
|
||||||
|
"Apply Smart Jittering",
|
||||||
|
value=False,
|
||||||
|
help="Add intelligent noise to separate overlapping points"
|
||||||
|
)
|
||||||
|
|
||||||
|
jitter_strength = 0.1
|
||||||
|
density_based_jitter = True
|
||||||
|
|
||||||
|
if apply_jittering:
|
||||||
|
jitter_strength = st.sidebar.slider(
|
||||||
|
"Jitter Strength",
|
||||||
|
0.01, 0.5, 0.1, 0.01,
|
||||||
|
help="Strength of jittering. Higher values spread points more."
|
||||||
|
)
|
||||||
|
density_based_jitter = st.sidebar.checkbox(
|
||||||
|
"Density-Based Jittering",
|
||||||
|
value=True,
|
||||||
|
help="Apply stronger jittering in dense regions"
|
||||||
|
)
|
||||||
|
|
||||||
|
return apply_jittering, jitter_strength, density_based_jitter
|
||||||
|
|
||||||
|
|
||||||
|
def create_advanced_options():
|
||||||
|
"""Create advanced visualization options"""
|
||||||
|
with st.sidebar.expander("⚙️ Advanced Options"):
|
||||||
|
show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True)
|
||||||
|
point_size = st.slider("Point Size", 4, 15, 8)
|
||||||
|
point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7)
|
||||||
|
|
||||||
|
# Density-based visualization
|
||||||
|
density_based_sizing = st.checkbox(
|
||||||
|
"Density-Based Point Sizing",
|
||||||
|
value=False,
|
||||||
|
help="Make points larger in sparse regions, smaller in dense regions"
|
||||||
|
)
|
||||||
|
|
||||||
|
size_variation = 2.0
|
||||||
|
if density_based_sizing:
|
||||||
|
size_variation = st.slider(
|
||||||
|
"Size Variation Factor",
|
||||||
|
1.5, 4.0, 2.0, 0.1,
|
||||||
|
help="How much point sizes vary based on local density"
|
||||||
|
)
|
||||||
|
|
||||||
|
return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation
|
||||||
|
|
||||||
|
|
||||||
|
def create_filter_controls(valid_df):
|
||||||
|
"""Create controls for filtering data by source and author"""
|
||||||
|
# Source file filter
|
||||||
|
source_files = valid_df['source_file'].unique()
|
||||||
|
selected_sources = st.sidebar.multiselect(
|
||||||
|
"Filter by Source Files",
|
||||||
|
source_files,
|
||||||
|
default=[],
|
||||||
|
help="Select which chat log files to include"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Author filter
|
||||||
|
authors = valid_df['author_name'].unique()
|
||||||
|
default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors
|
||||||
|
selected_authors = st.sidebar.multiselect(
|
||||||
|
"Filter by Authors",
|
||||||
|
authors,
|
||||||
|
default=default_authors,
|
||||||
|
help="Select which authors to include"
|
||||||
|
)
|
||||||
|
|
||||||
|
return selected_sources, selected_authors
|
||||||
|
|
||||||
|
|
||||||
|
def display_method_explanations():
|
||||||
|
"""Display explanations for different methods"""
|
||||||
|
st.sidebar.markdown("---")
|
||||||
|
with st.sidebar.expander("📚 Method Explanations"):
|
||||||
|
st.markdown("**Dimensionality Reduction:**")
|
||||||
|
for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items():
|
||||||
|
st.markdown(f"- **{method}**: {explanation}")
|
||||||
|
|
||||||
|
st.markdown("\n**Clustering Methods:**")
|
||||||
|
for method, explanation in METHOD_EXPLANATIONS["clustering"].items():
|
||||||
|
st.markdown(f"- **{method}**: {explanation}")
|
||||||
|
|
||||||
|
st.markdown("\n**Separation Techniques:**")
|
||||||
|
for technique, explanation in METHOD_EXPLANATIONS["separation"].items():
|
||||||
|
st.markdown(f"- **{technique}**: {explanation}")
|
||||||
|
|
||||||
|
st.markdown("\n**Metrics:**")
|
||||||
|
for metric, explanation in METHOD_EXPLANATIONS["metrics"].items():
|
||||||
|
st.markdown(f"- **{metric}**: {explanation}")
|
||||||
|
|
||||||
|
|
||||||
|
def display_performance_warnings(filtered_df, method, clustering_method):
|
||||||
|
"""Display performance warnings for computationally intensive operations"""
|
||||||
|
if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD:
|
||||||
|
if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]:
|
||||||
|
st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.")
|
||||||
|
if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]:
|
||||||
|
st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.")
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_ui_parameters(valid_df):
|
||||||
|
"""Get all UI parameters in a single function call"""
|
||||||
|
# Method selection
|
||||||
|
method, clustering_method = create_method_controls()
|
||||||
|
|
||||||
|
# Clustering parameters
|
||||||
|
n_clusters = create_clustering_controls(clustering_method)
|
||||||
|
|
||||||
|
# Separation controls
|
||||||
|
spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method)
|
||||||
|
|
||||||
|
# Jittering controls
|
||||||
|
apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls()
|
||||||
|
|
||||||
|
# Advanced options
|
||||||
|
show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options()
|
||||||
|
|
||||||
|
# Filters
|
||||||
|
selected_sources, selected_authors = create_filter_controls(valid_df)
|
||||||
|
|
||||||
|
# Method explanations
|
||||||
|
display_method_explanations()
|
||||||
|
|
||||||
|
return {
|
||||||
|
'method': method,
|
||||||
|
'clustering_method': clustering_method,
|
||||||
|
'n_clusters': n_clusters,
|
||||||
|
'spread_factor': spread_factor,
|
||||||
|
'perplexity_factor': perplexity_factor,
|
||||||
|
'min_dist_factor': min_dist_factor,
|
||||||
|
'apply_jittering': apply_jittering,
|
||||||
|
'jitter_strength': jitter_strength,
|
||||||
|
'density_based_jitter': density_based_jitter,
|
||||||
|
'show_cluster_metrics': show_cluster_metrics,
|
||||||
|
'point_size': point_size,
|
||||||
|
'point_opacity': point_opacity,
|
||||||
|
'density_based_sizing': density_based_sizing,
|
||||||
|
'size_variation': size_variation,
|
||||||
|
'selected_sources': selected_sources,
|
||||||
|
'selected_authors': selected_authors
|
||||||
|
}
|
||||||
225
apps/cluster_map/visualization.py
Normal file
225
apps/cluster_map/visualization.py
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
"""
|
||||||
|
Visualization functions for creating interactive plots and displays.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import plotly.express as px
|
||||||
|
import plotly.graph_objects as go
|
||||||
|
import streamlit as st
|
||||||
|
from dimensionality_reduction import calculate_local_density_scaling
|
||||||
|
from config import MESSAGE_CONTENT_PREVIEW_LENGTH, DEFAULT_POINT_SIZE, DEFAULT_POINT_OPACITY
|
||||||
|
|
||||||
|
|
||||||
|
def create_hover_text(df):
|
||||||
|
"""Create hover text for plotly"""
|
||||||
|
hover_text = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
text = f"<b>Author:</b> {row['author_name']}<br>"
|
||||||
|
text += f"<b>Timestamp:</b> {row['timestamp_utc']}<br>"
|
||||||
|
text += f"<b>Source:</b> {row['source_file']}<br>"
|
||||||
|
|
||||||
|
# Handle potential NaN or non-string content
|
||||||
|
content = row['content']
|
||||||
|
if pd.isna(content) or content is None:
|
||||||
|
content_text = "[No content]"
|
||||||
|
else:
|
||||||
|
content_str = str(content)
|
||||||
|
content_text = content_str[:MESSAGE_CONTENT_PREVIEW_LENGTH] + ('...' if len(content_str) > MESSAGE_CONTENT_PREVIEW_LENGTH else '')
|
||||||
|
|
||||||
|
text += f"<b>Content:</b> {content_text}"
|
||||||
|
hover_text.append(text)
|
||||||
|
return hover_text
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_point_sizes(reduced_embeddings, density_based_sizing=False,
|
||||||
|
point_size=DEFAULT_POINT_SIZE, size_variation=2.0):
|
||||||
|
"""Calculate point sizes based on density if enabled"""
|
||||||
|
if not density_based_sizing:
|
||||||
|
return [point_size] * len(reduced_embeddings)
|
||||||
|
|
||||||
|
local_densities = calculate_local_density_scaling(reduced_embeddings)
|
||||||
|
# Invert densities so sparse areas get larger points
|
||||||
|
inverted_densities = 1.0 - local_densities
|
||||||
|
# Scale point sizes
|
||||||
|
point_sizes = point_size * (1.0 + inverted_densities * (size_variation - 1.0))
|
||||||
|
return point_sizes
|
||||||
|
|
||||||
|
|
||||||
|
def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover_text,
|
||||||
|
point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA"):
|
||||||
|
"""Create a plot colored by clusters"""
|
||||||
|
fig = go.Figure()
|
||||||
|
|
||||||
|
unique_clusters = np.unique(cluster_labels)
|
||||||
|
colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel
|
||||||
|
|
||||||
|
for i, cluster_id in enumerate(unique_clusters):
|
||||||
|
cluster_mask = cluster_labels == cluster_id
|
||||||
|
if cluster_mask.any():
|
||||||
|
cluster_embeddings = reduced_embeddings[cluster_mask]
|
||||||
|
cluster_hover = [hover_text[j] for j, mask in enumerate(cluster_mask) if mask]
|
||||||
|
cluster_sizes = [point_sizes[j] for j, mask in enumerate(cluster_mask) if mask]
|
||||||
|
|
||||||
|
cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise"
|
||||||
|
|
||||||
|
fig.add_trace(go.Scatter(
|
||||||
|
x=cluster_embeddings[:, 0],
|
||||||
|
y=cluster_embeddings[:, 1],
|
||||||
|
mode='markers',
|
||||||
|
name=cluster_name,
|
||||||
|
marker=dict(
|
||||||
|
size=cluster_sizes,
|
||||||
|
color=colors[i % len(colors)],
|
||||||
|
opacity=point_opacity,
|
||||||
|
line=dict(width=1, color='white')
|
||||||
|
),
|
||||||
|
hovertemplate='%{hovertext}<extra></extra>',
|
||||||
|
hovertext=cluster_hover
|
||||||
|
))
|
||||||
|
|
||||||
|
return fig
|
||||||
|
|
||||||
|
|
||||||
|
def create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources, hover_text,
|
||||||
|
point_sizes, point_opacity=DEFAULT_POINT_OPACITY):
|
||||||
|
"""Create a plot colored by source files"""
|
||||||
|
fig = go.Figure()
|
||||||
|
colors = px.colors.qualitative.Set1
|
||||||
|
|
||||||
|
for i, source in enumerate(selected_sources):
|
||||||
|
source_mask = filtered_df['source_file'] == source
|
||||||
|
if source_mask.any():
|
||||||
|
source_embeddings = reduced_embeddings[source_mask]
|
||||||
|
source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
|
||||||
|
source_sizes = [point_sizes[j] for j, mask in enumerate(source_mask) if mask]
|
||||||
|
|
||||||
|
fig.add_trace(go.Scatter(
|
||||||
|
x=source_embeddings[:, 0],
|
||||||
|
y=source_embeddings[:, 1],
|
||||||
|
mode='markers',
|
||||||
|
name=source,
|
||||||
|
marker=dict(
|
||||||
|
size=source_sizes,
|
||||||
|
color=colors[i % len(colors)],
|
||||||
|
opacity=point_opacity,
|
||||||
|
line=dict(width=1, color='white')
|
||||||
|
),
|
||||||
|
hovertemplate='%{hovertext}<extra></extra>',
|
||||||
|
hovertext=source_hover
|
||||||
|
))
|
||||||
|
|
||||||
|
return fig
|
||||||
|
|
||||||
|
|
||||||
|
def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=None,
|
||||||
|
selected_sources=None, method="PCA", clustering_method="None",
|
||||||
|
point_size=DEFAULT_POINT_SIZE, point_opacity=DEFAULT_POINT_OPACITY,
|
||||||
|
density_based_sizing=False, size_variation=2.0):
|
||||||
|
"""Create the main visualization plot"""
|
||||||
|
|
||||||
|
# Create hover text
|
||||||
|
hover_text = create_hover_text(filtered_df)
|
||||||
|
|
||||||
|
# Calculate point sizes
|
||||||
|
point_sizes = calculate_point_sizes(reduced_embeddings, density_based_sizing,
|
||||||
|
point_size, size_variation)
|
||||||
|
|
||||||
|
# Create plot based on coloring strategy
|
||||||
|
if cluster_labels is not None:
|
||||||
|
fig = create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels,
|
||||||
|
hover_text, point_sizes, point_opacity, method)
|
||||||
|
else:
|
||||||
|
if selected_sources is None:
|
||||||
|
selected_sources = filtered_df['source_file'].unique()
|
||||||
|
fig = create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources,
|
||||||
|
hover_text, point_sizes, point_opacity)
|
||||||
|
|
||||||
|
# Update layout
|
||||||
|
title_suffix = f" with {clustering_method}" if clustering_method != "None" else ""
|
||||||
|
fig.update_layout(
|
||||||
|
title=f"Discord Chat Messages - {method} Visualization{title_suffix}",
|
||||||
|
xaxis_title=f"{method} Component 1",
|
||||||
|
yaxis_title=f"{method} Component 2",
|
||||||
|
hovermode='closest',
|
||||||
|
width=1000,
|
||||||
|
height=700
|
||||||
|
)
|
||||||
|
|
||||||
|
return fig
|
||||||
|
|
||||||
|
|
||||||
|
def display_clustering_metrics(cluster_labels, silhouette_avg, calinski_harabasz, show_metrics=True):
|
||||||
|
"""Display clustering quality metrics"""
|
||||||
|
if cluster_labels is not None and show_metrics:
|
||||||
|
col1, col2, col3 = st.columns(3)
|
||||||
|
with col1:
|
||||||
|
n_clusters_found = len(np.unique(cluster_labels[cluster_labels != -1]))
|
||||||
|
st.metric("Clusters Found", n_clusters_found)
|
||||||
|
with col2:
|
||||||
|
if silhouette_avg is not None:
|
||||||
|
st.metric("Silhouette Score", f"{silhouette_avg:.3f}")
|
||||||
|
else:
|
||||||
|
st.metric("Silhouette Score", "N/A")
|
||||||
|
with col3:
|
||||||
|
if calinski_harabasz is not None:
|
||||||
|
st.metric("Calinski-Harabasz Index", f"{calinski_harabasz:.1f}")
|
||||||
|
else:
|
||||||
|
st.metric("Calinski-Harabasz Index", "N/A")
|
||||||
|
|
||||||
|
|
||||||
|
def display_summary_stats(filtered_df, selected_sources):
|
||||||
|
"""Display summary statistics"""
|
||||||
|
col1, col2, col3 = st.columns(3)
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
st.metric("Total Messages", len(filtered_df))
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
st.metric("Unique Authors", filtered_df['author_name'].nunique())
|
||||||
|
|
||||||
|
with col3:
|
||||||
|
st.metric("Source Files", len(selected_sources))
|
||||||
|
|
||||||
|
|
||||||
|
def display_clustering_results(filtered_df, cluster_labels, reduced_embeddings, method, clustering_method):
|
||||||
|
"""Display clustering results and export options"""
|
||||||
|
if cluster_labels is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
st.subheader("📊 Clustering Results")
|
||||||
|
|
||||||
|
# Add cluster information to dataframe for export
|
||||||
|
export_df = filtered_df.copy()
|
||||||
|
export_df['cluster_id'] = cluster_labels
|
||||||
|
export_df['x_coordinate'] = reduced_embeddings[:, 0]
|
||||||
|
export_df['y_coordinate'] = reduced_embeddings[:, 1]
|
||||||
|
|
||||||
|
# Show cluster distribution
|
||||||
|
cluster_dist = pd.Series(cluster_labels).value_counts().sort_index()
|
||||||
|
st.bar_chart(cluster_dist)
|
||||||
|
|
||||||
|
# Download option
|
||||||
|
csv_data = export_df.to_csv(index=False)
|
||||||
|
st.download_button(
|
||||||
|
label="📥 Download Clustering Results (CSV)",
|
||||||
|
data=csv_data,
|
||||||
|
file_name=f"chat_clusters_{method}_{clustering_method}.csv",
|
||||||
|
mime="text/csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def display_data_table(filtered_df, cluster_labels=None):
|
||||||
|
"""Display the data table with optional clustering information"""
|
||||||
|
if not st.checkbox("Show Data Table"):
|
||||||
|
return
|
||||||
|
|
||||||
|
st.subheader("📋 Message Data")
|
||||||
|
display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
|
||||||
|
|
||||||
|
# Add clustering info if available
|
||||||
|
if cluster_labels is not None:
|
||||||
|
display_df['cluster'] = cluster_labels
|
||||||
|
|
||||||
|
display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display
|
||||||
|
st.dataframe(display_df, use_container_width=True)
|
||||||
Reference in New Issue
Block a user